@danielarndt0/cnpj-db-loader 2.2.0 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -17,6 +17,7 @@ This version focuses on the real loading workflow:
17
17
  - exact preparatory scanning for total rows and total batches before import starts
18
18
  - persisted import plans reused on resume for the same validated input and batch size
19
19
  - staged bulk loads for the large datasets through PostgreSQL COPY
20
+ - automatic materialization of `establishment_secondary_cnaes` from establishment secondary CNAE data
20
21
  - direct final-schema upserts for the smaller domain datasets
21
22
  - checkpoint-based resume by file and byte offset
22
23
  - row quarantine for invalid or constraint-breaking records without stopping the import
@@ -95,7 +96,7 @@ The import internals are now split into dedicated modules such as planner, sourc
95
96
 
96
97
  The CLI now exposes a split workflow as well: `import` runs the full pipeline, `import load` stops after staging/direct writes, `import materialize` resumes from the saved plan and pushes staged rows into the final tables, and `database cleanup ...` exposes safe maintenance commands for staging tables, simplified final materialized tables, checkpoints, and saved plans.
97
98
 
98
- Materialization progress is now checkpointed separately from file-load checkpoints, and the materializer works in resumable chunks controlled by `--materialize-batch-size`. During long final materialization steps, the CLI keeps the live progress output on a dedicated MATERIALIZING stage while reducing per-chunk checkpoint and JSONL write overhead so resumable chunks stay fast. The simplified final schema keeps raw secondary CNAE text in establishments and derives helper fields such as partner dedupe keys during materialization only when they are still stored physically in the target schema.
99
+ Materialization progress is now checkpointed separately from file-load checkpoints, and the materializer works in resumable chunks controlled by `--materialize-batch-size`. During long final materialization steps, the CLI keeps the live progress output on a dedicated MATERIALIZING stage while reducing per-chunk checkpoint and JSONL write overhead so resumable chunks stay fast. The simplified final schema keeps raw secondary CNAE text in establishments and also materializes `establishment_secondary_cnaes` so APIs can query one row per secondary CNAE without running a separate backfill script.
99
100
 
100
101
  The Federal Revenue commands write the same structured command logs and keep the remote-download phase outside the import internals. Existing completed ZIP files are skipped by default, temporary `.part` files are used while downloads are still in progress, and each reference keeps a local manifest for `status`, `retry`, `clean`, and future runner automation.
101
102
 
package/dist/cli.js CHANGED
@@ -1512,8 +1512,14 @@ function maskDatabaseLabel(url) {
1512
1512
 
1513
1513
  // src/services/database/cleanup.ts
1514
1514
  var MATERIALIZED_DATASET_TABLES = {
1515
- companies: ["simples_options", "partners", "establishments", "companies"],
1516
- establishments: ["establishments"],
1515
+ companies: [
1516
+ "simples_options",
1517
+ "partners",
1518
+ "establishment_secondary_cnaes",
1519
+ "establishments",
1520
+ "companies"
1521
+ ],
1522
+ establishments: ["establishment_secondary_cnaes", "establishments"],
1517
1523
  partners: ["partners"],
1518
1524
  simples_options: ["simples_options"]
1519
1525
  };
@@ -1557,6 +1563,22 @@ function collectMaterializedTables(dataset) {
1557
1563
  }
1558
1564
  return [...orderedTables];
1559
1565
  }
1566
+ async function tableExists3(client, tableName) {
1567
+ const result = await client.query(
1568
+ "select to_regclass(current_schema() || '.' || $1) as exists",
1569
+ [tableName]
1570
+ );
1571
+ return Boolean(result.rows[0]?.exists);
1572
+ }
1573
+ async function filterExistingTables(client, tableNames) {
1574
+ const existingTables = [];
1575
+ for (const tableName of tableNames) {
1576
+ if (await tableExists3(client, tableName)) {
1577
+ existingTables.push(tableName);
1578
+ }
1579
+ }
1580
+ return existingTables;
1581
+ }
1560
1582
  async function deleteLoadCheckpoints(client, dataset) {
1561
1583
  await ensureCheckpointTable(client);
1562
1584
  const result = dataset ? await client.query(`delete from import_checkpoints where dataset = $1`, [
@@ -1662,7 +1684,10 @@ async function cleanupDatabaseMaterializedTables(client, input2) {
1662
1684
  targetDatabase,
1663
1685
  dataset: input2.dataset
1664
1686
  });
1665
- const tableNames = collectMaterializedTables(input2.dataset);
1687
+ const tableNames = await filterExistingTables(
1688
+ client,
1689
+ collectMaterializedTables(input2.dataset)
1690
+ );
1666
1691
  if (tableNames.length > 0) {
1667
1692
  await client.query(`truncate ${tableNames.join(", ")}`);
1668
1693
  }
@@ -2261,6 +2286,7 @@ function createIndexesSql() {
2261
2286
  return [
2262
2287
  "-- Operational indexes",
2263
2288
  "create index if not exists idx_establishments_cnpj_root on establishments (cnpj_root);",
2289
+ "create index if not exists idx_establishment_secondary_cnaes_cnae_code on establishment_secondary_cnaes (cnae_code);",
2264
2290
  "create index if not exists idx_partners_cnpj_root on partners (cnpj_root);",
2265
2291
  "create index if not exists idx_import_plans_status on import_plans (status);",
2266
2292
  "create index if not exists idx_import_plans_load_status on import_plans (load_status);",
@@ -2314,6 +2340,15 @@ function createPartnersSql() {
2314
2340
  ");"
2315
2341
  ].join("\n");
2316
2342
  }
2343
+ function createEstablishmentSecondaryCnaesSql() {
2344
+ return [
2345
+ "create table if not exists establishment_secondary_cnaes (",
2346
+ " cnpj_full text not null,",
2347
+ " cnae_code text not null,",
2348
+ " primary key (cnpj_full, cnae_code)",
2349
+ ");"
2350
+ ].join("\n");
2351
+ }
2317
2352
  function createSimplesSql() {
2318
2353
  return [
2319
2354
  "create table if not exists simples_options (",
@@ -2331,6 +2366,7 @@ function createOperationalSchemaParts() {
2331
2366
  "-- Final operational tables (simplified for fast first-load materialization)",
2332
2367
  createCompaniesSql(),
2333
2368
  createEstablishmentsSql(),
2369
+ createEstablishmentSecondaryCnaesSql(),
2334
2370
  createPartnersSql(),
2335
2371
  createSimplesSql()
2336
2372
  ];
@@ -4279,6 +4315,68 @@ function buildChunkInsertSql(input2) {
4279
4315
  values: [input2.lastStagingId, input2.chunkSize]
4280
4316
  };
4281
4317
  }
4318
+ function buildEstablishmentsChunkInsertSql(input2) {
4319
+ const chunkSelectList = [
4320
+ "source.staging_id",
4321
+ ...input2.selectColumns.map((column) => `source.${column}`),
4322
+ `${buildEstablishmentCnpjFullExpression("source")} as cnpj_full`
4323
+ ].join(",\n ");
4324
+ const insertSelectList = input2.insertColumns.join(", ");
4325
+ const secondaryCnaesCtes = input2.includeSecondaryCnaesTable ? [
4326
+ ",",
4327
+ "deleted_secondary_cnaes as (",
4328
+ " delete from establishment_secondary_cnaes target",
4329
+ " using (select distinct cnpj_full from inserted_establishments) source_keys",
4330
+ " where target.cnpj_full = source_keys.cnpj_full",
4331
+ " returning 1",
4332
+ "),",
4333
+ "secondary_cnaes_source as (",
4334
+ " select distinct",
4335
+ " chunked.cnpj_full,",
4336
+ " btrim(cnae_code) as cnae_code",
4337
+ " from chunked",
4338
+ " inner join inserted_establishments inserted",
4339
+ " on inserted.cnpj_full = chunked.cnpj_full",
4340
+ " cross join lateral unnest(string_to_array(chunked.secondary_cnaes_raw, ',')) as cnae_code",
4341
+ " where chunked.secondary_cnaes_raw is not null",
4342
+ " and chunked.secondary_cnaes_raw <> ''",
4343
+ " and btrim(cnae_code) <> ''",
4344
+ "),",
4345
+ "inserted_secondary_cnaes as (",
4346
+ " insert into establishment_secondary_cnaes (cnpj_full, cnae_code)",
4347
+ " select cnpj_full, cnae_code",
4348
+ " from secondary_cnaes_source",
4349
+ " on conflict (cnpj_full, cnae_code) do nothing",
4350
+ " returning 1",
4351
+ ")"
4352
+ ] : [];
4353
+ return {
4354
+ text: [
4355
+ "with chunked as (",
4356
+ ` select
4357
+ ${chunkSelectList}`,
4358
+ " from staging_establishments source",
4359
+ " where source.staging_id > $1",
4360
+ " order by source.staging_id asc",
4361
+ " limit $2",
4362
+ "),",
4363
+ "inserted_establishments as (",
4364
+ ` insert into establishments (${input2.insertColumns.join(", ")})`,
4365
+ ` select ${insertSelectList}`,
4366
+ " from chunked",
4367
+ ...input2.conflictClause ? [input2.conflictClause] : [],
4368
+ " returning cnpj_full",
4369
+ ")",
4370
+ ...secondaryCnaesCtes,
4371
+ "select",
4372
+ " coalesce(max(staging_id), $1::bigint)::bigint as max_staging_id,",
4373
+ " count(*)::bigint as source_rows,",
4374
+ " count(*)::bigint as affected_rows",
4375
+ "from chunked;"
4376
+ ].join("\n"),
4377
+ values: [input2.lastStagingId, input2.chunkSize]
4378
+ };
4379
+ }
4282
4380
  function buildPartnersChunkInsertSql(input2) {
4283
4381
  const baseColumns = MATERIALIZATION_COLUMNS.partners;
4284
4382
  const chunkSelectList = [
@@ -4356,9 +4454,7 @@ function buildMaterializationChunkQuery(input2) {
4356
4454
  });
4357
4455
  case "establishments": {
4358
4456
  const insertColumns = input2.schemaCapabilities.includeEstablishmentCnpjFullInInsert ? [...baseColumns, "cnpj_full"] : [...baseColumns];
4359
- return buildChunkInsertSql({
4360
- stagingTable: "staging_establishments",
4361
- targetTable: "establishments",
4457
+ return buildEstablishmentsChunkInsertSql({
4362
4458
  insertColumns,
4363
4459
  selectColumns: baseColumns,
4364
4460
  conflictClause: useConflictClause ? getConflictClause(
@@ -4368,7 +4464,7 @@ function buildMaterializationChunkQuery(input2) {
4368
4464
  ) : "",
4369
4465
  lastStagingId: input2.lastStagingId,
4370
4466
  chunkSize: input2.chunkSize,
4371
- extraSelects: input2.schemaCapabilities.includeEstablishmentCnpjFullInInsert ? [`${buildEstablishmentCnpjFullExpression("source")} as cnpj_full`] : []
4467
+ includeSecondaryCnaesTable: input2.schemaCapabilities.includeEstablishmentSecondaryCnaesTable
4372
4468
  });
4373
4469
  }
4374
4470
  case "simples_options":
@@ -4819,6 +4915,57 @@ async function validateDatasetCheckpoint(input2) {
4819
4915
  targetRows
4820
4916
  };
4821
4917
  }
4918
+ async function readEstablishmentSecondaryCnaesCount(client) {
4919
+ const result = await client.query(
4920
+ `select count(*)::bigint as total_count from establishment_secondary_cnaes`
4921
+ );
4922
+ return Number.parseInt(result.rows[0]?.total_count ?? "0", 10);
4923
+ }
4924
+ async function hasEstablishmentsWithSecondaryCnaes(client) {
4925
+ const result = await client.query(
4926
+ `select exists (
4927
+ select 1
4928
+ from establishments
4929
+ where secondary_cnaes_raw is not null
4930
+ and secondary_cnaes_raw <> ''
4931
+ limit 1
4932
+ ) as exists`
4933
+ );
4934
+ return result.rows[0]?.exists ?? false;
4935
+ }
4936
+ async function backfillEstablishmentSecondaryCnaesFromFinal(input2) {
4937
+ const existingRows = await readEstablishmentSecondaryCnaesCount(input2.client);
4938
+ if (existingRows > 0) {
4939
+ return 0;
4940
+ }
4941
+ if (!await hasEstablishmentsWithSecondaryCnaes(input2.client)) {
4942
+ return 0;
4943
+ }
4944
+ const startedAt = performance4.now();
4945
+ const result = await input2.client.query(
4946
+ `insert into establishment_secondary_cnaes (cnpj_full, cnae_code)
4947
+ select distinct
4948
+ e.cnpj_full,
4949
+ btrim(cnae_code) as cnae_code
4950
+ from establishments e
4951
+ cross join lateral unnest(
4952
+ string_to_array(e.secondary_cnaes_raw, ',')
4953
+ ) as cnae_code
4954
+ where e.secondary_cnaes_raw is not null
4955
+ and e.secondary_cnaes_raw <> ''
4956
+ and btrim(cnae_code) <> ''
4957
+ on conflict (cnpj_full, cnae_code) do nothing`
4958
+ );
4959
+ const insertedRows = result.rowCount ?? 0;
4960
+ await appendJsonLinesLog(input2.progressLogPath, {
4961
+ kind: "establishment_secondary_cnaes_backfilled",
4962
+ targetTable: "establishment_secondary_cnaes",
4963
+ insertedRows,
4964
+ durationMs: performance4.now() - startedAt,
4965
+ timestamp: (/* @__PURE__ */ new Date()).toISOString()
4966
+ });
4967
+ return insertedRows;
4968
+ }
4822
4969
  async function materializeDatasetByChunks(input2) {
4823
4970
  const startedAt = performance4.now();
4824
4971
  const validationReason = "Validating the live staging row count and staging cursor before resuming materialization.";
@@ -5265,6 +5412,12 @@ async function materializeStagedDatasets(input2) {
5265
5412
  totalBatches: input2.totalBatches,
5266
5413
  completedDatasets: summary.datasets.length
5267
5414
  });
5415
+ if (dataset === "establishments" && input2.schemaCapabilities.includeEstablishmentSecondaryCnaesTable) {
5416
+ await backfillEstablishmentSecondaryCnaesFromFinal({
5417
+ client: input2.client,
5418
+ progressLogPath: input2.progressLogPath
5419
+ });
5420
+ }
5268
5421
  const tracker = input2.datasetPerformanceTrackers.get(dataset);
5269
5422
  if (tracker) {
5270
5423
  tracker.materializationDurationMs += result.durationMs;
@@ -5488,6 +5641,12 @@ function canInsertIntoColumn(rows, tableName, columnName) {
5488
5641
  }
5489
5642
  return row.is_generated.toUpperCase() !== "ALWAYS";
5490
5643
  }
5644
+ function hasRequiredColumns(rows, tableName, columnNames) {
5645
+ const availableColumns = new Set(
5646
+ rows.filter((item) => item.table_name === tableName).map((item) => item.column_name)
5647
+ );
5648
+ return columnNames.every((columnName) => availableColumns.has(columnName));
5649
+ }
5491
5650
  async function detectImportSchemaCapabilities(client) {
5492
5651
  const [columnResult, lookupConstraintResult] = await Promise.all([
5493
5652
  client.query(
@@ -5496,6 +5655,7 @@ async function detectImportSchemaCapabilities(client) {
5496
5655
  where table_schema = current_schema()
5497
5656
  and (
5498
5657
  (table_name = 'establishments' and column_name = 'cnpj_full') or
5658
+ (table_name = 'establishment_secondary_cnaes' and column_name in ('cnpj_full', 'cnae_code')) or
5499
5659
  (table_name = 'partners' and column_name = 'partner_dedupe_key')
5500
5660
  )`
5501
5661
  ),
@@ -5508,7 +5668,7 @@ async function detectImportSchemaCapabilities(client) {
5508
5668
  inner join pg_class target_table on target_table.oid = constraint_item.confrelid
5509
5669
  where constraint_item.contype = 'f'
5510
5670
  and source_namespace.nspname = current_schema()
5511
- and source_table.relname in ('companies', 'establishments', 'partners')
5671
+ and source_table.relname in ('companies', 'establishments', 'partners', 'establishment_secondary_cnaes')
5512
5672
  and target_table.relname in (
5513
5673
  'countries',
5514
5674
  'cities',
@@ -5531,6 +5691,11 @@ async function detectImportSchemaCapabilities(client) {
5531
5691
  "establishments",
5532
5692
  "cnpj_full"
5533
5693
  ),
5694
+ includeEstablishmentSecondaryCnaesTable: hasRequiredColumns(
5695
+ columnResult.rows,
5696
+ "establishment_secondary_cnaes",
5697
+ ["cnpj_full", "cnae_code"]
5698
+ ),
5534
5699
  includePartnerDedupeKeyInInsert: canInsertIntoColumn(
5535
5700
  columnResult.rows,
5536
5701
  "partners",