@danielarndt0/cnpj-db-loader 2.2.1 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -144,6 +144,7 @@ type ImportPlanRecord = {
144
144
  };
145
145
  type ImportSchemaCapabilities = {
146
146
  includeEstablishmentCnpjFullInInsert: boolean;
147
+ includeEstablishmentSecondaryCnaesTable: boolean;
147
148
  includePartnerDedupeKeyInInsert: boolean;
148
149
  requiresLookupReconciliation: boolean;
149
150
  };
package/dist/index.js CHANGED
@@ -1499,8 +1499,14 @@ function maskDatabaseLabel(url) {
1499
1499
 
1500
1500
  // src/services/database/cleanup.ts
1501
1501
  var MATERIALIZED_DATASET_TABLES = {
1502
- companies: ["simples_options", "partners", "establishments", "companies"],
1503
- establishments: ["establishments"],
1502
+ companies: [
1503
+ "simples_options",
1504
+ "partners",
1505
+ "establishment_secondary_cnaes",
1506
+ "establishments",
1507
+ "companies"
1508
+ ],
1509
+ establishments: ["establishment_secondary_cnaes", "establishments"],
1504
1510
  partners: ["partners"],
1505
1511
  simples_options: ["simples_options"]
1506
1512
  };
@@ -1544,6 +1550,22 @@ function collectMaterializedTables(dataset) {
1544
1550
  }
1545
1551
  return [...orderedTables];
1546
1552
  }
1553
+ async function tableExists3(client, tableName) {
1554
+ const result = await client.query(
1555
+ "select to_regclass(current_schema() || '.' || $1) as exists",
1556
+ [tableName]
1557
+ );
1558
+ return Boolean(result.rows[0]?.exists);
1559
+ }
1560
+ async function filterExistingTables(client, tableNames) {
1561
+ const existingTables = [];
1562
+ for (const tableName of tableNames) {
1563
+ if (await tableExists3(client, tableName)) {
1564
+ existingTables.push(tableName);
1565
+ }
1566
+ }
1567
+ return existingTables;
1568
+ }
1547
1569
  async function deleteLoadCheckpoints(client, dataset) {
1548
1570
  await ensureCheckpointTable(client);
1549
1571
  const result = dataset ? await client.query(`delete from import_checkpoints where dataset = $1`, [
@@ -1649,7 +1671,10 @@ async function cleanupDatabaseMaterializedTables(client, input) {
1649
1671
  targetDatabase,
1650
1672
  dataset: input.dataset
1651
1673
  });
1652
- const tableNames = collectMaterializedTables(input.dataset);
1674
+ const tableNames = await filterExistingTables(
1675
+ client,
1676
+ collectMaterializedTables(input.dataset)
1677
+ );
1653
1678
  if (tableNames.length > 0) {
1654
1679
  await client.query(`truncate ${tableNames.join(", ")}`);
1655
1680
  }
@@ -2280,6 +2305,7 @@ function createIndexesSql() {
2280
2305
  return [
2281
2306
  "-- Operational indexes",
2282
2307
  "create index if not exists idx_establishments_cnpj_root on establishments (cnpj_root);",
2308
+ "create index if not exists idx_establishment_secondary_cnaes_cnae_code on establishment_secondary_cnaes (cnae_code);",
2283
2309
  "create index if not exists idx_partners_cnpj_root on partners (cnpj_root);",
2284
2310
  "create index if not exists idx_import_plans_status on import_plans (status);",
2285
2311
  "create index if not exists idx_import_plans_load_status on import_plans (load_status);",
@@ -2333,6 +2359,15 @@ function createPartnersSql() {
2333
2359
  ");"
2334
2360
  ].join("\n");
2335
2361
  }
2362
+ function createEstablishmentSecondaryCnaesSql() {
2363
+ return [
2364
+ "create table if not exists establishment_secondary_cnaes (",
2365
+ " cnpj_full text not null,",
2366
+ " cnae_code text not null,",
2367
+ " primary key (cnpj_full, cnae_code)",
2368
+ ");"
2369
+ ].join("\n");
2370
+ }
2336
2371
  function createSimplesSql() {
2337
2372
  return [
2338
2373
  "create table if not exists simples_options (",
@@ -2350,6 +2385,7 @@ function createOperationalSchemaParts() {
2350
2385
  "-- Final operational tables (simplified for fast first-load materialization)",
2351
2386
  createCompaniesSql(),
2352
2387
  createEstablishmentsSql(),
2388
+ createEstablishmentSecondaryCnaesSql(),
2353
2389
  createPartnersSql(),
2354
2390
  createSimplesSql()
2355
2391
  ];
@@ -4311,6 +4347,68 @@ function buildChunkInsertSql(input) {
4311
4347
  values: [input.lastStagingId, input.chunkSize]
4312
4348
  };
4313
4349
  }
4350
+ function buildEstablishmentsChunkInsertSql(input) {
4351
+ const chunkSelectList = [
4352
+ "source.staging_id",
4353
+ ...input.selectColumns.map((column) => `source.${column}`),
4354
+ `${buildEstablishmentCnpjFullExpression("source")} as cnpj_full`
4355
+ ].join(",\n ");
4356
+ const insertSelectList = input.insertColumns.join(", ");
4357
+ const secondaryCnaesCtes = input.includeSecondaryCnaesTable ? [
4358
+ ",",
4359
+ "deleted_secondary_cnaes as (",
4360
+ " delete from establishment_secondary_cnaes target",
4361
+ " using (select distinct cnpj_full from inserted_establishments) source_keys",
4362
+ " where target.cnpj_full = source_keys.cnpj_full",
4363
+ " returning 1",
4364
+ "),",
4365
+ "secondary_cnaes_source as (",
4366
+ " select distinct",
4367
+ " chunked.cnpj_full,",
4368
+ " btrim(cnae_code) as cnae_code",
4369
+ " from chunked",
4370
+ " inner join inserted_establishments inserted",
4371
+ " on inserted.cnpj_full = chunked.cnpj_full",
4372
+ " cross join lateral unnest(string_to_array(chunked.secondary_cnaes_raw, ',')) as cnae_code",
4373
+ " where chunked.secondary_cnaes_raw is not null",
4374
+ " and chunked.secondary_cnaes_raw <> ''",
4375
+ " and btrim(cnae_code) <> ''",
4376
+ "),",
4377
+ "inserted_secondary_cnaes as (",
4378
+ " insert into establishment_secondary_cnaes (cnpj_full, cnae_code)",
4379
+ " select cnpj_full, cnae_code",
4380
+ " from secondary_cnaes_source",
4381
+ " on conflict (cnpj_full, cnae_code) do nothing",
4382
+ " returning 1",
4383
+ ")"
4384
+ ] : [];
4385
+ return {
4386
+ text: [
4387
+ "with chunked as (",
4388
+ ` select
4389
+ ${chunkSelectList}`,
4390
+ " from staging_establishments source",
4391
+ " where source.staging_id > $1",
4392
+ " order by source.staging_id asc",
4393
+ " limit $2",
4394
+ "),",
4395
+ "inserted_establishments as (",
4396
+ ` insert into establishments (${input.insertColumns.join(", ")})`,
4397
+ ` select ${insertSelectList}`,
4398
+ " from chunked",
4399
+ ...input.conflictClause ? [input.conflictClause] : [],
4400
+ " returning cnpj_full",
4401
+ ")",
4402
+ ...secondaryCnaesCtes,
4403
+ "select",
4404
+ " coalesce(max(staging_id), $1::bigint)::bigint as max_staging_id,",
4405
+ " count(*)::bigint as source_rows,",
4406
+ " count(*)::bigint as affected_rows",
4407
+ "from chunked;"
4408
+ ].join("\n"),
4409
+ values: [input.lastStagingId, input.chunkSize]
4410
+ };
4411
+ }
4314
4412
  function buildPartnersChunkInsertSql(input) {
4315
4413
  const baseColumns = MATERIALIZATION_COLUMNS.partners;
4316
4414
  const chunkSelectList = [
@@ -4388,9 +4486,7 @@ function buildMaterializationChunkQuery(input) {
4388
4486
  });
4389
4487
  case "establishments": {
4390
4488
  const insertColumns = input.schemaCapabilities.includeEstablishmentCnpjFullInInsert ? [...baseColumns, "cnpj_full"] : [...baseColumns];
4391
- return buildChunkInsertSql({
4392
- stagingTable: "staging_establishments",
4393
- targetTable: "establishments",
4489
+ return buildEstablishmentsChunkInsertSql({
4394
4490
  insertColumns,
4395
4491
  selectColumns: baseColumns,
4396
4492
  conflictClause: useConflictClause ? getConflictClause(
@@ -4400,7 +4496,7 @@ function buildMaterializationChunkQuery(input) {
4400
4496
  ) : "",
4401
4497
  lastStagingId: input.lastStagingId,
4402
4498
  chunkSize: input.chunkSize,
4403
- extraSelects: input.schemaCapabilities.includeEstablishmentCnpjFullInInsert ? [`${buildEstablishmentCnpjFullExpression("source")} as cnpj_full`] : []
4499
+ includeSecondaryCnaesTable: input.schemaCapabilities.includeEstablishmentSecondaryCnaesTable
4404
4500
  });
4405
4501
  }
4406
4502
  case "simples_options":
@@ -4851,6 +4947,57 @@ async function validateDatasetCheckpoint(input) {
4851
4947
  targetRows
4852
4948
  };
4853
4949
  }
4950
+ async function readEstablishmentSecondaryCnaesCount(client) {
4951
+ const result = await client.query(
4952
+ `select count(*)::bigint as total_count from establishment_secondary_cnaes`
4953
+ );
4954
+ return Number.parseInt(result.rows[0]?.total_count ?? "0", 10);
4955
+ }
4956
+ async function hasEstablishmentsWithSecondaryCnaes(client) {
4957
+ const result = await client.query(
4958
+ `select exists (
4959
+ select 1
4960
+ from establishments
4961
+ where secondary_cnaes_raw is not null
4962
+ and secondary_cnaes_raw <> ''
4963
+ limit 1
4964
+ ) as exists`
4965
+ );
4966
+ return result.rows[0]?.exists ?? false;
4967
+ }
4968
+ async function backfillEstablishmentSecondaryCnaesFromFinal(input) {
4969
+ const existingRows = await readEstablishmentSecondaryCnaesCount(input.client);
4970
+ if (existingRows > 0) {
4971
+ return 0;
4972
+ }
4973
+ if (!await hasEstablishmentsWithSecondaryCnaes(input.client)) {
4974
+ return 0;
4975
+ }
4976
+ const startedAt = performance4.now();
4977
+ const result = await input.client.query(
4978
+ `insert into establishment_secondary_cnaes (cnpj_full, cnae_code)
4979
+ select distinct
4980
+ e.cnpj_full,
4981
+ btrim(cnae_code) as cnae_code
4982
+ from establishments e
4983
+ cross join lateral unnest(
4984
+ string_to_array(e.secondary_cnaes_raw, ',')
4985
+ ) as cnae_code
4986
+ where e.secondary_cnaes_raw is not null
4987
+ and e.secondary_cnaes_raw <> ''
4988
+ and btrim(cnae_code) <> ''
4989
+ on conflict (cnpj_full, cnae_code) do nothing`
4990
+ );
4991
+ const insertedRows = result.rowCount ?? 0;
4992
+ await appendJsonLinesLog(input.progressLogPath, {
4993
+ kind: "establishment_secondary_cnaes_backfilled",
4994
+ targetTable: "establishment_secondary_cnaes",
4995
+ insertedRows,
4996
+ durationMs: performance4.now() - startedAt,
4997
+ timestamp: (/* @__PURE__ */ new Date()).toISOString()
4998
+ });
4999
+ return insertedRows;
5000
+ }
4854
5001
  async function materializeDatasetByChunks(input) {
4855
5002
  const startedAt = performance4.now();
4856
5003
  const validationReason = "Validating the live staging row count and staging cursor before resuming materialization.";
@@ -5297,6 +5444,12 @@ async function materializeStagedDatasets(input) {
5297
5444
  totalBatches: input.totalBatches,
5298
5445
  completedDatasets: summary.datasets.length
5299
5446
  });
5447
+ if (dataset === "establishments" && input.schemaCapabilities.includeEstablishmentSecondaryCnaesTable) {
5448
+ await backfillEstablishmentSecondaryCnaesFromFinal({
5449
+ client: input.client,
5450
+ progressLogPath: input.progressLogPath
5451
+ });
5452
+ }
5300
5453
  const tracker = input.datasetPerformanceTrackers.get(dataset);
5301
5454
  if (tracker) {
5302
5455
  tracker.materializationDurationMs += result.durationMs;
@@ -5520,6 +5673,12 @@ function canInsertIntoColumn(rows, tableName, columnName) {
5520
5673
  }
5521
5674
  return row.is_generated.toUpperCase() !== "ALWAYS";
5522
5675
  }
5676
+ function hasRequiredColumns(rows, tableName, columnNames) {
5677
+ const availableColumns = new Set(
5678
+ rows.filter((item) => item.table_name === tableName).map((item) => item.column_name)
5679
+ );
5680
+ return columnNames.every((columnName) => availableColumns.has(columnName));
5681
+ }
5523
5682
  async function detectImportSchemaCapabilities(client) {
5524
5683
  const [columnResult, lookupConstraintResult] = await Promise.all([
5525
5684
  client.query(
@@ -5528,6 +5687,7 @@ async function detectImportSchemaCapabilities(client) {
5528
5687
  where table_schema = current_schema()
5529
5688
  and (
5530
5689
  (table_name = 'establishments' and column_name = 'cnpj_full') or
5690
+ (table_name = 'establishment_secondary_cnaes' and column_name in ('cnpj_full', 'cnae_code')) or
5531
5691
  (table_name = 'partners' and column_name = 'partner_dedupe_key')
5532
5692
  )`
5533
5693
  ),
@@ -5540,7 +5700,7 @@ async function detectImportSchemaCapabilities(client) {
5540
5700
  inner join pg_class target_table on target_table.oid = constraint_item.confrelid
5541
5701
  where constraint_item.contype = 'f'
5542
5702
  and source_namespace.nspname = current_schema()
5543
- and source_table.relname in ('companies', 'establishments', 'partners')
5703
+ and source_table.relname in ('companies', 'establishments', 'partners', 'establishment_secondary_cnaes')
5544
5704
  and target_table.relname in (
5545
5705
  'countries',
5546
5706
  'cities',
@@ -5563,6 +5723,11 @@ async function detectImportSchemaCapabilities(client) {
5563
5723
  "establishments",
5564
5724
  "cnpj_full"
5565
5725
  ),
5726
+ includeEstablishmentSecondaryCnaesTable: hasRequiredColumns(
5727
+ columnResult.rows,
5728
+ "establishment_secondary_cnaes",
5729
+ ["cnpj_full", "cnae_code"]
5730
+ ),
5566
5731
  includePartnerDedupeKeyInInsert: canInsertIntoColumn(
5567
5732
  columnResult.rows,
5568
5733
  "partners",