@danielarndt0/cnpj-db-loader 2.2.0 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -1
- package/dist/cli.js +173 -8
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +1 -0
- package/dist/index.js +173 -8
- package/dist/index.js.map +1 -1
- package/docs/architecture.md +1 -1
- package/docs/commands.md +1 -1
- package/docs/usage.md +1 -0
- package/package.json +11 -1
package/dist/index.d.ts
CHANGED
|
@@ -144,6 +144,7 @@ type ImportPlanRecord = {
|
|
|
144
144
|
};
|
|
145
145
|
type ImportSchemaCapabilities = {
|
|
146
146
|
includeEstablishmentCnpjFullInInsert: boolean;
|
|
147
|
+
includeEstablishmentSecondaryCnaesTable: boolean;
|
|
147
148
|
includePartnerDedupeKeyInInsert: boolean;
|
|
148
149
|
requiresLookupReconciliation: boolean;
|
|
149
150
|
};
|
package/dist/index.js
CHANGED
|
@@ -1499,8 +1499,14 @@ function maskDatabaseLabel(url) {
|
|
|
1499
1499
|
|
|
1500
1500
|
// src/services/database/cleanup.ts
|
|
1501
1501
|
var MATERIALIZED_DATASET_TABLES = {
|
|
1502
|
-
companies: [
|
|
1503
|
-
|
|
1502
|
+
companies: [
|
|
1503
|
+
"simples_options",
|
|
1504
|
+
"partners",
|
|
1505
|
+
"establishment_secondary_cnaes",
|
|
1506
|
+
"establishments",
|
|
1507
|
+
"companies"
|
|
1508
|
+
],
|
|
1509
|
+
establishments: ["establishment_secondary_cnaes", "establishments"],
|
|
1504
1510
|
partners: ["partners"],
|
|
1505
1511
|
simples_options: ["simples_options"]
|
|
1506
1512
|
};
|
|
@@ -1544,6 +1550,22 @@ function collectMaterializedTables(dataset) {
|
|
|
1544
1550
|
}
|
|
1545
1551
|
return [...orderedTables];
|
|
1546
1552
|
}
|
|
1553
|
+
async function tableExists3(client, tableName) {
|
|
1554
|
+
const result = await client.query(
|
|
1555
|
+
"select to_regclass(current_schema() || '.' || $1) as exists",
|
|
1556
|
+
[tableName]
|
|
1557
|
+
);
|
|
1558
|
+
return Boolean(result.rows[0]?.exists);
|
|
1559
|
+
}
|
|
1560
|
+
async function filterExistingTables(client, tableNames) {
|
|
1561
|
+
const existingTables = [];
|
|
1562
|
+
for (const tableName of tableNames) {
|
|
1563
|
+
if (await tableExists3(client, tableName)) {
|
|
1564
|
+
existingTables.push(tableName);
|
|
1565
|
+
}
|
|
1566
|
+
}
|
|
1567
|
+
return existingTables;
|
|
1568
|
+
}
|
|
1547
1569
|
async function deleteLoadCheckpoints(client, dataset) {
|
|
1548
1570
|
await ensureCheckpointTable(client);
|
|
1549
1571
|
const result = dataset ? await client.query(`delete from import_checkpoints where dataset = $1`, [
|
|
@@ -1649,7 +1671,10 @@ async function cleanupDatabaseMaterializedTables(client, input) {
|
|
|
1649
1671
|
targetDatabase,
|
|
1650
1672
|
dataset: input.dataset
|
|
1651
1673
|
});
|
|
1652
|
-
const tableNames =
|
|
1674
|
+
const tableNames = await filterExistingTables(
|
|
1675
|
+
client,
|
|
1676
|
+
collectMaterializedTables(input.dataset)
|
|
1677
|
+
);
|
|
1653
1678
|
if (tableNames.length > 0) {
|
|
1654
1679
|
await client.query(`truncate ${tableNames.join(", ")}`);
|
|
1655
1680
|
}
|
|
@@ -2280,6 +2305,7 @@ function createIndexesSql() {
|
|
|
2280
2305
|
return [
|
|
2281
2306
|
"-- Operational indexes",
|
|
2282
2307
|
"create index if not exists idx_establishments_cnpj_root on establishments (cnpj_root);",
|
|
2308
|
+
"create index if not exists idx_establishment_secondary_cnaes_cnae_code on establishment_secondary_cnaes (cnae_code);",
|
|
2283
2309
|
"create index if not exists idx_partners_cnpj_root on partners (cnpj_root);",
|
|
2284
2310
|
"create index if not exists idx_import_plans_status on import_plans (status);",
|
|
2285
2311
|
"create index if not exists idx_import_plans_load_status on import_plans (load_status);",
|
|
@@ -2333,6 +2359,15 @@ function createPartnersSql() {
|
|
|
2333
2359
|
");"
|
|
2334
2360
|
].join("\n");
|
|
2335
2361
|
}
|
|
2362
|
+
function createEstablishmentSecondaryCnaesSql() {
|
|
2363
|
+
return [
|
|
2364
|
+
"create table if not exists establishment_secondary_cnaes (",
|
|
2365
|
+
" cnpj_full text not null,",
|
|
2366
|
+
" cnae_code text not null,",
|
|
2367
|
+
" primary key (cnpj_full, cnae_code)",
|
|
2368
|
+
");"
|
|
2369
|
+
].join("\n");
|
|
2370
|
+
}
|
|
2336
2371
|
function createSimplesSql() {
|
|
2337
2372
|
return [
|
|
2338
2373
|
"create table if not exists simples_options (",
|
|
@@ -2350,6 +2385,7 @@ function createOperationalSchemaParts() {
|
|
|
2350
2385
|
"-- Final operational tables (simplified for fast first-load materialization)",
|
|
2351
2386
|
createCompaniesSql(),
|
|
2352
2387
|
createEstablishmentsSql(),
|
|
2388
|
+
createEstablishmentSecondaryCnaesSql(),
|
|
2353
2389
|
createPartnersSql(),
|
|
2354
2390
|
createSimplesSql()
|
|
2355
2391
|
];
|
|
@@ -4311,6 +4347,68 @@ function buildChunkInsertSql(input) {
|
|
|
4311
4347
|
values: [input.lastStagingId, input.chunkSize]
|
|
4312
4348
|
};
|
|
4313
4349
|
}
|
|
4350
|
+
function buildEstablishmentsChunkInsertSql(input) {
|
|
4351
|
+
const chunkSelectList = [
|
|
4352
|
+
"source.staging_id",
|
|
4353
|
+
...input.selectColumns.map((column) => `source.${column}`),
|
|
4354
|
+
`${buildEstablishmentCnpjFullExpression("source")} as cnpj_full`
|
|
4355
|
+
].join(",\n ");
|
|
4356
|
+
const insertSelectList = input.insertColumns.join(", ");
|
|
4357
|
+
const secondaryCnaesCtes = input.includeSecondaryCnaesTable ? [
|
|
4358
|
+
",",
|
|
4359
|
+
"deleted_secondary_cnaes as (",
|
|
4360
|
+
" delete from establishment_secondary_cnaes target",
|
|
4361
|
+
" using (select distinct cnpj_full from inserted_establishments) source_keys",
|
|
4362
|
+
" where target.cnpj_full = source_keys.cnpj_full",
|
|
4363
|
+
" returning 1",
|
|
4364
|
+
"),",
|
|
4365
|
+
"secondary_cnaes_source as (",
|
|
4366
|
+
" select distinct",
|
|
4367
|
+
" chunked.cnpj_full,",
|
|
4368
|
+
" btrim(cnae_code) as cnae_code",
|
|
4369
|
+
" from chunked",
|
|
4370
|
+
" inner join inserted_establishments inserted",
|
|
4371
|
+
" on inserted.cnpj_full = chunked.cnpj_full",
|
|
4372
|
+
" cross join lateral unnest(string_to_array(chunked.secondary_cnaes_raw, ',')) as cnae_code",
|
|
4373
|
+
" where chunked.secondary_cnaes_raw is not null",
|
|
4374
|
+
" and chunked.secondary_cnaes_raw <> ''",
|
|
4375
|
+
" and btrim(cnae_code) <> ''",
|
|
4376
|
+
"),",
|
|
4377
|
+
"inserted_secondary_cnaes as (",
|
|
4378
|
+
" insert into establishment_secondary_cnaes (cnpj_full, cnae_code)",
|
|
4379
|
+
" select cnpj_full, cnae_code",
|
|
4380
|
+
" from secondary_cnaes_source",
|
|
4381
|
+
" on conflict (cnpj_full, cnae_code) do nothing",
|
|
4382
|
+
" returning 1",
|
|
4383
|
+
")"
|
|
4384
|
+
] : [];
|
|
4385
|
+
return {
|
|
4386
|
+
text: [
|
|
4387
|
+
"with chunked as (",
|
|
4388
|
+
` select
|
|
4389
|
+
${chunkSelectList}`,
|
|
4390
|
+
" from staging_establishments source",
|
|
4391
|
+
" where source.staging_id > $1",
|
|
4392
|
+
" order by source.staging_id asc",
|
|
4393
|
+
" limit $2",
|
|
4394
|
+
"),",
|
|
4395
|
+
"inserted_establishments as (",
|
|
4396
|
+
` insert into establishments (${input.insertColumns.join(", ")})`,
|
|
4397
|
+
` select ${insertSelectList}`,
|
|
4398
|
+
" from chunked",
|
|
4399
|
+
...input.conflictClause ? [input.conflictClause] : [],
|
|
4400
|
+
" returning cnpj_full",
|
|
4401
|
+
")",
|
|
4402
|
+
...secondaryCnaesCtes,
|
|
4403
|
+
"select",
|
|
4404
|
+
" coalesce(max(staging_id), $1::bigint)::bigint as max_staging_id,",
|
|
4405
|
+
" count(*)::bigint as source_rows,",
|
|
4406
|
+
" count(*)::bigint as affected_rows",
|
|
4407
|
+
"from chunked;"
|
|
4408
|
+
].join("\n"),
|
|
4409
|
+
values: [input.lastStagingId, input.chunkSize]
|
|
4410
|
+
};
|
|
4411
|
+
}
|
|
4314
4412
|
function buildPartnersChunkInsertSql(input) {
|
|
4315
4413
|
const baseColumns = MATERIALIZATION_COLUMNS.partners;
|
|
4316
4414
|
const chunkSelectList = [
|
|
@@ -4388,9 +4486,7 @@ function buildMaterializationChunkQuery(input) {
|
|
|
4388
4486
|
});
|
|
4389
4487
|
case "establishments": {
|
|
4390
4488
|
const insertColumns = input.schemaCapabilities.includeEstablishmentCnpjFullInInsert ? [...baseColumns, "cnpj_full"] : [...baseColumns];
|
|
4391
|
-
return
|
|
4392
|
-
stagingTable: "staging_establishments",
|
|
4393
|
-
targetTable: "establishments",
|
|
4489
|
+
return buildEstablishmentsChunkInsertSql({
|
|
4394
4490
|
insertColumns,
|
|
4395
4491
|
selectColumns: baseColumns,
|
|
4396
4492
|
conflictClause: useConflictClause ? getConflictClause(
|
|
@@ -4400,7 +4496,7 @@ function buildMaterializationChunkQuery(input) {
|
|
|
4400
4496
|
) : "",
|
|
4401
4497
|
lastStagingId: input.lastStagingId,
|
|
4402
4498
|
chunkSize: input.chunkSize,
|
|
4403
|
-
|
|
4499
|
+
includeSecondaryCnaesTable: input.schemaCapabilities.includeEstablishmentSecondaryCnaesTable
|
|
4404
4500
|
});
|
|
4405
4501
|
}
|
|
4406
4502
|
case "simples_options":
|
|
@@ -4851,6 +4947,57 @@ async function validateDatasetCheckpoint(input) {
|
|
|
4851
4947
|
targetRows
|
|
4852
4948
|
};
|
|
4853
4949
|
}
|
|
4950
|
+
async function readEstablishmentSecondaryCnaesCount(client) {
|
|
4951
|
+
const result = await client.query(
|
|
4952
|
+
`select count(*)::bigint as total_count from establishment_secondary_cnaes`
|
|
4953
|
+
);
|
|
4954
|
+
return Number.parseInt(result.rows[0]?.total_count ?? "0", 10);
|
|
4955
|
+
}
|
|
4956
|
+
async function hasEstablishmentsWithSecondaryCnaes(client) {
|
|
4957
|
+
const result = await client.query(
|
|
4958
|
+
`select exists (
|
|
4959
|
+
select 1
|
|
4960
|
+
from establishments
|
|
4961
|
+
where secondary_cnaes_raw is not null
|
|
4962
|
+
and secondary_cnaes_raw <> ''
|
|
4963
|
+
limit 1
|
|
4964
|
+
) as exists`
|
|
4965
|
+
);
|
|
4966
|
+
return result.rows[0]?.exists ?? false;
|
|
4967
|
+
}
|
|
4968
|
+
async function backfillEstablishmentSecondaryCnaesFromFinal(input) {
|
|
4969
|
+
const existingRows = await readEstablishmentSecondaryCnaesCount(input.client);
|
|
4970
|
+
if (existingRows > 0) {
|
|
4971
|
+
return 0;
|
|
4972
|
+
}
|
|
4973
|
+
if (!await hasEstablishmentsWithSecondaryCnaes(input.client)) {
|
|
4974
|
+
return 0;
|
|
4975
|
+
}
|
|
4976
|
+
const startedAt = performance4.now();
|
|
4977
|
+
const result = await input.client.query(
|
|
4978
|
+
`insert into establishment_secondary_cnaes (cnpj_full, cnae_code)
|
|
4979
|
+
select distinct
|
|
4980
|
+
e.cnpj_full,
|
|
4981
|
+
btrim(cnae_code) as cnae_code
|
|
4982
|
+
from establishments e
|
|
4983
|
+
cross join lateral unnest(
|
|
4984
|
+
string_to_array(e.secondary_cnaes_raw, ',')
|
|
4985
|
+
) as cnae_code
|
|
4986
|
+
where e.secondary_cnaes_raw is not null
|
|
4987
|
+
and e.secondary_cnaes_raw <> ''
|
|
4988
|
+
and btrim(cnae_code) <> ''
|
|
4989
|
+
on conflict (cnpj_full, cnae_code) do nothing`
|
|
4990
|
+
);
|
|
4991
|
+
const insertedRows = result.rowCount ?? 0;
|
|
4992
|
+
await appendJsonLinesLog(input.progressLogPath, {
|
|
4993
|
+
kind: "establishment_secondary_cnaes_backfilled",
|
|
4994
|
+
targetTable: "establishment_secondary_cnaes",
|
|
4995
|
+
insertedRows,
|
|
4996
|
+
durationMs: performance4.now() - startedAt,
|
|
4997
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
4998
|
+
});
|
|
4999
|
+
return insertedRows;
|
|
5000
|
+
}
|
|
4854
5001
|
async function materializeDatasetByChunks(input) {
|
|
4855
5002
|
const startedAt = performance4.now();
|
|
4856
5003
|
const validationReason = "Validating the live staging row count and staging cursor before resuming materialization.";
|
|
@@ -5297,6 +5444,12 @@ async function materializeStagedDatasets(input) {
|
|
|
5297
5444
|
totalBatches: input.totalBatches,
|
|
5298
5445
|
completedDatasets: summary.datasets.length
|
|
5299
5446
|
});
|
|
5447
|
+
if (dataset === "establishments" && input.schemaCapabilities.includeEstablishmentSecondaryCnaesTable) {
|
|
5448
|
+
await backfillEstablishmentSecondaryCnaesFromFinal({
|
|
5449
|
+
client: input.client,
|
|
5450
|
+
progressLogPath: input.progressLogPath
|
|
5451
|
+
});
|
|
5452
|
+
}
|
|
5300
5453
|
const tracker = input.datasetPerformanceTrackers.get(dataset);
|
|
5301
5454
|
if (tracker) {
|
|
5302
5455
|
tracker.materializationDurationMs += result.durationMs;
|
|
@@ -5520,6 +5673,12 @@ function canInsertIntoColumn(rows, tableName, columnName) {
|
|
|
5520
5673
|
}
|
|
5521
5674
|
return row.is_generated.toUpperCase() !== "ALWAYS";
|
|
5522
5675
|
}
|
|
5676
|
+
function hasRequiredColumns(rows, tableName, columnNames) {
|
|
5677
|
+
const availableColumns = new Set(
|
|
5678
|
+
rows.filter((item) => item.table_name === tableName).map((item) => item.column_name)
|
|
5679
|
+
);
|
|
5680
|
+
return columnNames.every((columnName) => availableColumns.has(columnName));
|
|
5681
|
+
}
|
|
5523
5682
|
async function detectImportSchemaCapabilities(client) {
|
|
5524
5683
|
const [columnResult, lookupConstraintResult] = await Promise.all([
|
|
5525
5684
|
client.query(
|
|
@@ -5528,6 +5687,7 @@ async function detectImportSchemaCapabilities(client) {
|
|
|
5528
5687
|
where table_schema = current_schema()
|
|
5529
5688
|
and (
|
|
5530
5689
|
(table_name = 'establishments' and column_name = 'cnpj_full') or
|
|
5690
|
+
(table_name = 'establishment_secondary_cnaes' and column_name in ('cnpj_full', 'cnae_code')) or
|
|
5531
5691
|
(table_name = 'partners' and column_name = 'partner_dedupe_key')
|
|
5532
5692
|
)`
|
|
5533
5693
|
),
|
|
@@ -5540,7 +5700,7 @@ async function detectImportSchemaCapabilities(client) {
|
|
|
5540
5700
|
inner join pg_class target_table on target_table.oid = constraint_item.confrelid
|
|
5541
5701
|
where constraint_item.contype = 'f'
|
|
5542
5702
|
and source_namespace.nspname = current_schema()
|
|
5543
|
-
and source_table.relname in ('companies', 'establishments', 'partners')
|
|
5703
|
+
and source_table.relname in ('companies', 'establishments', 'partners', 'establishment_secondary_cnaes')
|
|
5544
5704
|
and target_table.relname in (
|
|
5545
5705
|
'countries',
|
|
5546
5706
|
'cities',
|
|
@@ -5563,6 +5723,11 @@ async function detectImportSchemaCapabilities(client) {
|
|
|
5563
5723
|
"establishments",
|
|
5564
5724
|
"cnpj_full"
|
|
5565
5725
|
),
|
|
5726
|
+
includeEstablishmentSecondaryCnaesTable: hasRequiredColumns(
|
|
5727
|
+
columnResult.rows,
|
|
5728
|
+
"establishment_secondary_cnaes",
|
|
5729
|
+
["cnpj_full", "cnae_code"]
|
|
5730
|
+
),
|
|
5566
5731
|
includePartnerDedupeKeyInInsert: canInsertIntoColumn(
|
|
5567
5732
|
columnResult.rows,
|
|
5568
5733
|
"partners",
|