@danielarndt0/cnpj-db-loader 2.2.0 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -1
- package/dist/cli.js +173 -8
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +1 -0
- package/dist/index.js +173 -8
- package/dist/index.js.map +1 -1
- package/docs/architecture.md +1 -1
- package/docs/commands.md +1 -1
- package/docs/usage.md +1 -0
- package/package.json +11 -1
package/README.md
CHANGED
|
@@ -17,6 +17,7 @@ This version focuses on the real loading workflow:
|
|
|
17
17
|
- exact preparatory scanning for total rows and total batches before import starts
|
|
18
18
|
- persisted import plans reused on resume for the same validated input and batch size
|
|
19
19
|
- staged bulk loads for the large datasets through PostgreSQL COPY
|
|
20
|
+
- automatic materialization of `establishment_secondary_cnaes` from establishment secondary CNAE data
|
|
20
21
|
- direct final-schema upserts for the smaller domain datasets
|
|
21
22
|
- checkpoint-based resume by file and byte offset
|
|
22
23
|
- row quarantine for invalid or constraint-breaking records without stopping the import
|
|
@@ -95,7 +96,7 @@ The import internals are now split into dedicated modules such as planner, sourc
|
|
|
95
96
|
|
|
96
97
|
The CLI now exposes a split workflow as well: `import` runs the full pipeline, `import load` stops after staging/direct writes, `import materialize` resumes from the saved plan and pushes staged rows into the final tables, and `database cleanup ...` exposes safe maintenance commands for staging tables, simplified final materialized tables, checkpoints, and saved plans.
|
|
97
98
|
|
|
98
|
-
Materialization progress is now checkpointed separately from file-load checkpoints, and the materializer works in resumable chunks controlled by `--materialize-batch-size`. During long final materialization steps, the CLI keeps the live progress output on a dedicated MATERIALIZING stage while reducing per-chunk checkpoint and JSONL write overhead so resumable chunks stay fast. The simplified final schema keeps raw secondary CNAE text in establishments and
|
|
99
|
+
Materialization progress is now checkpointed separately from file-load checkpoints, and the materializer works in resumable chunks controlled by `--materialize-batch-size`. During long final materialization steps, the CLI keeps the live progress output on a dedicated MATERIALIZING stage while reducing per-chunk checkpoint and JSONL write overhead so resumable chunks stay fast. The simplified final schema keeps raw secondary CNAE text in establishments and also materializes `establishment_secondary_cnaes` so APIs can query one row per secondary CNAE without running a separate backfill script.
|
|
99
100
|
|
|
100
101
|
The Federal Revenue commands write the same structured command logs and keep the remote-download phase outside the import internals. Existing completed ZIP files are skipped by default, temporary `.part` files are used while downloads are still in progress, and each reference keeps a local manifest for `status`, `retry`, `clean`, and future runner automation.
|
|
101
102
|
|
package/dist/cli.js
CHANGED
|
@@ -1512,8 +1512,14 @@ function maskDatabaseLabel(url) {
|
|
|
1512
1512
|
|
|
1513
1513
|
// src/services/database/cleanup.ts
|
|
1514
1514
|
var MATERIALIZED_DATASET_TABLES = {
|
|
1515
|
-
companies: [
|
|
1516
|
-
|
|
1515
|
+
companies: [
|
|
1516
|
+
"simples_options",
|
|
1517
|
+
"partners",
|
|
1518
|
+
"establishment_secondary_cnaes",
|
|
1519
|
+
"establishments",
|
|
1520
|
+
"companies"
|
|
1521
|
+
],
|
|
1522
|
+
establishments: ["establishment_secondary_cnaes", "establishments"],
|
|
1517
1523
|
partners: ["partners"],
|
|
1518
1524
|
simples_options: ["simples_options"]
|
|
1519
1525
|
};
|
|
@@ -1557,6 +1563,22 @@ function collectMaterializedTables(dataset) {
|
|
|
1557
1563
|
}
|
|
1558
1564
|
return [...orderedTables];
|
|
1559
1565
|
}
|
|
1566
|
+
async function tableExists3(client, tableName) {
|
|
1567
|
+
const result = await client.query(
|
|
1568
|
+
"select to_regclass(current_schema() || '.' || $1) as exists",
|
|
1569
|
+
[tableName]
|
|
1570
|
+
);
|
|
1571
|
+
return Boolean(result.rows[0]?.exists);
|
|
1572
|
+
}
|
|
1573
|
+
async function filterExistingTables(client, tableNames) {
|
|
1574
|
+
const existingTables = [];
|
|
1575
|
+
for (const tableName of tableNames) {
|
|
1576
|
+
if (await tableExists3(client, tableName)) {
|
|
1577
|
+
existingTables.push(tableName);
|
|
1578
|
+
}
|
|
1579
|
+
}
|
|
1580
|
+
return existingTables;
|
|
1581
|
+
}
|
|
1560
1582
|
async function deleteLoadCheckpoints(client, dataset) {
|
|
1561
1583
|
await ensureCheckpointTable(client);
|
|
1562
1584
|
const result = dataset ? await client.query(`delete from import_checkpoints where dataset = $1`, [
|
|
@@ -1662,7 +1684,10 @@ async function cleanupDatabaseMaterializedTables(client, input2) {
|
|
|
1662
1684
|
targetDatabase,
|
|
1663
1685
|
dataset: input2.dataset
|
|
1664
1686
|
});
|
|
1665
|
-
const tableNames =
|
|
1687
|
+
const tableNames = await filterExistingTables(
|
|
1688
|
+
client,
|
|
1689
|
+
collectMaterializedTables(input2.dataset)
|
|
1690
|
+
);
|
|
1666
1691
|
if (tableNames.length > 0) {
|
|
1667
1692
|
await client.query(`truncate ${tableNames.join(", ")}`);
|
|
1668
1693
|
}
|
|
@@ -2261,6 +2286,7 @@ function createIndexesSql() {
|
|
|
2261
2286
|
return [
|
|
2262
2287
|
"-- Operational indexes",
|
|
2263
2288
|
"create index if not exists idx_establishments_cnpj_root on establishments (cnpj_root);",
|
|
2289
|
+
"create index if not exists idx_establishment_secondary_cnaes_cnae_code on establishment_secondary_cnaes (cnae_code);",
|
|
2264
2290
|
"create index if not exists idx_partners_cnpj_root on partners (cnpj_root);",
|
|
2265
2291
|
"create index if not exists idx_import_plans_status on import_plans (status);",
|
|
2266
2292
|
"create index if not exists idx_import_plans_load_status on import_plans (load_status);",
|
|
@@ -2314,6 +2340,15 @@ function createPartnersSql() {
|
|
|
2314
2340
|
");"
|
|
2315
2341
|
].join("\n");
|
|
2316
2342
|
}
|
|
2343
|
+
function createEstablishmentSecondaryCnaesSql() {
|
|
2344
|
+
return [
|
|
2345
|
+
"create table if not exists establishment_secondary_cnaes (",
|
|
2346
|
+
" cnpj_full text not null,",
|
|
2347
|
+
" cnae_code text not null,",
|
|
2348
|
+
" primary key (cnpj_full, cnae_code)",
|
|
2349
|
+
");"
|
|
2350
|
+
].join("\n");
|
|
2351
|
+
}
|
|
2317
2352
|
function createSimplesSql() {
|
|
2318
2353
|
return [
|
|
2319
2354
|
"create table if not exists simples_options (",
|
|
@@ -2331,6 +2366,7 @@ function createOperationalSchemaParts() {
|
|
|
2331
2366
|
"-- Final operational tables (simplified for fast first-load materialization)",
|
|
2332
2367
|
createCompaniesSql(),
|
|
2333
2368
|
createEstablishmentsSql(),
|
|
2369
|
+
createEstablishmentSecondaryCnaesSql(),
|
|
2334
2370
|
createPartnersSql(),
|
|
2335
2371
|
createSimplesSql()
|
|
2336
2372
|
];
|
|
@@ -4279,6 +4315,68 @@ function buildChunkInsertSql(input2) {
|
|
|
4279
4315
|
values: [input2.lastStagingId, input2.chunkSize]
|
|
4280
4316
|
};
|
|
4281
4317
|
}
|
|
4318
|
+
function buildEstablishmentsChunkInsertSql(input2) {
|
|
4319
|
+
const chunkSelectList = [
|
|
4320
|
+
"source.staging_id",
|
|
4321
|
+
...input2.selectColumns.map((column) => `source.${column}`),
|
|
4322
|
+
`${buildEstablishmentCnpjFullExpression("source")} as cnpj_full`
|
|
4323
|
+
].join(",\n ");
|
|
4324
|
+
const insertSelectList = input2.insertColumns.join(", ");
|
|
4325
|
+
const secondaryCnaesCtes = input2.includeSecondaryCnaesTable ? [
|
|
4326
|
+
",",
|
|
4327
|
+
"deleted_secondary_cnaes as (",
|
|
4328
|
+
" delete from establishment_secondary_cnaes target",
|
|
4329
|
+
" using (select distinct cnpj_full from inserted_establishments) source_keys",
|
|
4330
|
+
" where target.cnpj_full = source_keys.cnpj_full",
|
|
4331
|
+
" returning 1",
|
|
4332
|
+
"),",
|
|
4333
|
+
"secondary_cnaes_source as (",
|
|
4334
|
+
" select distinct",
|
|
4335
|
+
" chunked.cnpj_full,",
|
|
4336
|
+
" btrim(cnae_code) as cnae_code",
|
|
4337
|
+
" from chunked",
|
|
4338
|
+
" inner join inserted_establishments inserted",
|
|
4339
|
+
" on inserted.cnpj_full = chunked.cnpj_full",
|
|
4340
|
+
" cross join lateral unnest(string_to_array(chunked.secondary_cnaes_raw, ',')) as cnae_code",
|
|
4341
|
+
" where chunked.secondary_cnaes_raw is not null",
|
|
4342
|
+
" and chunked.secondary_cnaes_raw <> ''",
|
|
4343
|
+
" and btrim(cnae_code) <> ''",
|
|
4344
|
+
"),",
|
|
4345
|
+
"inserted_secondary_cnaes as (",
|
|
4346
|
+
" insert into establishment_secondary_cnaes (cnpj_full, cnae_code)",
|
|
4347
|
+
" select cnpj_full, cnae_code",
|
|
4348
|
+
" from secondary_cnaes_source",
|
|
4349
|
+
" on conflict (cnpj_full, cnae_code) do nothing",
|
|
4350
|
+
" returning 1",
|
|
4351
|
+
")"
|
|
4352
|
+
] : [];
|
|
4353
|
+
return {
|
|
4354
|
+
text: [
|
|
4355
|
+
"with chunked as (",
|
|
4356
|
+
` select
|
|
4357
|
+
${chunkSelectList}`,
|
|
4358
|
+
" from staging_establishments source",
|
|
4359
|
+
" where source.staging_id > $1",
|
|
4360
|
+
" order by source.staging_id asc",
|
|
4361
|
+
" limit $2",
|
|
4362
|
+
"),",
|
|
4363
|
+
"inserted_establishments as (",
|
|
4364
|
+
` insert into establishments (${input2.insertColumns.join(", ")})`,
|
|
4365
|
+
` select ${insertSelectList}`,
|
|
4366
|
+
" from chunked",
|
|
4367
|
+
...input2.conflictClause ? [input2.conflictClause] : [],
|
|
4368
|
+
" returning cnpj_full",
|
|
4369
|
+
")",
|
|
4370
|
+
...secondaryCnaesCtes,
|
|
4371
|
+
"select",
|
|
4372
|
+
" coalesce(max(staging_id), $1::bigint)::bigint as max_staging_id,",
|
|
4373
|
+
" count(*)::bigint as source_rows,",
|
|
4374
|
+
" count(*)::bigint as affected_rows",
|
|
4375
|
+
"from chunked;"
|
|
4376
|
+
].join("\n"),
|
|
4377
|
+
values: [input2.lastStagingId, input2.chunkSize]
|
|
4378
|
+
};
|
|
4379
|
+
}
|
|
4282
4380
|
function buildPartnersChunkInsertSql(input2) {
|
|
4283
4381
|
const baseColumns = MATERIALIZATION_COLUMNS.partners;
|
|
4284
4382
|
const chunkSelectList = [
|
|
@@ -4356,9 +4454,7 @@ function buildMaterializationChunkQuery(input2) {
|
|
|
4356
4454
|
});
|
|
4357
4455
|
case "establishments": {
|
|
4358
4456
|
const insertColumns = input2.schemaCapabilities.includeEstablishmentCnpjFullInInsert ? [...baseColumns, "cnpj_full"] : [...baseColumns];
|
|
4359
|
-
return
|
|
4360
|
-
stagingTable: "staging_establishments",
|
|
4361
|
-
targetTable: "establishments",
|
|
4457
|
+
return buildEstablishmentsChunkInsertSql({
|
|
4362
4458
|
insertColumns,
|
|
4363
4459
|
selectColumns: baseColumns,
|
|
4364
4460
|
conflictClause: useConflictClause ? getConflictClause(
|
|
@@ -4368,7 +4464,7 @@ function buildMaterializationChunkQuery(input2) {
|
|
|
4368
4464
|
) : "",
|
|
4369
4465
|
lastStagingId: input2.lastStagingId,
|
|
4370
4466
|
chunkSize: input2.chunkSize,
|
|
4371
|
-
|
|
4467
|
+
includeSecondaryCnaesTable: input2.schemaCapabilities.includeEstablishmentSecondaryCnaesTable
|
|
4372
4468
|
});
|
|
4373
4469
|
}
|
|
4374
4470
|
case "simples_options":
|
|
@@ -4819,6 +4915,57 @@ async function validateDatasetCheckpoint(input2) {
|
|
|
4819
4915
|
targetRows
|
|
4820
4916
|
};
|
|
4821
4917
|
}
|
|
4918
|
+
async function readEstablishmentSecondaryCnaesCount(client) {
|
|
4919
|
+
const result = await client.query(
|
|
4920
|
+
`select count(*)::bigint as total_count from establishment_secondary_cnaes`
|
|
4921
|
+
);
|
|
4922
|
+
return Number.parseInt(result.rows[0]?.total_count ?? "0", 10);
|
|
4923
|
+
}
|
|
4924
|
+
async function hasEstablishmentsWithSecondaryCnaes(client) {
|
|
4925
|
+
const result = await client.query(
|
|
4926
|
+
`select exists (
|
|
4927
|
+
select 1
|
|
4928
|
+
from establishments
|
|
4929
|
+
where secondary_cnaes_raw is not null
|
|
4930
|
+
and secondary_cnaes_raw <> ''
|
|
4931
|
+
limit 1
|
|
4932
|
+
) as exists`
|
|
4933
|
+
);
|
|
4934
|
+
return result.rows[0]?.exists ?? false;
|
|
4935
|
+
}
|
|
4936
|
+
async function backfillEstablishmentSecondaryCnaesFromFinal(input2) {
|
|
4937
|
+
const existingRows = await readEstablishmentSecondaryCnaesCount(input2.client);
|
|
4938
|
+
if (existingRows > 0) {
|
|
4939
|
+
return 0;
|
|
4940
|
+
}
|
|
4941
|
+
if (!await hasEstablishmentsWithSecondaryCnaes(input2.client)) {
|
|
4942
|
+
return 0;
|
|
4943
|
+
}
|
|
4944
|
+
const startedAt = performance4.now();
|
|
4945
|
+
const result = await input2.client.query(
|
|
4946
|
+
`insert into establishment_secondary_cnaes (cnpj_full, cnae_code)
|
|
4947
|
+
select distinct
|
|
4948
|
+
e.cnpj_full,
|
|
4949
|
+
btrim(cnae_code) as cnae_code
|
|
4950
|
+
from establishments e
|
|
4951
|
+
cross join lateral unnest(
|
|
4952
|
+
string_to_array(e.secondary_cnaes_raw, ',')
|
|
4953
|
+
) as cnae_code
|
|
4954
|
+
where e.secondary_cnaes_raw is not null
|
|
4955
|
+
and e.secondary_cnaes_raw <> ''
|
|
4956
|
+
and btrim(cnae_code) <> ''
|
|
4957
|
+
on conflict (cnpj_full, cnae_code) do nothing`
|
|
4958
|
+
);
|
|
4959
|
+
const insertedRows = result.rowCount ?? 0;
|
|
4960
|
+
await appendJsonLinesLog(input2.progressLogPath, {
|
|
4961
|
+
kind: "establishment_secondary_cnaes_backfilled",
|
|
4962
|
+
targetTable: "establishment_secondary_cnaes",
|
|
4963
|
+
insertedRows,
|
|
4964
|
+
durationMs: performance4.now() - startedAt,
|
|
4965
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
4966
|
+
});
|
|
4967
|
+
return insertedRows;
|
|
4968
|
+
}
|
|
4822
4969
|
async function materializeDatasetByChunks(input2) {
|
|
4823
4970
|
const startedAt = performance4.now();
|
|
4824
4971
|
const validationReason = "Validating the live staging row count and staging cursor before resuming materialization.";
|
|
@@ -5265,6 +5412,12 @@ async function materializeStagedDatasets(input2) {
|
|
|
5265
5412
|
totalBatches: input2.totalBatches,
|
|
5266
5413
|
completedDatasets: summary.datasets.length
|
|
5267
5414
|
});
|
|
5415
|
+
if (dataset === "establishments" && input2.schemaCapabilities.includeEstablishmentSecondaryCnaesTable) {
|
|
5416
|
+
await backfillEstablishmentSecondaryCnaesFromFinal({
|
|
5417
|
+
client: input2.client,
|
|
5418
|
+
progressLogPath: input2.progressLogPath
|
|
5419
|
+
});
|
|
5420
|
+
}
|
|
5268
5421
|
const tracker = input2.datasetPerformanceTrackers.get(dataset);
|
|
5269
5422
|
if (tracker) {
|
|
5270
5423
|
tracker.materializationDurationMs += result.durationMs;
|
|
@@ -5488,6 +5641,12 @@ function canInsertIntoColumn(rows, tableName, columnName) {
|
|
|
5488
5641
|
}
|
|
5489
5642
|
return row.is_generated.toUpperCase() !== "ALWAYS";
|
|
5490
5643
|
}
|
|
5644
|
+
function hasRequiredColumns(rows, tableName, columnNames) {
|
|
5645
|
+
const availableColumns = new Set(
|
|
5646
|
+
rows.filter((item) => item.table_name === tableName).map((item) => item.column_name)
|
|
5647
|
+
);
|
|
5648
|
+
return columnNames.every((columnName) => availableColumns.has(columnName));
|
|
5649
|
+
}
|
|
5491
5650
|
async function detectImportSchemaCapabilities(client) {
|
|
5492
5651
|
const [columnResult, lookupConstraintResult] = await Promise.all([
|
|
5493
5652
|
client.query(
|
|
@@ -5496,6 +5655,7 @@ async function detectImportSchemaCapabilities(client) {
|
|
|
5496
5655
|
where table_schema = current_schema()
|
|
5497
5656
|
and (
|
|
5498
5657
|
(table_name = 'establishments' and column_name = 'cnpj_full') or
|
|
5658
|
+
(table_name = 'establishment_secondary_cnaes' and column_name in ('cnpj_full', 'cnae_code')) or
|
|
5499
5659
|
(table_name = 'partners' and column_name = 'partner_dedupe_key')
|
|
5500
5660
|
)`
|
|
5501
5661
|
),
|
|
@@ -5508,7 +5668,7 @@ async function detectImportSchemaCapabilities(client) {
|
|
|
5508
5668
|
inner join pg_class target_table on target_table.oid = constraint_item.confrelid
|
|
5509
5669
|
where constraint_item.contype = 'f'
|
|
5510
5670
|
and source_namespace.nspname = current_schema()
|
|
5511
|
-
and source_table.relname in ('companies', 'establishments', 'partners')
|
|
5671
|
+
and source_table.relname in ('companies', 'establishments', 'partners', 'establishment_secondary_cnaes')
|
|
5512
5672
|
and target_table.relname in (
|
|
5513
5673
|
'countries',
|
|
5514
5674
|
'cities',
|
|
@@ -5531,6 +5691,11 @@ async function detectImportSchemaCapabilities(client) {
|
|
|
5531
5691
|
"establishments",
|
|
5532
5692
|
"cnpj_full"
|
|
5533
5693
|
),
|
|
5694
|
+
includeEstablishmentSecondaryCnaesTable: hasRequiredColumns(
|
|
5695
|
+
columnResult.rows,
|
|
5696
|
+
"establishment_secondary_cnaes",
|
|
5697
|
+
["cnpj_full", "cnae_code"]
|
|
5698
|
+
),
|
|
5534
5699
|
includePartnerDedupeKeyInInsert: canInsertIntoColumn(
|
|
5535
5700
|
columnResult.rows,
|
|
5536
5701
|
"partners",
|