@danielarndt0/cnpj-db-loader 2.4.0-beta.2 → 2.4.0-beta.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -8411,6 +8411,18 @@ var STAGING_TABLE_BY_DATASET3 = {
8411
8411
  partners: "staging_partners",
8412
8412
  simples_options: "staging_simples_options"
8413
8413
  };
8414
+ var STEP_ORDER = [
8415
+ "setup",
8416
+ "load-domains",
8417
+ "load-companies",
8418
+ "load-establishments",
8419
+ "load-partners",
8420
+ "load-simples",
8421
+ "materialize",
8422
+ "materialize-secondary-cnaes",
8423
+ "indexes",
8424
+ "analyze"
8425
+ ];
8414
8426
  function quoteSqlLiteral(value) {
8415
8427
  return `'${value.replace(/'/g, "''")}'`;
8416
8428
  }
@@ -8428,6 +8440,9 @@ function receitaCopyCommand(tableName, columns, filePath) {
8428
8440
  const normalizedFilePath = normalizePathForPsql(filePath);
8429
8441
  return `\\copy ${tableName} (${columns.join(", ")}) from ${quoteSqlLiteral(normalizedFilePath)} with (format csv, header false, delimiter ';', quote '"', escape '"')`;
8430
8442
  }
8443
+ function echo(message) {
8444
+ return `\\echo ${quoteSqlLiteral(message)}`;
8445
+ }
8431
8446
  function datasetColumns(dataset) {
8432
8447
  return DATASET_LAYOUTS[dataset].fields.map((field) => field.columnName);
8433
8448
  }
@@ -8454,7 +8469,7 @@ function partnerDedupeExpression(alias) {
8454
8469
  function materializeCompaniesSql() {
8455
8470
  const columns = companiesLayout.fields.map((field) => field.columnName);
8456
8471
  return [
8457
- "\\echo 'Materializing companies...'",
8472
+ echo("[materialize] Materializing companies..."),
8458
8473
  "with source as (",
8459
8474
  " select",
8460
8475
  ` ${columns.map((column) => `source.${column}`).join(",\n ")},`,
@@ -8468,7 +8483,8 @@ function materializeCompaniesSql() {
8468
8483
  `select ${columns.join(", ")}`,
8469
8484
  "from deduped",
8470
8485
  "on conflict (cnpj_root) do update set",
8471
- ` ${updateAssignments(columns, ["cnpj_root"])};`
8486
+ ` ${updateAssignments(columns, ["cnpj_root"])};`,
8487
+ echo("[materialize] Companies materialization completed.")
8472
8488
  ].join("\n");
8473
8489
  }
8474
8490
  function materializeEstablishmentsSql() {
@@ -8477,7 +8493,7 @@ function materializeEstablishmentsSql() {
8477
8493
  );
8478
8494
  const insertColumns = [...baseColumns, "cnpj_full"];
8479
8495
  return [
8480
- "\\echo 'Materializing establishments and secondary CNAEs...'",
8496
+ echo("[materialize] Materializing establishments..."),
8481
8497
  "with source as (",
8482
8498
  " select",
8483
8499
  ` ${baseColumns.map((column) => `source.${column}`).join(",\n ")},`,
@@ -8487,14 +8503,29 @@ function materializeEstablishmentsSql() {
8487
8503
  "),",
8488
8504
  "deduped as (",
8489
8505
  " select * from source where dedupe_rank = 1",
8506
+ ")",
8507
+ `insert into establishments (${insertColumns.join(", ")})`,
8508
+ `select ${insertColumns.join(", ")}`,
8509
+ "from deduped",
8510
+ "on conflict (cnpj_full) do update set",
8511
+ ` ${updateAssignments(insertColumns, ["cnpj_root", "cnpj_order", "cnpj_check_digits", "cnpj_full"])};`,
8512
+ echo("[materialize] Establishments materialization completed.")
8513
+ ].join("\n");
8514
+ }
8515
+ function materializeSecondaryCnaesSql() {
8516
+ return [
8517
+ echo(
8518
+ "[materialize-secondary-cnaes] Materializing establishment secondary CNAEs..."
8519
+ ),
8520
+ "with source as (",
8521
+ " select",
8522
+ " staging.cnpj_root || staging.cnpj_order || staging.cnpj_check_digits as cnpj_full,",
8523
+ " staging.secondary_cnaes_raw,",
8524
+ " row_number() over (partition by staging.cnpj_root || staging.cnpj_order || staging.cnpj_check_digits order by staging.staging_id desc) as dedupe_rank",
8525
+ " from staging_establishments staging",
8490
8526
  "),",
8491
- "upserted as (",
8492
- ` insert into establishments (${insertColumns.join(", ")})`,
8493
- ` select ${insertColumns.join(", ")}`,
8494
- " from deduped",
8495
- " on conflict (cnpj_full) do update set",
8496
- ` ${updateAssignments(insertColumns, ["cnpj_root", "cnpj_order", "cnpj_check_digits", "cnpj_full"])}`,
8497
- " returning cnpj_full",
8527
+ "deduped as (",
8528
+ " select * from source where dedupe_rank = 1",
8498
8529
  "),",
8499
8530
  "deleted_secondary_cnaes as (",
8500
8531
  " delete from establishment_secondary_cnaes target",
@@ -8515,14 +8546,17 @@ function materializeEstablishmentsSql() {
8515
8546
  "insert into establishment_secondary_cnaes (cnpj_full, cnae_code)",
8516
8547
  "select cnpj_full, cnae_code",
8517
8548
  "from secondary_cnaes_source",
8518
- "on conflict (cnpj_full, cnae_code) do nothing;"
8549
+ "on conflict (cnpj_full, cnae_code) do nothing;",
8550
+ echo(
8551
+ "[materialize-secondary-cnaes] Secondary CNAEs materialization completed."
8552
+ )
8519
8553
  ].join("\n");
8520
8554
  }
8521
8555
  function materializePartnersSql() {
8522
8556
  const baseColumns = partnersLayout.fields.map((field) => field.columnName);
8523
8557
  const insertColumns = [...baseColumns, "partner_dedupe_key"];
8524
8558
  return [
8525
- "\\echo 'Materializing partners...'",
8559
+ echo("[materialize] Materializing partners..."),
8526
8560
  "with source as (",
8527
8561
  " select",
8528
8562
  ` ${baseColumns.map((column) => `source.${column}`).join(",\n ")},`,
@@ -8542,13 +8576,14 @@ function materializePartnersSql() {
8542
8576
  `select ${insertColumns.join(", ")}`,
8543
8577
  "from deduped",
8544
8578
  "on conflict (partner_dedupe_key) do update set",
8545
- ` ${updateAssignments(insertColumns, ["partner_dedupe_key"])};`
8579
+ ` ${updateAssignments(insertColumns, ["partner_dedupe_key"])};`,
8580
+ echo("[materialize] Partners materialization completed.")
8546
8581
  ].join("\n");
8547
8582
  }
8548
8583
  function materializeSimplesSql() {
8549
8584
  const columns = simplesLayout.fields.map((field) => field.columnName);
8550
8585
  return [
8551
- "\\echo 'Materializing simples options...'",
8586
+ echo("[materialize] Materializing simples options..."),
8552
8587
  "with source as (",
8553
8588
  " select",
8554
8589
  ` ${columns.map((column) => `source.${column}`).join(",\n ")},`,
@@ -8562,7 +8597,8 @@ function materializeSimplesSql() {
8562
8597
  `select ${columns.join(", ")}`,
8563
8598
  "from deduped",
8564
8599
  "on conflict (cnpj_root) do update set",
8565
- ` ${updateAssignments(columns, ["cnpj_root"])};`
8600
+ ` ${updateAssignments(columns, ["cnpj_root"])};`,
8601
+ echo("[materialize] Simples options materialization completed.")
8566
8602
  ].join("\n");
8567
8603
  }
8568
8604
  function copyDomainSql(dataset, files) {
@@ -8572,12 +8608,20 @@ function copyDomainSql(dataset, files) {
8572
8608
  const columns = datasetColumns(dataset);
8573
8609
  const tempTable = `tmp_hybrid_${dataset}`;
8574
8610
  const lines = [
8575
- `\\echo 'Loading ${dataset} lookup data...'`,
8611
+ echo(`[load-domains] Loading ${dataset} lookup data...`),
8576
8612
  `drop table if exists ${tempTable};`,
8577
8613
  `create temporary table ${tempTable} (code text, description text);`
8578
8614
  ];
8579
- for (const file of files) {
8580
- lines.push(csvCopyCommand(tempTable, columns, file.absolutePath));
8615
+ for (const [index, file] of files.entries()) {
8616
+ lines.push(
8617
+ echo(
8618
+ `[load-domains] Loading ${dataset} file ${index + 1} of ${files.length}: ${file.relativePath}`
8619
+ ),
8620
+ csvCopyCommand(tempTable, columns, file.absolutePath),
8621
+ echo(
8622
+ `[load-domains] Loaded ${dataset} file ${index + 1} of ${files.length}.`
8623
+ )
8624
+ );
8581
8625
  }
8582
8626
  lines.push(
8583
8627
  `insert into ${dataset} (${columns.join(", ")})`,
@@ -8598,12 +8642,17 @@ function copyStagingSql(dataset, files) {
8598
8642
  return [];
8599
8643
  }
8600
8644
  const columns = datasetColumns(dataset);
8601
- return [
8602
- `\\echo 'Loading ${dataset} staging data...'`,
8603
- ...files.map(
8604
- (file) => csvCopyCommand(tableName, columns, file.absolutePath)
8605
- )
8606
- ];
8645
+ const lines = [echo(`[load-${dataset}] Loading ${dataset} staging data...`)];
8646
+ for (const [index, file] of files.entries()) {
8647
+ lines.push(
8648
+ echo(
8649
+ `[load-${dataset}] Loading file ${index + 1} of ${files.length}: ${file.relativePath}`
8650
+ ),
8651
+ csvCopyCommand(tableName, columns, file.absolutePath),
8652
+ echo(`[load-${dataset}] Loaded file ${index + 1} of ${files.length}.`)
8653
+ );
8654
+ }
8655
+ return lines;
8607
8656
  }
8608
8657
  function csvFilesByDataset(files) {
8609
8658
  const grouped = {};
@@ -8629,7 +8678,9 @@ function rawTableName(dataset) {
8629
8678
  function createRawTempTableSql(dataset) {
8630
8679
  const columns = DATASET_LAYOUTS[dataset].fields.map((field) => ` ${quoteIdentifier(field.columnName)} text`).join(",\n");
8631
8680
  return [
8681
+ "set client_min_messages to warning;",
8632
8682
  `drop table if exists ${rawTableName(dataset)};`,
8683
+ "reset client_min_messages;",
8633
8684
  `create temporary table ${rawTableName(dataset)} (`,
8634
8685
  columns,
8635
8686
  ");"
@@ -8711,11 +8762,21 @@ function rawDomainSql(dataset, files) {
8711
8762
  const columns = layout.fields.map((field) => field.columnName);
8712
8763
  const tableName = rawTableName(dataset);
8713
8764
  const lines = [
8714
- `\\echo 'Loading ${dataset} lookup data directly from sanitized Receita files...'`,
8765
+ echo(
8766
+ `[load-domains] Loading ${dataset} lookup data directly from sanitized Receita files...`
8767
+ ),
8715
8768
  createRawTempTableSql(dataset)
8716
8769
  ];
8717
- for (const file of files) {
8718
- lines.push(receitaCopyCommand(tableName, columns, file.absolutePath));
8770
+ for (const [index, file] of files.entries()) {
8771
+ lines.push(
8772
+ echo(
8773
+ `[load-domains] Loading ${dataset} file ${index + 1} of ${files.length}: ${file.relativePath}`
8774
+ ),
8775
+ receitaCopyCommand(tableName, columns, file.absolutePath),
8776
+ echo(
8777
+ `[load-domains] Loaded ${dataset} file ${index + 1} of ${files.length}.`
8778
+ )
8779
+ );
8719
8780
  }
8720
8781
  lines.push(
8721
8782
  `insert into ${dataset} (${columns.join(", ")})`,
@@ -8725,7 +8786,8 @@ function rawDomainSql(dataset, files) {
8725
8786
  `from ${tableName}`,
8726
8787
  "where nullif(btrim(code), '') is not null",
8727
8788
  "order by code",
8728
- "on conflict (code) do update set description = excluded.description;"
8789
+ "on conflict (code) do update set description = excluded.description;",
8790
+ echo(`[load-domains] ${dataset} lookup data completed.`)
8729
8791
  );
8730
8792
  return lines;
8731
8793
  }
@@ -8744,70 +8806,363 @@ function rawStagingSql(dataset, files) {
8744
8806
  const expressions = layout.fields.map(
8745
8807
  (field) => ` ${fieldExpression(dataset, field, alias)} as ${field.columnName}`
8746
8808
  );
8809
+ const stepName = loadStepName(dataset);
8747
8810
  const lines = [
8748
- `\\echo 'Loading ${dataset} staging data directly from sanitized Receita files...'`,
8811
+ echo(
8812
+ `[${stepName}] Loading ${dataset} staging data directly from sanitized Receita files...`
8813
+ ),
8814
+ `truncate table ${targetTable} restart identity;`,
8749
8815
  createRawTempTableSql(dataset)
8750
8816
  ];
8751
- for (const file of files) {
8752
- lines.push(receitaCopyCommand(tableName, columns, file.absolutePath));
8817
+ for (const [index, file] of files.entries()) {
8818
+ lines.push(
8819
+ echo(
8820
+ `[${stepName}] Loading file ${index + 1} of ${files.length}: ${file.relativePath}`
8821
+ ),
8822
+ receitaCopyCommand(tableName, columns, file.absolutePath),
8823
+ echo(`[${stepName}] Loaded file ${index + 1} of ${files.length}.`)
8824
+ );
8753
8825
  }
8754
8826
  lines.push(
8827
+ echo(
8828
+ `[${stepName}] Transforming ${dataset} raw rows into ${targetTable}...`
8829
+ ),
8755
8830
  `insert into ${targetTable} (${columns.join(", ")})`,
8756
8831
  "select",
8757
8832
  expressions.join(",\n"),
8758
- `from ${tableName} ${alias};`
8833
+ `from ${tableName} ${alias};`,
8834
+ echo(`[${stepName}] ${dataset} staging load completed.`)
8759
8835
  );
8760
8836
  return lines;
8761
8837
  }
8762
- function generatePostgresDirectImportScript(input) {
8763
- const grouped = csvFilesByDataset(input.files);
8764
- const lines = [
8765
- "-- CNPJ DB Loader hybrid PostgreSQL import script",
8766
- "-- Generated from PostgreSQL-ready CSV files exported by cnpj-db-loader postgres export-csv.",
8767
- "-- Execute with psql, for example:",
8768
- '-- psql "postgres://postgres:postgres@localhost:5432/cnpj" -f import-postgres-direct.sql',
8769
- "",
8838
+ function loadStepName(dataset) {
8839
+ switch (dataset) {
8840
+ case "companies":
8841
+ return "load-companies";
8842
+ case "establishments":
8843
+ return "load-establishments";
8844
+ case "partners":
8845
+ return "load-partners";
8846
+ case "simples_options":
8847
+ return "load-simples";
8848
+ default:
8849
+ return `load-${dataset}`;
8850
+ }
8851
+ }
8852
+ function scriptHeader(title, sourceEncoding) {
8853
+ return [
8854
+ `-- ${title}`,
8855
+ "-- Generated by cnpj-db-loader postgres generate-script.",
8770
8856
  "\\set ON_ERROR_STOP on",
8771
- "\\echo 'Starting CNPJ DB Loader hybrid PostgreSQL import...'",
8772
- "",
8773
- "begin;",
8774
- "",
8775
- "-- Keep the final schema and seed data managed by sql/schema.sql.",
8776
- "-- This script only resets staging tables and then upserts final data.",
8777
- "truncate table staging_companies restart identity;",
8778
- "truncate table staging_establishments restart identity;",
8779
- "truncate table staging_partners restart identity;",
8780
- "truncate table staging_simples_options restart identity;",
8857
+ ...sourceEncoding ? [
8858
+ echo(
8859
+ `Using source file encoding ${sourceEncoding} for psql copy operations...`
8860
+ ),
8861
+ `set client_encoding to ${quoteSqlLiteral(sourceEncoding)};`
8862
+ ] : [],
8781
8863
  ""
8782
8864
  ];
8783
- for (const dataset of DOMAIN_DATASETS) {
8784
- lines.push(...copyDomainSql(dataset, grouped[dataset] ?? []), "");
8865
+ }
8866
+ function wrapTransaction(lines, mode, shouldWrap) {
8867
+ if (!shouldWrap || mode !== "phase") {
8868
+ return [...lines];
8785
8869
  }
8786
- for (const dataset of STAGING_DATASETS) {
8787
- lines.push(...copyStagingSql(dataset, grouped[dataset] ?? []), "");
8870
+ return ["begin;", "", ...lines, "", "commit;"];
8871
+ }
8872
+ function buildStepScript(title, body, input, wrapInPhaseTransaction) {
8873
+ return [
8874
+ ...scriptHeader(title, input.sourceEncoding),
8875
+ ...wrapTransaction(body, input.transactionMode, wrapInPhaseTransaction),
8876
+ ""
8877
+ ].join("\n");
8878
+ }
8879
+ function includeSet(input) {
8880
+ const selected = new Set(input.include);
8881
+ if (input.skipIndexes) {
8882
+ selected.delete("indexes");
8788
8883
  }
8789
- lines.push(...materializationAndAnalyzeSql());
8790
- return lines.join("\n");
8884
+ if (input.skipAnalyze) {
8885
+ selected.delete("analyze");
8886
+ }
8887
+ return selected;
8888
+ }
8889
+ function hasAnyFinalMaterialization(selected) {
8890
+ return selected.has("companies") || selected.has("establishments") || selected.has("partners") || selected.has("simples");
8891
+ }
8892
+ function materializeSql(selected) {
8893
+ const lines = [echo("[materialize] Starting final table materialization...")];
8894
+ if (selected.has("companies")) {
8895
+ lines.push(materializeCompaniesSql(), "");
8896
+ }
8897
+ if (selected.has("establishments")) {
8898
+ lines.push(materializeEstablishmentsSql(), "");
8899
+ }
8900
+ if (selected.has("partners")) {
8901
+ lines.push(materializePartnersSql(), "");
8902
+ }
8903
+ if (selected.has("simples")) {
8904
+ lines.push(materializeSimplesSql(), "");
8905
+ }
8906
+ lines.push(echo("[materialize] Final table materialization completed."));
8907
+ return lines;
8908
+ }
8909
+ function indexesSql() {
8910
+ return [
8911
+ echo(
8912
+ "[indexes] No additional index operations are generated in this beta."
8913
+ ),
8914
+ "-- Indexes are expected to be managed by the schema generated by cnpj-db-loader schema generate.",
8915
+ "-- A future fast-rebuild mode may generate DROP/CREATE INDEX operations here."
8916
+ ];
8917
+ }
8918
+ function analyzeSql(selected) {
8919
+ const tables = /* @__PURE__ */ new Set();
8920
+ if (selected.has("companies")) {
8921
+ tables.add("companies");
8922
+ }
8923
+ if (selected.has("establishments")) {
8924
+ tables.add("establishments");
8925
+ }
8926
+ if (selected.has("secondary-cnaes")) {
8927
+ tables.add("establishment_secondary_cnaes");
8928
+ }
8929
+ if (selected.has("partners")) {
8930
+ tables.add("partners");
8931
+ }
8932
+ if (selected.has("simples")) {
8933
+ tables.add("simples_options");
8934
+ }
8935
+ if (selected.has("domains")) {
8936
+ for (const dataset of DOMAIN_DATASETS) {
8937
+ tables.add(dataset);
8938
+ }
8939
+ }
8940
+ return [
8941
+ echo("[analyze] Refreshing planner statistics..."),
8942
+ ...[...tables].map((table) => `analyze ${table};`),
8943
+ echo("[analyze] Planner statistics refreshed.")
8944
+ ];
8791
8945
  }
8792
- function generatePostgresSanitizedDirectImportScript(input) {
8946
+ function step(name, file, dependsOn, included) {
8947
+ return { name, file, dependsOn, included };
8948
+ }
8949
+ function generatePostgresDirectScriptFiles(input) {
8793
8950
  const grouped = directFilesByDataset(input.files);
8794
- const lines = [
8795
- "-- CNPJ DB Loader direct PostgreSQL import script",
8951
+ const selected = includeSet(input);
8952
+ if (!DOMAIN_DATASETS.some((dataset) => (grouped[dataset] ?? []).length > 0)) {
8953
+ selected.delete("domains");
8954
+ }
8955
+ if ((grouped.companies ?? []).length === 0) {
8956
+ selected.delete("companies");
8957
+ }
8958
+ if ((grouped.establishments ?? []).length === 0) {
8959
+ selected.delete("establishments");
8960
+ selected.delete("secondary-cnaes");
8961
+ }
8962
+ if ((grouped.partners ?? []).length === 0) {
8963
+ selected.delete("partners");
8964
+ }
8965
+ if ((grouped.simples_options ?? []).length === 0) {
8966
+ selected.delete("simples");
8967
+ }
8968
+ const scripts = {};
8969
+ const steps = [];
8970
+ const setupIncluded = true;
8971
+ steps.push(step("setup", "setup.sql", [], setupIncluded));
8972
+ scripts["setup.sql"] = [
8973
+ ...scriptHeader(
8974
+ "CNPJ DB Loader PostgreSQL direct import setup",
8975
+ input.sourceEncoding
8976
+ ),
8977
+ echo("[setup] Preparing PostgreSQL direct import session..."),
8978
+ "-- The database schema must be applied before running these scripts.",
8979
+ "-- This setup script configures the psql session used by the generated orchestrator.",
8980
+ echo("[setup] Setup completed."),
8981
+ ""
8982
+ ].join("\n");
8983
+ const domainsIncluded = selected.has("domains") && DOMAIN_DATASETS.some((dataset) => (grouped[dataset] ?? []).length > 0);
8984
+ steps.push(
8985
+ step("load-domains", "load-domains.sql", ["setup"], domainsIncluded)
8986
+ );
8987
+ if (domainsIncluded) {
8988
+ const lines = [echo("[load-domains] Starting domain tables load...")];
8989
+ for (const dataset of DOMAIN_DATASETS) {
8990
+ lines.push(...rawDomainSql(dataset, grouped[dataset] ?? []), "");
8991
+ }
8992
+ lines.push(echo("[load-domains] Domain tables load completed."));
8993
+ scripts["load-domains.sql"] = buildStepScript(
8994
+ "CNPJ DB Loader PostgreSQL direct import domains step",
8995
+ lines,
8996
+ input,
8997
+ true
8998
+ );
8999
+ }
9000
+ const datasetSteps = [
9001
+ {
9002
+ dataset: "companies",
9003
+ name: "load-companies",
9004
+ file: "load-companies.sql",
9005
+ include: "companies"
9006
+ },
9007
+ {
9008
+ dataset: "establishments",
9009
+ name: "load-establishments",
9010
+ file: "load-establishments.sql",
9011
+ include: "establishments"
9012
+ },
9013
+ {
9014
+ dataset: "partners",
9015
+ name: "load-partners",
9016
+ file: "load-partners.sql",
9017
+ include: "partners"
9018
+ },
9019
+ {
9020
+ dataset: "simples_options",
9021
+ name: "load-simples",
9022
+ file: "load-simples.sql",
9023
+ include: "simples"
9024
+ }
9025
+ ];
9026
+ for (const item of datasetSteps) {
9027
+ const files = grouped[item.dataset] ?? [];
9028
+ const included = selected.has(item.include) && files.length > 0;
9029
+ steps.push(step(item.name, item.file, ["setup"], included));
9030
+ if (included) {
9031
+ scripts[item.file] = buildStepScript(
9032
+ `CNPJ DB Loader PostgreSQL direct import ${item.name} step`,
9033
+ rawStagingSql(item.dataset, files),
9034
+ input,
9035
+ true
9036
+ );
9037
+ }
9038
+ }
9039
+ const materializeIncluded = hasAnyFinalMaterialization(selected);
9040
+ steps.push(
9041
+ step(
9042
+ "materialize",
9043
+ "materialize.sql",
9044
+ datasetSteps.filter((item) => selected.has(item.include)).map((item) => item.name),
9045
+ materializeIncluded
9046
+ )
9047
+ );
9048
+ if (materializeIncluded) {
9049
+ scripts["materialize.sql"] = buildStepScript(
9050
+ "CNPJ DB Loader PostgreSQL direct import materialization step",
9051
+ materializeSql(selected),
9052
+ input,
9053
+ true
9054
+ );
9055
+ }
9056
+ const secondaryIncluded = selected.has("secondary-cnaes") && selected.has("establishments");
9057
+ steps.push(
9058
+ step(
9059
+ "materialize-secondary-cnaes",
9060
+ "materialize-secondary-cnaes.sql",
9061
+ ["load-establishments"],
9062
+ secondaryIncluded
9063
+ )
9064
+ );
9065
+ if (secondaryIncluded) {
9066
+ scripts["materialize-secondary-cnaes.sql"] = buildStepScript(
9067
+ "CNPJ DB Loader PostgreSQL direct import secondary CNAEs step",
9068
+ [materializeSecondaryCnaesSql()],
9069
+ input,
9070
+ true
9071
+ );
9072
+ }
9073
+ const indexesIncluded = selected.has("indexes");
9074
+ steps.push(
9075
+ step(
9076
+ "indexes",
9077
+ "indexes.sql",
9078
+ materializeIncluded ? ["materialize"] : ["setup"],
9079
+ indexesIncluded
9080
+ )
9081
+ );
9082
+ if (indexesIncluded) {
9083
+ scripts["indexes.sql"] = buildStepScript(
9084
+ "CNPJ DB Loader PostgreSQL direct import indexes step",
9085
+ indexesSql(),
9086
+ input,
9087
+ true
9088
+ );
9089
+ }
9090
+ const analyzeIncluded = selected.has("analyze");
9091
+ const analyzeDependencies = [
9092
+ ...domainsIncluded ? ["load-domains"] : [],
9093
+ ...materializeIncluded ? ["materialize"] : [],
9094
+ ...secondaryIncluded ? ["materialize-secondary-cnaes"] : []
9095
+ ];
9096
+ steps.push(
9097
+ step(
9098
+ "analyze",
9099
+ "analyze.sql",
9100
+ analyzeDependencies.length > 0 ? analyzeDependencies : ["setup"],
9101
+ analyzeIncluded
9102
+ )
9103
+ );
9104
+ if (analyzeIncluded) {
9105
+ scripts["analyze.sql"] = buildStepScript(
9106
+ "CNPJ DB Loader PostgreSQL direct import analyze step",
9107
+ analyzeSql(selected),
9108
+ input,
9109
+ true
9110
+ );
9111
+ }
9112
+ const orchestratorLines = [
9113
+ "-- CNPJ DB Loader direct PostgreSQL import orchestrator",
8796
9114
  "-- Generated from sanitized Receita files by cnpj-db-loader postgres generate-script.",
8797
- "-- This path avoids rewriting the dataset into a second CSV tree.",
8798
9115
  "-- Execute with psql, for example:",
8799
- '-- psql "postgres://postgres:postgres@localhost:5432/cnpj" -f import-postgres-direct.sql',
9116
+ '-- psql -d "postgres://postgres:postgres@localhost:5432/cnpj" -f import-postgres-direct.sql',
8800
9117
  "",
8801
9118
  "\\set ON_ERROR_STOP on",
8802
- `\\echo 'Using source file encoding ${input.sourceEncoding} for psql copy operations...'`,
9119
+ echo(
9120
+ `Using source file encoding ${input.sourceEncoding} for psql copy operations...`
9121
+ ),
8803
9122
  `set client_encoding to ${quoteSqlLiteral(input.sourceEncoding)};`,
8804
- "\\echo 'Starting CNPJ DB Loader direct PostgreSQL import from sanitized files...'",
9123
+ echo(
9124
+ `Starting CNPJ DB Loader direct PostgreSQL import using transaction mode ${input.transactionMode}...`
9125
+ ),
9126
+ "",
9127
+ ...input.transactionMode === "single" ? ["begin;", ""] : []
9128
+ ];
9129
+ for (const name of STEP_ORDER) {
9130
+ const currentStep = steps.find((item) => item.name === name);
9131
+ if (!currentStep?.included) {
9132
+ continue;
9133
+ }
9134
+ orchestratorLines.push(
9135
+ echo(
9136
+ `[orchestrator] Running ${currentStep.name} (${currentStep.file})...`
9137
+ ),
9138
+ `\\ir ${currentStep.file}`,
9139
+ echo(`[orchestrator] Completed ${currentStep.name}.`),
9140
+ ""
9141
+ );
9142
+ }
9143
+ orchestratorLines.push(
9144
+ ...input.transactionMode === "single" ? ["commit;", ""] : [],
9145
+ echo("CNPJ DB Loader hybrid PostgreSQL import completed."),
9146
+ ""
9147
+ );
9148
+ scripts["import-postgres-direct.sql"] = orchestratorLines.join("\n");
9149
+ return { scripts, steps };
9150
+ }
9151
+ function generatePostgresDirectImportScript(input) {
9152
+ const grouped = csvFilesByDataset(input.files);
9153
+ const lines = [
9154
+ "-- CNPJ DB Loader hybrid PostgreSQL import script",
9155
+ "-- Generated from PostgreSQL-ready CSV files exported by cnpj-db-loader postgres export-csv.",
9156
+ "-- Execute with psql, for example:",
9157
+ '-- psql -d "postgres://postgres:postgres@localhost:5432/cnpj" -f import-postgres-direct.sql',
9158
+ "",
9159
+ "\\set ON_ERROR_STOP on",
9160
+ echo("Starting CNPJ DB Loader hybrid PostgreSQL import..."),
8805
9161
  "",
8806
9162
  "begin;",
8807
9163
  "",
8808
9164
  "-- Keep the final schema and seed data managed by sql/schema.sql.",
8809
- "-- This script copies sanitized Receita files into temporary raw tables,",
8810
- "-- transforms values inside PostgreSQL, resets staging tables and upserts final data.",
9165
+ "-- This script only resets staging tables and then upserts final data.",
8811
9166
  "truncate table staging_companies restart identity;",
8812
9167
  "truncate table staging_establishments restart identity;",
8813
9168
  "truncate table staging_partners restart identity;",
@@ -8815,10 +9170,10 @@ function generatePostgresSanitizedDirectImportScript(input) {
8815
9170
  ""
8816
9171
  ];
8817
9172
  for (const dataset of DOMAIN_DATASETS) {
8818
- lines.push(...rawDomainSql(dataset, grouped[dataset] ?? []), "");
9173
+ lines.push(...copyDomainSql(dataset, grouped[dataset] ?? []), "");
8819
9174
  }
8820
9175
  for (const dataset of STAGING_DATASETS) {
8821
- lines.push(...rawStagingSql(dataset, grouped[dataset] ?? []), "");
9176
+ lines.push(...copyStagingSql(dataset, grouped[dataset] ?? []), "");
8822
9177
  }
8823
9178
  lines.push(...materializationAndAnalyzeSql());
8824
9179
  return lines.join("\n");
@@ -8829,11 +9184,13 @@ function materializationAndAnalyzeSql() {
8829
9184
  "",
8830
9185
  materializeEstablishmentsSql(),
8831
9186
  "",
9187
+ materializeSecondaryCnaesSql(),
9188
+ "",
8832
9189
  materializePartnersSql(),
8833
9190
  "",
8834
9191
  materializeSimplesSql(),
8835
9192
  "",
8836
- "\\echo 'Refreshing planner statistics...'",
9193
+ echo("Refreshing planner statistics..."),
8837
9194
  "analyze companies;",
8838
9195
  "analyze establishments;",
8839
9196
  "analyze establishment_secondary_cnaes;",
@@ -8848,7 +9205,7 @@ function materializationAndAnalyzeSql() {
8848
9205
  "",
8849
9206
  "commit;",
8850
9207
  "",
8851
- "\\echo 'CNPJ DB Loader hybrid PostgreSQL import completed.'",
9208
+ echo("CNPJ DB Loader hybrid PostgreSQL import completed."),
8852
9209
  ""
8853
9210
  ];
8854
9211
  }
@@ -9055,6 +9412,29 @@ async function exportPostgresCsvDataset(inputPath, options = {}) {
9055
9412
  import { mkdir as mkdir9, stat as stat7, writeFile as writeFile6 } from "fs/promises";
9056
9413
  import path17 from "path";
9057
9414
  var DEFAULT_SOURCE_ENCODING = "UTF8";
9415
+ var DEFAULT_TRANSACTION_MODE = "single";
9416
+ var ALL_INCLUDE_TARGETS = [
9417
+ "domains",
9418
+ "companies",
9419
+ "establishments",
9420
+ "partners",
9421
+ "simples",
9422
+ "secondary-cnaes",
9423
+ "indexes",
9424
+ "analyze"
9425
+ ];
9426
+ var INCLUDE_TARGETS_BY_DATASET = {
9427
+ companies: "companies",
9428
+ establishments: "establishments",
9429
+ partners: "partners",
9430
+ simples_options: "simples",
9431
+ countries: "domains",
9432
+ cities: "domains",
9433
+ partner_qualifications: "domains",
9434
+ legal_natures: "domains",
9435
+ reasons: "domains",
9436
+ cnaes: "domains"
9437
+ };
9058
9438
  function defaultPostgresDirectOutputPath(inputPath) {
9059
9439
  const baseName = path17.basename(inputPath);
9060
9440
  if (baseName.toLowerCase() === "sanitized") {
@@ -9063,7 +9443,7 @@ function defaultPostgresDirectOutputPath(inputPath) {
9063
9443
  return path17.join(path17.dirname(inputPath), `${baseName}-postgres-direct`);
9064
9444
  }
9065
9445
  function inferNextStep5(scriptPath) {
9066
- return `psql "postgres://postgres:postgres@localhost:5432/cnpj" -f ${scriptPath.replace(/\\/g, "/")}`;
9446
+ return `psql -d "postgres://postgres:postgres@localhost:5432/cnpj" -f ${scriptPath.replace(/\\/g, "/")}`;
9067
9447
  }
9068
9448
  function normalizeSourceEncoding(value) {
9069
9449
  const encoding = (value ?? DEFAULT_SOURCE_ENCODING).trim();
@@ -9074,6 +9454,41 @@ function normalizeSourceEncoding(value) {
9074
9454
  }
9075
9455
  return encoding.toUpperCase();
9076
9456
  }
9457
+ function normalizeTransactionMode(value) {
9458
+ const mode = value ?? DEFAULT_TRANSACTION_MODE;
9459
+ if (!["single", "phase", "none"].includes(mode)) {
9460
+ throw new ValidationError(
9461
+ `Invalid transaction mode: ${String(value)}. Use single, phase or none.`
9462
+ );
9463
+ }
9464
+ return mode;
9465
+ }
9466
+ function isIncludeTarget(value) {
9467
+ return ALL_INCLUDE_TARGETS.includes(value);
9468
+ }
9469
+ function normalizeIncludeTargets(include, dataset) {
9470
+ if (include && include.length > 0) {
9471
+ const unique = [...new Set(include)];
9472
+ const invalid = unique.filter((item) => !isIncludeTarget(item));
9473
+ if (invalid.length > 0) {
9474
+ throw new ValidationError(
9475
+ `Invalid include target(s): ${invalid.join(", ")}. Use ${ALL_INCLUDE_TARGETS.join(", ")}.`
9476
+ );
9477
+ }
9478
+ return unique;
9479
+ }
9480
+ if (dataset) {
9481
+ const target = INCLUDE_TARGETS_BY_DATASET[dataset];
9482
+ if (!target) {
9483
+ return [];
9484
+ }
9485
+ if (target === "establishments") {
9486
+ return ["establishments", "secondary-cnaes", "analyze"];
9487
+ }
9488
+ return [target, "analyze"];
9489
+ }
9490
+ return [...ALL_INCLUDE_TARGETS];
9491
+ }
9077
9492
  async function generatePostgresDirectScript(inputPath, options = {}) {
9078
9493
  if (options.dataset && !isImportDatasetType(options.dataset)) {
9079
9494
  throw new ValidationError(`Unsupported dataset type: ${options.dataset}.`);
@@ -9089,6 +9504,10 @@ async function generatePostgresDirectScript(inputPath, options = {}) {
9089
9504
  options.outputPath ?? defaultPostgresDirectOutputPath(validatedPath)
9090
9505
  );
9091
9506
  const sourceEncoding = normalizeSourceEncoding(options.sourceEncoding);
9507
+ const transactionMode = normalizeTransactionMode(options.transactionMode);
9508
+ const include = normalizeIncludeTargets(options.include, options.dataset);
9509
+ const skipIndexes = options.skipIndexes ?? false;
9510
+ const skipAnalyze = options.skipAnalyze ?? false;
9092
9511
  const inspected = await inspectFiles(validatedPath);
9093
9512
  const recognizedFiles = inspected.entries.filter((entry) => entry.entryKind === "file").flatMap((entry) => {
9094
9513
  if (!isImportDatasetType(entry.inferredType)) {
@@ -9116,7 +9535,11 @@ async function generatePostgresDirectScript(inputPath, options = {}) {
9116
9535
  outputPath,
9117
9536
  totalFiles: recognizedFiles.length,
9118
9537
  datasets,
9119
- sourceEncoding
9538
+ sourceEncoding,
9539
+ transactionMode,
9540
+ include,
9541
+ skipIndexes,
9542
+ skipAnalyze
9120
9543
  });
9121
9544
  await mkdir9(outputPath, { recursive: true });
9122
9545
  const sourceFiles = [];
@@ -9152,11 +9575,21 @@ async function generatePostgresDirectScript(inputPath, options = {}) {
9152
9575
  }
9153
9576
  const scriptName = options.scriptName ?? "import-postgres-direct.sql";
9154
9577
  const scriptPath = path17.join(outputPath, scriptName);
9155
- const script = generatePostgresSanitizedDirectImportScript({
9578
+ const generated = generatePostgresDirectScriptFiles({
9156
9579
  files: sourceFiles,
9157
- sourceEncoding
9580
+ sourceEncoding,
9581
+ transactionMode,
9582
+ include,
9583
+ skipIndexes,
9584
+ skipAnalyze
9158
9585
  });
9159
- await writeFile6(scriptPath, script, "utf8");
9586
+ const scriptFiles = [];
9587
+ for (const [fileName, script] of Object.entries(generated.scripts)) {
9588
+ const outputFileName = fileName === "import-postgres-direct.sql" ? scriptName : fileName;
9589
+ const outputFilePath = path17.join(outputPath, outputFileName);
9590
+ await writeFile6(outputFilePath, script, "utf8");
9591
+ scriptFiles.push(outputFilePath);
9592
+ }
9160
9593
  const manifestPath = path17.join(outputPath, "manifest.json");
9161
9594
  const summaryDatasets = [...summariesByDataset.values()].sort(
9162
9595
  (left, right) => IMPORT_ORDER.indexOf(left.dataset) - IMPORT_ORDER.indexOf(right.dataset)
@@ -9168,13 +9601,19 @@ async function generatePostgresDirectScript(inputPath, options = {}) {
9168
9601
  const manifest = {
9169
9602
  generatedAt: (/* @__PURE__ */ new Date()).toISOString(),
9170
9603
  mode: "direct-sanitized-script",
9604
+ transactionMode,
9605
+ include,
9606
+ skipIndexes,
9607
+ skipAnalyze,
9171
9608
  inputPath: path17.resolve(inputPath),
9172
9609
  validatedPath,
9173
9610
  outputPath,
9174
9611
  scriptPath,
9612
+ scriptFiles,
9175
9613
  sourceEncoding,
9176
9614
  totalFiles: sourceFiles.length,
9177
9615
  totalBytes,
9616
+ steps: generated.steps,
9178
9617
  datasets: summaryDatasets
9179
9618
  };
9180
9619
  await writeFile6(
@@ -9197,15 +9636,19 @@ async function generatePostgresDirectScript(inputPath, options = {}) {
9197
9636
  scriptPath,
9198
9637
  manifestPath,
9199
9638
  sourceEncoding,
9639
+ transactionMode,
9200
9640
  totalFiles: sourceFiles.length,
9201
9641
  totalBytes,
9202
9642
  datasets: summaryDatasets,
9643
+ scriptFiles,
9644
+ steps: generated.steps,
9203
9645
  warnings: [
9204
9646
  ...validation.ok ? [] : validation.errors,
9205
9647
  "This script imports sanitized Receita files directly with psql \\copy. It avoids rewriting the full dataset into a second CSV tree.",
9206
- "The generated script expects the database schema generated by cnpj-db-loader to be applied before execution.",
9648
+ "The generated scripts expect the database schema generated by cnpj-db-loader to be applied before execution.",
9207
9649
  "The direct PostgreSQL script now defaults to UTF8 because the sanitize command writes clean UTF-8 files.",
9208
- "Use --source-encoding WIN1252 or LATIN1 only when generating scripts for legacy sanitized files produced by older loader versions."
9650
+ "Use --source-encoding WIN1252 or LATIN1 only when generating scripts for legacy sanitized files produced by older loader versions.",
9651
+ "The generated import is now modular. Use import-postgres-direct.sql as the orchestrator or run individual phase scripts manually."
9209
9652
  ],
9210
9653
  nextStep: inferNextStep5(scriptPath)
9211
9654
  };