@danielarndt0/cnpj-db-loader 2.4.0-beta.1 → 2.4.0-beta.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -7821,81 +7821,264 @@ function isRecognizedSanitizeEntry(entry) {
7821
7821
  return entry.entryKind === "file" && entry.inferredType !== "zip-archive" && entry.inferredType !== "unknown";
7822
7822
  }
7823
7823
 
7824
+ // src/services/sanitize/encoding.ts
7825
+ import { StringDecoder } from "string_decoder";
7826
+ var WINDOWS_1252_C1_MAP = {
7827
+ 128: "\u20AC",
7828
+ 130: "\u201A",
7829
+ 131: "\u0192",
7830
+ 132: "\u201E",
7831
+ 133: "\u2026",
7832
+ 134: "\u2020",
7833
+ 135: "\u2021",
7834
+ 136: "\u02C6",
7835
+ 137: "\u2030",
7836
+ 138: "\u0160",
7837
+ 139: "\u2039",
7838
+ 140: "\u0152",
7839
+ 142: "\u017D",
7840
+ 145: "\u2018",
7841
+ 146: "\u2019",
7842
+ 147: "\u201C",
7843
+ 148: "\u201D",
7844
+ 149: "\u2022",
7845
+ 150: "\u2013",
7846
+ 151: "\u2014",
7847
+ 152: "\u02DC",
7848
+ 153: "\u2122",
7849
+ 154: "\u0161",
7850
+ 155: "\u203A",
7851
+ 156: "\u0153",
7852
+ 158: "\u017E",
7853
+ 159: "\u0178"
7854
+ };
7855
+ function normalizeSanitizeSourceEncoding(value) {
7856
+ const normalized = (value ?? "WIN1252").trim().toUpperCase().replace(/_/g, "-");
7857
+ switch (normalized) {
7858
+ case "WIN1252":
7859
+ case "WINDOWS-1252":
7860
+ case "CP1252":
7861
+ return "WIN1252";
7862
+ case "LATIN1":
7863
+ case "LATIN-1":
7864
+ case "ISO-8859-1":
7865
+ case "ISO8859-1":
7866
+ return "LATIN1";
7867
+ case "UTF8":
7868
+ case "UTF-8":
7869
+ return "UTF8";
7870
+ default:
7871
+ throw new ValidationError(
7872
+ `Unsupported sanitize source encoding: ${value}. Supported values: WIN1252, LATIN1, UTF8.`
7873
+ );
7874
+ }
7875
+ }
7876
+ function isAllowedControlCodePoint(codePoint) {
7877
+ return codePoint === 9 || codePoint === 10 || codePoint === 13;
7878
+ }
7879
+ function isProblematicControlCodePoint(codePoint) {
7880
+ if (isAllowedControlCodePoint(codePoint)) {
7881
+ return false;
7882
+ }
7883
+ return codePoint >= 0 && codePoint <= 31 || codePoint === 127 || codePoint >= 128 && codePoint <= 159 || codePoint === 65279;
7884
+ }
7885
+ function sanitizeDecodedText(text) {
7886
+ const output2 = [];
7887
+ let invalidBytesRemoved = 0;
7888
+ let controlCharsRemoved = 0;
7889
+ for (const char of text) {
7890
+ const codePoint = char.codePointAt(0);
7891
+ if (codePoint === 65533) {
7892
+ invalidBytesRemoved += 1;
7893
+ continue;
7894
+ }
7895
+ if (isProblematicControlCodePoint(codePoint)) {
7896
+ controlCharsRemoved += 1;
7897
+ continue;
7898
+ }
7899
+ output2.push(char);
7900
+ }
7901
+ return {
7902
+ text: output2.join(""),
7903
+ invalidBytesRemoved,
7904
+ controlCharsRemoved
7905
+ };
7906
+ }
7907
+ var SanitizeEncodingNormalizer = class {
7908
+ constructor(sourceEncoding) {
7909
+ this.sourceEncoding = sourceEncoding;
7910
+ this.utf8Decoder = sourceEncoding === "UTF8" ? new StringDecoder("utf8") : void 0;
7911
+ }
7912
+ sourceEncoding;
7913
+ utf8Decoder;
7914
+ normalizeChunk(chunk) {
7915
+ if (this.sourceEncoding === "UTF8") {
7916
+ const decoded = this.utf8Decoder.write(chunk);
7917
+ const sanitized = sanitizeDecodedText(decoded);
7918
+ const nulBytesRemoved = [...decoded].filter(
7919
+ (char) => char === "\0"
7920
+ ).length;
7921
+ return {
7922
+ ...sanitized,
7923
+ nulBytesRemoved
7924
+ };
7925
+ }
7926
+ return this.normalizeSingleByteChunk(chunk);
7927
+ }
7928
+ flush() {
7929
+ if (!this.utf8Decoder) {
7930
+ return {
7931
+ text: "",
7932
+ nulBytesRemoved: 0,
7933
+ invalidBytesRemoved: 0,
7934
+ controlCharsRemoved: 0
7935
+ };
7936
+ }
7937
+ const decoded = this.utf8Decoder.end();
7938
+ const sanitized = sanitizeDecodedText(decoded);
7939
+ const nulBytesRemoved = [...decoded].filter((char) => char === "\0").length;
7940
+ return {
7941
+ ...sanitized,
7942
+ nulBytesRemoved
7943
+ };
7944
+ }
7945
+ normalizeSingleByteChunk(chunk) {
7946
+ const output2 = [];
7947
+ let nulBytesRemoved = 0;
7948
+ let invalidBytesRemoved = 0;
7949
+ let controlCharsRemoved = 0;
7950
+ for (const byte of chunk) {
7951
+ if (byte === 0) {
7952
+ nulBytesRemoved += 1;
7953
+ continue;
7954
+ }
7955
+ if (byte < 32 || byte === 127) {
7956
+ if (isAllowedControlCodePoint(byte)) {
7957
+ output2.push(String.fromCharCode(byte));
7958
+ } else {
7959
+ controlCharsRemoved += 1;
7960
+ }
7961
+ continue;
7962
+ }
7963
+ if (byte >= 128 && byte <= 159) {
7964
+ if (this.sourceEncoding === "WIN1252") {
7965
+ const mapped = WINDOWS_1252_C1_MAP[byte];
7966
+ if (mapped === void 0) {
7967
+ invalidBytesRemoved += 1;
7968
+ } else {
7969
+ output2.push(mapped);
7970
+ }
7971
+ } else {
7972
+ controlCharsRemoved += 1;
7973
+ }
7974
+ continue;
7975
+ }
7976
+ output2.push(String.fromCharCode(byte));
7977
+ }
7978
+ return {
7979
+ text: output2.join(""),
7980
+ nulBytesRemoved,
7981
+ invalidBytesRemoved,
7982
+ controlCharsRemoved
7983
+ };
7984
+ }
7985
+ };
7986
+
7824
7987
  // src/services/sanitize/runner.ts
7825
7988
  import { createReadStream as createReadStream2, createWriteStream as createWriteStream2 } from "fs";
7826
7989
  import { mkdir as mkdir7 } from "fs/promises";
7827
7990
  import path13 from "path";
7828
- function stripNulBytes(chunk) {
7829
- let removed = 0;
7830
- for (let index = 0; index < chunk.length; index += 1) {
7831
- if (chunk[index] === 0) {
7832
- removed += 1;
7833
- }
7991
+ async function writeUtf8(output2, value) {
7992
+ if (value.length === 0) {
7993
+ return;
7834
7994
  }
7835
- if (removed === 0) {
7836
- return { buffer: chunk, removed: 0 };
7995
+ if (!output2.write(value, "utf8")) {
7996
+ await new Promise((resolve2, reject) => {
7997
+ output2.once("drain", resolve2);
7998
+ output2.once("error", reject);
7999
+ });
7837
8000
  }
7838
- const sanitized = Buffer.allocUnsafe(chunk.length - removed);
7839
- let outputIndex = 0;
7840
- for (let index = 0; index < chunk.length; index += 1) {
7841
- const value = chunk[index];
7842
- if (value !== 0) {
7843
- sanitized[outputIndex] = value;
7844
- outputIndex += 1;
8001
+ }
8002
+ function countNewlines(value) {
8003
+ let count = 0;
8004
+ for (let index = 0; index < value.length; index += 1) {
8005
+ if (value[index] === "\n") {
8006
+ count += 1;
7845
8007
  }
7846
8008
  }
7847
- return { buffer: sanitized, removed };
8009
+ return count;
7848
8010
  }
7849
- async function sanitizeDatasetFile(plan, onChunk) {
8011
+ async function sanitizeDatasetFile(plan, onChunk, options = {}) {
7850
8012
  await mkdir7(path13.dirname(plan.outputPath), { recursive: true });
8013
+ const sourceEncoding = normalizeSanitizeSourceEncoding(
8014
+ options.sourceEncoding
8015
+ );
8016
+ const normalizer = new SanitizeEncodingNormalizer(sourceEncoding);
7851
8017
  const input2 = createReadStream2(plan.absolutePath);
7852
- const output2 = createWriteStream2(plan.outputPath);
8018
+ const output2 = createWriteStream2(plan.outputPath, { encoding: "utf8" });
7853
8019
  let totalBytesRead = 0;
7854
8020
  let totalBytesWritten = 0;
7855
8021
  let nulBytesRemoved = 0;
8022
+ let invalidBytesRemoved = 0;
8023
+ let controlCharsRemoved = 0;
7856
8024
  let lineCount = 0;
7857
- let sawAnyByte = false;
7858
- let lastByteWasNewline = false;
8025
+ let sawAnyCharacter = false;
8026
+ let lastCharacterWasNewline = false;
8027
+ const processText = async (text) => {
8028
+ if (text.length === 0) {
8029
+ return;
8030
+ }
8031
+ sawAnyCharacter = true;
8032
+ lineCount += countNewlines(text);
8033
+ lastCharacterWasNewline = text.endsWith("\n");
8034
+ totalBytesWritten += Buffer.byteLength(text, "utf8");
8035
+ await writeUtf8(output2, text);
8036
+ };
7859
8037
  try {
7860
8038
  for await (const chunk of input2) {
7861
8039
  const chunkBuffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
7862
8040
  totalBytesRead += chunkBuffer.length;
7863
- const { buffer, removed } = stripNulBytes(chunkBuffer);
7864
- nulBytesRemoved += removed;
7865
- sawAnyByte = sawAnyByte || buffer.length > 0;
7866
- for (let index = 0; index < buffer.length; index += 1) {
7867
- if (buffer[index] === 10) {
7868
- lineCount += 1;
7869
- }
7870
- }
7871
- if (buffer.length > 0) {
7872
- lastByteWasNewline = buffer[buffer.length - 1] === 10;
7873
- }
7874
- totalBytesWritten += buffer.length;
7875
- output2.write(buffer);
8041
+ const normalized = normalizer.normalizeChunk(chunkBuffer);
8042
+ nulBytesRemoved += normalized.nulBytesRemoved;
8043
+ invalidBytesRemoved += normalized.invalidBytesRemoved;
8044
+ controlCharsRemoved += normalized.controlCharsRemoved;
8045
+ await processText(normalized.text);
7876
8046
  onChunk?.({
7877
8047
  bytesProcessed: chunkBuffer.length,
7878
8048
  fileBytesProcessed: totalBytesRead,
7879
8049
  currentFileSize: plan.fileSize,
7880
8050
  processedRows: lineCount,
7881
- nulBytesRemoved
8051
+ nulBytesRemoved,
8052
+ invalidBytesRemoved,
8053
+ controlCharsRemoved
7882
8054
  });
7883
8055
  }
7884
- if (sawAnyByte && !lastByteWasNewline) {
8056
+ const flushed = normalizer.flush();
8057
+ nulBytesRemoved += flushed.nulBytesRemoved;
8058
+ invalidBytesRemoved += flushed.invalidBytesRemoved;
8059
+ controlCharsRemoved += flushed.controlCharsRemoved;
8060
+ await processText(flushed.text);
8061
+ if (sawAnyCharacter && !lastCharacterWasNewline) {
7885
8062
  lineCount += 1;
7886
8063
  }
7887
8064
  } finally {
7888
8065
  input2.close();
7889
8066
  output2.end();
7890
- await new Promise((resolve2) => output2.on("finish", () => resolve2()));
8067
+ await new Promise((resolve2, reject) => {
8068
+ output2.on("finish", () => resolve2());
8069
+ output2.on("error", (error) => reject(error));
8070
+ });
7891
8071
  }
7892
8072
  return {
7893
8073
  plan,
7894
8074
  totalBytesRead,
7895
8075
  totalBytesWritten,
8076
+ sourceEncoding,
7896
8077
  nulBytesRemoved,
8078
+ invalidBytesRemoved,
8079
+ controlCharsRemoved,
7897
8080
  lineCount,
7898
- changed: nulBytesRemoved > 0 || totalBytesRead !== totalBytesWritten
8081
+ changed: nulBytesRemoved > 0 || invalidBytesRemoved > 0 || controlCharsRemoved > 0 || totalBytesRead !== totalBytesWritten
7899
8082
  };
7900
8083
  }
7901
8084
 
@@ -7958,40 +8141,54 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
7958
8141
  "No recognized validated dataset files were found for sanitization."
7959
8142
  );
7960
8143
  }
8144
+ const sourceEncoding = normalizeSanitizeSourceEncoding(
8145
+ options.sourceEncoding
8146
+ );
7961
8147
  options.onProgress?.({
7962
8148
  kind: "start",
7963
8149
  validatedPath,
7964
8150
  outputPath,
7965
8151
  totalFiles: plan.totalFiles,
7966
8152
  totalBytes: plan.totalBytes,
7967
- datasets: plan.datasets
8153
+ datasets: plan.datasets,
8154
+ sourceEncoding
7968
8155
  });
7969
8156
  let processedFiles = 0;
7970
8157
  let processedRows = 0;
7971
8158
  let processedBytes = 0;
7972
8159
  let nulBytesRemoved = 0;
8160
+ let invalidBytesRemoved = 0;
8161
+ let controlCharsRemoved = 0;
7973
8162
  let changedFiles = 0;
7974
8163
  const fileSummaries = [];
7975
8164
  for (const [index, filePlan] of plan.files.entries()) {
7976
- const fileResult = await sanitizeDatasetFile(filePlan, (chunk) => {
7977
- options.onProgress?.({
7978
- kind: "progress",
7979
- currentFileDisplayPath: filePlan.displayPath,
7980
- fileIndex: index + 1,
7981
- totalFiles: plan.totalFiles,
7982
- bytesProcessed: processedBytes + chunk.fileBytesProcessed,
7983
- totalBytes: plan.totalBytes,
7984
- fileBytesProcessed: chunk.fileBytesProcessed,
7985
- currentFileSize: chunk.currentFileSize,
7986
- processedRows: processedRows + chunk.processedRows,
7987
- nulBytesRemoved: nulBytesRemoved + chunk.nulBytesRemoved,
7988
- changedFiles
7989
- });
7990
- });
8165
+ const fileResult = await sanitizeDatasetFile(
8166
+ filePlan,
8167
+ (chunk) => {
8168
+ options.onProgress?.({
8169
+ kind: "progress",
8170
+ currentFileDisplayPath: filePlan.displayPath,
8171
+ fileIndex: index + 1,
8172
+ totalFiles: plan.totalFiles,
8173
+ bytesProcessed: processedBytes + chunk.fileBytesProcessed,
8174
+ totalBytes: plan.totalBytes,
8175
+ fileBytesProcessed: chunk.fileBytesProcessed,
8176
+ currentFileSize: chunk.currentFileSize,
8177
+ processedRows: processedRows + chunk.processedRows,
8178
+ nulBytesRemoved: nulBytesRemoved + chunk.nulBytesRemoved,
8179
+ invalidBytesRemoved: invalidBytesRemoved + chunk.invalidBytesRemoved,
8180
+ controlCharsRemoved: controlCharsRemoved + chunk.controlCharsRemoved,
8181
+ changedFiles
8182
+ });
8183
+ },
8184
+ { sourceEncoding }
8185
+ );
7991
8186
  processedFiles += 1;
7992
8187
  processedRows += fileResult.lineCount;
7993
8188
  processedBytes += fileResult.totalBytesRead;
7994
8189
  nulBytesRemoved += fileResult.nulBytesRemoved;
8190
+ invalidBytesRemoved += fileResult.invalidBytesRemoved;
8191
+ controlCharsRemoved += fileResult.controlCharsRemoved;
7995
8192
  changedFiles += fileResult.changed ? 1 : 0;
7996
8193
  fileSummaries.push({
7997
8194
  dataset: filePlan.dataset,
@@ -7999,7 +8196,9 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
7999
8196
  outputPath: filePlan.outputPath,
8000
8197
  lineCount: fileResult.lineCount,
8001
8198
  changed: fileResult.changed,
8002
- nulBytesRemoved: fileResult.nulBytesRemoved
8199
+ nulBytesRemoved: fileResult.nulBytesRemoved,
8200
+ invalidBytesRemoved: fileResult.invalidBytesRemoved,
8201
+ controlCharsRemoved: fileResult.controlCharsRemoved
8003
8202
  });
8004
8203
  }
8005
8204
  options.onProgress?.({
@@ -8007,6 +8206,8 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
8007
8206
  totalFiles: plan.totalFiles,
8008
8207
  processedRows,
8009
8208
  nulBytesRemoved,
8209
+ invalidBytesRemoved,
8210
+ controlCharsRemoved,
8010
8211
  changedFiles,
8011
8212
  totalBytes: plan.totalBytes
8012
8213
  });
@@ -8018,13 +8219,17 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
8018
8219
  totalBytes: plan.totalBytes,
8019
8220
  processedFiles,
8020
8221
  processedRows,
8222
+ sourceEncoding,
8021
8223
  nulBytesRemoved,
8224
+ invalidBytesRemoved,
8225
+ controlCharsRemoved,
8022
8226
  changedFiles,
8023
8227
  unchangedFiles: plan.totalFiles - changedFiles,
8024
8228
  datasets: plan.datasets,
8025
8229
  files: fileSummaries,
8026
8230
  warnings: [
8027
- "Sanitization prepares a clean dataset tree for import by removing known low-level byte issues such as NUL bytes before PostgreSQL loading begins.",
8231
+ "Sanitization now writes UTF-8 output and removes invalid bytes plus problematic control characters before PostgreSQL loading begins.",
8232
+ "The PostgreSQL direct import path can use --source-encoding UTF8 when reading files generated by this sanitization command.",
8028
8233
  "The import command still keeps quarantine and row-level recovery for unexpected issues, but sanitizing first reduces the amount of slow fallback work during import."
8029
8234
  ],
8030
8235
  nextStep: inferNextStep3(outputPath)
@@ -8174,6 +8379,18 @@ var STAGING_TABLE_BY_DATASET3 = {
8174
8379
  partners: "staging_partners",
8175
8380
  simples_options: "staging_simples_options"
8176
8381
  };
8382
+ var STEP_ORDER = [
8383
+ "setup",
8384
+ "load-domains",
8385
+ "load-companies",
8386
+ "load-establishments",
8387
+ "load-partners",
8388
+ "load-simples",
8389
+ "materialize",
8390
+ "materialize-secondary-cnaes",
8391
+ "indexes",
8392
+ "analyze"
8393
+ ];
8177
8394
  function quoteSqlLiteral(value) {
8178
8395
  return `'${value.replace(/'/g, "''")}'`;
8179
8396
  }
@@ -8191,6 +8408,9 @@ function receitaCopyCommand(tableName, columns, filePath) {
8191
8408
  const normalizedFilePath = normalizePathForPsql(filePath);
8192
8409
  return `\\copy ${tableName} (${columns.join(", ")}) from ${quoteSqlLiteral(normalizedFilePath)} with (format csv, header false, delimiter ';', quote '"', escape '"')`;
8193
8410
  }
8411
+ function echo(message) {
8412
+ return `\\echo ${quoteSqlLiteral(message)}`;
8413
+ }
8194
8414
  function datasetColumns(dataset) {
8195
8415
  return DATASET_LAYOUTS[dataset].fields.map((field) => field.columnName);
8196
8416
  }
@@ -8217,7 +8437,7 @@ function partnerDedupeExpression(alias) {
8217
8437
  function materializeCompaniesSql() {
8218
8438
  const columns = companiesLayout.fields.map((field) => field.columnName);
8219
8439
  return [
8220
- "\\echo 'Materializing companies...'",
8440
+ echo("[materialize] Materializing companies..."),
8221
8441
  "with source as (",
8222
8442
  " select",
8223
8443
  ` ${columns.map((column) => `source.${column}`).join(",\n ")},`,
@@ -8231,7 +8451,8 @@ function materializeCompaniesSql() {
8231
8451
  `select ${columns.join(", ")}`,
8232
8452
  "from deduped",
8233
8453
  "on conflict (cnpj_root) do update set",
8234
- ` ${updateAssignments(columns, ["cnpj_root"])};`
8454
+ ` ${updateAssignments(columns, ["cnpj_root"])};`,
8455
+ echo("[materialize] Companies materialization completed.")
8235
8456
  ].join("\n");
8236
8457
  }
8237
8458
  function materializeEstablishmentsSql() {
@@ -8240,7 +8461,7 @@ function materializeEstablishmentsSql() {
8240
8461
  );
8241
8462
  const insertColumns = [...baseColumns, "cnpj_full"];
8242
8463
  return [
8243
- "\\echo 'Materializing establishments and secondary CNAEs...'",
8464
+ echo("[materialize] Materializing establishments..."),
8244
8465
  "with source as (",
8245
8466
  " select",
8246
8467
  ` ${baseColumns.map((column) => `source.${column}`).join(",\n ")},`,
@@ -8250,14 +8471,29 @@ function materializeEstablishmentsSql() {
8250
8471
  "),",
8251
8472
  "deduped as (",
8252
8473
  " select * from source where dedupe_rank = 1",
8474
+ ")",
8475
+ `insert into establishments (${insertColumns.join(", ")})`,
8476
+ `select ${insertColumns.join(", ")}`,
8477
+ "from deduped",
8478
+ "on conflict (cnpj_full) do update set",
8479
+ ` ${updateAssignments(insertColumns, ["cnpj_root", "cnpj_order", "cnpj_check_digits", "cnpj_full"])};`,
8480
+ echo("[materialize] Establishments materialization completed.")
8481
+ ].join("\n");
8482
+ }
8483
+ function materializeSecondaryCnaesSql() {
8484
+ return [
8485
+ echo(
8486
+ "[materialize-secondary-cnaes] Materializing establishment secondary CNAEs..."
8487
+ ),
8488
+ "with source as (",
8489
+ " select",
8490
+ " staging.cnpj_root || staging.cnpj_order || staging.cnpj_check_digits as cnpj_full,",
8491
+ " staging.secondary_cnaes_raw,",
8492
+ " row_number() over (partition by staging.cnpj_root || staging.cnpj_order || staging.cnpj_check_digits order by staging.staging_id desc) as dedupe_rank",
8493
+ " from staging_establishments staging",
8253
8494
  "),",
8254
- "upserted as (",
8255
- ` insert into establishments (${insertColumns.join(", ")})`,
8256
- ` select ${insertColumns.join(", ")}`,
8257
- " from deduped",
8258
- " on conflict (cnpj_full) do update set",
8259
- ` ${updateAssignments(insertColumns, ["cnpj_root", "cnpj_order", "cnpj_check_digits", "cnpj_full"])}`,
8260
- " returning cnpj_full",
8495
+ "deduped as (",
8496
+ " select * from source where dedupe_rank = 1",
8261
8497
  "),",
8262
8498
  "deleted_secondary_cnaes as (",
8263
8499
  " delete from establishment_secondary_cnaes target",
@@ -8278,14 +8514,17 @@ function materializeEstablishmentsSql() {
8278
8514
  "insert into establishment_secondary_cnaes (cnpj_full, cnae_code)",
8279
8515
  "select cnpj_full, cnae_code",
8280
8516
  "from secondary_cnaes_source",
8281
- "on conflict (cnpj_full, cnae_code) do nothing;"
8517
+ "on conflict (cnpj_full, cnae_code) do nothing;",
8518
+ echo(
8519
+ "[materialize-secondary-cnaes] Secondary CNAEs materialization completed."
8520
+ )
8282
8521
  ].join("\n");
8283
8522
  }
8284
8523
  function materializePartnersSql() {
8285
8524
  const baseColumns = partnersLayout.fields.map((field) => field.columnName);
8286
8525
  const insertColumns = [...baseColumns, "partner_dedupe_key"];
8287
8526
  return [
8288
- "\\echo 'Materializing partners...'",
8527
+ echo("[materialize] Materializing partners..."),
8289
8528
  "with source as (",
8290
8529
  " select",
8291
8530
  ` ${baseColumns.map((column) => `source.${column}`).join(",\n ")},`,
@@ -8305,13 +8544,14 @@ function materializePartnersSql() {
8305
8544
  `select ${insertColumns.join(", ")}`,
8306
8545
  "from deduped",
8307
8546
  "on conflict (partner_dedupe_key) do update set",
8308
- ` ${updateAssignments(insertColumns, ["partner_dedupe_key"])};`
8547
+ ` ${updateAssignments(insertColumns, ["partner_dedupe_key"])};`,
8548
+ echo("[materialize] Partners materialization completed.")
8309
8549
  ].join("\n");
8310
8550
  }
8311
8551
  function materializeSimplesSql() {
8312
8552
  const columns = simplesLayout.fields.map((field) => field.columnName);
8313
8553
  return [
8314
- "\\echo 'Materializing simples options...'",
8554
+ echo("[materialize] Materializing simples options..."),
8315
8555
  "with source as (",
8316
8556
  " select",
8317
8557
  ` ${columns.map((column) => `source.${column}`).join(",\n ")},`,
@@ -8325,7 +8565,8 @@ function materializeSimplesSql() {
8325
8565
  `select ${columns.join(", ")}`,
8326
8566
  "from deduped",
8327
8567
  "on conflict (cnpj_root) do update set",
8328
- ` ${updateAssignments(columns, ["cnpj_root"])};`
8568
+ ` ${updateAssignments(columns, ["cnpj_root"])};`,
8569
+ echo("[materialize] Simples options materialization completed.")
8329
8570
  ].join("\n");
8330
8571
  }
8331
8572
  function copyDomainSql(dataset, files) {
@@ -8335,12 +8576,20 @@ function copyDomainSql(dataset, files) {
8335
8576
  const columns = datasetColumns(dataset);
8336
8577
  const tempTable = `tmp_hybrid_${dataset}`;
8337
8578
  const lines = [
8338
- `\\echo 'Loading ${dataset} lookup data...'`,
8579
+ echo(`[load-domains] Loading ${dataset} lookup data...`),
8339
8580
  `drop table if exists ${tempTable};`,
8340
8581
  `create temporary table ${tempTable} (code text, description text);`
8341
8582
  ];
8342
- for (const file of files) {
8343
- lines.push(csvCopyCommand(tempTable, columns, file.absolutePath));
8583
+ for (const [index, file] of files.entries()) {
8584
+ lines.push(
8585
+ echo(
8586
+ `[load-domains] Loading ${dataset} file ${index + 1} of ${files.length}: ${file.relativePath}`
8587
+ ),
8588
+ csvCopyCommand(tempTable, columns, file.absolutePath),
8589
+ echo(
8590
+ `[load-domains] Loaded ${dataset} file ${index + 1} of ${files.length}.`
8591
+ )
8592
+ );
8344
8593
  }
8345
8594
  lines.push(
8346
8595
  `insert into ${dataset} (${columns.join(", ")})`,
@@ -8361,12 +8610,17 @@ function copyStagingSql(dataset, files) {
8361
8610
  return [];
8362
8611
  }
8363
8612
  const columns = datasetColumns(dataset);
8364
- return [
8365
- `\\echo 'Loading ${dataset} staging data...'`,
8366
- ...files.map(
8367
- (file) => csvCopyCommand(tableName, columns, file.absolutePath)
8368
- )
8369
- ];
8613
+ const lines = [echo(`[load-${dataset}] Loading ${dataset} staging data...`)];
8614
+ for (const [index, file] of files.entries()) {
8615
+ lines.push(
8616
+ echo(
8617
+ `[load-${dataset}] Loading file ${index + 1} of ${files.length}: ${file.relativePath}`
8618
+ ),
8619
+ csvCopyCommand(tableName, columns, file.absolutePath),
8620
+ echo(`[load-${dataset}] Loaded file ${index + 1} of ${files.length}.`)
8621
+ );
8622
+ }
8623
+ return lines;
8370
8624
  }
8371
8625
  function csvFilesByDataset(files) {
8372
8626
  const grouped = {};
@@ -8392,7 +8646,9 @@ function rawTableName(dataset) {
8392
8646
  function createRawTempTableSql(dataset) {
8393
8647
  const columns = DATASET_LAYOUTS[dataset].fields.map((field) => ` ${quoteIdentifier(field.columnName)} text`).join(",\n");
8394
8648
  return [
8649
+ "set client_min_messages to warning;",
8395
8650
  `drop table if exists ${rawTableName(dataset)};`,
8651
+ "reset client_min_messages;",
8396
8652
  `create temporary table ${rawTableName(dataset)} (`,
8397
8653
  columns,
8398
8654
  ");"
@@ -8474,11 +8730,21 @@ function rawDomainSql(dataset, files) {
8474
8730
  const columns = layout.fields.map((field) => field.columnName);
8475
8731
  const tableName = rawTableName(dataset);
8476
8732
  const lines = [
8477
- `\\echo 'Loading ${dataset} lookup data directly from sanitized Receita files...'`,
8733
+ echo(
8734
+ `[load-domains] Loading ${dataset} lookup data directly from sanitized Receita files...`
8735
+ ),
8478
8736
  createRawTempTableSql(dataset)
8479
8737
  ];
8480
- for (const file of files) {
8481
- lines.push(receitaCopyCommand(tableName, columns, file.absolutePath));
8738
+ for (const [index, file] of files.entries()) {
8739
+ lines.push(
8740
+ echo(
8741
+ `[load-domains] Loading ${dataset} file ${index + 1} of ${files.length}: ${file.relativePath}`
8742
+ ),
8743
+ receitaCopyCommand(tableName, columns, file.absolutePath),
8744
+ echo(
8745
+ `[load-domains] Loaded ${dataset} file ${index + 1} of ${files.length}.`
8746
+ )
8747
+ );
8482
8748
  }
8483
8749
  lines.push(
8484
8750
  `insert into ${dataset} (${columns.join(", ")})`,
@@ -8488,7 +8754,8 @@ function rawDomainSql(dataset, files) {
8488
8754
  `from ${tableName}`,
8489
8755
  "where nullif(btrim(code), '') is not null",
8490
8756
  "order by code",
8491
- "on conflict (code) do update set description = excluded.description;"
8757
+ "on conflict (code) do update set description = excluded.description;",
8758
+ echo(`[load-domains] ${dataset} lookup data completed.`)
8492
8759
  );
8493
8760
  return lines;
8494
8761
  }
@@ -8507,70 +8774,363 @@ function rawStagingSql(dataset, files) {
8507
8774
  const expressions = layout.fields.map(
8508
8775
  (field) => ` ${fieldExpression(dataset, field, alias)} as ${field.columnName}`
8509
8776
  );
8777
+ const stepName = loadStepName(dataset);
8510
8778
  const lines = [
8511
- `\\echo 'Loading ${dataset} staging data directly from sanitized Receita files...'`,
8779
+ echo(
8780
+ `[${stepName}] Loading ${dataset} staging data directly from sanitized Receita files...`
8781
+ ),
8782
+ `truncate table ${targetTable} restart identity;`,
8512
8783
  createRawTempTableSql(dataset)
8513
8784
  ];
8514
- for (const file of files) {
8515
- lines.push(receitaCopyCommand(tableName, columns, file.absolutePath));
8785
+ for (const [index, file] of files.entries()) {
8786
+ lines.push(
8787
+ echo(
8788
+ `[${stepName}] Loading file ${index + 1} of ${files.length}: ${file.relativePath}`
8789
+ ),
8790
+ receitaCopyCommand(tableName, columns, file.absolutePath),
8791
+ echo(`[${stepName}] Loaded file ${index + 1} of ${files.length}.`)
8792
+ );
8516
8793
  }
8517
8794
  lines.push(
8795
+ echo(
8796
+ `[${stepName}] Transforming ${dataset} raw rows into ${targetTable}...`
8797
+ ),
8518
8798
  `insert into ${targetTable} (${columns.join(", ")})`,
8519
8799
  "select",
8520
8800
  expressions.join(",\n"),
8521
- `from ${tableName} ${alias};`
8801
+ `from ${tableName} ${alias};`,
8802
+ echo(`[${stepName}] ${dataset} staging load completed.`)
8522
8803
  );
8523
8804
  return lines;
8524
8805
  }
8525
- function generatePostgresDirectImportScript(input2) {
8526
- const grouped = csvFilesByDataset(input2.files);
8527
- const lines = [
8528
- "-- CNPJ DB Loader hybrid PostgreSQL import script",
8529
- "-- Generated from PostgreSQL-ready CSV files exported by cnpj-db-loader postgres export-csv.",
8530
- "-- Execute with psql, for example:",
8531
- '-- psql "postgres://postgres:postgres@localhost:5432/cnpj" -f import-postgres-direct.sql',
8532
- "",
8806
+ function loadStepName(dataset) {
8807
+ switch (dataset) {
8808
+ case "companies":
8809
+ return "load-companies";
8810
+ case "establishments":
8811
+ return "load-establishments";
8812
+ case "partners":
8813
+ return "load-partners";
8814
+ case "simples_options":
8815
+ return "load-simples";
8816
+ default:
8817
+ return `load-${dataset}`;
8818
+ }
8819
+ }
8820
+ function scriptHeader(title, sourceEncoding) {
8821
+ return [
8822
+ `-- ${title}`,
8823
+ "-- Generated by cnpj-db-loader postgres generate-script.",
8533
8824
  "\\set ON_ERROR_STOP on",
8534
- "\\echo 'Starting CNPJ DB Loader hybrid PostgreSQL import...'",
8535
- "",
8536
- "begin;",
8537
- "",
8538
- "-- Keep the final schema and seed data managed by sql/schema.sql.",
8539
- "-- This script only resets staging tables and then upserts final data.",
8540
- "truncate table staging_companies restart identity;",
8541
- "truncate table staging_establishments restart identity;",
8542
- "truncate table staging_partners restart identity;",
8543
- "truncate table staging_simples_options restart identity;",
8825
+ ...sourceEncoding ? [
8826
+ echo(
8827
+ `Using source file encoding ${sourceEncoding} for psql copy operations...`
8828
+ ),
8829
+ `set client_encoding to ${quoteSqlLiteral(sourceEncoding)};`
8830
+ ] : [],
8544
8831
  ""
8545
8832
  ];
8546
- for (const dataset of DOMAIN_DATASETS) {
8547
- lines.push(...copyDomainSql(dataset, grouped[dataset] ?? []), "");
8833
+ }
8834
+ function wrapTransaction(lines, mode, shouldWrap) {
8835
+ if (!shouldWrap || mode !== "phase") {
8836
+ return [...lines];
8548
8837
  }
8549
- for (const dataset of STAGING_DATASETS) {
8550
- lines.push(...copyStagingSql(dataset, grouped[dataset] ?? []), "");
8838
+ return ["begin;", "", ...lines, "", "commit;"];
8839
+ }
8840
+ function buildStepScript(title, body, input2, wrapInPhaseTransaction) {
8841
+ return [
8842
+ ...scriptHeader(title, input2.sourceEncoding),
8843
+ ...wrapTransaction(body, input2.transactionMode, wrapInPhaseTransaction),
8844
+ ""
8845
+ ].join("\n");
8846
+ }
8847
+ function includeSet(input2) {
8848
+ const selected = new Set(input2.include);
8849
+ if (input2.skipIndexes) {
8850
+ selected.delete("indexes");
8551
8851
  }
8552
- lines.push(...materializationAndAnalyzeSql());
8553
- return lines.join("\n");
8852
+ if (input2.skipAnalyze) {
8853
+ selected.delete("analyze");
8854
+ }
8855
+ return selected;
8856
+ }
8857
+ function hasAnyFinalMaterialization(selected) {
8858
+ return selected.has("companies") || selected.has("establishments") || selected.has("partners") || selected.has("simples");
8554
8859
  }
8555
- function generatePostgresSanitizedDirectImportScript(input2) {
8860
+ function materializeSql(selected) {
8861
+ const lines = [echo("[materialize] Starting final table materialization...")];
8862
+ if (selected.has("companies")) {
8863
+ lines.push(materializeCompaniesSql(), "");
8864
+ }
8865
+ if (selected.has("establishments")) {
8866
+ lines.push(materializeEstablishmentsSql(), "");
8867
+ }
8868
+ if (selected.has("partners")) {
8869
+ lines.push(materializePartnersSql(), "");
8870
+ }
8871
+ if (selected.has("simples")) {
8872
+ lines.push(materializeSimplesSql(), "");
8873
+ }
8874
+ lines.push(echo("[materialize] Final table materialization completed."));
8875
+ return lines;
8876
+ }
8877
+ function indexesSql() {
8878
+ return [
8879
+ echo(
8880
+ "[indexes] No additional index operations are generated in this beta."
8881
+ ),
8882
+ "-- Indexes are expected to be managed by the schema generated by cnpj-db-loader schema generate.",
8883
+ "-- A future fast-rebuild mode may generate DROP/CREATE INDEX operations here."
8884
+ ];
8885
+ }
8886
+ function analyzeSql(selected) {
8887
+ const tables = /* @__PURE__ */ new Set();
8888
+ if (selected.has("companies")) {
8889
+ tables.add("companies");
8890
+ }
8891
+ if (selected.has("establishments")) {
8892
+ tables.add("establishments");
8893
+ }
8894
+ if (selected.has("secondary-cnaes")) {
8895
+ tables.add("establishment_secondary_cnaes");
8896
+ }
8897
+ if (selected.has("partners")) {
8898
+ tables.add("partners");
8899
+ }
8900
+ if (selected.has("simples")) {
8901
+ tables.add("simples_options");
8902
+ }
8903
+ if (selected.has("domains")) {
8904
+ for (const dataset of DOMAIN_DATASETS) {
8905
+ tables.add(dataset);
8906
+ }
8907
+ }
8908
+ return [
8909
+ echo("[analyze] Refreshing planner statistics..."),
8910
+ ...[...tables].map((table) => `analyze ${table};`),
8911
+ echo("[analyze] Planner statistics refreshed.")
8912
+ ];
8913
+ }
8914
+ function step(name, file, dependsOn, included) {
8915
+ return { name, file, dependsOn, included };
8916
+ }
8917
+ function generatePostgresDirectScriptFiles(input2) {
8556
8918
  const grouped = directFilesByDataset(input2.files);
8557
- const lines = [
8558
- "-- CNPJ DB Loader direct PostgreSQL import script",
8919
+ const selected = includeSet(input2);
8920
+ if (!DOMAIN_DATASETS.some((dataset) => (grouped[dataset] ?? []).length > 0)) {
8921
+ selected.delete("domains");
8922
+ }
8923
+ if ((grouped.companies ?? []).length === 0) {
8924
+ selected.delete("companies");
8925
+ }
8926
+ if ((grouped.establishments ?? []).length === 0) {
8927
+ selected.delete("establishments");
8928
+ selected.delete("secondary-cnaes");
8929
+ }
8930
+ if ((grouped.partners ?? []).length === 0) {
8931
+ selected.delete("partners");
8932
+ }
8933
+ if ((grouped.simples_options ?? []).length === 0) {
8934
+ selected.delete("simples");
8935
+ }
8936
+ const scripts = {};
8937
+ const steps = [];
8938
+ const setupIncluded = true;
8939
+ steps.push(step("setup", "setup.sql", [], setupIncluded));
8940
+ scripts["setup.sql"] = [
8941
+ ...scriptHeader(
8942
+ "CNPJ DB Loader PostgreSQL direct import setup",
8943
+ input2.sourceEncoding
8944
+ ),
8945
+ echo("[setup] Preparing PostgreSQL direct import session..."),
8946
+ "-- The database schema must be applied before running these scripts.",
8947
+ "-- This setup script configures the psql session used by the generated orchestrator.",
8948
+ echo("[setup] Setup completed."),
8949
+ ""
8950
+ ].join("\n");
8951
+ const domainsIncluded = selected.has("domains") && DOMAIN_DATASETS.some((dataset) => (grouped[dataset] ?? []).length > 0);
8952
+ steps.push(
8953
+ step("load-domains", "load-domains.sql", ["setup"], domainsIncluded)
8954
+ );
8955
+ if (domainsIncluded) {
8956
+ const lines = [echo("[load-domains] Starting domain tables load...")];
8957
+ for (const dataset of DOMAIN_DATASETS) {
8958
+ lines.push(...rawDomainSql(dataset, grouped[dataset] ?? []), "");
8959
+ }
8960
+ lines.push(echo("[load-domains] Domain tables load completed."));
8961
+ scripts["load-domains.sql"] = buildStepScript(
8962
+ "CNPJ DB Loader PostgreSQL direct import domains step",
8963
+ lines,
8964
+ input2,
8965
+ true
8966
+ );
8967
+ }
8968
+ const datasetSteps = [
8969
+ {
8970
+ dataset: "companies",
8971
+ name: "load-companies",
8972
+ file: "load-companies.sql",
8973
+ include: "companies"
8974
+ },
8975
+ {
8976
+ dataset: "establishments",
8977
+ name: "load-establishments",
8978
+ file: "load-establishments.sql",
8979
+ include: "establishments"
8980
+ },
8981
+ {
8982
+ dataset: "partners",
8983
+ name: "load-partners",
8984
+ file: "load-partners.sql",
8985
+ include: "partners"
8986
+ },
8987
+ {
8988
+ dataset: "simples_options",
8989
+ name: "load-simples",
8990
+ file: "load-simples.sql",
8991
+ include: "simples"
8992
+ }
8993
+ ];
8994
+ for (const item of datasetSteps) {
8995
+ const files = grouped[item.dataset] ?? [];
8996
+ const included = selected.has(item.include) && files.length > 0;
8997
+ steps.push(step(item.name, item.file, ["setup"], included));
8998
+ if (included) {
8999
+ scripts[item.file] = buildStepScript(
9000
+ `CNPJ DB Loader PostgreSQL direct import ${item.name} step`,
9001
+ rawStagingSql(item.dataset, files),
9002
+ input2,
9003
+ true
9004
+ );
9005
+ }
9006
+ }
9007
+ const materializeIncluded = hasAnyFinalMaterialization(selected);
9008
+ steps.push(
9009
+ step(
9010
+ "materialize",
9011
+ "materialize.sql",
9012
+ datasetSteps.filter((item) => selected.has(item.include)).map((item) => item.name),
9013
+ materializeIncluded
9014
+ )
9015
+ );
9016
+ if (materializeIncluded) {
9017
+ scripts["materialize.sql"] = buildStepScript(
9018
+ "CNPJ DB Loader PostgreSQL direct import materialization step",
9019
+ materializeSql(selected),
9020
+ input2,
9021
+ true
9022
+ );
9023
+ }
9024
+ const secondaryIncluded = selected.has("secondary-cnaes") && selected.has("establishments");
9025
+ steps.push(
9026
+ step(
9027
+ "materialize-secondary-cnaes",
9028
+ "materialize-secondary-cnaes.sql",
9029
+ ["load-establishments"],
9030
+ secondaryIncluded
9031
+ )
9032
+ );
9033
+ if (secondaryIncluded) {
9034
+ scripts["materialize-secondary-cnaes.sql"] = buildStepScript(
9035
+ "CNPJ DB Loader PostgreSQL direct import secondary CNAEs step",
9036
+ [materializeSecondaryCnaesSql()],
9037
+ input2,
9038
+ true
9039
+ );
9040
+ }
9041
+ const indexesIncluded = selected.has("indexes");
9042
+ steps.push(
9043
+ step(
9044
+ "indexes",
9045
+ "indexes.sql",
9046
+ materializeIncluded ? ["materialize"] : ["setup"],
9047
+ indexesIncluded
9048
+ )
9049
+ );
9050
+ if (indexesIncluded) {
9051
+ scripts["indexes.sql"] = buildStepScript(
9052
+ "CNPJ DB Loader PostgreSQL direct import indexes step",
9053
+ indexesSql(),
9054
+ input2,
9055
+ true
9056
+ );
9057
+ }
9058
+ const analyzeIncluded = selected.has("analyze");
9059
+ const analyzeDependencies = [
9060
+ ...domainsIncluded ? ["load-domains"] : [],
9061
+ ...materializeIncluded ? ["materialize"] : [],
9062
+ ...secondaryIncluded ? ["materialize-secondary-cnaes"] : []
9063
+ ];
9064
+ steps.push(
9065
+ step(
9066
+ "analyze",
9067
+ "analyze.sql",
9068
+ analyzeDependencies.length > 0 ? analyzeDependencies : ["setup"],
9069
+ analyzeIncluded
9070
+ )
9071
+ );
9072
+ if (analyzeIncluded) {
9073
+ scripts["analyze.sql"] = buildStepScript(
9074
+ "CNPJ DB Loader PostgreSQL direct import analyze step",
9075
+ analyzeSql(selected),
9076
+ input2,
9077
+ true
9078
+ );
9079
+ }
9080
+ const orchestratorLines = [
9081
+ "-- CNPJ DB Loader direct PostgreSQL import orchestrator",
8559
9082
  "-- Generated from sanitized Receita files by cnpj-db-loader postgres generate-script.",
8560
- "-- This path avoids rewriting the dataset into a second CSV tree.",
8561
9083
  "-- Execute with psql, for example:",
8562
- '-- psql "postgres://postgres:postgres@localhost:5432/cnpj" -f import-postgres-direct.sql',
9084
+ '-- psql -d "postgres://postgres:postgres@localhost:5432/cnpj" -f import-postgres-direct.sql',
8563
9085
  "",
8564
9086
  "\\set ON_ERROR_STOP on",
8565
- `\\echo 'Using source file encoding ${input2.sourceEncoding} for psql copy operations...'`,
9087
+ echo(
9088
+ `Using source file encoding ${input2.sourceEncoding} for psql copy operations...`
9089
+ ),
8566
9090
  `set client_encoding to ${quoteSqlLiteral(input2.sourceEncoding)};`,
8567
- "\\echo 'Starting CNPJ DB Loader direct PostgreSQL import from sanitized files...'",
9091
+ echo(
9092
+ `Starting CNPJ DB Loader direct PostgreSQL import using transaction mode ${input2.transactionMode}...`
9093
+ ),
9094
+ "",
9095
+ ...input2.transactionMode === "single" ? ["begin;", ""] : []
9096
+ ];
9097
+ for (const name of STEP_ORDER) {
9098
+ const currentStep = steps.find((item) => item.name === name);
9099
+ if (!currentStep?.included) {
9100
+ continue;
9101
+ }
9102
+ orchestratorLines.push(
9103
+ echo(
9104
+ `[orchestrator] Running ${currentStep.name} (${currentStep.file})...`
9105
+ ),
9106
+ `\\ir ${currentStep.file}`,
9107
+ echo(`[orchestrator] Completed ${currentStep.name}.`),
9108
+ ""
9109
+ );
9110
+ }
9111
+ orchestratorLines.push(
9112
+ ...input2.transactionMode === "single" ? ["commit;", ""] : [],
9113
+ echo("CNPJ DB Loader hybrid PostgreSQL import completed."),
9114
+ ""
9115
+ );
9116
+ scripts["import-postgres-direct.sql"] = orchestratorLines.join("\n");
9117
+ return { scripts, steps };
9118
+ }
9119
+ function generatePostgresDirectImportScript(input2) {
9120
+ const grouped = csvFilesByDataset(input2.files);
9121
+ const lines = [
9122
+ "-- CNPJ DB Loader hybrid PostgreSQL import script",
9123
+ "-- Generated from PostgreSQL-ready CSV files exported by cnpj-db-loader postgres export-csv.",
9124
+ "-- Execute with psql, for example:",
9125
+ '-- psql -d "postgres://postgres:postgres@localhost:5432/cnpj" -f import-postgres-direct.sql',
9126
+ "",
9127
+ "\\set ON_ERROR_STOP on",
9128
+ echo("Starting CNPJ DB Loader hybrid PostgreSQL import..."),
8568
9129
  "",
8569
9130
  "begin;",
8570
9131
  "",
8571
9132
  "-- Keep the final schema and seed data managed by sql/schema.sql.",
8572
- "-- This script copies sanitized Receita files into temporary raw tables,",
8573
- "-- transforms values inside PostgreSQL, resets staging tables and upserts final data.",
9133
+ "-- This script only resets staging tables and then upserts final data.",
8574
9134
  "truncate table staging_companies restart identity;",
8575
9135
  "truncate table staging_establishments restart identity;",
8576
9136
  "truncate table staging_partners restart identity;",
@@ -8578,10 +9138,10 @@ function generatePostgresSanitizedDirectImportScript(input2) {
8578
9138
  ""
8579
9139
  ];
8580
9140
  for (const dataset of DOMAIN_DATASETS) {
8581
- lines.push(...rawDomainSql(dataset, grouped[dataset] ?? []), "");
9141
+ lines.push(...copyDomainSql(dataset, grouped[dataset] ?? []), "");
8582
9142
  }
8583
9143
  for (const dataset of STAGING_DATASETS) {
8584
- lines.push(...rawStagingSql(dataset, grouped[dataset] ?? []), "");
9144
+ lines.push(...copyStagingSql(dataset, grouped[dataset] ?? []), "");
8585
9145
  }
8586
9146
  lines.push(...materializationAndAnalyzeSql());
8587
9147
  return lines.join("\n");
@@ -8592,11 +9152,13 @@ function materializationAndAnalyzeSql() {
8592
9152
  "",
8593
9153
  materializeEstablishmentsSql(),
8594
9154
  "",
9155
+ materializeSecondaryCnaesSql(),
9156
+ "",
8595
9157
  materializePartnersSql(),
8596
9158
  "",
8597
9159
  materializeSimplesSql(),
8598
9160
  "",
8599
- "\\echo 'Refreshing planner statistics...'",
9161
+ echo("Refreshing planner statistics..."),
8600
9162
  "analyze companies;",
8601
9163
  "analyze establishments;",
8602
9164
  "analyze establishment_secondary_cnaes;",
@@ -8611,7 +9173,7 @@ function materializationAndAnalyzeSql() {
8611
9173
  "",
8612
9174
  "commit;",
8613
9175
  "",
8614
- "\\echo 'CNPJ DB Loader hybrid PostgreSQL import completed.'",
9176
+ echo("CNPJ DB Loader hybrid PostgreSQL import completed."),
8615
9177
  ""
8616
9178
  ];
8617
9179
  }
@@ -8817,7 +9379,30 @@ async function exportPostgresCsvDataset(inputPath, options = {}) {
8817
9379
  // src/services/postgres-direct/generator.ts
8818
9380
  import { mkdir as mkdir9, stat as stat7, writeFile as writeFile6 } from "fs/promises";
8819
9381
  import path17 from "path";
8820
- var DEFAULT_SOURCE_ENCODING = "WIN1252";
9382
+ var DEFAULT_SOURCE_ENCODING = "UTF8";
9383
+ var DEFAULT_TRANSACTION_MODE = "single";
9384
+ var ALL_INCLUDE_TARGETS = [
9385
+ "domains",
9386
+ "companies",
9387
+ "establishments",
9388
+ "partners",
9389
+ "simples",
9390
+ "secondary-cnaes",
9391
+ "indexes",
9392
+ "analyze"
9393
+ ];
9394
+ var INCLUDE_TARGETS_BY_DATASET = {
9395
+ companies: "companies",
9396
+ establishments: "establishments",
9397
+ partners: "partners",
9398
+ simples_options: "simples",
9399
+ countries: "domains",
9400
+ cities: "domains",
9401
+ partner_qualifications: "domains",
9402
+ legal_natures: "domains",
9403
+ reasons: "domains",
9404
+ cnaes: "domains"
9405
+ };
8821
9406
  function defaultPostgresDirectOutputPath(inputPath) {
8822
9407
  const baseName = path17.basename(inputPath);
8823
9408
  if (baseName.toLowerCase() === "sanitized") {
@@ -8826,17 +9411,52 @@ function defaultPostgresDirectOutputPath(inputPath) {
8826
9411
  return path17.join(path17.dirname(inputPath), `${baseName}-postgres-direct`);
8827
9412
  }
8828
9413
  function inferNextStep5(scriptPath) {
8829
- return `psql "postgres://postgres:postgres@localhost:5432/cnpj" -f ${scriptPath.replace(/\\/g, "/")}`;
9414
+ return `psql -d "postgres://postgres:postgres@localhost:5432/cnpj" -f ${scriptPath.replace(/\\/g, "/")}`;
8830
9415
  }
8831
9416
  function normalizeSourceEncoding(value) {
8832
9417
  const encoding = (value ?? DEFAULT_SOURCE_ENCODING).trim();
8833
9418
  if (!/^[A-Za-z0-9_-]+$/.test(encoding)) {
8834
9419
  throw new ValidationError(
8835
- `Invalid source encoding: ${value}. Use a PostgreSQL client encoding name such as WIN1252 or UTF8.`
9420
+ `Invalid source encoding: ${value}. Use a PostgreSQL client encoding name such as UTF8, WIN1252 or LATIN1.`
8836
9421
  );
8837
9422
  }
8838
9423
  return encoding.toUpperCase();
8839
9424
  }
9425
+ function normalizeTransactionMode(value) {
9426
+ const mode = value ?? DEFAULT_TRANSACTION_MODE;
9427
+ if (!["single", "phase", "none"].includes(mode)) {
9428
+ throw new ValidationError(
9429
+ `Invalid transaction mode: ${String(value)}. Use single, phase or none.`
9430
+ );
9431
+ }
9432
+ return mode;
9433
+ }
9434
+ function isIncludeTarget(value) {
9435
+ return ALL_INCLUDE_TARGETS.includes(value);
9436
+ }
9437
+ function normalizeIncludeTargets(include, dataset) {
9438
+ if (include && include.length > 0) {
9439
+ const unique = [...new Set(include)];
9440
+ const invalid = unique.filter((item) => !isIncludeTarget(item));
9441
+ if (invalid.length > 0) {
9442
+ throw new ValidationError(
9443
+ `Invalid include target(s): ${invalid.join(", ")}. Use ${ALL_INCLUDE_TARGETS.join(", ")}.`
9444
+ );
9445
+ }
9446
+ return unique;
9447
+ }
9448
+ if (dataset) {
9449
+ const target = INCLUDE_TARGETS_BY_DATASET[dataset];
9450
+ if (!target) {
9451
+ return [];
9452
+ }
9453
+ if (target === "establishments") {
9454
+ return ["establishments", "secondary-cnaes", "analyze"];
9455
+ }
9456
+ return [target, "analyze"];
9457
+ }
9458
+ return [...ALL_INCLUDE_TARGETS];
9459
+ }
8840
9460
  async function generatePostgresDirectScript(inputPath, options = {}) {
8841
9461
  if (options.dataset && !isImportDatasetType(options.dataset)) {
8842
9462
  throw new ValidationError(`Unsupported dataset type: ${options.dataset}.`);
@@ -8852,6 +9472,10 @@ async function generatePostgresDirectScript(inputPath, options = {}) {
8852
9472
  options.outputPath ?? defaultPostgresDirectOutputPath(validatedPath)
8853
9473
  );
8854
9474
  const sourceEncoding = normalizeSourceEncoding(options.sourceEncoding);
9475
+ const transactionMode = normalizeTransactionMode(options.transactionMode);
9476
+ const include = normalizeIncludeTargets(options.include, options.dataset);
9477
+ const skipIndexes = options.skipIndexes ?? false;
9478
+ const skipAnalyze = options.skipAnalyze ?? false;
8855
9479
  const inspected = await inspectFiles(validatedPath);
8856
9480
  const recognizedFiles = inspected.entries.filter((entry) => entry.entryKind === "file").flatMap((entry) => {
8857
9481
  if (!isImportDatasetType(entry.inferredType)) {
@@ -8879,7 +9503,11 @@ async function generatePostgresDirectScript(inputPath, options = {}) {
8879
9503
  outputPath,
8880
9504
  totalFiles: recognizedFiles.length,
8881
9505
  datasets,
8882
- sourceEncoding
9506
+ sourceEncoding,
9507
+ transactionMode,
9508
+ include,
9509
+ skipIndexes,
9510
+ skipAnalyze
8883
9511
  });
8884
9512
  await mkdir9(outputPath, { recursive: true });
8885
9513
  const sourceFiles = [];
@@ -8915,11 +9543,21 @@ async function generatePostgresDirectScript(inputPath, options = {}) {
8915
9543
  }
8916
9544
  const scriptName = options.scriptName ?? "import-postgres-direct.sql";
8917
9545
  const scriptPath = path17.join(outputPath, scriptName);
8918
- const script = generatePostgresSanitizedDirectImportScript({
9546
+ const generated = generatePostgresDirectScriptFiles({
8919
9547
  files: sourceFiles,
8920
- sourceEncoding
9548
+ sourceEncoding,
9549
+ transactionMode,
9550
+ include,
9551
+ skipIndexes,
9552
+ skipAnalyze
8921
9553
  });
8922
- await writeFile6(scriptPath, script, "utf8");
9554
+ const scriptFiles = [];
9555
+ for (const [fileName, script] of Object.entries(generated.scripts)) {
9556
+ const outputFileName = fileName === "import-postgres-direct.sql" ? scriptName : fileName;
9557
+ const outputFilePath = path17.join(outputPath, outputFileName);
9558
+ await writeFile6(outputFilePath, script, "utf8");
9559
+ scriptFiles.push(outputFilePath);
9560
+ }
8923
9561
  const manifestPath = path17.join(outputPath, "manifest.json");
8924
9562
  const summaryDatasets = [...summariesByDataset.values()].sort(
8925
9563
  (left, right) => IMPORT_ORDER.indexOf(left.dataset) - IMPORT_ORDER.indexOf(right.dataset)
@@ -8931,13 +9569,19 @@ async function generatePostgresDirectScript(inputPath, options = {}) {
8931
9569
  const manifest = {
8932
9570
  generatedAt: (/* @__PURE__ */ new Date()).toISOString(),
8933
9571
  mode: "direct-sanitized-script",
9572
+ transactionMode,
9573
+ include,
9574
+ skipIndexes,
9575
+ skipAnalyze,
8934
9576
  inputPath: path17.resolve(inputPath),
8935
9577
  validatedPath,
8936
9578
  outputPath,
8937
9579
  scriptPath,
9580
+ scriptFiles,
8938
9581
  sourceEncoding,
8939
9582
  totalFiles: sourceFiles.length,
8940
9583
  totalBytes,
9584
+ steps: generated.steps,
8941
9585
  datasets: summaryDatasets
8942
9586
  };
8943
9587
  await writeFile6(
@@ -8960,14 +9604,19 @@ async function generatePostgresDirectScript(inputPath, options = {}) {
8960
9604
  scriptPath,
8961
9605
  manifestPath,
8962
9606
  sourceEncoding,
9607
+ transactionMode,
8963
9608
  totalFiles: sourceFiles.length,
8964
9609
  totalBytes,
8965
9610
  datasets: summaryDatasets,
9611
+ scriptFiles,
9612
+ steps: generated.steps,
8966
9613
  warnings: [
8967
9614
  ...validation.ok ? [] : validation.errors,
8968
9615
  "This script imports sanitized Receita files directly with psql \\copy. It avoids rewriting the full dataset into a second CSV tree.",
8969
- "The generated script expects the database schema generated by cnpj-db-loader to be applied before execution.",
8970
- "Use --source-encoding UTF8 only if your sanitized files are already UTF-8. The default WIN1252 matches the usual Receita file encoding."
9616
+ "The generated scripts expect the database schema generated by cnpj-db-loader to be applied before execution.",
9617
+ "The direct PostgreSQL script now defaults to UTF8 because the sanitize command writes clean UTF-8 files.",
9618
+ "Use --source-encoding WIN1252 or LATIN1 only when generating scripts for legacy sanitized files produced by older loader versions.",
9619
+ "The generated import is now modular. Use import-postgres-direct.sql as the orchestrator or run individual phase scripts manually."
8971
9620
  ],
8972
9621
  nextStep: inferNextStep5(scriptPath)
8973
9622
  };
@@ -9271,9 +9920,23 @@ function printSanitizeSummary(summary, logFilePath) {
9271
9920
  console.log(
9272
9921
  formatKeyValue("Processed bytes", formatBytes(summary.totalBytes))
9273
9922
  );
9923
+ console.log(formatKeyValue("Source encoding", summary.sourceEncoding));
9924
+ console.log(formatKeyValue("Output encoding", "UTF8"));
9274
9925
  console.log(
9275
9926
  formatKeyValue("Removed NUL bytes", formatCount(summary.nulBytesRemoved))
9276
9927
  );
9928
+ console.log(
9929
+ formatKeyValue(
9930
+ "Removed invalid bytes",
9931
+ formatCount(summary.invalidBytesRemoved)
9932
+ )
9933
+ );
9934
+ console.log(
9935
+ formatKeyValue(
9936
+ "Removed control chars",
9937
+ formatCount(summary.controlCharsRemoved)
9938
+ )
9939
+ );
9277
9940
  console.log(formatKeyValue("Changed files", summary.changedFiles));
9278
9941
  console.log(formatKeyValue("Unchanged files", summary.unchangedFiles));
9279
9942
  if (summary.datasets.length > 0) {
@@ -9601,6 +10264,16 @@ function printPostgresDirectScriptSummary(summary, logFilePath) {
9601
10264
  console.log(formatKeyValue("Generated script", summary.scriptPath));
9602
10265
  console.log(formatKeyValue("Manifest", summary.manifestPath));
9603
10266
  console.log(formatKeyValue("Source encoding", summary.sourceEncoding));
10267
+ console.log(formatKeyValue("Transaction mode", summary.transactionMode));
10268
+ console.log(
10269
+ formatKeyValue("Generated SQL files", summary.scriptFiles.length)
10270
+ );
10271
+ console.log(
10272
+ formatKeyValue(
10273
+ "Included steps",
10274
+ summary.steps.filter((step2) => step2.included).map((step2) => step2.name).join(", ")
10275
+ )
10276
+ );
9604
10277
  console.log(formatKeyValue("Source files", summary.totalFiles));
9605
10278
  console.log(formatKeyValue("Source bytes", formatBytes(summary.totalBytes)));
9606
10279
  if (summary.datasets.length > 0) {
@@ -10053,8 +10726,9 @@ function createSanitizeProgressReporter() {
10053
10726
  `Validated: ${shortPath(event.validatedPath)}`,
10054
10727
  `Output: ${shortPath(event.outputPath)}`,
10055
10728
  `Datasets: ${event.datasets.join(" > ")}`,
10729
+ `Source encoding: ${event.sourceEncoding} > UTF8`,
10056
10730
  `Files: 0/${formatCount(event.totalFiles)} | Bytes: ${formatBytes(0)} / ${formatBytes(event.totalBytes)}`,
10057
- `Rows counted: ${formatCount(0)} | NUL removed: ${formatCount(0)}`,
10731
+ `Rows: ${formatCount(0)} | NUL: ${formatCount(0)} | Invalid bytes: ${formatCount(0)} | Controls: ${formatCount(0)}`,
10058
10732
  `Current: waiting...`
10059
10733
  ];
10060
10734
  renderBlock([
@@ -10070,8 +10744,9 @@ function createSanitizeProgressReporter() {
10070
10744
  currentLines[1] ?? "",
10071
10745
  currentLines[2] ?? "",
10072
10746
  currentLines[3] ?? "",
10747
+ currentLines[4] ?? "",
10073
10748
  `Files: ${formatCount(event.fileIndex)}/${formatCount(event.totalFiles)} | Bytes: ${formatBytes(event.bytesProcessed)} / ${formatBytes(event.totalBytes)}`,
10074
- `Rows counted: ${formatCount(event.processedRows)} | NUL removed: ${formatCount(event.nulBytesRemoved)} | Changed files: ${formatCount(event.changedFiles)}`,
10749
+ `Rows: ${formatCount(event.processedRows)} | NUL: ${formatCount(event.nulBytesRemoved)} | Invalid bytes: ${formatCount(event.invalidBytesRemoved)} | Controls: ${formatCount(event.controlCharsRemoved)} | Changed: ${formatCount(event.changedFiles)}`,
10075
10750
  `Current: ${shortPath(event.currentFileDisplayPath)}`
10076
10751
  ];
10077
10752
  renderBlock([
@@ -10091,6 +10766,18 @@ function createSanitizeProgressReporter() {
10091
10766
  console.log(
10092
10767
  formatKeyValue("Removed NUL bytes", formatCount(event.nulBytesRemoved))
10093
10768
  );
10769
+ console.log(
10770
+ formatKeyValue(
10771
+ "Removed invalid bytes",
10772
+ formatCount(event.invalidBytesRemoved)
10773
+ )
10774
+ );
10775
+ console.log(
10776
+ formatKeyValue(
10777
+ "Removed control chars",
10778
+ formatCount(event.controlCharsRemoved)
10779
+ )
10780
+ );
10094
10781
  console.log(
10095
10782
  formatKeyValue("Changed files", formatCount(event.changedFiles))
10096
10783
  );
@@ -10277,6 +10964,14 @@ function createPostgresDirectScriptProgressReporter() {
10277
10964
  console.log(formatKeyValue("Validated path", event.validatedPath));
10278
10965
  console.log(formatKeyValue("Output path", event.outputPath));
10279
10966
  console.log(formatKeyValue("Source encoding", event.sourceEncoding));
10967
+ console.log(formatKeyValue("Transaction mode", event.transactionMode));
10968
+ console.log(formatKeyValue("Included steps", event.include.join(", ")));
10969
+ console.log(
10970
+ formatKeyValue("Skip indexes", event.skipIndexes ? "yes" : "no")
10971
+ );
10972
+ console.log(
10973
+ formatKeyValue("Skip analyze", event.skipAnalyze ? "yes" : "no")
10974
+ );
10280
10975
  console.log(formatKeyValue("Files queued", event.totalFiles));
10281
10976
  return;
10282
10977
  }
@@ -11203,8 +11898,14 @@ function registerPostgresCommands(program) {
11203
11898
  "Generated psql script file name. Defaults to import-postgres-direct.sql."
11204
11899
  ).option(
11205
11900
  "--source-encoding <encoding>",
11206
- "PostgreSQL client encoding used while reading sanitized Receita files. Defaults to WIN1252."
11207
- ).option("-f, --force", "Skip the confirmation prompt.").description(
11901
+ "PostgreSQL client encoding used while reading sanitized Receita files. Defaults to UTF8."
11902
+ ).option(
11903
+ "--transaction-mode <mode>",
11904
+ "Transaction mode for generated scripts: single, phase or none. Defaults to single."
11905
+ ).option(
11906
+ "--include <items>",
11907
+ "Comma-separated steps to include: domains,companies,establishments,partners,simples,secondary-cnaes,indexes,analyze."
11908
+ ).option("--skip-indexes", "Do not generate the indexes step.").option("--skip-analyze", "Do not generate the analyze step.").option("-f, --force", "Skip the confirmation prompt.").description(
11208
11909
  "Generate a direct psql import script that loads sanitized Receita files without rewriting them into new CSV files."
11209
11910
  ).action(
11210
11911
  async (input2, options) => {
@@ -11233,6 +11934,18 @@ function registerPostgresCommands(program) {
11233
11934
  if (options.sourceEncoding) {
11234
11935
  generateOptions.sourceEncoding = options.sourceEncoding;
11235
11936
  }
11937
+ if (options.transactionMode) {
11938
+ generateOptions.transactionMode = options.transactionMode;
11939
+ }
11940
+ if (options.include) {
11941
+ generateOptions.include = options.include.split(",").map((item) => item.trim()).filter(Boolean);
11942
+ }
11943
+ if (options.skipIndexes) {
11944
+ generateOptions.skipIndexes = true;
11945
+ }
11946
+ if (options.skipAnalyze) {
11947
+ generateOptions.skipAnalyze = true;
11948
+ }
11236
11949
  const summary = await generatePostgresDirectScript(
11237
11950
  input2,
11238
11951
  generateOptions
@@ -11353,6 +12066,9 @@ function registerSanitizeCommands(program) {
11353
12066
  ).option(
11354
12067
  "--dataset <dataset>",
11355
12068
  "Sanitize only one validated dataset block (for example: establishments or companies)."
12069
+ ).option(
12070
+ "--source-encoding <encoding>",
12071
+ "Source file encoding used while reading Receita files. Defaults to WIN1252 and writes clean UTF-8 output."
11356
12072
  ).option("-f, --force", "Skip the confirmation prompt.").description(
11357
12073
  "Prepare a sanitized dataset tree before import by removing known low-level byte issues such as NUL bytes."
11358
12074
  ).action(
@@ -11376,6 +12092,9 @@ function registerSanitizeCommands(program) {
11376
12092
  if (options.dataset) {
11377
12093
  sanitizeOptions.dataset = options.dataset;
11378
12094
  }
12095
+ if (options.sourceEncoding) {
12096
+ sanitizeOptions.sourceEncoding = options.sourceEncoding;
12097
+ }
11379
12098
  const summary = await sanitizeInputDirectory(input2, sanitizeOptions);
11380
12099
  const logFilePath = await writeCommandLog("sanitize", summary);
11381
12100
  printSanitizeSummary(summary, logFilePath);