@danielarndt0/cnpj-db-loader 2.4.0-beta.1 → 2.4.0-beta.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -7853,81 +7853,264 @@ function isRecognizedSanitizeEntry(entry) {
7853
7853
  return entry.entryKind === "file" && entry.inferredType !== "zip-archive" && entry.inferredType !== "unknown";
7854
7854
  }
7855
7855
 
7856
+ // src/services/sanitize/encoding.ts
7857
+ import { StringDecoder } from "string_decoder";
7858
+ var WINDOWS_1252_C1_MAP = {
7859
+ 128: "\u20AC",
7860
+ 130: "\u201A",
7861
+ 131: "\u0192",
7862
+ 132: "\u201E",
7863
+ 133: "\u2026",
7864
+ 134: "\u2020",
7865
+ 135: "\u2021",
7866
+ 136: "\u02C6",
7867
+ 137: "\u2030",
7868
+ 138: "\u0160",
7869
+ 139: "\u2039",
7870
+ 140: "\u0152",
7871
+ 142: "\u017D",
7872
+ 145: "\u2018",
7873
+ 146: "\u2019",
7874
+ 147: "\u201C",
7875
+ 148: "\u201D",
7876
+ 149: "\u2022",
7877
+ 150: "\u2013",
7878
+ 151: "\u2014",
7879
+ 152: "\u02DC",
7880
+ 153: "\u2122",
7881
+ 154: "\u0161",
7882
+ 155: "\u203A",
7883
+ 156: "\u0153",
7884
+ 158: "\u017E",
7885
+ 159: "\u0178"
7886
+ };
7887
+ function normalizeSanitizeSourceEncoding(value) {
7888
+ const normalized = (value ?? "WIN1252").trim().toUpperCase().replace(/_/g, "-");
7889
+ switch (normalized) {
7890
+ case "WIN1252":
7891
+ case "WINDOWS-1252":
7892
+ case "CP1252":
7893
+ return "WIN1252";
7894
+ case "LATIN1":
7895
+ case "LATIN-1":
7896
+ case "ISO-8859-1":
7897
+ case "ISO8859-1":
7898
+ return "LATIN1";
7899
+ case "UTF8":
7900
+ case "UTF-8":
7901
+ return "UTF8";
7902
+ default:
7903
+ throw new ValidationError(
7904
+ `Unsupported sanitize source encoding: ${value}. Supported values: WIN1252, LATIN1, UTF8.`
7905
+ );
7906
+ }
7907
+ }
7908
+ function isAllowedControlCodePoint(codePoint) {
7909
+ return codePoint === 9 || codePoint === 10 || codePoint === 13;
7910
+ }
7911
+ function isProblematicControlCodePoint(codePoint) {
7912
+ if (isAllowedControlCodePoint(codePoint)) {
7913
+ return false;
7914
+ }
7915
+ return codePoint >= 0 && codePoint <= 31 || codePoint === 127 || codePoint >= 128 && codePoint <= 159 || codePoint === 65279;
7916
+ }
7917
+ function sanitizeDecodedText(text) {
7918
+ const output = [];
7919
+ let invalidBytesRemoved = 0;
7920
+ let controlCharsRemoved = 0;
7921
+ for (const char of text) {
7922
+ const codePoint = char.codePointAt(0);
7923
+ if (codePoint === 65533) {
7924
+ invalidBytesRemoved += 1;
7925
+ continue;
7926
+ }
7927
+ if (isProblematicControlCodePoint(codePoint)) {
7928
+ controlCharsRemoved += 1;
7929
+ continue;
7930
+ }
7931
+ output.push(char);
7932
+ }
7933
+ return {
7934
+ text: output.join(""),
7935
+ invalidBytesRemoved,
7936
+ controlCharsRemoved
7937
+ };
7938
+ }
7939
+ var SanitizeEncodingNormalizer = class {
7940
+ constructor(sourceEncoding) {
7941
+ this.sourceEncoding = sourceEncoding;
7942
+ this.utf8Decoder = sourceEncoding === "UTF8" ? new StringDecoder("utf8") : void 0;
7943
+ }
7944
+ sourceEncoding;
7945
+ utf8Decoder;
7946
+ normalizeChunk(chunk) {
7947
+ if (this.sourceEncoding === "UTF8") {
7948
+ const decoded = this.utf8Decoder.write(chunk);
7949
+ const sanitized = sanitizeDecodedText(decoded);
7950
+ const nulBytesRemoved = [...decoded].filter(
7951
+ (char) => char === "\0"
7952
+ ).length;
7953
+ return {
7954
+ ...sanitized,
7955
+ nulBytesRemoved
7956
+ };
7957
+ }
7958
+ return this.normalizeSingleByteChunk(chunk);
7959
+ }
7960
+ flush() {
7961
+ if (!this.utf8Decoder) {
7962
+ return {
7963
+ text: "",
7964
+ nulBytesRemoved: 0,
7965
+ invalidBytesRemoved: 0,
7966
+ controlCharsRemoved: 0
7967
+ };
7968
+ }
7969
+ const decoded = this.utf8Decoder.end();
7970
+ const sanitized = sanitizeDecodedText(decoded);
7971
+ const nulBytesRemoved = [...decoded].filter((char) => char === "\0").length;
7972
+ return {
7973
+ ...sanitized,
7974
+ nulBytesRemoved
7975
+ };
7976
+ }
7977
+ normalizeSingleByteChunk(chunk) {
7978
+ const output = [];
7979
+ let nulBytesRemoved = 0;
7980
+ let invalidBytesRemoved = 0;
7981
+ let controlCharsRemoved = 0;
7982
+ for (const byte of chunk) {
7983
+ if (byte === 0) {
7984
+ nulBytesRemoved += 1;
7985
+ continue;
7986
+ }
7987
+ if (byte < 32 || byte === 127) {
7988
+ if (isAllowedControlCodePoint(byte)) {
7989
+ output.push(String.fromCharCode(byte));
7990
+ } else {
7991
+ controlCharsRemoved += 1;
7992
+ }
7993
+ continue;
7994
+ }
7995
+ if (byte >= 128 && byte <= 159) {
7996
+ if (this.sourceEncoding === "WIN1252") {
7997
+ const mapped = WINDOWS_1252_C1_MAP[byte];
7998
+ if (mapped === void 0) {
7999
+ invalidBytesRemoved += 1;
8000
+ } else {
8001
+ output.push(mapped);
8002
+ }
8003
+ } else {
8004
+ controlCharsRemoved += 1;
8005
+ }
8006
+ continue;
8007
+ }
8008
+ output.push(String.fromCharCode(byte));
8009
+ }
8010
+ return {
8011
+ text: output.join(""),
8012
+ nulBytesRemoved,
8013
+ invalidBytesRemoved,
8014
+ controlCharsRemoved
8015
+ };
8016
+ }
8017
+ };
8018
+
7856
8019
  // src/services/sanitize/runner.ts
7857
8020
  import { createReadStream as createReadStream2, createWriteStream as createWriteStream2 } from "fs";
7858
8021
  import { mkdir as mkdir7 } from "fs/promises";
7859
8022
  import path13 from "path";
7860
- function stripNulBytes(chunk) {
7861
- let removed = 0;
7862
- for (let index = 0; index < chunk.length; index += 1) {
7863
- if (chunk[index] === 0) {
7864
- removed += 1;
7865
- }
8023
+ async function writeUtf8(output, value) {
8024
+ if (value.length === 0) {
8025
+ return;
7866
8026
  }
7867
- if (removed === 0) {
7868
- return { buffer: chunk, removed: 0 };
8027
+ if (!output.write(value, "utf8")) {
8028
+ await new Promise((resolve, reject) => {
8029
+ output.once("drain", resolve);
8030
+ output.once("error", reject);
8031
+ });
7869
8032
  }
7870
- const sanitized = Buffer.allocUnsafe(chunk.length - removed);
7871
- let outputIndex = 0;
7872
- for (let index = 0; index < chunk.length; index += 1) {
7873
- const value = chunk[index];
7874
- if (value !== 0) {
7875
- sanitized[outputIndex] = value;
7876
- outputIndex += 1;
8033
+ }
8034
+ function countNewlines(value) {
8035
+ let count = 0;
8036
+ for (let index = 0; index < value.length; index += 1) {
8037
+ if (value[index] === "\n") {
8038
+ count += 1;
7877
8039
  }
7878
8040
  }
7879
- return { buffer: sanitized, removed };
8041
+ return count;
7880
8042
  }
7881
- async function sanitizeDatasetFile(plan, onChunk) {
8043
+ async function sanitizeDatasetFile(plan, onChunk, options = {}) {
7882
8044
  await mkdir7(path13.dirname(plan.outputPath), { recursive: true });
8045
+ const sourceEncoding = normalizeSanitizeSourceEncoding(
8046
+ options.sourceEncoding
8047
+ );
8048
+ const normalizer = new SanitizeEncodingNormalizer(sourceEncoding);
7883
8049
  const input = createReadStream2(plan.absolutePath);
7884
- const output = createWriteStream2(plan.outputPath);
8050
+ const output = createWriteStream2(plan.outputPath, { encoding: "utf8" });
7885
8051
  let totalBytesRead = 0;
7886
8052
  let totalBytesWritten = 0;
7887
8053
  let nulBytesRemoved = 0;
8054
+ let invalidBytesRemoved = 0;
8055
+ let controlCharsRemoved = 0;
7888
8056
  let lineCount = 0;
7889
- let sawAnyByte = false;
7890
- let lastByteWasNewline = false;
8057
+ let sawAnyCharacter = false;
8058
+ let lastCharacterWasNewline = false;
8059
+ const processText = async (text) => {
8060
+ if (text.length === 0) {
8061
+ return;
8062
+ }
8063
+ sawAnyCharacter = true;
8064
+ lineCount += countNewlines(text);
8065
+ lastCharacterWasNewline = text.endsWith("\n");
8066
+ totalBytesWritten += Buffer.byteLength(text, "utf8");
8067
+ await writeUtf8(output, text);
8068
+ };
7891
8069
  try {
7892
8070
  for await (const chunk of input) {
7893
8071
  const chunkBuffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
7894
8072
  totalBytesRead += chunkBuffer.length;
7895
- const { buffer, removed } = stripNulBytes(chunkBuffer);
7896
- nulBytesRemoved += removed;
7897
- sawAnyByte = sawAnyByte || buffer.length > 0;
7898
- for (let index = 0; index < buffer.length; index += 1) {
7899
- if (buffer[index] === 10) {
7900
- lineCount += 1;
7901
- }
7902
- }
7903
- if (buffer.length > 0) {
7904
- lastByteWasNewline = buffer[buffer.length - 1] === 10;
7905
- }
7906
- totalBytesWritten += buffer.length;
7907
- output.write(buffer);
8073
+ const normalized = normalizer.normalizeChunk(chunkBuffer);
8074
+ nulBytesRemoved += normalized.nulBytesRemoved;
8075
+ invalidBytesRemoved += normalized.invalidBytesRemoved;
8076
+ controlCharsRemoved += normalized.controlCharsRemoved;
8077
+ await processText(normalized.text);
7908
8078
  onChunk?.({
7909
8079
  bytesProcessed: chunkBuffer.length,
7910
8080
  fileBytesProcessed: totalBytesRead,
7911
8081
  currentFileSize: plan.fileSize,
7912
8082
  processedRows: lineCount,
7913
- nulBytesRemoved
8083
+ nulBytesRemoved,
8084
+ invalidBytesRemoved,
8085
+ controlCharsRemoved
7914
8086
  });
7915
8087
  }
7916
- if (sawAnyByte && !lastByteWasNewline) {
8088
+ const flushed = normalizer.flush();
8089
+ nulBytesRemoved += flushed.nulBytesRemoved;
8090
+ invalidBytesRemoved += flushed.invalidBytesRemoved;
8091
+ controlCharsRemoved += flushed.controlCharsRemoved;
8092
+ await processText(flushed.text);
8093
+ if (sawAnyCharacter && !lastCharacterWasNewline) {
7917
8094
  lineCount += 1;
7918
8095
  }
7919
8096
  } finally {
7920
8097
  input.close();
7921
8098
  output.end();
7922
- await new Promise((resolve) => output.on("finish", () => resolve()));
8099
+ await new Promise((resolve, reject) => {
8100
+ output.on("finish", () => resolve());
8101
+ output.on("error", (error) => reject(error));
8102
+ });
7923
8103
  }
7924
8104
  return {
7925
8105
  plan,
7926
8106
  totalBytesRead,
7927
8107
  totalBytesWritten,
8108
+ sourceEncoding,
7928
8109
  nulBytesRemoved,
8110
+ invalidBytesRemoved,
8111
+ controlCharsRemoved,
7929
8112
  lineCount,
7930
- changed: nulBytesRemoved > 0 || totalBytesRead !== totalBytesWritten
8113
+ changed: nulBytesRemoved > 0 || invalidBytesRemoved > 0 || controlCharsRemoved > 0 || totalBytesRead !== totalBytesWritten
7931
8114
  };
7932
8115
  }
7933
8116
 
@@ -7990,40 +8173,54 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
7990
8173
  "No recognized validated dataset files were found for sanitization."
7991
8174
  );
7992
8175
  }
8176
+ const sourceEncoding = normalizeSanitizeSourceEncoding(
8177
+ options.sourceEncoding
8178
+ );
7993
8179
  options.onProgress?.({
7994
8180
  kind: "start",
7995
8181
  validatedPath,
7996
8182
  outputPath,
7997
8183
  totalFiles: plan.totalFiles,
7998
8184
  totalBytes: plan.totalBytes,
7999
- datasets: plan.datasets
8185
+ datasets: plan.datasets,
8186
+ sourceEncoding
8000
8187
  });
8001
8188
  let processedFiles = 0;
8002
8189
  let processedRows = 0;
8003
8190
  let processedBytes = 0;
8004
8191
  let nulBytesRemoved = 0;
8192
+ let invalidBytesRemoved = 0;
8193
+ let controlCharsRemoved = 0;
8005
8194
  let changedFiles = 0;
8006
8195
  const fileSummaries = [];
8007
8196
  for (const [index, filePlan] of plan.files.entries()) {
8008
- const fileResult = await sanitizeDatasetFile(filePlan, (chunk) => {
8009
- options.onProgress?.({
8010
- kind: "progress",
8011
- currentFileDisplayPath: filePlan.displayPath,
8012
- fileIndex: index + 1,
8013
- totalFiles: plan.totalFiles,
8014
- bytesProcessed: processedBytes + chunk.fileBytesProcessed,
8015
- totalBytes: plan.totalBytes,
8016
- fileBytesProcessed: chunk.fileBytesProcessed,
8017
- currentFileSize: chunk.currentFileSize,
8018
- processedRows: processedRows + chunk.processedRows,
8019
- nulBytesRemoved: nulBytesRemoved + chunk.nulBytesRemoved,
8020
- changedFiles
8021
- });
8022
- });
8197
+ const fileResult = await sanitizeDatasetFile(
8198
+ filePlan,
8199
+ (chunk) => {
8200
+ options.onProgress?.({
8201
+ kind: "progress",
8202
+ currentFileDisplayPath: filePlan.displayPath,
8203
+ fileIndex: index + 1,
8204
+ totalFiles: plan.totalFiles,
8205
+ bytesProcessed: processedBytes + chunk.fileBytesProcessed,
8206
+ totalBytes: plan.totalBytes,
8207
+ fileBytesProcessed: chunk.fileBytesProcessed,
8208
+ currentFileSize: chunk.currentFileSize,
8209
+ processedRows: processedRows + chunk.processedRows,
8210
+ nulBytesRemoved: nulBytesRemoved + chunk.nulBytesRemoved,
8211
+ invalidBytesRemoved: invalidBytesRemoved + chunk.invalidBytesRemoved,
8212
+ controlCharsRemoved: controlCharsRemoved + chunk.controlCharsRemoved,
8213
+ changedFiles
8214
+ });
8215
+ },
8216
+ { sourceEncoding }
8217
+ );
8023
8218
  processedFiles += 1;
8024
8219
  processedRows += fileResult.lineCount;
8025
8220
  processedBytes += fileResult.totalBytesRead;
8026
8221
  nulBytesRemoved += fileResult.nulBytesRemoved;
8222
+ invalidBytesRemoved += fileResult.invalidBytesRemoved;
8223
+ controlCharsRemoved += fileResult.controlCharsRemoved;
8027
8224
  changedFiles += fileResult.changed ? 1 : 0;
8028
8225
  fileSummaries.push({
8029
8226
  dataset: filePlan.dataset,
@@ -8031,7 +8228,9 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
8031
8228
  outputPath: filePlan.outputPath,
8032
8229
  lineCount: fileResult.lineCount,
8033
8230
  changed: fileResult.changed,
8034
- nulBytesRemoved: fileResult.nulBytesRemoved
8231
+ nulBytesRemoved: fileResult.nulBytesRemoved,
8232
+ invalidBytesRemoved: fileResult.invalidBytesRemoved,
8233
+ controlCharsRemoved: fileResult.controlCharsRemoved
8035
8234
  });
8036
8235
  }
8037
8236
  options.onProgress?.({
@@ -8039,6 +8238,8 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
8039
8238
  totalFiles: plan.totalFiles,
8040
8239
  processedRows,
8041
8240
  nulBytesRemoved,
8241
+ invalidBytesRemoved,
8242
+ controlCharsRemoved,
8042
8243
  changedFiles,
8043
8244
  totalBytes: plan.totalBytes
8044
8245
  });
@@ -8050,13 +8251,17 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
8050
8251
  totalBytes: plan.totalBytes,
8051
8252
  processedFiles,
8052
8253
  processedRows,
8254
+ sourceEncoding,
8053
8255
  nulBytesRemoved,
8256
+ invalidBytesRemoved,
8257
+ controlCharsRemoved,
8054
8258
  changedFiles,
8055
8259
  unchangedFiles: plan.totalFiles - changedFiles,
8056
8260
  datasets: plan.datasets,
8057
8261
  files: fileSummaries,
8058
8262
  warnings: [
8059
- "Sanitization prepares a clean dataset tree for import by removing known low-level byte issues such as NUL bytes before PostgreSQL loading begins.",
8263
+ "Sanitization now writes UTF-8 output and removes invalid bytes plus problematic control characters before PostgreSQL loading begins.",
8264
+ "The PostgreSQL direct import path can use --source-encoding UTF8 when reading files generated by this sanitization command.",
8060
8265
  "The import command still keeps quarantine and row-level recovery for unexpected issues, but sanitizing first reduces the amount of slow fallback work during import."
8061
8266
  ],
8062
8267
  nextStep: inferNextStep3(outputPath)
@@ -8206,6 +8411,18 @@ var STAGING_TABLE_BY_DATASET3 = {
8206
8411
  partners: "staging_partners",
8207
8412
  simples_options: "staging_simples_options"
8208
8413
  };
8414
+ var STEP_ORDER = [
8415
+ "setup",
8416
+ "load-domains",
8417
+ "load-companies",
8418
+ "load-establishments",
8419
+ "load-partners",
8420
+ "load-simples",
8421
+ "materialize",
8422
+ "materialize-secondary-cnaes",
8423
+ "indexes",
8424
+ "analyze"
8425
+ ];
8209
8426
  function quoteSqlLiteral(value) {
8210
8427
  return `'${value.replace(/'/g, "''")}'`;
8211
8428
  }
@@ -8223,6 +8440,9 @@ function receitaCopyCommand(tableName, columns, filePath) {
8223
8440
  const normalizedFilePath = normalizePathForPsql(filePath);
8224
8441
  return `\\copy ${tableName} (${columns.join(", ")}) from ${quoteSqlLiteral(normalizedFilePath)} with (format csv, header false, delimiter ';', quote '"', escape '"')`;
8225
8442
  }
8443
+ function echo(message) {
8444
+ return `\\echo ${quoteSqlLiteral(message)}`;
8445
+ }
8226
8446
  function datasetColumns(dataset) {
8227
8447
  return DATASET_LAYOUTS[dataset].fields.map((field) => field.columnName);
8228
8448
  }
@@ -8249,7 +8469,7 @@ function partnerDedupeExpression(alias) {
8249
8469
  function materializeCompaniesSql() {
8250
8470
  const columns = companiesLayout.fields.map((field) => field.columnName);
8251
8471
  return [
8252
- "\\echo 'Materializing companies...'",
8472
+ echo("[materialize] Materializing companies..."),
8253
8473
  "with source as (",
8254
8474
  " select",
8255
8475
  ` ${columns.map((column) => `source.${column}`).join(",\n ")},`,
@@ -8263,7 +8483,8 @@ function materializeCompaniesSql() {
8263
8483
  `select ${columns.join(", ")}`,
8264
8484
  "from deduped",
8265
8485
  "on conflict (cnpj_root) do update set",
8266
- ` ${updateAssignments(columns, ["cnpj_root"])};`
8486
+ ` ${updateAssignments(columns, ["cnpj_root"])};`,
8487
+ echo("[materialize] Companies materialization completed.")
8267
8488
  ].join("\n");
8268
8489
  }
8269
8490
  function materializeEstablishmentsSql() {
@@ -8272,7 +8493,7 @@ function materializeEstablishmentsSql() {
8272
8493
  );
8273
8494
  const insertColumns = [...baseColumns, "cnpj_full"];
8274
8495
  return [
8275
- "\\echo 'Materializing establishments and secondary CNAEs...'",
8496
+ echo("[materialize] Materializing establishments..."),
8276
8497
  "with source as (",
8277
8498
  " select",
8278
8499
  ` ${baseColumns.map((column) => `source.${column}`).join(",\n ")},`,
@@ -8282,14 +8503,29 @@ function materializeEstablishmentsSql() {
8282
8503
  "),",
8283
8504
  "deduped as (",
8284
8505
  " select * from source where dedupe_rank = 1",
8506
+ ")",
8507
+ `insert into establishments (${insertColumns.join(", ")})`,
8508
+ `select ${insertColumns.join(", ")}`,
8509
+ "from deduped",
8510
+ "on conflict (cnpj_full) do update set",
8511
+ ` ${updateAssignments(insertColumns, ["cnpj_root", "cnpj_order", "cnpj_check_digits", "cnpj_full"])};`,
8512
+ echo("[materialize] Establishments materialization completed.")
8513
+ ].join("\n");
8514
+ }
8515
+ function materializeSecondaryCnaesSql() {
8516
+ return [
8517
+ echo(
8518
+ "[materialize-secondary-cnaes] Materializing establishment secondary CNAEs..."
8519
+ ),
8520
+ "with source as (",
8521
+ " select",
8522
+ " staging.cnpj_root || staging.cnpj_order || staging.cnpj_check_digits as cnpj_full,",
8523
+ " staging.secondary_cnaes_raw,",
8524
+ " row_number() over (partition by staging.cnpj_root || staging.cnpj_order || staging.cnpj_check_digits order by staging.staging_id desc) as dedupe_rank",
8525
+ " from staging_establishments staging",
8285
8526
  "),",
8286
- "upserted as (",
8287
- ` insert into establishments (${insertColumns.join(", ")})`,
8288
- ` select ${insertColumns.join(", ")}`,
8289
- " from deduped",
8290
- " on conflict (cnpj_full) do update set",
8291
- ` ${updateAssignments(insertColumns, ["cnpj_root", "cnpj_order", "cnpj_check_digits", "cnpj_full"])}`,
8292
- " returning cnpj_full",
8527
+ "deduped as (",
8528
+ " select * from source where dedupe_rank = 1",
8293
8529
  "),",
8294
8530
  "deleted_secondary_cnaes as (",
8295
8531
  " delete from establishment_secondary_cnaes target",
@@ -8310,14 +8546,17 @@ function materializeEstablishmentsSql() {
8310
8546
  "insert into establishment_secondary_cnaes (cnpj_full, cnae_code)",
8311
8547
  "select cnpj_full, cnae_code",
8312
8548
  "from secondary_cnaes_source",
8313
- "on conflict (cnpj_full, cnae_code) do nothing;"
8549
+ "on conflict (cnpj_full, cnae_code) do nothing;",
8550
+ echo(
8551
+ "[materialize-secondary-cnaes] Secondary CNAEs materialization completed."
8552
+ )
8314
8553
  ].join("\n");
8315
8554
  }
8316
8555
  function materializePartnersSql() {
8317
8556
  const baseColumns = partnersLayout.fields.map((field) => field.columnName);
8318
8557
  const insertColumns = [...baseColumns, "partner_dedupe_key"];
8319
8558
  return [
8320
- "\\echo 'Materializing partners...'",
8559
+ echo("[materialize] Materializing partners..."),
8321
8560
  "with source as (",
8322
8561
  " select",
8323
8562
  ` ${baseColumns.map((column) => `source.${column}`).join(",\n ")},`,
@@ -8337,13 +8576,14 @@ function materializePartnersSql() {
8337
8576
  `select ${insertColumns.join(", ")}`,
8338
8577
  "from deduped",
8339
8578
  "on conflict (partner_dedupe_key) do update set",
8340
- ` ${updateAssignments(insertColumns, ["partner_dedupe_key"])};`
8579
+ ` ${updateAssignments(insertColumns, ["partner_dedupe_key"])};`,
8580
+ echo("[materialize] Partners materialization completed.")
8341
8581
  ].join("\n");
8342
8582
  }
8343
8583
  function materializeSimplesSql() {
8344
8584
  const columns = simplesLayout.fields.map((field) => field.columnName);
8345
8585
  return [
8346
- "\\echo 'Materializing simples options...'",
8586
+ echo("[materialize] Materializing simples options..."),
8347
8587
  "with source as (",
8348
8588
  " select",
8349
8589
  ` ${columns.map((column) => `source.${column}`).join(",\n ")},`,
@@ -8357,7 +8597,8 @@ function materializeSimplesSql() {
8357
8597
  `select ${columns.join(", ")}`,
8358
8598
  "from deduped",
8359
8599
  "on conflict (cnpj_root) do update set",
8360
- ` ${updateAssignments(columns, ["cnpj_root"])};`
8600
+ ` ${updateAssignments(columns, ["cnpj_root"])};`,
8601
+ echo("[materialize] Simples options materialization completed.")
8361
8602
  ].join("\n");
8362
8603
  }
8363
8604
  function copyDomainSql(dataset, files) {
@@ -8367,12 +8608,20 @@ function copyDomainSql(dataset, files) {
8367
8608
  const columns = datasetColumns(dataset);
8368
8609
  const tempTable = `tmp_hybrid_${dataset}`;
8369
8610
  const lines = [
8370
- `\\echo 'Loading ${dataset} lookup data...'`,
8611
+ echo(`[load-domains] Loading ${dataset} lookup data...`),
8371
8612
  `drop table if exists ${tempTable};`,
8372
8613
  `create temporary table ${tempTable} (code text, description text);`
8373
8614
  ];
8374
- for (const file of files) {
8375
- lines.push(csvCopyCommand(tempTable, columns, file.absolutePath));
8615
+ for (const [index, file] of files.entries()) {
8616
+ lines.push(
8617
+ echo(
8618
+ `[load-domains] Loading ${dataset} file ${index + 1} of ${files.length}: ${file.relativePath}`
8619
+ ),
8620
+ csvCopyCommand(tempTable, columns, file.absolutePath),
8621
+ echo(
8622
+ `[load-domains] Loaded ${dataset} file ${index + 1} of ${files.length}.`
8623
+ )
8624
+ );
8376
8625
  }
8377
8626
  lines.push(
8378
8627
  `insert into ${dataset} (${columns.join(", ")})`,
@@ -8393,12 +8642,17 @@ function copyStagingSql(dataset, files) {
8393
8642
  return [];
8394
8643
  }
8395
8644
  const columns = datasetColumns(dataset);
8396
- return [
8397
- `\\echo 'Loading ${dataset} staging data...'`,
8398
- ...files.map(
8399
- (file) => csvCopyCommand(tableName, columns, file.absolutePath)
8400
- )
8401
- ];
8645
+ const lines = [echo(`[load-${dataset}] Loading ${dataset} staging data...`)];
8646
+ for (const [index, file] of files.entries()) {
8647
+ lines.push(
8648
+ echo(
8649
+ `[load-${dataset}] Loading file ${index + 1} of ${files.length}: ${file.relativePath}`
8650
+ ),
8651
+ csvCopyCommand(tableName, columns, file.absolutePath),
8652
+ echo(`[load-${dataset}] Loaded file ${index + 1} of ${files.length}.`)
8653
+ );
8654
+ }
8655
+ return lines;
8402
8656
  }
8403
8657
  function csvFilesByDataset(files) {
8404
8658
  const grouped = {};
@@ -8424,7 +8678,9 @@ function rawTableName(dataset) {
8424
8678
  function createRawTempTableSql(dataset) {
8425
8679
  const columns = DATASET_LAYOUTS[dataset].fields.map((field) => ` ${quoteIdentifier(field.columnName)} text`).join(",\n");
8426
8680
  return [
8681
+ "set client_min_messages to warning;",
8427
8682
  `drop table if exists ${rawTableName(dataset)};`,
8683
+ "reset client_min_messages;",
8428
8684
  `create temporary table ${rawTableName(dataset)} (`,
8429
8685
  columns,
8430
8686
  ");"
@@ -8506,11 +8762,21 @@ function rawDomainSql(dataset, files) {
8506
8762
  const columns = layout.fields.map((field) => field.columnName);
8507
8763
  const tableName = rawTableName(dataset);
8508
8764
  const lines = [
8509
- `\\echo 'Loading ${dataset} lookup data directly from sanitized Receita files...'`,
8765
+ echo(
8766
+ `[load-domains] Loading ${dataset} lookup data directly from sanitized Receita files...`
8767
+ ),
8510
8768
  createRawTempTableSql(dataset)
8511
8769
  ];
8512
- for (const file of files) {
8513
- lines.push(receitaCopyCommand(tableName, columns, file.absolutePath));
8770
+ for (const [index, file] of files.entries()) {
8771
+ lines.push(
8772
+ echo(
8773
+ `[load-domains] Loading ${dataset} file ${index + 1} of ${files.length}: ${file.relativePath}`
8774
+ ),
8775
+ receitaCopyCommand(tableName, columns, file.absolutePath),
8776
+ echo(
8777
+ `[load-domains] Loaded ${dataset} file ${index + 1} of ${files.length}.`
8778
+ )
8779
+ );
8514
8780
  }
8515
8781
  lines.push(
8516
8782
  `insert into ${dataset} (${columns.join(", ")})`,
@@ -8520,7 +8786,8 @@ function rawDomainSql(dataset, files) {
8520
8786
  `from ${tableName}`,
8521
8787
  "where nullif(btrim(code), '') is not null",
8522
8788
  "order by code",
8523
- "on conflict (code) do update set description = excluded.description;"
8789
+ "on conflict (code) do update set description = excluded.description;",
8790
+ echo(`[load-domains] ${dataset} lookup data completed.`)
8524
8791
  );
8525
8792
  return lines;
8526
8793
  }
@@ -8539,70 +8806,363 @@ function rawStagingSql(dataset, files) {
8539
8806
  const expressions = layout.fields.map(
8540
8807
  (field) => ` ${fieldExpression(dataset, field, alias)} as ${field.columnName}`
8541
8808
  );
8809
+ const stepName = loadStepName(dataset);
8542
8810
  const lines = [
8543
- `\\echo 'Loading ${dataset} staging data directly from sanitized Receita files...'`,
8811
+ echo(
8812
+ `[${stepName}] Loading ${dataset} staging data directly from sanitized Receita files...`
8813
+ ),
8814
+ `truncate table ${targetTable} restart identity;`,
8544
8815
  createRawTempTableSql(dataset)
8545
8816
  ];
8546
- for (const file of files) {
8547
- lines.push(receitaCopyCommand(tableName, columns, file.absolutePath));
8817
+ for (const [index, file] of files.entries()) {
8818
+ lines.push(
8819
+ echo(
8820
+ `[${stepName}] Loading file ${index + 1} of ${files.length}: ${file.relativePath}`
8821
+ ),
8822
+ receitaCopyCommand(tableName, columns, file.absolutePath),
8823
+ echo(`[${stepName}] Loaded file ${index + 1} of ${files.length}.`)
8824
+ );
8548
8825
  }
8549
8826
  lines.push(
8827
+ echo(
8828
+ `[${stepName}] Transforming ${dataset} raw rows into ${targetTable}...`
8829
+ ),
8550
8830
  `insert into ${targetTable} (${columns.join(", ")})`,
8551
8831
  "select",
8552
8832
  expressions.join(",\n"),
8553
- `from ${tableName} ${alias};`
8833
+ `from ${tableName} ${alias};`,
8834
+ echo(`[${stepName}] ${dataset} staging load completed.`)
8554
8835
  );
8555
8836
  return lines;
8556
8837
  }
8557
- function generatePostgresDirectImportScript(input) {
8558
- const grouped = csvFilesByDataset(input.files);
8559
- const lines = [
8560
- "-- CNPJ DB Loader hybrid PostgreSQL import script",
8561
- "-- Generated from PostgreSQL-ready CSV files exported by cnpj-db-loader postgres export-csv.",
8562
- "-- Execute with psql, for example:",
8563
- '-- psql "postgres://postgres:postgres@localhost:5432/cnpj" -f import-postgres-direct.sql',
8564
- "",
8838
+ function loadStepName(dataset) {
8839
+ switch (dataset) {
8840
+ case "companies":
8841
+ return "load-companies";
8842
+ case "establishments":
8843
+ return "load-establishments";
8844
+ case "partners":
8845
+ return "load-partners";
8846
+ case "simples_options":
8847
+ return "load-simples";
8848
+ default:
8849
+ return `load-${dataset}`;
8850
+ }
8851
+ }
8852
+ function scriptHeader(title, sourceEncoding) {
8853
+ return [
8854
+ `-- ${title}`,
8855
+ "-- Generated by cnpj-db-loader postgres generate-script.",
8565
8856
  "\\set ON_ERROR_STOP on",
8566
- "\\echo 'Starting CNPJ DB Loader hybrid PostgreSQL import...'",
8567
- "",
8568
- "begin;",
8569
- "",
8570
- "-- Keep the final schema and seed data managed by sql/schema.sql.",
8571
- "-- This script only resets staging tables and then upserts final data.",
8572
- "truncate table staging_companies restart identity;",
8573
- "truncate table staging_establishments restart identity;",
8574
- "truncate table staging_partners restart identity;",
8575
- "truncate table staging_simples_options restart identity;",
8857
+ ...sourceEncoding ? [
8858
+ echo(
8859
+ `Using source file encoding ${sourceEncoding} for psql copy operations...`
8860
+ ),
8861
+ `set client_encoding to ${quoteSqlLiteral(sourceEncoding)};`
8862
+ ] : [],
8576
8863
  ""
8577
8864
  ];
8578
- for (const dataset of DOMAIN_DATASETS) {
8579
- lines.push(...copyDomainSql(dataset, grouped[dataset] ?? []), "");
8865
+ }
8866
+ function wrapTransaction(lines, mode, shouldWrap) {
8867
+ if (!shouldWrap || mode !== "phase") {
8868
+ return [...lines];
8580
8869
  }
8581
- for (const dataset of STAGING_DATASETS) {
8582
- lines.push(...copyStagingSql(dataset, grouped[dataset] ?? []), "");
8870
+ return ["begin;", "", ...lines, "", "commit;"];
8871
+ }
8872
+ function buildStepScript(title, body, input, wrapInPhaseTransaction) {
8873
+ return [
8874
+ ...scriptHeader(title, input.sourceEncoding),
8875
+ ...wrapTransaction(body, input.transactionMode, wrapInPhaseTransaction),
8876
+ ""
8877
+ ].join("\n");
8878
+ }
8879
+ function includeSet(input) {
8880
+ const selected = new Set(input.include);
8881
+ if (input.skipIndexes) {
8882
+ selected.delete("indexes");
8583
8883
  }
8584
- lines.push(...materializationAndAnalyzeSql());
8585
- return lines.join("\n");
8884
+ if (input.skipAnalyze) {
8885
+ selected.delete("analyze");
8886
+ }
8887
+ return selected;
8888
+ }
8889
+ function hasAnyFinalMaterialization(selected) {
8890
+ return selected.has("companies") || selected.has("establishments") || selected.has("partners") || selected.has("simples");
8891
+ }
8892
+ function materializeSql(selected) {
8893
+ const lines = [echo("[materialize] Starting final table materialization...")];
8894
+ if (selected.has("companies")) {
8895
+ lines.push(materializeCompaniesSql(), "");
8896
+ }
8897
+ if (selected.has("establishments")) {
8898
+ lines.push(materializeEstablishmentsSql(), "");
8899
+ }
8900
+ if (selected.has("partners")) {
8901
+ lines.push(materializePartnersSql(), "");
8902
+ }
8903
+ if (selected.has("simples")) {
8904
+ lines.push(materializeSimplesSql(), "");
8905
+ }
8906
+ lines.push(echo("[materialize] Final table materialization completed."));
8907
+ return lines;
8908
+ }
8909
+ function indexesSql() {
8910
+ return [
8911
+ echo(
8912
+ "[indexes] No additional index operations are generated in this beta."
8913
+ ),
8914
+ "-- Indexes are expected to be managed by the schema generated by cnpj-db-loader schema generate.",
8915
+ "-- A future fast-rebuild mode may generate DROP/CREATE INDEX operations here."
8916
+ ];
8917
+ }
8918
+ function analyzeSql(selected) {
8919
+ const tables = /* @__PURE__ */ new Set();
8920
+ if (selected.has("companies")) {
8921
+ tables.add("companies");
8922
+ }
8923
+ if (selected.has("establishments")) {
8924
+ tables.add("establishments");
8925
+ }
8926
+ if (selected.has("secondary-cnaes")) {
8927
+ tables.add("establishment_secondary_cnaes");
8928
+ }
8929
+ if (selected.has("partners")) {
8930
+ tables.add("partners");
8931
+ }
8932
+ if (selected.has("simples")) {
8933
+ tables.add("simples_options");
8934
+ }
8935
+ if (selected.has("domains")) {
8936
+ for (const dataset of DOMAIN_DATASETS) {
8937
+ tables.add(dataset);
8938
+ }
8939
+ }
8940
+ return [
8941
+ echo("[analyze] Refreshing planner statistics..."),
8942
+ ...[...tables].map((table) => `analyze ${table};`),
8943
+ echo("[analyze] Planner statistics refreshed.")
8944
+ ];
8945
+ }
8946
+ function step(name, file, dependsOn, included) {
8947
+ return { name, file, dependsOn, included };
8586
8948
  }
8587
- function generatePostgresSanitizedDirectImportScript(input) {
8949
+ function generatePostgresDirectScriptFiles(input) {
8588
8950
  const grouped = directFilesByDataset(input.files);
8589
- const lines = [
8590
- "-- CNPJ DB Loader direct PostgreSQL import script",
8951
+ const selected = includeSet(input);
8952
+ if (!DOMAIN_DATASETS.some((dataset) => (grouped[dataset] ?? []).length > 0)) {
8953
+ selected.delete("domains");
8954
+ }
8955
+ if ((grouped.companies ?? []).length === 0) {
8956
+ selected.delete("companies");
8957
+ }
8958
+ if ((grouped.establishments ?? []).length === 0) {
8959
+ selected.delete("establishments");
8960
+ selected.delete("secondary-cnaes");
8961
+ }
8962
+ if ((grouped.partners ?? []).length === 0) {
8963
+ selected.delete("partners");
8964
+ }
8965
+ if ((grouped.simples_options ?? []).length === 0) {
8966
+ selected.delete("simples");
8967
+ }
8968
+ const scripts = {};
8969
+ const steps = [];
8970
+ const setupIncluded = true;
8971
+ steps.push(step("setup", "setup.sql", [], setupIncluded));
8972
+ scripts["setup.sql"] = [
8973
+ ...scriptHeader(
8974
+ "CNPJ DB Loader PostgreSQL direct import setup",
8975
+ input.sourceEncoding
8976
+ ),
8977
+ echo("[setup] Preparing PostgreSQL direct import session..."),
8978
+ "-- The database schema must be applied before running these scripts.",
8979
+ "-- This setup script configures the psql session used by the generated orchestrator.",
8980
+ echo("[setup] Setup completed."),
8981
+ ""
8982
+ ].join("\n");
8983
+ const domainsIncluded = selected.has("domains") && DOMAIN_DATASETS.some((dataset) => (grouped[dataset] ?? []).length > 0);
8984
+ steps.push(
8985
+ step("load-domains", "load-domains.sql", ["setup"], domainsIncluded)
8986
+ );
8987
+ if (domainsIncluded) {
8988
+ const lines = [echo("[load-domains] Starting domain tables load...")];
8989
+ for (const dataset of DOMAIN_DATASETS) {
8990
+ lines.push(...rawDomainSql(dataset, grouped[dataset] ?? []), "");
8991
+ }
8992
+ lines.push(echo("[load-domains] Domain tables load completed."));
8993
+ scripts["load-domains.sql"] = buildStepScript(
8994
+ "CNPJ DB Loader PostgreSQL direct import domains step",
8995
+ lines,
8996
+ input,
8997
+ true
8998
+ );
8999
+ }
9000
+ const datasetSteps = [
9001
+ {
9002
+ dataset: "companies",
9003
+ name: "load-companies",
9004
+ file: "load-companies.sql",
9005
+ include: "companies"
9006
+ },
9007
+ {
9008
+ dataset: "establishments",
9009
+ name: "load-establishments",
9010
+ file: "load-establishments.sql",
9011
+ include: "establishments"
9012
+ },
9013
+ {
9014
+ dataset: "partners",
9015
+ name: "load-partners",
9016
+ file: "load-partners.sql",
9017
+ include: "partners"
9018
+ },
9019
+ {
9020
+ dataset: "simples_options",
9021
+ name: "load-simples",
9022
+ file: "load-simples.sql",
9023
+ include: "simples"
9024
+ }
9025
+ ];
9026
+ for (const item of datasetSteps) {
9027
+ const files = grouped[item.dataset] ?? [];
9028
+ const included = selected.has(item.include) && files.length > 0;
9029
+ steps.push(step(item.name, item.file, ["setup"], included));
9030
+ if (included) {
9031
+ scripts[item.file] = buildStepScript(
9032
+ `CNPJ DB Loader PostgreSQL direct import ${item.name} step`,
9033
+ rawStagingSql(item.dataset, files),
9034
+ input,
9035
+ true
9036
+ );
9037
+ }
9038
+ }
9039
+ const materializeIncluded = hasAnyFinalMaterialization(selected);
9040
+ steps.push(
9041
+ step(
9042
+ "materialize",
9043
+ "materialize.sql",
9044
+ datasetSteps.filter((item) => selected.has(item.include)).map((item) => item.name),
9045
+ materializeIncluded
9046
+ )
9047
+ );
9048
+ if (materializeIncluded) {
9049
+ scripts["materialize.sql"] = buildStepScript(
9050
+ "CNPJ DB Loader PostgreSQL direct import materialization step",
9051
+ materializeSql(selected),
9052
+ input,
9053
+ true
9054
+ );
9055
+ }
9056
+ const secondaryIncluded = selected.has("secondary-cnaes") && selected.has("establishments");
9057
+ steps.push(
9058
+ step(
9059
+ "materialize-secondary-cnaes",
9060
+ "materialize-secondary-cnaes.sql",
9061
+ ["load-establishments"],
9062
+ secondaryIncluded
9063
+ )
9064
+ );
9065
+ if (secondaryIncluded) {
9066
+ scripts["materialize-secondary-cnaes.sql"] = buildStepScript(
9067
+ "CNPJ DB Loader PostgreSQL direct import secondary CNAEs step",
9068
+ [materializeSecondaryCnaesSql()],
9069
+ input,
9070
+ true
9071
+ );
9072
+ }
9073
+ const indexesIncluded = selected.has("indexes");
9074
+ steps.push(
9075
+ step(
9076
+ "indexes",
9077
+ "indexes.sql",
9078
+ materializeIncluded ? ["materialize"] : ["setup"],
9079
+ indexesIncluded
9080
+ )
9081
+ );
9082
+ if (indexesIncluded) {
9083
+ scripts["indexes.sql"] = buildStepScript(
9084
+ "CNPJ DB Loader PostgreSQL direct import indexes step",
9085
+ indexesSql(),
9086
+ input,
9087
+ true
9088
+ );
9089
+ }
9090
+ const analyzeIncluded = selected.has("analyze");
9091
+ const analyzeDependencies = [
9092
+ ...domainsIncluded ? ["load-domains"] : [],
9093
+ ...materializeIncluded ? ["materialize"] : [],
9094
+ ...secondaryIncluded ? ["materialize-secondary-cnaes"] : []
9095
+ ];
9096
+ steps.push(
9097
+ step(
9098
+ "analyze",
9099
+ "analyze.sql",
9100
+ analyzeDependencies.length > 0 ? analyzeDependencies : ["setup"],
9101
+ analyzeIncluded
9102
+ )
9103
+ );
9104
+ if (analyzeIncluded) {
9105
+ scripts["analyze.sql"] = buildStepScript(
9106
+ "CNPJ DB Loader PostgreSQL direct import analyze step",
9107
+ analyzeSql(selected),
9108
+ input,
9109
+ true
9110
+ );
9111
+ }
9112
+ const orchestratorLines = [
9113
+ "-- CNPJ DB Loader direct PostgreSQL import orchestrator",
8591
9114
  "-- Generated from sanitized Receita files by cnpj-db-loader postgres generate-script.",
8592
- "-- This path avoids rewriting the dataset into a second CSV tree.",
8593
9115
  "-- Execute with psql, for example:",
8594
- '-- psql "postgres://postgres:postgres@localhost:5432/cnpj" -f import-postgres-direct.sql',
9116
+ '-- psql -d "postgres://postgres:postgres@localhost:5432/cnpj" -f import-postgres-direct.sql',
8595
9117
  "",
8596
9118
  "\\set ON_ERROR_STOP on",
8597
- `\\echo 'Using source file encoding ${input.sourceEncoding} for psql copy operations...'`,
9119
+ echo(
9120
+ `Using source file encoding ${input.sourceEncoding} for psql copy operations...`
9121
+ ),
8598
9122
  `set client_encoding to ${quoteSqlLiteral(input.sourceEncoding)};`,
8599
- "\\echo 'Starting CNPJ DB Loader direct PostgreSQL import from sanitized files...'",
9123
+ echo(
9124
+ `Starting CNPJ DB Loader direct PostgreSQL import using transaction mode ${input.transactionMode}...`
9125
+ ),
9126
+ "",
9127
+ ...input.transactionMode === "single" ? ["begin;", ""] : []
9128
+ ];
9129
+ for (const name of STEP_ORDER) {
9130
+ const currentStep = steps.find((item) => item.name === name);
9131
+ if (!currentStep?.included) {
9132
+ continue;
9133
+ }
9134
+ orchestratorLines.push(
9135
+ echo(
9136
+ `[orchestrator] Running ${currentStep.name} (${currentStep.file})...`
9137
+ ),
9138
+ `\\ir ${currentStep.file}`,
9139
+ echo(`[orchestrator] Completed ${currentStep.name}.`),
9140
+ ""
9141
+ );
9142
+ }
9143
+ orchestratorLines.push(
9144
+ ...input.transactionMode === "single" ? ["commit;", ""] : [],
9145
+ echo("CNPJ DB Loader hybrid PostgreSQL import completed."),
9146
+ ""
9147
+ );
9148
+ scripts["import-postgres-direct.sql"] = orchestratorLines.join("\n");
9149
+ return { scripts, steps };
9150
+ }
9151
+ function generatePostgresDirectImportScript(input) {
9152
+ const grouped = csvFilesByDataset(input.files);
9153
+ const lines = [
9154
+ "-- CNPJ DB Loader hybrid PostgreSQL import script",
9155
+ "-- Generated from PostgreSQL-ready CSV files exported by cnpj-db-loader postgres export-csv.",
9156
+ "-- Execute with psql, for example:",
9157
+ '-- psql -d "postgres://postgres:postgres@localhost:5432/cnpj" -f import-postgres-direct.sql',
9158
+ "",
9159
+ "\\set ON_ERROR_STOP on",
9160
+ echo("Starting CNPJ DB Loader hybrid PostgreSQL import..."),
8600
9161
  "",
8601
9162
  "begin;",
8602
9163
  "",
8603
9164
  "-- Keep the final schema and seed data managed by sql/schema.sql.",
8604
- "-- This script copies sanitized Receita files into temporary raw tables,",
8605
- "-- transforms values inside PostgreSQL, resets staging tables and upserts final data.",
9165
+ "-- This script only resets staging tables and then upserts final data.",
8606
9166
  "truncate table staging_companies restart identity;",
8607
9167
  "truncate table staging_establishments restart identity;",
8608
9168
  "truncate table staging_partners restart identity;",
@@ -8610,10 +9170,10 @@ function generatePostgresSanitizedDirectImportScript(input) {
8610
9170
  ""
8611
9171
  ];
8612
9172
  for (const dataset of DOMAIN_DATASETS) {
8613
- lines.push(...rawDomainSql(dataset, grouped[dataset] ?? []), "");
9173
+ lines.push(...copyDomainSql(dataset, grouped[dataset] ?? []), "");
8614
9174
  }
8615
9175
  for (const dataset of STAGING_DATASETS) {
8616
- lines.push(...rawStagingSql(dataset, grouped[dataset] ?? []), "");
9176
+ lines.push(...copyStagingSql(dataset, grouped[dataset] ?? []), "");
8617
9177
  }
8618
9178
  lines.push(...materializationAndAnalyzeSql());
8619
9179
  return lines.join("\n");
@@ -8624,11 +9184,13 @@ function materializationAndAnalyzeSql() {
8624
9184
  "",
8625
9185
  materializeEstablishmentsSql(),
8626
9186
  "",
9187
+ materializeSecondaryCnaesSql(),
9188
+ "",
8627
9189
  materializePartnersSql(),
8628
9190
  "",
8629
9191
  materializeSimplesSql(),
8630
9192
  "",
8631
- "\\echo 'Refreshing planner statistics...'",
9193
+ echo("Refreshing planner statistics..."),
8632
9194
  "analyze companies;",
8633
9195
  "analyze establishments;",
8634
9196
  "analyze establishment_secondary_cnaes;",
@@ -8643,7 +9205,7 @@ function materializationAndAnalyzeSql() {
8643
9205
  "",
8644
9206
  "commit;",
8645
9207
  "",
8646
- "\\echo 'CNPJ DB Loader hybrid PostgreSQL import completed.'",
9208
+ echo("CNPJ DB Loader hybrid PostgreSQL import completed."),
8647
9209
  ""
8648
9210
  ];
8649
9211
  }
@@ -8849,7 +9411,30 @@ async function exportPostgresCsvDataset(inputPath, options = {}) {
8849
9411
  // src/services/postgres-direct/generator.ts
8850
9412
  import { mkdir as mkdir9, stat as stat7, writeFile as writeFile6 } from "fs/promises";
8851
9413
  import path17 from "path";
8852
- var DEFAULT_SOURCE_ENCODING = "WIN1252";
9414
+ var DEFAULT_SOURCE_ENCODING = "UTF8";
9415
+ var DEFAULT_TRANSACTION_MODE = "single";
9416
+ var ALL_INCLUDE_TARGETS = [
9417
+ "domains",
9418
+ "companies",
9419
+ "establishments",
9420
+ "partners",
9421
+ "simples",
9422
+ "secondary-cnaes",
9423
+ "indexes",
9424
+ "analyze"
9425
+ ];
9426
+ var INCLUDE_TARGETS_BY_DATASET = {
9427
+ companies: "companies",
9428
+ establishments: "establishments",
9429
+ partners: "partners",
9430
+ simples_options: "simples",
9431
+ countries: "domains",
9432
+ cities: "domains",
9433
+ partner_qualifications: "domains",
9434
+ legal_natures: "domains",
9435
+ reasons: "domains",
9436
+ cnaes: "domains"
9437
+ };
8853
9438
  function defaultPostgresDirectOutputPath(inputPath) {
8854
9439
  const baseName = path17.basename(inputPath);
8855
9440
  if (baseName.toLowerCase() === "sanitized") {
@@ -8858,17 +9443,52 @@ function defaultPostgresDirectOutputPath(inputPath) {
8858
9443
  return path17.join(path17.dirname(inputPath), `${baseName}-postgres-direct`);
8859
9444
  }
8860
9445
  function inferNextStep5(scriptPath) {
8861
- return `psql "postgres://postgres:postgres@localhost:5432/cnpj" -f ${scriptPath.replace(/\\/g, "/")}`;
9446
+ return `psql -d "postgres://postgres:postgres@localhost:5432/cnpj" -f ${scriptPath.replace(/\\/g, "/")}`;
8862
9447
  }
8863
9448
  function normalizeSourceEncoding(value) {
8864
9449
  const encoding = (value ?? DEFAULT_SOURCE_ENCODING).trim();
8865
9450
  if (!/^[A-Za-z0-9_-]+$/.test(encoding)) {
8866
9451
  throw new ValidationError(
8867
- `Invalid source encoding: ${value}. Use a PostgreSQL client encoding name such as WIN1252 or UTF8.`
9452
+ `Invalid source encoding: ${value}. Use a PostgreSQL client encoding name such as UTF8, WIN1252 or LATIN1.`
8868
9453
  );
8869
9454
  }
8870
9455
  return encoding.toUpperCase();
8871
9456
  }
9457
+ function normalizeTransactionMode(value) {
9458
+ const mode = value ?? DEFAULT_TRANSACTION_MODE;
9459
+ if (!["single", "phase", "none"].includes(mode)) {
9460
+ throw new ValidationError(
9461
+ `Invalid transaction mode: ${String(value)}. Use single, phase or none.`
9462
+ );
9463
+ }
9464
+ return mode;
9465
+ }
9466
+ function isIncludeTarget(value) {
9467
+ return ALL_INCLUDE_TARGETS.includes(value);
9468
+ }
9469
+ function normalizeIncludeTargets(include, dataset) {
9470
+ if (include && include.length > 0) {
9471
+ const unique = [...new Set(include)];
9472
+ const invalid = unique.filter((item) => !isIncludeTarget(item));
9473
+ if (invalid.length > 0) {
9474
+ throw new ValidationError(
9475
+ `Invalid include target(s): ${invalid.join(", ")}. Use ${ALL_INCLUDE_TARGETS.join(", ")}.`
9476
+ );
9477
+ }
9478
+ return unique;
9479
+ }
9480
+ if (dataset) {
9481
+ const target = INCLUDE_TARGETS_BY_DATASET[dataset];
9482
+ if (!target) {
9483
+ return [];
9484
+ }
9485
+ if (target === "establishments") {
9486
+ return ["establishments", "secondary-cnaes", "analyze"];
9487
+ }
9488
+ return [target, "analyze"];
9489
+ }
9490
+ return [...ALL_INCLUDE_TARGETS];
9491
+ }
8872
9492
  async function generatePostgresDirectScript(inputPath, options = {}) {
8873
9493
  if (options.dataset && !isImportDatasetType(options.dataset)) {
8874
9494
  throw new ValidationError(`Unsupported dataset type: ${options.dataset}.`);
@@ -8884,6 +9504,10 @@ async function generatePostgresDirectScript(inputPath, options = {}) {
8884
9504
  options.outputPath ?? defaultPostgresDirectOutputPath(validatedPath)
8885
9505
  );
8886
9506
  const sourceEncoding = normalizeSourceEncoding(options.sourceEncoding);
9507
+ const transactionMode = normalizeTransactionMode(options.transactionMode);
9508
+ const include = normalizeIncludeTargets(options.include, options.dataset);
9509
+ const skipIndexes = options.skipIndexes ?? false;
9510
+ const skipAnalyze = options.skipAnalyze ?? false;
8887
9511
  const inspected = await inspectFiles(validatedPath);
8888
9512
  const recognizedFiles = inspected.entries.filter((entry) => entry.entryKind === "file").flatMap((entry) => {
8889
9513
  if (!isImportDatasetType(entry.inferredType)) {
@@ -8911,7 +9535,11 @@ async function generatePostgresDirectScript(inputPath, options = {}) {
8911
9535
  outputPath,
8912
9536
  totalFiles: recognizedFiles.length,
8913
9537
  datasets,
8914
- sourceEncoding
9538
+ sourceEncoding,
9539
+ transactionMode,
9540
+ include,
9541
+ skipIndexes,
9542
+ skipAnalyze
8915
9543
  });
8916
9544
  await mkdir9(outputPath, { recursive: true });
8917
9545
  const sourceFiles = [];
@@ -8947,11 +9575,21 @@ async function generatePostgresDirectScript(inputPath, options = {}) {
8947
9575
  }
8948
9576
  const scriptName = options.scriptName ?? "import-postgres-direct.sql";
8949
9577
  const scriptPath = path17.join(outputPath, scriptName);
8950
- const script = generatePostgresSanitizedDirectImportScript({
9578
+ const generated = generatePostgresDirectScriptFiles({
8951
9579
  files: sourceFiles,
8952
- sourceEncoding
9580
+ sourceEncoding,
9581
+ transactionMode,
9582
+ include,
9583
+ skipIndexes,
9584
+ skipAnalyze
8953
9585
  });
8954
- await writeFile6(scriptPath, script, "utf8");
9586
+ const scriptFiles = [];
9587
+ for (const [fileName, script] of Object.entries(generated.scripts)) {
9588
+ const outputFileName = fileName === "import-postgres-direct.sql" ? scriptName : fileName;
9589
+ const outputFilePath = path17.join(outputPath, outputFileName);
9590
+ await writeFile6(outputFilePath, script, "utf8");
9591
+ scriptFiles.push(outputFilePath);
9592
+ }
8955
9593
  const manifestPath = path17.join(outputPath, "manifest.json");
8956
9594
  const summaryDatasets = [...summariesByDataset.values()].sort(
8957
9595
  (left, right) => IMPORT_ORDER.indexOf(left.dataset) - IMPORT_ORDER.indexOf(right.dataset)
@@ -8963,13 +9601,19 @@ async function generatePostgresDirectScript(inputPath, options = {}) {
8963
9601
  const manifest = {
8964
9602
  generatedAt: (/* @__PURE__ */ new Date()).toISOString(),
8965
9603
  mode: "direct-sanitized-script",
9604
+ transactionMode,
9605
+ include,
9606
+ skipIndexes,
9607
+ skipAnalyze,
8966
9608
  inputPath: path17.resolve(inputPath),
8967
9609
  validatedPath,
8968
9610
  outputPath,
8969
9611
  scriptPath,
9612
+ scriptFiles,
8970
9613
  sourceEncoding,
8971
9614
  totalFiles: sourceFiles.length,
8972
9615
  totalBytes,
9616
+ steps: generated.steps,
8973
9617
  datasets: summaryDatasets
8974
9618
  };
8975
9619
  await writeFile6(
@@ -8992,14 +9636,19 @@ async function generatePostgresDirectScript(inputPath, options = {}) {
8992
9636
  scriptPath,
8993
9637
  manifestPath,
8994
9638
  sourceEncoding,
9639
+ transactionMode,
8995
9640
  totalFiles: sourceFiles.length,
8996
9641
  totalBytes,
8997
9642
  datasets: summaryDatasets,
9643
+ scriptFiles,
9644
+ steps: generated.steps,
8998
9645
  warnings: [
8999
9646
  ...validation.ok ? [] : validation.errors,
9000
9647
  "This script imports sanitized Receita files directly with psql \\copy. It avoids rewriting the full dataset into a second CSV tree.",
9001
- "The generated script expects the database schema generated by cnpj-db-loader to be applied before execution.",
9002
- "Use --source-encoding UTF8 only if your sanitized files are already UTF-8. The default WIN1252 matches the usual Receita file encoding."
9648
+ "The generated scripts expect the database schema generated by cnpj-db-loader to be applied before execution.",
9649
+ "The direct PostgreSQL script now defaults to UTF8 because the sanitize command writes clean UTF-8 files.",
9650
+ "Use --source-encoding WIN1252 or LATIN1 only when generating scripts for legacy sanitized files produced by older loader versions.",
9651
+ "The generated import is now modular. Use import-postgres-direct.sql as the orchestrator or run individual phase scripts manually."
9003
9652
  ],
9004
9653
  nextStep: inferNextStep5(scriptPath)
9005
9654
  };