@danielarndt0/cnpj-db-loader 2.4.0-beta.1 → 2.4.0-beta.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -10,7 +10,7 @@ This version focuses on the real loading workflow:
10
10
  - check, download, retry, clean, and inspect the latest Federal Revenue CNPJ monthly ZIP archives from the public share
11
11
  - extract Receita Federal ZIP archives
12
12
  - validate an extracted tree
13
- - sanitize validated files before import to remove known low-level byte issues
13
+ - sanitize validated files into clean UTF-8 before import, removing NUL bytes, invalid bytes and problematic control characters
14
14
  - print or generate final, staging, or combined SQL schemas
15
15
  - configure and test the default PostgreSQL URL
16
16
  - import validated dataset files into PostgreSQL with:
@@ -51,7 +51,7 @@ cnpj-db-loader schema generate --profile full
51
51
  cnpj-db-loader import ./downloads/<reference>/sanitized --load-batch-size 500 --materialize-batch-size 50000 --verbose-progress
52
52
 
53
53
  # Optional hybrid path for PostgreSQL direct loading
54
- cnpj-db-loader postgres generate-script ./downloads/<reference>/sanitized --output ./downloads/<reference>/postgres-direct --force
54
+ cnpj-db-loader postgres generate-script ./downloads/<reference>/sanitized --output ./downloads/<reference>/postgres-direct --source-encoding UTF8 --force
55
55
  psql "postgres://postgres:postgres@localhost:5432/cnpj" -f ./downloads/<reference>/postgres-direct/import-postgres-direct.sql
56
56
  ```
57
57
 
@@ -67,7 +67,7 @@ cnpj-db-loader federal-revenue sync [reference] [--reference <yyyy-mm>] [--curre
67
67
  cnpj-db-loader inspect <input>
68
68
  cnpj-db-loader extract <input> [--output <path>]
69
69
  cnpj-db-loader validate <input>
70
- cnpj-db-loader sanitize <input> [--output <path>] [--dataset <name>] [-f]
70
+ cnpj-db-loader sanitize <input> [--output <path>] [--dataset <name>] [--source-encoding <encoding>] [-f]
71
71
  cnpj-db-loader schema print [--profile <profile>]
72
72
  cnpj-db-loader schema generate [--name <name>] [--output <path>] [--profile <profile>]
73
73
  cnpj-db-loader database config set <url>
@@ -95,11 +95,11 @@ For local benchmarks or controlled full loads, the CLI can now generate a direct
95
95
 
96
96
  ```bash
97
97
  cnpj-db-loader sanitize ./downloads/<reference>/extracted
98
- cnpj-db-loader postgres generate-script ./downloads/<reference>/sanitized --output ./downloads/<reference>/postgres-direct --force
98
+ cnpj-db-loader postgres generate-script ./downloads/<reference>/sanitized --output ./downloads/<reference>/postgres-direct --source-encoding UTF8 --force
99
99
  psql "postgres://postgres:postgres@localhost:5432/cnpj" -f ./downloads/<reference>/postgres-direct/import-postgres-direct.sql
100
100
  ```
101
101
 
102
- This path keeps download, extraction, validation and sanitization inside the loader, then lets PostgreSQL load the sanitized Receita files directly through `\copy`, convert values into staging tables and materialize the final tables with set-based SQL. The standard `import` command remains the safest path when checkpoint resume and quarantine recovery are required.
102
+ This path keeps download, extraction, validation and robust UTF-8 sanitization inside the loader, then lets PostgreSQL load the sanitized Receita files directly through `\copy`, convert values into staging tables and materialize the final tables with set-based SQL. The standard `import` command remains the safest path when checkpoint resume and quarantine recovery are required.
103
103
 
104
104
  ## Logs
105
105
 
package/dist/cli.js CHANGED
@@ -7821,81 +7821,264 @@ function isRecognizedSanitizeEntry(entry) {
7821
7821
  return entry.entryKind === "file" && entry.inferredType !== "zip-archive" && entry.inferredType !== "unknown";
7822
7822
  }
7823
7823
 
7824
+ // src/services/sanitize/encoding.ts
7825
+ import { StringDecoder } from "string_decoder";
7826
+ var WINDOWS_1252_C1_MAP = {
7827
+ 128: "\u20AC",
7828
+ 130: "\u201A",
7829
+ 131: "\u0192",
7830
+ 132: "\u201E",
7831
+ 133: "\u2026",
7832
+ 134: "\u2020",
7833
+ 135: "\u2021",
7834
+ 136: "\u02C6",
7835
+ 137: "\u2030",
7836
+ 138: "\u0160",
7837
+ 139: "\u2039",
7838
+ 140: "\u0152",
7839
+ 142: "\u017D",
7840
+ 145: "\u2018",
7841
+ 146: "\u2019",
7842
+ 147: "\u201C",
7843
+ 148: "\u201D",
7844
+ 149: "\u2022",
7845
+ 150: "\u2013",
7846
+ 151: "\u2014",
7847
+ 152: "\u02DC",
7848
+ 153: "\u2122",
7849
+ 154: "\u0161",
7850
+ 155: "\u203A",
7851
+ 156: "\u0153",
7852
+ 158: "\u017E",
7853
+ 159: "\u0178"
7854
+ };
7855
+ function normalizeSanitizeSourceEncoding(value) {
7856
+ const normalized = (value ?? "WIN1252").trim().toUpperCase().replace(/_/g, "-");
7857
+ switch (normalized) {
7858
+ case "WIN1252":
7859
+ case "WINDOWS-1252":
7860
+ case "CP1252":
7861
+ return "WIN1252";
7862
+ case "LATIN1":
7863
+ case "LATIN-1":
7864
+ case "ISO-8859-1":
7865
+ case "ISO8859-1":
7866
+ return "LATIN1";
7867
+ case "UTF8":
7868
+ case "UTF-8":
7869
+ return "UTF8";
7870
+ default:
7871
+ throw new ValidationError(
7872
+ `Unsupported sanitize source encoding: ${value}. Supported values: WIN1252, LATIN1, UTF8.`
7873
+ );
7874
+ }
7875
+ }
7876
+ function isAllowedControlCodePoint(codePoint) {
7877
+ return codePoint === 9 || codePoint === 10 || codePoint === 13;
7878
+ }
7879
+ function isProblematicControlCodePoint(codePoint) {
7880
+ if (isAllowedControlCodePoint(codePoint)) {
7881
+ return false;
7882
+ }
7883
+ return codePoint >= 0 && codePoint <= 31 || codePoint === 127 || codePoint >= 128 && codePoint <= 159 || codePoint === 65279;
7884
+ }
7885
+ function sanitizeDecodedText(text) {
7886
+ const output2 = [];
7887
+ let invalidBytesRemoved = 0;
7888
+ let controlCharsRemoved = 0;
7889
+ for (const char of text) {
7890
+ const codePoint = char.codePointAt(0);
7891
+ if (codePoint === 65533) {
7892
+ invalidBytesRemoved += 1;
7893
+ continue;
7894
+ }
7895
+ if (isProblematicControlCodePoint(codePoint)) {
7896
+ controlCharsRemoved += 1;
7897
+ continue;
7898
+ }
7899
+ output2.push(char);
7900
+ }
7901
+ return {
7902
+ text: output2.join(""),
7903
+ invalidBytesRemoved,
7904
+ controlCharsRemoved
7905
+ };
7906
+ }
7907
+ var SanitizeEncodingNormalizer = class {
7908
+ constructor(sourceEncoding) {
7909
+ this.sourceEncoding = sourceEncoding;
7910
+ this.utf8Decoder = sourceEncoding === "UTF8" ? new StringDecoder("utf8") : void 0;
7911
+ }
7912
+ sourceEncoding;
7913
+ utf8Decoder;
7914
+ normalizeChunk(chunk) {
7915
+ if (this.sourceEncoding === "UTF8") {
7916
+ const decoded = this.utf8Decoder.write(chunk);
7917
+ const sanitized = sanitizeDecodedText(decoded);
7918
+ const nulBytesRemoved = [...decoded].filter(
7919
+ (char) => char === "\0"
7920
+ ).length;
7921
+ return {
7922
+ ...sanitized,
7923
+ nulBytesRemoved
7924
+ };
7925
+ }
7926
+ return this.normalizeSingleByteChunk(chunk);
7927
+ }
7928
+ flush() {
7929
+ if (!this.utf8Decoder) {
7930
+ return {
7931
+ text: "",
7932
+ nulBytesRemoved: 0,
7933
+ invalidBytesRemoved: 0,
7934
+ controlCharsRemoved: 0
7935
+ };
7936
+ }
7937
+ const decoded = this.utf8Decoder.end();
7938
+ const sanitized = sanitizeDecodedText(decoded);
7939
+ const nulBytesRemoved = [...decoded].filter((char) => char === "\0").length;
7940
+ return {
7941
+ ...sanitized,
7942
+ nulBytesRemoved
7943
+ };
7944
+ }
7945
+ normalizeSingleByteChunk(chunk) {
7946
+ const output2 = [];
7947
+ let nulBytesRemoved = 0;
7948
+ let invalidBytesRemoved = 0;
7949
+ let controlCharsRemoved = 0;
7950
+ for (const byte of chunk) {
7951
+ if (byte === 0) {
7952
+ nulBytesRemoved += 1;
7953
+ continue;
7954
+ }
7955
+ if (byte < 32 || byte === 127) {
7956
+ if (isAllowedControlCodePoint(byte)) {
7957
+ output2.push(String.fromCharCode(byte));
7958
+ } else {
7959
+ controlCharsRemoved += 1;
7960
+ }
7961
+ continue;
7962
+ }
7963
+ if (byte >= 128 && byte <= 159) {
7964
+ if (this.sourceEncoding === "WIN1252") {
7965
+ const mapped = WINDOWS_1252_C1_MAP[byte];
7966
+ if (mapped === void 0) {
7967
+ invalidBytesRemoved += 1;
7968
+ } else {
7969
+ output2.push(mapped);
7970
+ }
7971
+ } else {
7972
+ controlCharsRemoved += 1;
7973
+ }
7974
+ continue;
7975
+ }
7976
+ output2.push(String.fromCharCode(byte));
7977
+ }
7978
+ return {
7979
+ text: output2.join(""),
7980
+ nulBytesRemoved,
7981
+ invalidBytesRemoved,
7982
+ controlCharsRemoved
7983
+ };
7984
+ }
7985
+ };
7986
+
7824
7987
  // src/services/sanitize/runner.ts
7825
7988
  import { createReadStream as createReadStream2, createWriteStream as createWriteStream2 } from "fs";
7826
7989
  import { mkdir as mkdir7 } from "fs/promises";
7827
7990
  import path13 from "path";
7828
- function stripNulBytes(chunk) {
7829
- let removed = 0;
7830
- for (let index = 0; index < chunk.length; index += 1) {
7831
- if (chunk[index] === 0) {
7832
- removed += 1;
7833
- }
7991
+ async function writeUtf8(output2, value) {
7992
+ if (value.length === 0) {
7993
+ return;
7834
7994
  }
7835
- if (removed === 0) {
7836
- return { buffer: chunk, removed: 0 };
7995
+ if (!output2.write(value, "utf8")) {
7996
+ await new Promise((resolve2, reject) => {
7997
+ output2.once("drain", resolve2);
7998
+ output2.once("error", reject);
7999
+ });
7837
8000
  }
7838
- const sanitized = Buffer.allocUnsafe(chunk.length - removed);
7839
- let outputIndex = 0;
7840
- for (let index = 0; index < chunk.length; index += 1) {
7841
- const value = chunk[index];
7842
- if (value !== 0) {
7843
- sanitized[outputIndex] = value;
7844
- outputIndex += 1;
8001
+ }
8002
+ function countNewlines(value) {
8003
+ let count = 0;
8004
+ for (let index = 0; index < value.length; index += 1) {
8005
+ if (value[index] === "\n") {
8006
+ count += 1;
7845
8007
  }
7846
8008
  }
7847
- return { buffer: sanitized, removed };
8009
+ return count;
7848
8010
  }
7849
- async function sanitizeDatasetFile(plan, onChunk) {
8011
+ async function sanitizeDatasetFile(plan, onChunk, options = {}) {
7850
8012
  await mkdir7(path13.dirname(plan.outputPath), { recursive: true });
8013
+ const sourceEncoding = normalizeSanitizeSourceEncoding(
8014
+ options.sourceEncoding
8015
+ );
8016
+ const normalizer = new SanitizeEncodingNormalizer(sourceEncoding);
7851
8017
  const input2 = createReadStream2(plan.absolutePath);
7852
- const output2 = createWriteStream2(plan.outputPath);
8018
+ const output2 = createWriteStream2(plan.outputPath, { encoding: "utf8" });
7853
8019
  let totalBytesRead = 0;
7854
8020
  let totalBytesWritten = 0;
7855
8021
  let nulBytesRemoved = 0;
8022
+ let invalidBytesRemoved = 0;
8023
+ let controlCharsRemoved = 0;
7856
8024
  let lineCount = 0;
7857
- let sawAnyByte = false;
7858
- let lastByteWasNewline = false;
8025
+ let sawAnyCharacter = false;
8026
+ let lastCharacterWasNewline = false;
8027
+ const processText = async (text) => {
8028
+ if (text.length === 0) {
8029
+ return;
8030
+ }
8031
+ sawAnyCharacter = true;
8032
+ lineCount += countNewlines(text);
8033
+ lastCharacterWasNewline = text.endsWith("\n");
8034
+ totalBytesWritten += Buffer.byteLength(text, "utf8");
8035
+ await writeUtf8(output2, text);
8036
+ };
7859
8037
  try {
7860
8038
  for await (const chunk of input2) {
7861
8039
  const chunkBuffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
7862
8040
  totalBytesRead += chunkBuffer.length;
7863
- const { buffer, removed } = stripNulBytes(chunkBuffer);
7864
- nulBytesRemoved += removed;
7865
- sawAnyByte = sawAnyByte || buffer.length > 0;
7866
- for (let index = 0; index < buffer.length; index += 1) {
7867
- if (buffer[index] === 10) {
7868
- lineCount += 1;
7869
- }
7870
- }
7871
- if (buffer.length > 0) {
7872
- lastByteWasNewline = buffer[buffer.length - 1] === 10;
7873
- }
7874
- totalBytesWritten += buffer.length;
7875
- output2.write(buffer);
8041
+ const normalized = normalizer.normalizeChunk(chunkBuffer);
8042
+ nulBytesRemoved += normalized.nulBytesRemoved;
8043
+ invalidBytesRemoved += normalized.invalidBytesRemoved;
8044
+ controlCharsRemoved += normalized.controlCharsRemoved;
8045
+ await processText(normalized.text);
7876
8046
  onChunk?.({
7877
8047
  bytesProcessed: chunkBuffer.length,
7878
8048
  fileBytesProcessed: totalBytesRead,
7879
8049
  currentFileSize: plan.fileSize,
7880
8050
  processedRows: lineCount,
7881
- nulBytesRemoved
8051
+ nulBytesRemoved,
8052
+ invalidBytesRemoved,
8053
+ controlCharsRemoved
7882
8054
  });
7883
8055
  }
7884
- if (sawAnyByte && !lastByteWasNewline) {
8056
+ const flushed = normalizer.flush();
8057
+ nulBytesRemoved += flushed.nulBytesRemoved;
8058
+ invalidBytesRemoved += flushed.invalidBytesRemoved;
8059
+ controlCharsRemoved += flushed.controlCharsRemoved;
8060
+ await processText(flushed.text);
8061
+ if (sawAnyCharacter && !lastCharacterWasNewline) {
7885
8062
  lineCount += 1;
7886
8063
  }
7887
8064
  } finally {
7888
8065
  input2.close();
7889
8066
  output2.end();
7890
- await new Promise((resolve2) => output2.on("finish", () => resolve2()));
8067
+ await new Promise((resolve2, reject) => {
8068
+ output2.on("finish", () => resolve2());
8069
+ output2.on("error", (error) => reject(error));
8070
+ });
7891
8071
  }
7892
8072
  return {
7893
8073
  plan,
7894
8074
  totalBytesRead,
7895
8075
  totalBytesWritten,
8076
+ sourceEncoding,
7896
8077
  nulBytesRemoved,
8078
+ invalidBytesRemoved,
8079
+ controlCharsRemoved,
7897
8080
  lineCount,
7898
- changed: nulBytesRemoved > 0 || totalBytesRead !== totalBytesWritten
8081
+ changed: nulBytesRemoved > 0 || invalidBytesRemoved > 0 || controlCharsRemoved > 0 || totalBytesRead !== totalBytesWritten
7899
8082
  };
7900
8083
  }
7901
8084
 
@@ -7958,40 +8141,54 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
7958
8141
  "No recognized validated dataset files were found for sanitization."
7959
8142
  );
7960
8143
  }
8144
+ const sourceEncoding = normalizeSanitizeSourceEncoding(
8145
+ options.sourceEncoding
8146
+ );
7961
8147
  options.onProgress?.({
7962
8148
  kind: "start",
7963
8149
  validatedPath,
7964
8150
  outputPath,
7965
8151
  totalFiles: plan.totalFiles,
7966
8152
  totalBytes: plan.totalBytes,
7967
- datasets: plan.datasets
8153
+ datasets: plan.datasets,
8154
+ sourceEncoding
7968
8155
  });
7969
8156
  let processedFiles = 0;
7970
8157
  let processedRows = 0;
7971
8158
  let processedBytes = 0;
7972
8159
  let nulBytesRemoved = 0;
8160
+ let invalidBytesRemoved = 0;
8161
+ let controlCharsRemoved = 0;
7973
8162
  let changedFiles = 0;
7974
8163
  const fileSummaries = [];
7975
8164
  for (const [index, filePlan] of plan.files.entries()) {
7976
- const fileResult = await sanitizeDatasetFile(filePlan, (chunk) => {
7977
- options.onProgress?.({
7978
- kind: "progress",
7979
- currentFileDisplayPath: filePlan.displayPath,
7980
- fileIndex: index + 1,
7981
- totalFiles: plan.totalFiles,
7982
- bytesProcessed: processedBytes + chunk.fileBytesProcessed,
7983
- totalBytes: plan.totalBytes,
7984
- fileBytesProcessed: chunk.fileBytesProcessed,
7985
- currentFileSize: chunk.currentFileSize,
7986
- processedRows: processedRows + chunk.processedRows,
7987
- nulBytesRemoved: nulBytesRemoved + chunk.nulBytesRemoved,
7988
- changedFiles
7989
- });
7990
- });
8165
+ const fileResult = await sanitizeDatasetFile(
8166
+ filePlan,
8167
+ (chunk) => {
8168
+ options.onProgress?.({
8169
+ kind: "progress",
8170
+ currentFileDisplayPath: filePlan.displayPath,
8171
+ fileIndex: index + 1,
8172
+ totalFiles: plan.totalFiles,
8173
+ bytesProcessed: processedBytes + chunk.fileBytesProcessed,
8174
+ totalBytes: plan.totalBytes,
8175
+ fileBytesProcessed: chunk.fileBytesProcessed,
8176
+ currentFileSize: chunk.currentFileSize,
8177
+ processedRows: processedRows + chunk.processedRows,
8178
+ nulBytesRemoved: nulBytesRemoved + chunk.nulBytesRemoved,
8179
+ invalidBytesRemoved: invalidBytesRemoved + chunk.invalidBytesRemoved,
8180
+ controlCharsRemoved: controlCharsRemoved + chunk.controlCharsRemoved,
8181
+ changedFiles
8182
+ });
8183
+ },
8184
+ { sourceEncoding }
8185
+ );
7991
8186
  processedFiles += 1;
7992
8187
  processedRows += fileResult.lineCount;
7993
8188
  processedBytes += fileResult.totalBytesRead;
7994
8189
  nulBytesRemoved += fileResult.nulBytesRemoved;
8190
+ invalidBytesRemoved += fileResult.invalidBytesRemoved;
8191
+ controlCharsRemoved += fileResult.controlCharsRemoved;
7995
8192
  changedFiles += fileResult.changed ? 1 : 0;
7996
8193
  fileSummaries.push({
7997
8194
  dataset: filePlan.dataset,
@@ -7999,7 +8196,9 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
7999
8196
  outputPath: filePlan.outputPath,
8000
8197
  lineCount: fileResult.lineCount,
8001
8198
  changed: fileResult.changed,
8002
- nulBytesRemoved: fileResult.nulBytesRemoved
8199
+ nulBytesRemoved: fileResult.nulBytesRemoved,
8200
+ invalidBytesRemoved: fileResult.invalidBytesRemoved,
8201
+ controlCharsRemoved: fileResult.controlCharsRemoved
8003
8202
  });
8004
8203
  }
8005
8204
  options.onProgress?.({
@@ -8007,6 +8206,8 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
8007
8206
  totalFiles: plan.totalFiles,
8008
8207
  processedRows,
8009
8208
  nulBytesRemoved,
8209
+ invalidBytesRemoved,
8210
+ controlCharsRemoved,
8010
8211
  changedFiles,
8011
8212
  totalBytes: plan.totalBytes
8012
8213
  });
@@ -8018,13 +8219,17 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
8018
8219
  totalBytes: plan.totalBytes,
8019
8220
  processedFiles,
8020
8221
  processedRows,
8222
+ sourceEncoding,
8021
8223
  nulBytesRemoved,
8224
+ invalidBytesRemoved,
8225
+ controlCharsRemoved,
8022
8226
  changedFiles,
8023
8227
  unchangedFiles: plan.totalFiles - changedFiles,
8024
8228
  datasets: plan.datasets,
8025
8229
  files: fileSummaries,
8026
8230
  warnings: [
8027
- "Sanitization prepares a clean dataset tree for import by removing known low-level byte issues such as NUL bytes before PostgreSQL loading begins.",
8231
+ "Sanitization now writes UTF-8 output and removes invalid bytes plus problematic control characters before PostgreSQL loading begins.",
8232
+ "The PostgreSQL direct import path can use --source-encoding UTF8 when reading files generated by this sanitization command.",
8028
8233
  "The import command still keeps quarantine and row-level recovery for unexpected issues, but sanitizing first reduces the amount of slow fallback work during import."
8029
8234
  ],
8030
8235
  nextStep: inferNextStep3(outputPath)
@@ -8817,7 +9022,7 @@ async function exportPostgresCsvDataset(inputPath, options = {}) {
8817
9022
  // src/services/postgres-direct/generator.ts
8818
9023
  import { mkdir as mkdir9, stat as stat7, writeFile as writeFile6 } from "fs/promises";
8819
9024
  import path17 from "path";
8820
- var DEFAULT_SOURCE_ENCODING = "WIN1252";
9025
+ var DEFAULT_SOURCE_ENCODING = "UTF8";
8821
9026
  function defaultPostgresDirectOutputPath(inputPath) {
8822
9027
  const baseName = path17.basename(inputPath);
8823
9028
  if (baseName.toLowerCase() === "sanitized") {
@@ -8832,7 +9037,7 @@ function normalizeSourceEncoding(value) {
8832
9037
  const encoding = (value ?? DEFAULT_SOURCE_ENCODING).trim();
8833
9038
  if (!/^[A-Za-z0-9_-]+$/.test(encoding)) {
8834
9039
  throw new ValidationError(
8835
- `Invalid source encoding: ${value}. Use a PostgreSQL client encoding name such as WIN1252 or UTF8.`
9040
+ `Invalid source encoding: ${value}. Use a PostgreSQL client encoding name such as UTF8, WIN1252 or LATIN1.`
8836
9041
  );
8837
9042
  }
8838
9043
  return encoding.toUpperCase();
@@ -8967,7 +9172,8 @@ async function generatePostgresDirectScript(inputPath, options = {}) {
8967
9172
  ...validation.ok ? [] : validation.errors,
8968
9173
  "This script imports sanitized Receita files directly with psql \\copy. It avoids rewriting the full dataset into a second CSV tree.",
8969
9174
  "The generated script expects the database schema generated by cnpj-db-loader to be applied before execution.",
8970
- "Use --source-encoding UTF8 only if your sanitized files are already UTF-8. The default WIN1252 matches the usual Receita file encoding."
9175
+ "The direct PostgreSQL script now defaults to UTF8 because the sanitize command writes clean UTF-8 files.",
9176
+ "Use --source-encoding WIN1252 or LATIN1 only when generating scripts for legacy sanitized files produced by older loader versions."
8971
9177
  ],
8972
9178
  nextStep: inferNextStep5(scriptPath)
8973
9179
  };
@@ -9271,9 +9477,23 @@ function printSanitizeSummary(summary, logFilePath) {
9271
9477
  console.log(
9272
9478
  formatKeyValue("Processed bytes", formatBytes(summary.totalBytes))
9273
9479
  );
9480
+ console.log(formatKeyValue("Source encoding", summary.sourceEncoding));
9481
+ console.log(formatKeyValue("Output encoding", "UTF8"));
9274
9482
  console.log(
9275
9483
  formatKeyValue("Removed NUL bytes", formatCount(summary.nulBytesRemoved))
9276
9484
  );
9485
+ console.log(
9486
+ formatKeyValue(
9487
+ "Removed invalid bytes",
9488
+ formatCount(summary.invalidBytesRemoved)
9489
+ )
9490
+ );
9491
+ console.log(
9492
+ formatKeyValue(
9493
+ "Removed control chars",
9494
+ formatCount(summary.controlCharsRemoved)
9495
+ )
9496
+ );
9277
9497
  console.log(formatKeyValue("Changed files", summary.changedFiles));
9278
9498
  console.log(formatKeyValue("Unchanged files", summary.unchangedFiles));
9279
9499
  if (summary.datasets.length > 0) {
@@ -10053,8 +10273,9 @@ function createSanitizeProgressReporter() {
10053
10273
  `Validated: ${shortPath(event.validatedPath)}`,
10054
10274
  `Output: ${shortPath(event.outputPath)}`,
10055
10275
  `Datasets: ${event.datasets.join(" > ")}`,
10276
+ `Source encoding: ${event.sourceEncoding} > UTF8`,
10056
10277
  `Files: 0/${formatCount(event.totalFiles)} | Bytes: ${formatBytes(0)} / ${formatBytes(event.totalBytes)}`,
10057
- `Rows counted: ${formatCount(0)} | NUL removed: ${formatCount(0)}`,
10278
+ `Rows: ${formatCount(0)} | NUL: ${formatCount(0)} | Invalid bytes: ${formatCount(0)} | Controls: ${formatCount(0)}`,
10058
10279
  `Current: waiting...`
10059
10280
  ];
10060
10281
  renderBlock([
@@ -10070,8 +10291,9 @@ function createSanitizeProgressReporter() {
10070
10291
  currentLines[1] ?? "",
10071
10292
  currentLines[2] ?? "",
10072
10293
  currentLines[3] ?? "",
10294
+ currentLines[4] ?? "",
10073
10295
  `Files: ${formatCount(event.fileIndex)}/${formatCount(event.totalFiles)} | Bytes: ${formatBytes(event.bytesProcessed)} / ${formatBytes(event.totalBytes)}`,
10074
- `Rows counted: ${formatCount(event.processedRows)} | NUL removed: ${formatCount(event.nulBytesRemoved)} | Changed files: ${formatCount(event.changedFiles)}`,
10296
+ `Rows: ${formatCount(event.processedRows)} | NUL: ${formatCount(event.nulBytesRemoved)} | Invalid bytes: ${formatCount(event.invalidBytesRemoved)} | Controls: ${formatCount(event.controlCharsRemoved)} | Changed: ${formatCount(event.changedFiles)}`,
10075
10297
  `Current: ${shortPath(event.currentFileDisplayPath)}`
10076
10298
  ];
10077
10299
  renderBlock([
@@ -10091,6 +10313,18 @@ function createSanitizeProgressReporter() {
10091
10313
  console.log(
10092
10314
  formatKeyValue("Removed NUL bytes", formatCount(event.nulBytesRemoved))
10093
10315
  );
10316
+ console.log(
10317
+ formatKeyValue(
10318
+ "Removed invalid bytes",
10319
+ formatCount(event.invalidBytesRemoved)
10320
+ )
10321
+ );
10322
+ console.log(
10323
+ formatKeyValue(
10324
+ "Removed control chars",
10325
+ formatCount(event.controlCharsRemoved)
10326
+ )
10327
+ );
10094
10328
  console.log(
10095
10329
  formatKeyValue("Changed files", formatCount(event.changedFiles))
10096
10330
  );
@@ -11203,7 +11437,7 @@ function registerPostgresCommands(program) {
11203
11437
  "Generated psql script file name. Defaults to import-postgres-direct.sql."
11204
11438
  ).option(
11205
11439
  "--source-encoding <encoding>",
11206
- "PostgreSQL client encoding used while reading sanitized Receita files. Defaults to WIN1252."
11440
+ "PostgreSQL client encoding used while reading sanitized Receita files. Defaults to UTF8."
11207
11441
  ).option("-f, --force", "Skip the confirmation prompt.").description(
11208
11442
  "Generate a direct psql import script that loads sanitized Receita files without rewriting them into new CSV files."
11209
11443
  ).action(
@@ -11353,6 +11587,9 @@ function registerSanitizeCommands(program) {
11353
11587
  ).option(
11354
11588
  "--dataset <dataset>",
11355
11589
  "Sanitize only one validated dataset block (for example: establishments or companies)."
11590
+ ).option(
11591
+ "--source-encoding <encoding>",
11592
+ "Source file encoding used while reading Receita files. Defaults to WIN1252 and writes clean UTF-8 output."
11356
11593
  ).option("-f, --force", "Skip the confirmation prompt.").description(
11357
11594
  "Prepare a sanitized dataset tree before import by removing known low-level byte issues such as NUL bytes."
11358
11595
  ).action(
@@ -11376,6 +11613,9 @@ function registerSanitizeCommands(program) {
11376
11613
  if (options.dataset) {
11377
11614
  sanitizeOptions.dataset = options.dataset;
11378
11615
  }
11616
+ if (options.sourceEncoding) {
11617
+ sanitizeOptions.sourceEncoding = options.sourceEncoding;
11618
+ }
11379
11619
  const summary = await sanitizeInputDirectory(input2, sanitizeOptions);
11380
11620
  const logFilePath = await writeCommandLog("sanitize", summary);
11381
11621
  printSanitizeSummary(summary, logFilePath);