npm - @danielarndt0/cnpj-db-loader - Versions diffs - 2.4.0-beta.1 → 2.4.0-beta.2 - Mend

@danielarndt0/cnpj-db-loader 2.4.0-beta.1 → 2.4.0-beta.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/README.md CHANGED Viewed

@@ -10,7 +10,7 @@ This version focuses on the real loading workflow:
 - check, download, retry, clean, and inspect the latest Federal Revenue CNPJ monthly ZIP archives from the public share
 - extract Receita Federal ZIP archives
 - validate an extracted tree
-- sanitize validated files before import to remove known low-level byte issues
+- sanitize validated files into clean UTF-8 before import, removing NUL bytes, invalid bytes and problematic control characters
 - print or generate final, staging, or combined SQL schemas
 - configure and test the default PostgreSQL URL
 - import validated dataset files into PostgreSQL with:
@@ -51,7 +51,7 @@ cnpj-db-loader schema generate --profile full
 cnpj-db-loader import ./downloads/<reference>/sanitized --load-batch-size 500 --materialize-batch-size 50000 --verbose-progress
 # Optional hybrid path for PostgreSQL direct loading
-cnpj-db-loader postgres generate-script ./downloads/<reference>/sanitized --output ./downloads/<reference>/postgres-direct --force
+cnpj-db-loader postgres generate-script ./downloads/<reference>/sanitized --output ./downloads/<reference>/postgres-direct --source-encoding UTF8 --force
 psql "postgres://postgres:postgres@localhost:5432/cnpj" -f ./downloads/<reference>/postgres-direct/import-postgres-direct.sql
 ```
@@ -67,7 +67,7 @@ cnpj-db-loader federal-revenue sync [reference] [--reference <yyyy-mm>] [--curre
 cnpj-db-loader inspect <input>
 cnpj-db-loader extract <input> [--output <path>]
 cnpj-db-loader validate <input>
-cnpj-db-loader sanitize <input> [--output <path>] [--dataset <name>] [-f]
+cnpj-db-loader sanitize <input> [--output <path>] [--dataset <name>] [--source-encoding <encoding>] [-f]
 cnpj-db-loader schema print [--profile <profile>]
 cnpj-db-loader schema generate [--name <name>] [--output <path>] [--profile <profile>]
 cnpj-db-loader database config set <url>
@@ -95,11 +95,11 @@ For local benchmarks or controlled full loads, the CLI can now generate a direct
 ```bash
 cnpj-db-loader sanitize ./downloads/<reference>/extracted
-cnpj-db-loader postgres generate-script ./downloads/<reference>/sanitized --output ./downloads/<reference>/postgres-direct --force
+cnpj-db-loader postgres generate-script ./downloads/<reference>/sanitized --output ./downloads/<reference>/postgres-direct --source-encoding UTF8 --force
 psql "postgres://postgres:postgres@localhost:5432/cnpj" -f ./downloads/<reference>/postgres-direct/import-postgres-direct.sql
 ```
-This path keeps download, extraction, validation and sanitization inside the loader, then lets PostgreSQL load the sanitized Receita files directly through `\copy`, convert values into staging tables and materialize the final tables with set-based SQL. The standard `import` command remains the safest path when checkpoint resume and quarantine recovery are required.
+This path keeps download, extraction, validation and robust UTF-8 sanitization inside the loader, then lets PostgreSQL load the sanitized Receita files directly through `\copy`, convert values into staging tables and materialize the final tables with set-based SQL. The standard `import` command remains the safest path when checkpoint resume and quarantine recovery are required.
 ## Logs

package/dist/cli.js CHANGED Viewed

@@ -7821,81 +7821,264 @@ function isRecognizedSanitizeEntry(entry) {
   return entry.entryKind === "file" && entry.inferredType !== "zip-archive" && entry.inferredType !== "unknown";
 }
+// src/services/sanitize/encoding.ts
+import { StringDecoder } from "string_decoder";
+var WINDOWS_1252_C1_MAP = {
+  128: "\u20AC",
+  130: "\u201A",
+  131: "\u0192",
+  132: "\u201E",
+  133: "\u2026",
+  134: "\u2020",
+  135: "\u2021",
+  136: "\u02C6",
+  137: "\u2030",
+  138: "\u0160",
+  139: "\u2039",
+  140: "\u0152",
+  142: "\u017D",
+  145: "\u2018",
+  146: "\u2019",
+  147: "\u201C",
+  148: "\u201D",
+  149: "\u2022",
+  150: "\u2013",
+  151: "\u2014",
+  152: "\u02DC",
+  153: "\u2122",
+  154: "\u0161",
+  155: "\u203A",
+  156: "\u0153",
+  158: "\u017E",
+  159: "\u0178"
+};
+function normalizeSanitizeSourceEncoding(value) {
+  const normalized = (value ?? "WIN1252").trim().toUpperCase().replace(/_/g, "-");
+  switch (normalized) {
+    case "WIN1252":
+    case "WINDOWS-1252":
+    case "CP1252":
+      return "WIN1252";
+    case "LATIN1":
+    case "LATIN-1":
+    case "ISO-8859-1":
+    case "ISO8859-1":
+      return "LATIN1";
+    case "UTF8":
+    case "UTF-8":
+      return "UTF8";
+    default:
+      throw new ValidationError(
+        `Unsupported sanitize source encoding: ${value}. Supported values: WIN1252, LATIN1, UTF8.`
+      );
+  }
+}
+function isAllowedControlCodePoint(codePoint) {
+  return codePoint === 9 || codePoint === 10 || codePoint === 13;
+}
+function isProblematicControlCodePoint(codePoint) {
+  if (isAllowedControlCodePoint(codePoint)) {
+    return false;
+  }
+  return codePoint >= 0 && codePoint <= 31 || codePoint === 127 || codePoint >= 128 && codePoint <= 159 || codePoint === 65279;
+}
+function sanitizeDecodedText(text) {
+  const output2 = [];
+  let invalidBytesRemoved = 0;
+  let controlCharsRemoved = 0;
+  for (const char of text) {
+    const codePoint = char.codePointAt(0);
+    if (codePoint === 65533) {
+      invalidBytesRemoved += 1;
+      continue;
+    }
+    if (isProblematicControlCodePoint(codePoint)) {
+      controlCharsRemoved += 1;
+      continue;
+    }
+    output2.push(char);
+  }
+  return {
+    text: output2.join(""),
+    invalidBytesRemoved,
+    controlCharsRemoved
+  };
+}
+var SanitizeEncodingNormalizer = class {
+  constructor(sourceEncoding) {
+    this.sourceEncoding = sourceEncoding;
+    this.utf8Decoder = sourceEncoding === "UTF8" ? new StringDecoder("utf8") : void 0;
+  }
+  sourceEncoding;
+  utf8Decoder;
+  normalizeChunk(chunk) {
+    if (this.sourceEncoding === "UTF8") {
+      const decoded = this.utf8Decoder.write(chunk);
+      const sanitized = sanitizeDecodedText(decoded);
+      const nulBytesRemoved = [...decoded].filter(
+        (char) => char === "\0"
+      ).length;
+      return {
+        ...sanitized,
+        nulBytesRemoved
+      };
+    }
+    return this.normalizeSingleByteChunk(chunk);
+  }
+  flush() {
+    if (!this.utf8Decoder) {
+      return {
+        text: "",
+        nulBytesRemoved: 0,
+        invalidBytesRemoved: 0,
+        controlCharsRemoved: 0
+      };
+    }
+    const decoded = this.utf8Decoder.end();
+    const sanitized = sanitizeDecodedText(decoded);
+    const nulBytesRemoved = [...decoded].filter((char) => char === "\0").length;
+    return {
+      ...sanitized,
+      nulBytesRemoved
+    };
+  }
+  normalizeSingleByteChunk(chunk) {
+    const output2 = [];
+    let nulBytesRemoved = 0;
+    let invalidBytesRemoved = 0;
+    let controlCharsRemoved = 0;
+    for (const byte of chunk) {
+      if (byte === 0) {
+        nulBytesRemoved += 1;
+        continue;
+      }
+      if (byte < 32 || byte === 127) {
+        if (isAllowedControlCodePoint(byte)) {
+          output2.push(String.fromCharCode(byte));
+        } else {
+          controlCharsRemoved += 1;
+        }
+        continue;
+      }
+      if (byte >= 128 && byte <= 159) {
+        if (this.sourceEncoding === "WIN1252") {
+          const mapped = WINDOWS_1252_C1_MAP[byte];
+          if (mapped === void 0) {
+            invalidBytesRemoved += 1;
+          } else {
+            output2.push(mapped);
+          }
+        } else {
+          controlCharsRemoved += 1;
+        }
+        continue;
+      }
+      output2.push(String.fromCharCode(byte));
+    }
+    return {
+      text: output2.join(""),
+      nulBytesRemoved,
+      invalidBytesRemoved,
+      controlCharsRemoved
+    };
+  }
+};
 // src/services/sanitize/runner.ts
 import { createReadStream as createReadStream2, createWriteStream as createWriteStream2 } from "fs";
 import { mkdir as mkdir7 } from "fs/promises";
 import path13 from "path";
-function stripNulBytes(chunk) {
-  let removed = 0;
-  for (let index = 0; index < chunk.length; index += 1) {
-    if (chunk[index] === 0) {
-      removed += 1;
-    }
+async function writeUtf8(output2, value) {
+  if (value.length === 0) {
+    return;
   }
-  if (removed === 0) {
-    return { buffer: chunk, removed: 0 };
+  if (!output2.write(value, "utf8")) {
+    await new Promise((resolve2, reject) => {
+      output2.once("drain", resolve2);
+      output2.once("error", reject);
+    });
   }
-  const sanitized = Buffer.allocUnsafe(chunk.length - removed);
-  let outputIndex = 0;
-  for (let index = 0; index < chunk.length; index += 1) {
-    const value = chunk[index];
-    if (value !== 0) {
-      sanitized[outputIndex] = value;
-      outputIndex += 1;
+}
+function countNewlines(value) {
+  let count = 0;
+  for (let index = 0; index < value.length; index += 1) {
+    if (value[index] === "\n") {
+      count += 1;
     }
   }
-  return { buffer: sanitized, removed };
+  return count;
 }
-async function sanitizeDatasetFile(plan, onChunk) {
+async function sanitizeDatasetFile(plan, onChunk, options = {}) {
   await mkdir7(path13.dirname(plan.outputPath), { recursive: true });
+  const sourceEncoding = normalizeSanitizeSourceEncoding(
+    options.sourceEncoding
+  );
+  const normalizer = new SanitizeEncodingNormalizer(sourceEncoding);
   const input2 = createReadStream2(plan.absolutePath);
-  const output2 = createWriteStream2(plan.outputPath);
+  const output2 = createWriteStream2(plan.outputPath, { encoding: "utf8" });
   let totalBytesRead = 0;
   let totalBytesWritten = 0;
   let nulBytesRemoved = 0;
+  let invalidBytesRemoved = 0;
+  let controlCharsRemoved = 0;
   let lineCount = 0;
-  let sawAnyByte = false;
-  let lastByteWasNewline = false;
+  let sawAnyCharacter = false;
+  let lastCharacterWasNewline = false;
+  const processText = async (text) => {
+    if (text.length === 0) {
+      return;
+    }
+    sawAnyCharacter = true;
+    lineCount += countNewlines(text);
+    lastCharacterWasNewline = text.endsWith("\n");
+    totalBytesWritten += Buffer.byteLength(text, "utf8");
+    await writeUtf8(output2, text);
+  };
   try {
     for await (const chunk of input2) {
       const chunkBuffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
       totalBytesRead += chunkBuffer.length;
-      const { buffer, removed } = stripNulBytes(chunkBuffer);
-      nulBytesRemoved += removed;
-      sawAnyByte = sawAnyByte || buffer.length > 0;
-      for (let index = 0; index < buffer.length; index += 1) {
-        if (buffer[index] === 10) {
-          lineCount += 1;
-        }
-      }
-      if (buffer.length > 0) {
-        lastByteWasNewline = buffer[buffer.length - 1] === 10;
-      }
-      totalBytesWritten += buffer.length;
-      output2.write(buffer);
+      const normalized = normalizer.normalizeChunk(chunkBuffer);
+      nulBytesRemoved += normalized.nulBytesRemoved;
+      invalidBytesRemoved += normalized.invalidBytesRemoved;
+      controlCharsRemoved += normalized.controlCharsRemoved;
+      await processText(normalized.text);
       onChunk?.({
         bytesProcessed: chunkBuffer.length,
         fileBytesProcessed: totalBytesRead,
         currentFileSize: plan.fileSize,
         processedRows: lineCount,
-        nulBytesRemoved
+        nulBytesRemoved,
+        invalidBytesRemoved,
+        controlCharsRemoved
       });
     }
-    if (sawAnyByte && !lastByteWasNewline) {
+    const flushed = normalizer.flush();
+    nulBytesRemoved += flushed.nulBytesRemoved;
+    invalidBytesRemoved += flushed.invalidBytesRemoved;
+    controlCharsRemoved += flushed.controlCharsRemoved;
+    await processText(flushed.text);
+    if (sawAnyCharacter && !lastCharacterWasNewline) {
       lineCount += 1;
     }
   } finally {
     input2.close();
     output2.end();
-    await new Promise((resolve2) => output2.on("finish", () => resolve2()));
+    await new Promise((resolve2, reject) => {
+      output2.on("finish", () => resolve2());
+      output2.on("error", (error) => reject(error));
+    });
   }
   return {
     plan,
     totalBytesRead,
     totalBytesWritten,
+    sourceEncoding,
     nulBytesRemoved,
+    invalidBytesRemoved,
+    controlCharsRemoved,
     lineCount,
-    changed: nulBytesRemoved > 0 || totalBytesRead !== totalBytesWritten
+    changed: nulBytesRemoved > 0 || invalidBytesRemoved > 0 || controlCharsRemoved > 0 || totalBytesRead !== totalBytesWritten
   };
 }
@@ -7958,40 +8141,54 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
       "No recognized validated dataset files were found for sanitization."
     );
   }
+  const sourceEncoding = normalizeSanitizeSourceEncoding(
+    options.sourceEncoding
+  );
   options.onProgress?.({
     kind: "start",
     validatedPath,
     outputPath,
     totalFiles: plan.totalFiles,
     totalBytes: plan.totalBytes,
-    datasets: plan.datasets
+    datasets: plan.datasets,
+    sourceEncoding
   });
   let processedFiles = 0;
   let processedRows = 0;
   let processedBytes = 0;
   let nulBytesRemoved = 0;
+  let invalidBytesRemoved = 0;
+  let controlCharsRemoved = 0;
   let changedFiles = 0;
   const fileSummaries = [];
   for (const [index, filePlan] of plan.files.entries()) {
-    const fileResult = await sanitizeDatasetFile(filePlan, (chunk) => {
-      options.onProgress?.({
-        kind: "progress",
-        currentFileDisplayPath: filePlan.displayPath,
-        fileIndex: index + 1,
-        totalFiles: plan.totalFiles,
-        bytesProcessed: processedBytes + chunk.fileBytesProcessed,
-        totalBytes: plan.totalBytes,
-        fileBytesProcessed: chunk.fileBytesProcessed,
-        currentFileSize: chunk.currentFileSize,
-        processedRows: processedRows + chunk.processedRows,
-        nulBytesRemoved: nulBytesRemoved + chunk.nulBytesRemoved,
-        changedFiles
-      });
-    });
+    const fileResult = await sanitizeDatasetFile(
+      filePlan,
+      (chunk) => {
+        options.onProgress?.({
+          kind: "progress",
+          currentFileDisplayPath: filePlan.displayPath,
+          fileIndex: index + 1,
+          totalFiles: plan.totalFiles,
+          bytesProcessed: processedBytes + chunk.fileBytesProcessed,
+          totalBytes: plan.totalBytes,
+          fileBytesProcessed: chunk.fileBytesProcessed,
+          currentFileSize: chunk.currentFileSize,
+          processedRows: processedRows + chunk.processedRows,
+          nulBytesRemoved: nulBytesRemoved + chunk.nulBytesRemoved,
+          invalidBytesRemoved: invalidBytesRemoved + chunk.invalidBytesRemoved,
+          controlCharsRemoved: controlCharsRemoved + chunk.controlCharsRemoved,
+          changedFiles
+        });
+      },
+      { sourceEncoding }
+    );
     processedFiles += 1;
     processedRows += fileResult.lineCount;
     processedBytes += fileResult.totalBytesRead;
     nulBytesRemoved += fileResult.nulBytesRemoved;
+    invalidBytesRemoved += fileResult.invalidBytesRemoved;
+    controlCharsRemoved += fileResult.controlCharsRemoved;
     changedFiles += fileResult.changed ? 1 : 0;
     fileSummaries.push({
       dataset: filePlan.dataset,
@@ -7999,7 +8196,9 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
       outputPath: filePlan.outputPath,
       lineCount: fileResult.lineCount,
       changed: fileResult.changed,
-      nulBytesRemoved: fileResult.nulBytesRemoved
+      nulBytesRemoved: fileResult.nulBytesRemoved,
+      invalidBytesRemoved: fileResult.invalidBytesRemoved,
+      controlCharsRemoved: fileResult.controlCharsRemoved
     });
   }
   options.onProgress?.({
@@ -8007,6 +8206,8 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
     totalFiles: plan.totalFiles,
     processedRows,
     nulBytesRemoved,
+    invalidBytesRemoved,
+    controlCharsRemoved,
     changedFiles,
     totalBytes: plan.totalBytes
   });
@@ -8018,13 +8219,17 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
     totalBytes: plan.totalBytes,
     processedFiles,
     processedRows,
+    sourceEncoding,
     nulBytesRemoved,
+    invalidBytesRemoved,
+    controlCharsRemoved,
     changedFiles,
     unchangedFiles: plan.totalFiles - changedFiles,
     datasets: plan.datasets,
     files: fileSummaries,
     warnings: [
-      "Sanitization prepares a clean dataset tree for import by removing known low-level byte issues such as NUL bytes before PostgreSQL loading begins.",
+      "Sanitization now writes UTF-8 output and removes invalid bytes plus problematic control characters before PostgreSQL loading begins.",
+      "The PostgreSQL direct import path can use --source-encoding UTF8 when reading files generated by this sanitization command.",
       "The import command still keeps quarantine and row-level recovery for unexpected issues, but sanitizing first reduces the amount of slow fallback work during import."
     ],
     nextStep: inferNextStep3(outputPath)
@@ -8817,7 +9022,7 @@ async function exportPostgresCsvDataset(inputPath, options = {}) {
 // src/services/postgres-direct/generator.ts
 import { mkdir as mkdir9, stat as stat7, writeFile as writeFile6 } from "fs/promises";
 import path17 from "path";
-var DEFAULT_SOURCE_ENCODING = "WIN1252";
+var DEFAULT_SOURCE_ENCODING = "UTF8";
 function defaultPostgresDirectOutputPath(inputPath) {
   const baseName = path17.basename(inputPath);
   if (baseName.toLowerCase() === "sanitized") {
@@ -8832,7 +9037,7 @@ function normalizeSourceEncoding(value) {
   const encoding = (value ?? DEFAULT_SOURCE_ENCODING).trim();
   if (!/^[A-Za-z0-9_-]+$/.test(encoding)) {
     throw new ValidationError(
-      `Invalid source encoding: ${value}. Use a PostgreSQL client encoding name such as WIN1252 or UTF8.`
+      `Invalid source encoding: ${value}. Use a PostgreSQL client encoding name such as UTF8, WIN1252 or LATIN1.`
     );
   }
   return encoding.toUpperCase();
@@ -8967,7 +9172,8 @@ async function generatePostgresDirectScript(inputPath, options = {}) {
       ...validation.ok ? [] : validation.errors,
       "This script imports sanitized Receita files directly with psql \\copy. It avoids rewriting the full dataset into a second CSV tree.",
       "The generated script expects the database schema generated by cnpj-db-loader to be applied before execution.",
-      "Use --source-encoding UTF8 only if your sanitized files are already UTF-8. The default WIN1252 matches the usual Receita file encoding."
+      "The direct PostgreSQL script now defaults to UTF8 because the sanitize command writes clean UTF-8 files.",
+      "Use --source-encoding WIN1252 or LATIN1 only when generating scripts for legacy sanitized files produced by older loader versions."
     ],
     nextStep: inferNextStep5(scriptPath)
   };
@@ -9271,9 +9477,23 @@ function printSanitizeSummary(summary, logFilePath) {
   console.log(
     formatKeyValue("Processed bytes", formatBytes(summary.totalBytes))
   );
+  console.log(formatKeyValue("Source encoding", summary.sourceEncoding));
+  console.log(formatKeyValue("Output encoding", "UTF8"));
   console.log(
     formatKeyValue("Removed NUL bytes", formatCount(summary.nulBytesRemoved))
   );
+  console.log(
+    formatKeyValue(
+      "Removed invalid bytes",
+      formatCount(summary.invalidBytesRemoved)
+    )
+  );
+  console.log(
+    formatKeyValue(
+      "Removed control chars",
+      formatCount(summary.controlCharsRemoved)
+    )
+  );
   console.log(formatKeyValue("Changed files", summary.changedFiles));
   console.log(formatKeyValue("Unchanged files", summary.unchangedFiles));
   if (summary.datasets.length > 0) {
@@ -10053,8 +10273,9 @@ function createSanitizeProgressReporter() {
         `Validated: ${shortPath(event.validatedPath)}`,
         `Output: ${shortPath(event.outputPath)}`,
         `Datasets: ${event.datasets.join(" > ")}`,
+        `Source encoding: ${event.sourceEncoding} > UTF8`,
         `Files: 0/${formatCount(event.totalFiles)} | Bytes: ${formatBytes(0)} / ${formatBytes(event.totalBytes)}`,
-        `Rows counted: ${formatCount(0)} | NUL removed: ${formatCount(0)}`,
+        `Rows: ${formatCount(0)} | NUL: ${formatCount(0)} | Invalid bytes: ${formatCount(0)} | Controls: ${formatCount(0)}`,
         `Current: waiting...`
       ];
       renderBlock([
@@ -10070,8 +10291,9 @@ function createSanitizeProgressReporter() {
         currentLines[1] ?? "",
         currentLines[2] ?? "",
         currentLines[3] ?? "",
+        currentLines[4] ?? "",
         `Files: ${formatCount(event.fileIndex)}/${formatCount(event.totalFiles)} | Bytes: ${formatBytes(event.bytesProcessed)} / ${formatBytes(event.totalBytes)}`,
-        `Rows counted: ${formatCount(event.processedRows)} | NUL removed: ${formatCount(event.nulBytesRemoved)} | Changed files: ${formatCount(event.changedFiles)}`,
+        `Rows: ${formatCount(event.processedRows)} | NUL: ${formatCount(event.nulBytesRemoved)} | Invalid bytes: ${formatCount(event.invalidBytesRemoved)} | Controls: ${formatCount(event.controlCharsRemoved)} | Changed: ${formatCount(event.changedFiles)}`,
         `Current: ${shortPath(event.currentFileDisplayPath)}`
       ];
       renderBlock([
@@ -10091,6 +10313,18 @@ function createSanitizeProgressReporter() {
     console.log(
       formatKeyValue("Removed NUL bytes", formatCount(event.nulBytesRemoved))
     );
+    console.log(
+      formatKeyValue(
+        "Removed invalid bytes",
+        formatCount(event.invalidBytesRemoved)
+      )
+    );
+    console.log(
+      formatKeyValue(
+        "Removed control chars",
+        formatCount(event.controlCharsRemoved)
+      )
+    );
     console.log(
       formatKeyValue("Changed files", formatCount(event.changedFiles))
     );
@@ -11203,7 +11437,7 @@ function registerPostgresCommands(program) {
     "Generated psql script file name. Defaults to import-postgres-direct.sql."
   ).option(
     "--source-encoding <encoding>",
-    "PostgreSQL client encoding used while reading sanitized Receita files. Defaults to WIN1252."
+    "PostgreSQL client encoding used while reading sanitized Receita files. Defaults to UTF8."
   ).option("-f, --force", "Skip the confirmation prompt.").description(
     "Generate a direct psql import script that loads sanitized Receita files without rewriting them into new CSV files."
   ).action(
@@ -11353,6 +11587,9 @@ function registerSanitizeCommands(program) {
   ).option(
     "--dataset <dataset>",
     "Sanitize only one validated dataset block (for example: establishments or companies)."
+  ).option(
+    "--source-encoding <encoding>",
+    "Source file encoding used while reading Receita files. Defaults to WIN1252 and writes clean UTF-8 output."
   ).option("-f, --force", "Skip the confirmation prompt.").description(
     "Prepare a sanitized dataset tree before import by removing known low-level byte issues such as NUL bytes."
   ).action(
@@ -11376,6 +11613,9 @@ function registerSanitizeCommands(program) {
       if (options.dataset) {
         sanitizeOptions.dataset = options.dataset;
       }
+      if (options.sourceEncoding) {
+        sanitizeOptions.sourceEncoding = options.sourceEncoding;
+      }
       const summary = await sanitizeInputDirectory(input2, sanitizeOptions);
       const logFilePath = await writeCommandLog("sanitize", summary);
       printSanitizeSummary(summary, logFilePath);