npm - @danielarndt0/cnpj-db-loader - Versions diffs - 2.4.0-beta.1 → 2.4.0-beta.2 - Mend

@danielarndt0/cnpj-db-loader 2.4.0-beta.1 → 2.4.0-beta.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -582,6 +582,8 @@ declare function showQuarantineRow(id: number, options?: {
     dbUrl?: string;
 }): Promise<QuarantineRecord>;
+type SanitizeSourceEncoding = "WIN1252" | "LATIN1" | "UTF8";
 type SanitizeDatasetType = Exclude<DatasetType, "zip-archive" | "unknown">;
 type SanitizeFilePlan = {
     dataset: SanitizeDatasetType;
@@ -607,7 +609,10 @@ type SanitizeSummary = {
     totalBytes: number;
     processedFiles: number;
     processedRows: number;
+    sourceEncoding: SanitizeSourceEncoding;
     nulBytesRemoved: number;
+    invalidBytesRemoved: number;
+    controlCharsRemoved: number;
     changedFiles: number;
     unchangedFiles: number;
     datasets: SanitizeDatasetType[];
@@ -618,6 +623,8 @@ type SanitizeSummary = {
         lineCount: number;
         changed: boolean;
         nulBytesRemoved: number;
+        invalidBytesRemoved: number;
+        controlCharsRemoved: number;
     }>;
     warnings: string[];
     nextStep?: string | undefined;
@@ -629,6 +636,7 @@ type SanitizeProgressEvent = {
     totalFiles: number;
     totalBytes: number;
     datasets: SanitizeDatasetType[];
+    sourceEncoding: SanitizeSourceEncoding;
 } | {
     kind: "progress";
     currentFileDisplayPath: string;
@@ -640,12 +648,16 @@ type SanitizeProgressEvent = {
     currentFileSize: number;
     processedRows: number;
     nulBytesRemoved: number;
+    invalidBytesRemoved: number;
+    controlCharsRemoved: number;
     changedFiles: number;
 } | {
     kind: "finish";
     totalFiles: number;
     processedRows: number;
     nulBytesRemoved: number;
+    invalidBytesRemoved: number;
+    controlCharsRemoved: number;
     changedFiles: number;
     totalBytes: number;
 };
@@ -653,6 +665,7 @@ type SanitizeProgressListener = (event: SanitizeProgressEvent) => void;
 type SanitizeOptions = {
     outputPath?: string | undefined;
     dataset?: SanitizeDatasetType | undefined;
+    sourceEncoding?: string | undefined;
     onProgress?: SanitizeProgressListener | undefined;
 };
@@ -1087,4 +1100,4 @@ declare function exportPostgresCsvDataset(inputPath: string, options?: PostgresC
 declare function generatePostgresDirectScript(inputPath: string, options?: PostgresDirectScriptOptions): Promise<PostgresDirectScriptSummary>;
-export { type AppConfig, type AppEnvironment, AppError, type CheckpointCleanupPhase, DEFAULT_FEDERAL_REVENUE_DOWNLOAD_ROOT, DEFAULT_FEDERAL_REVENUE_SHARE_TOKEN, DEFAULT_FEDERAL_REVENUE_USER_AGENT, DEFAULT_FEDERAL_REVENUE_WEBDAV_URL, type DatabaseCleanupSummary, type DatabaseConfig, type DatasetBlock, type DatasetType, type ExtractionEntry, type ExtractionProgressEvent, type ExtractionProgressListener, type ExtractionSummary, FEDERAL_REVENUE_CONTROL_DIR, FEDERAL_REVENUE_CONTROL_SCOPE, FEDERAL_REVENUE_MANIFEST_VERSION, type Failure, type FederalRevenueCheckOptions, type FederalRevenueCheckSummary, type FederalRevenueCleanMode, type FederalRevenueCleanOptions, type FederalRevenueCleanSummary, type FederalRevenueClientOptions, type FederalRevenueDownloadEntry, type FederalRevenueDownloadOptions, type FederalRevenueDownloadProgressEvent, type FederalRevenueDownloadProgressListener, type FederalRevenueDownloadStatus, type FederalRevenueDownloadSummary, type FederalRevenueFile, type FederalRevenueLocalFileStatus, type FederalRevenueLocalStatusEntry, type FederalRevenueLockFile, type FederalRevenueManifest, type FederalRevenueManifestFile, type FederalRevenueManifestLastCommand, type FederalRevenueManifestLastStatus, type FederalRevenueReference, type FederalRevenueReferenceMode, type FederalRevenueReferenceSelection, type FederalRevenueRetryOptions, type FederalRevenueStatusOptions, type FederalRevenueStatusSummary, type FederalRevenueSyncLockOptions, type FederalRevenueSyncOptions, type FederalRevenueSyncSummary, type FileInspection, type ImportCheckpointRecord, type ImportCheckpointStatus, type ImportDatasetPlan, type ImportDatasetType, type ImportFilePlan, type ImportOptions, type ImportPerformanceSummary, type ImportPhaseStatus, type ImportPlanRecord, type ImportProgressEvent, type ImportProgressListener, type ImportSchemaCapabilities, type ImportSummary, type InputDetectionMode, type InputMode, type InspectSummary, type LogLevel, type LogStatus, type PostgresCsvDatasetSummary, type PostgresCsvExportOptions, type PostgresCsvExportProgressEvent, type PostgresCsvExportProgressListener, type PostgresCsvExportSummary, type PostgresCsvFile, type PostgresDirectScriptDatasetSummary, type PostgresDirectScriptOptions, type PostgresDirectScriptProgressEvent, type PostgresDirectScriptProgressListener, type PostgresDirectScriptSummary, type PostgresDirectSourceFile, type QuarantineListFilters, type QuarantineListSummary, type QuarantineRecord, type QuarantineStatsFilters, type QuarantineStatsSummary, type Result, type SanitizeDatasetType, type SanitizeOptions, type SanitizePlan, type SanitizeProgressEvent, type SanitizeProgressListener, type SanitizeSummary, type SchemaGenerationOptions, type SchemaProfile, ServiceError, type StructuredLogEntry, type SupportedOs, ValidationError, type ValidationSummary, appendJsonLinesLog, assertPostgresUrl, buildFederalRevenueDownloadHeaders, buildFederalRevenueReferenceOutputPath, checkFederalRevenueDataset, cleanFederalRevenueDataset, cleanupDatabaseCheckpointsData, cleanupDatabaseMaterializedData, cleanupDatabasePlansData, cleanupDatabaseStagingData, createFederalRevenueManifest, createJsonLinesLog, defaultExtractedOutputPath, detectOs, downloadFederalRevenueDataset, ensureDirectory, evaluateFederalRevenueManifestFile, evaluateFederalRevenueManifestFiles, exportPostgresCsvDataset, extractArchives, finalizeFederalRevenueManifest, generatePostgresDirectScript, generateSchemaSql, getAllLayouts, getCurrentFederalRevenueReference, getFederalRevenueControlDirectory, getFederalRevenueManifestPath, getFederalRevenueStatus, getFederalRevenueSyncLockPath, getLayoutSummary, getLogsDirectoryPath, getQuarantineStats, getUserAppDirectoryPath, importDataToDatabase, inspectFiles, listFederalRevenueFiles, listFederalRevenueReferences, listQuarantineRows, loadImportDataToStaging, materializeImportedData, prettyJson, readDatabaseConfig, readFederalRevenueManifest, resetDefaultDbUrl, resolveDatabaseUrl, resolveDbUrl, resolveFederalRevenueReference, resolveInputMode, resolveSchemaProfile, retryFederalRevenueDataset, runDoctor, safeReadText, safeWriteText, sanitizeInputDirectory, setDefaultDbUrl, showQuarantineRow, syncFederalRevenueDataset, testDatabaseConnection, toTitleCase, updateFederalRevenueManifestFile, validateFederalRevenueReference, validateInputDirectory, withFederalRevenueSyncLock, writeCommandFailureLog, writeCommandLog, writeDatabaseConfig, writeFederalRevenueManifest, writeSchemaFile };
+export { type AppConfig, type AppEnvironment, AppError, type CheckpointCleanupPhase, DEFAULT_FEDERAL_REVENUE_DOWNLOAD_ROOT, DEFAULT_FEDERAL_REVENUE_SHARE_TOKEN, DEFAULT_FEDERAL_REVENUE_USER_AGENT, DEFAULT_FEDERAL_REVENUE_WEBDAV_URL, type DatabaseCleanupSummary, type DatabaseConfig, type DatasetBlock, type DatasetType, type ExtractionEntry, type ExtractionProgressEvent, type ExtractionProgressListener, type ExtractionSummary, FEDERAL_REVENUE_CONTROL_DIR, FEDERAL_REVENUE_CONTROL_SCOPE, FEDERAL_REVENUE_MANIFEST_VERSION, type Failure, type FederalRevenueCheckOptions, type FederalRevenueCheckSummary, type FederalRevenueCleanMode, type FederalRevenueCleanOptions, type FederalRevenueCleanSummary, type FederalRevenueClientOptions, type FederalRevenueDownloadEntry, type FederalRevenueDownloadOptions, type FederalRevenueDownloadProgressEvent, type FederalRevenueDownloadProgressListener, type FederalRevenueDownloadStatus, type FederalRevenueDownloadSummary, type FederalRevenueFile, type FederalRevenueLocalFileStatus, type FederalRevenueLocalStatusEntry, type FederalRevenueLockFile, type FederalRevenueManifest, type FederalRevenueManifestFile, type FederalRevenueManifestLastCommand, type FederalRevenueManifestLastStatus, type FederalRevenueReference, type FederalRevenueReferenceMode, type FederalRevenueReferenceSelection, type FederalRevenueRetryOptions, type FederalRevenueStatusOptions, type FederalRevenueStatusSummary, type FederalRevenueSyncLockOptions, type FederalRevenueSyncOptions, type FederalRevenueSyncSummary, type FileInspection, type ImportCheckpointRecord, type ImportCheckpointStatus, type ImportDatasetPlan, type ImportDatasetType, type ImportFilePlan, type ImportOptions, type ImportPerformanceSummary, type ImportPhaseStatus, type ImportPlanRecord, type ImportProgressEvent, type ImportProgressListener, type ImportSchemaCapabilities, type ImportSummary, type InputDetectionMode, type InputMode, type InspectSummary, type LogLevel, type LogStatus, type PostgresCsvDatasetSummary, type PostgresCsvExportOptions, type PostgresCsvExportProgressEvent, type PostgresCsvExportProgressListener, type PostgresCsvExportSummary, type PostgresCsvFile, type PostgresDirectScriptDatasetSummary, type PostgresDirectScriptOptions, type PostgresDirectScriptProgressEvent, type PostgresDirectScriptProgressListener, type PostgresDirectScriptSummary, type PostgresDirectSourceFile, type QuarantineListFilters, type QuarantineListSummary, type QuarantineRecord, type QuarantineStatsFilters, type QuarantineStatsSummary, type Result, type SanitizeDatasetType, type SanitizeOptions, type SanitizePlan, type SanitizeProgressEvent, type SanitizeProgressListener, type SanitizeSourceEncoding, type SanitizeSummary, type SchemaGenerationOptions, type SchemaProfile, ServiceError, type StructuredLogEntry, type SupportedOs, ValidationError, type ValidationSummary, appendJsonLinesLog, assertPostgresUrl, buildFederalRevenueDownloadHeaders, buildFederalRevenueReferenceOutputPath, checkFederalRevenueDataset, cleanFederalRevenueDataset, cleanupDatabaseCheckpointsData, cleanupDatabaseMaterializedData, cleanupDatabasePlansData, cleanupDatabaseStagingData, createFederalRevenueManifest, createJsonLinesLog, defaultExtractedOutputPath, detectOs, downloadFederalRevenueDataset, ensureDirectory, evaluateFederalRevenueManifestFile, evaluateFederalRevenueManifestFiles, exportPostgresCsvDataset, extractArchives, finalizeFederalRevenueManifest, generatePostgresDirectScript, generateSchemaSql, getAllLayouts, getCurrentFederalRevenueReference, getFederalRevenueControlDirectory, getFederalRevenueManifestPath, getFederalRevenueStatus, getFederalRevenueSyncLockPath, getLayoutSummary, getLogsDirectoryPath, getQuarantineStats, getUserAppDirectoryPath, importDataToDatabase, inspectFiles, listFederalRevenueFiles, listFederalRevenueReferences, listQuarantineRows, loadImportDataToStaging, materializeImportedData, prettyJson, readDatabaseConfig, readFederalRevenueManifest, resetDefaultDbUrl, resolveDatabaseUrl, resolveDbUrl, resolveFederalRevenueReference, resolveInputMode, resolveSchemaProfile, retryFederalRevenueDataset, runDoctor, safeReadText, safeWriteText, sanitizeInputDirectory, setDefaultDbUrl, showQuarantineRow, syncFederalRevenueDataset, testDatabaseConnection, toTitleCase, updateFederalRevenueManifestFile, validateFederalRevenueReference, validateInputDirectory, withFederalRevenueSyncLock, writeCommandFailureLog, writeCommandLog, writeDatabaseConfig, writeFederalRevenueManifest, writeSchemaFile };

package/dist/index.js CHANGED Viewed

@@ -7853,81 +7853,264 @@ function isRecognizedSanitizeEntry(entry) {
   return entry.entryKind === "file" && entry.inferredType !== "zip-archive" && entry.inferredType !== "unknown";
 }
+// src/services/sanitize/encoding.ts
+import { StringDecoder } from "string_decoder";
+var WINDOWS_1252_C1_MAP = {
+  128: "\u20AC",
+  130: "\u201A",
+  131: "\u0192",
+  132: "\u201E",
+  133: "\u2026",
+  134: "\u2020",
+  135: "\u2021",
+  136: "\u02C6",
+  137: "\u2030",
+  138: "\u0160",
+  139: "\u2039",
+  140: "\u0152",
+  142: "\u017D",
+  145: "\u2018",
+  146: "\u2019",
+  147: "\u201C",
+  148: "\u201D",
+  149: "\u2022",
+  150: "\u2013",
+  151: "\u2014",
+  152: "\u02DC",
+  153: "\u2122",
+  154: "\u0161",
+  155: "\u203A",
+  156: "\u0153",
+  158: "\u017E",
+  159: "\u0178"
+};
+function normalizeSanitizeSourceEncoding(value) {
+  const normalized = (value ?? "WIN1252").trim().toUpperCase().replace(/_/g, "-");
+  switch (normalized) {
+    case "WIN1252":
+    case "WINDOWS-1252":
+    case "CP1252":
+      return "WIN1252";
+    case "LATIN1":
+    case "LATIN-1":
+    case "ISO-8859-1":
+    case "ISO8859-1":
+      return "LATIN1";
+    case "UTF8":
+    case "UTF-8":
+      return "UTF8";
+    default:
+      throw new ValidationError(
+        `Unsupported sanitize source encoding: ${value}. Supported values: WIN1252, LATIN1, UTF8.`
+      );
+  }
+}
+function isAllowedControlCodePoint(codePoint) {
+  return codePoint === 9 || codePoint === 10 || codePoint === 13;
+}
+function isProblematicControlCodePoint(codePoint) {
+  if (isAllowedControlCodePoint(codePoint)) {
+    return false;
+  }
+  return codePoint >= 0 && codePoint <= 31 || codePoint === 127 || codePoint >= 128 && codePoint <= 159 || codePoint === 65279;
+}
+function sanitizeDecodedText(text) {
+  const output = [];
+  let invalidBytesRemoved = 0;
+  let controlCharsRemoved = 0;
+  for (const char of text) {
+    const codePoint = char.codePointAt(0);
+    if (codePoint === 65533) {
+      invalidBytesRemoved += 1;
+      continue;
+    }
+    if (isProblematicControlCodePoint(codePoint)) {
+      controlCharsRemoved += 1;
+      continue;
+    }
+    output.push(char);
+  }
+  return {
+    text: output.join(""),
+    invalidBytesRemoved,
+    controlCharsRemoved
+  };
+}
+var SanitizeEncodingNormalizer = class {
+  constructor(sourceEncoding) {
+    this.sourceEncoding = sourceEncoding;
+    this.utf8Decoder = sourceEncoding === "UTF8" ? new StringDecoder("utf8") : void 0;
+  }
+  sourceEncoding;
+  utf8Decoder;
+  normalizeChunk(chunk) {
+    if (this.sourceEncoding === "UTF8") {
+      const decoded = this.utf8Decoder.write(chunk);
+      const sanitized = sanitizeDecodedText(decoded);
+      const nulBytesRemoved = [...decoded].filter(
+        (char) => char === "\0"
+      ).length;
+      return {
+        ...sanitized,
+        nulBytesRemoved
+      };
+    }
+    return this.normalizeSingleByteChunk(chunk);
+  }
+  flush() {
+    if (!this.utf8Decoder) {
+      return {
+        text: "",
+        nulBytesRemoved: 0,
+        invalidBytesRemoved: 0,
+        controlCharsRemoved: 0
+      };
+    }
+    const decoded = this.utf8Decoder.end();
+    const sanitized = sanitizeDecodedText(decoded);
+    const nulBytesRemoved = [...decoded].filter((char) => char === "\0").length;
+    return {
+      ...sanitized,
+      nulBytesRemoved
+    };
+  }
+  normalizeSingleByteChunk(chunk) {
+    const output = [];
+    let nulBytesRemoved = 0;
+    let invalidBytesRemoved = 0;
+    let controlCharsRemoved = 0;
+    for (const byte of chunk) {
+      if (byte === 0) {
+        nulBytesRemoved += 1;
+        continue;
+      }
+      if (byte < 32 || byte === 127) {
+        if (isAllowedControlCodePoint(byte)) {
+          output.push(String.fromCharCode(byte));
+        } else {
+          controlCharsRemoved += 1;
+        }
+        continue;
+      }
+      if (byte >= 128 && byte <= 159) {
+        if (this.sourceEncoding === "WIN1252") {
+          const mapped = WINDOWS_1252_C1_MAP[byte];
+          if (mapped === void 0) {
+            invalidBytesRemoved += 1;
+          } else {
+            output.push(mapped);
+          }
+        } else {
+          controlCharsRemoved += 1;
+        }
+        continue;
+      }
+      output.push(String.fromCharCode(byte));
+    }
+    return {
+      text: output.join(""),
+      nulBytesRemoved,
+      invalidBytesRemoved,
+      controlCharsRemoved
+    };
+  }
+};
 // src/services/sanitize/runner.ts
 import { createReadStream as createReadStream2, createWriteStream as createWriteStream2 } from "fs";
 import { mkdir as mkdir7 } from "fs/promises";
 import path13 from "path";
-function stripNulBytes(chunk) {
-  let removed = 0;
-  for (let index = 0; index < chunk.length; index += 1) {
-    if (chunk[index] === 0) {
-      removed += 1;
-    }
+async function writeUtf8(output, value) {
+  if (value.length === 0) {
+    return;
   }
-  if (removed === 0) {
-    return { buffer: chunk, removed: 0 };
+  if (!output.write(value, "utf8")) {
+    await new Promise((resolve, reject) => {
+      output.once("drain", resolve);
+      output.once("error", reject);
+    });
   }
-  const sanitized = Buffer.allocUnsafe(chunk.length - removed);
-  let outputIndex = 0;
-  for (let index = 0; index < chunk.length; index += 1) {
-    const value = chunk[index];
-    if (value !== 0) {
-      sanitized[outputIndex] = value;
-      outputIndex += 1;
+}
+function countNewlines(value) {
+  let count = 0;
+  for (let index = 0; index < value.length; index += 1) {
+    if (value[index] === "\n") {
+      count += 1;
     }
   }
-  return { buffer: sanitized, removed };
+  return count;
 }
-async function sanitizeDatasetFile(plan, onChunk) {
+async function sanitizeDatasetFile(plan, onChunk, options = {}) {
   await mkdir7(path13.dirname(plan.outputPath), { recursive: true });
+  const sourceEncoding = normalizeSanitizeSourceEncoding(
+    options.sourceEncoding
+  );
+  const normalizer = new SanitizeEncodingNormalizer(sourceEncoding);
   const input = createReadStream2(plan.absolutePath);
-  const output = createWriteStream2(plan.outputPath);
+  const output = createWriteStream2(plan.outputPath, { encoding: "utf8" });
   let totalBytesRead = 0;
   let totalBytesWritten = 0;
   let nulBytesRemoved = 0;
+  let invalidBytesRemoved = 0;
+  let controlCharsRemoved = 0;
   let lineCount = 0;
-  let sawAnyByte = false;
-  let lastByteWasNewline = false;
+  let sawAnyCharacter = false;
+  let lastCharacterWasNewline = false;
+  const processText = async (text) => {
+    if (text.length === 0) {
+      return;
+    }
+    sawAnyCharacter = true;
+    lineCount += countNewlines(text);
+    lastCharacterWasNewline = text.endsWith("\n");
+    totalBytesWritten += Buffer.byteLength(text, "utf8");
+    await writeUtf8(output, text);
+  };
   try {
     for await (const chunk of input) {
       const chunkBuffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
       totalBytesRead += chunkBuffer.length;
-      const { buffer, removed } = stripNulBytes(chunkBuffer);
-      nulBytesRemoved += removed;
-      sawAnyByte = sawAnyByte || buffer.length > 0;
-      for (let index = 0; index < buffer.length; index += 1) {
-        if (buffer[index] === 10) {
-          lineCount += 1;
-        }
-      }
-      if (buffer.length > 0) {
-        lastByteWasNewline = buffer[buffer.length - 1] === 10;
-      }
-      totalBytesWritten += buffer.length;
-      output.write(buffer);
+      const normalized = normalizer.normalizeChunk(chunkBuffer);
+      nulBytesRemoved += normalized.nulBytesRemoved;
+      invalidBytesRemoved += normalized.invalidBytesRemoved;
+      controlCharsRemoved += normalized.controlCharsRemoved;
+      await processText(normalized.text);
       onChunk?.({
         bytesProcessed: chunkBuffer.length,
         fileBytesProcessed: totalBytesRead,
         currentFileSize: plan.fileSize,
         processedRows: lineCount,
-        nulBytesRemoved
+        nulBytesRemoved,
+        invalidBytesRemoved,
+        controlCharsRemoved
       });
     }
-    if (sawAnyByte && !lastByteWasNewline) {
+    const flushed = normalizer.flush();
+    nulBytesRemoved += flushed.nulBytesRemoved;
+    invalidBytesRemoved += flushed.invalidBytesRemoved;
+    controlCharsRemoved += flushed.controlCharsRemoved;
+    await processText(flushed.text);
+    if (sawAnyCharacter && !lastCharacterWasNewline) {
       lineCount += 1;
     }
   } finally {
     input.close();
     output.end();
-    await new Promise((resolve) => output.on("finish", () => resolve()));
+    await new Promise((resolve, reject) => {
+      output.on("finish", () => resolve());
+      output.on("error", (error) => reject(error));
+    });
   }
   return {
     plan,
     totalBytesRead,
     totalBytesWritten,
+    sourceEncoding,
     nulBytesRemoved,
+    invalidBytesRemoved,
+    controlCharsRemoved,
     lineCount,
-    changed: nulBytesRemoved > 0 || totalBytesRead !== totalBytesWritten
+    changed: nulBytesRemoved > 0 || invalidBytesRemoved > 0 || controlCharsRemoved > 0 || totalBytesRead !== totalBytesWritten
   };
 }
@@ -7990,40 +8173,54 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
       "No recognized validated dataset files were found for sanitization."
     );
   }
+  const sourceEncoding = normalizeSanitizeSourceEncoding(
+    options.sourceEncoding
+  );
   options.onProgress?.({
     kind: "start",
     validatedPath,
     outputPath,
     totalFiles: plan.totalFiles,
     totalBytes: plan.totalBytes,
-    datasets: plan.datasets
+    datasets: plan.datasets,
+    sourceEncoding
   });
   let processedFiles = 0;
   let processedRows = 0;
   let processedBytes = 0;
   let nulBytesRemoved = 0;
+  let invalidBytesRemoved = 0;
+  let controlCharsRemoved = 0;
   let changedFiles = 0;
   const fileSummaries = [];
   for (const [index, filePlan] of plan.files.entries()) {
-    const fileResult = await sanitizeDatasetFile(filePlan, (chunk) => {
-      options.onProgress?.({
-        kind: "progress",
-        currentFileDisplayPath: filePlan.displayPath,
-        fileIndex: index + 1,
-        totalFiles: plan.totalFiles,
-        bytesProcessed: processedBytes + chunk.fileBytesProcessed,
-        totalBytes: plan.totalBytes,
-        fileBytesProcessed: chunk.fileBytesProcessed,
-        currentFileSize: chunk.currentFileSize,
-        processedRows: processedRows + chunk.processedRows,
-        nulBytesRemoved: nulBytesRemoved + chunk.nulBytesRemoved,
-        changedFiles
-      });
-    });
+    const fileResult = await sanitizeDatasetFile(
+      filePlan,
+      (chunk) => {
+        options.onProgress?.({
+          kind: "progress",
+          currentFileDisplayPath: filePlan.displayPath,
+          fileIndex: index + 1,
+          totalFiles: plan.totalFiles,
+          bytesProcessed: processedBytes + chunk.fileBytesProcessed,
+          totalBytes: plan.totalBytes,
+          fileBytesProcessed: chunk.fileBytesProcessed,
+          currentFileSize: chunk.currentFileSize,
+          processedRows: processedRows + chunk.processedRows,
+          nulBytesRemoved: nulBytesRemoved + chunk.nulBytesRemoved,
+          invalidBytesRemoved: invalidBytesRemoved + chunk.invalidBytesRemoved,
+          controlCharsRemoved: controlCharsRemoved + chunk.controlCharsRemoved,
+          changedFiles
+        });
+      },
+      { sourceEncoding }
+    );
     processedFiles += 1;
     processedRows += fileResult.lineCount;
     processedBytes += fileResult.totalBytesRead;
     nulBytesRemoved += fileResult.nulBytesRemoved;
+    invalidBytesRemoved += fileResult.invalidBytesRemoved;
+    controlCharsRemoved += fileResult.controlCharsRemoved;
     changedFiles += fileResult.changed ? 1 : 0;
     fileSummaries.push({
       dataset: filePlan.dataset,
@@ -8031,7 +8228,9 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
       outputPath: filePlan.outputPath,
       lineCount: fileResult.lineCount,
       changed: fileResult.changed,
-      nulBytesRemoved: fileResult.nulBytesRemoved
+      nulBytesRemoved: fileResult.nulBytesRemoved,
+      invalidBytesRemoved: fileResult.invalidBytesRemoved,
+      controlCharsRemoved: fileResult.controlCharsRemoved
     });
   }
   options.onProgress?.({
@@ -8039,6 +8238,8 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
     totalFiles: plan.totalFiles,
     processedRows,
     nulBytesRemoved,
+    invalidBytesRemoved,
+    controlCharsRemoved,
     changedFiles,
     totalBytes: plan.totalBytes
   });
@@ -8050,13 +8251,17 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
     totalBytes: plan.totalBytes,
     processedFiles,
     processedRows,
+    sourceEncoding,
     nulBytesRemoved,
+    invalidBytesRemoved,
+    controlCharsRemoved,
     changedFiles,
     unchangedFiles: plan.totalFiles - changedFiles,
     datasets: plan.datasets,
     files: fileSummaries,
     warnings: [
-      "Sanitization prepares a clean dataset tree for import by removing known low-level byte issues such as NUL bytes before PostgreSQL loading begins.",
+      "Sanitization now writes UTF-8 output and removes invalid bytes plus problematic control characters before PostgreSQL loading begins.",
+      "The PostgreSQL direct import path can use --source-encoding UTF8 when reading files generated by this sanitization command.",
       "The import command still keeps quarantine and row-level recovery for unexpected issues, but sanitizing first reduces the amount of slow fallback work during import."
     ],
     nextStep: inferNextStep3(outputPath)
@@ -8849,7 +9054,7 @@ async function exportPostgresCsvDataset(inputPath, options = {}) {
 // src/services/postgres-direct/generator.ts
 import { mkdir as mkdir9, stat as stat7, writeFile as writeFile6 } from "fs/promises";
 import path17 from "path";
-var DEFAULT_SOURCE_ENCODING = "WIN1252";
+var DEFAULT_SOURCE_ENCODING = "UTF8";
 function defaultPostgresDirectOutputPath(inputPath) {
   const baseName = path17.basename(inputPath);
   if (baseName.toLowerCase() === "sanitized") {
@@ -8864,7 +9069,7 @@ function normalizeSourceEncoding(value) {
   const encoding = (value ?? DEFAULT_SOURCE_ENCODING).trim();
   if (!/^[A-Za-z0-9_-]+$/.test(encoding)) {
     throw new ValidationError(
-      `Invalid source encoding: ${value}. Use a PostgreSQL client encoding name such as WIN1252 or UTF8.`
+      `Invalid source encoding: ${value}. Use a PostgreSQL client encoding name such as UTF8, WIN1252 or LATIN1.`
     );
   }
   return encoding.toUpperCase();
@@ -8999,7 +9204,8 @@ async function generatePostgresDirectScript(inputPath, options = {}) {
       ...validation.ok ? [] : validation.errors,
       "This script imports sanitized Receita files directly with psql \\copy. It avoids rewriting the full dataset into a second CSV tree.",
       "The generated script expects the database schema generated by cnpj-db-loader to be applied before execution.",
-      "Use --source-encoding UTF8 only if your sanitized files are already UTF-8. The default WIN1252 matches the usual Receita file encoding."
+      "The direct PostgreSQL script now defaults to UTF8 because the sanitize command writes clean UTF-8 files.",
+      "Use --source-encoding WIN1252 or LATIN1 only when generating scripts for legacy sanitized files produced by older loader versions."
     ],
     nextStep: inferNextStep5(scriptPath)
   };