@danielarndt0/cnpj-db-loader 2.4.0-beta.1 → 2.4.0-beta.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -582,6 +582,8 @@ declare function showQuarantineRow(id: number, options?: {
582
582
  dbUrl?: string;
583
583
  }): Promise<QuarantineRecord>;
584
584
 
585
+ type SanitizeSourceEncoding = "WIN1252" | "LATIN1" | "UTF8";
586
+
585
587
  type SanitizeDatasetType = Exclude<DatasetType, "zip-archive" | "unknown">;
586
588
  type SanitizeFilePlan = {
587
589
  dataset: SanitizeDatasetType;
@@ -607,7 +609,10 @@ type SanitizeSummary = {
607
609
  totalBytes: number;
608
610
  processedFiles: number;
609
611
  processedRows: number;
612
+ sourceEncoding: SanitizeSourceEncoding;
610
613
  nulBytesRemoved: number;
614
+ invalidBytesRemoved: number;
615
+ controlCharsRemoved: number;
611
616
  changedFiles: number;
612
617
  unchangedFiles: number;
613
618
  datasets: SanitizeDatasetType[];
@@ -618,6 +623,8 @@ type SanitizeSummary = {
618
623
  lineCount: number;
619
624
  changed: boolean;
620
625
  nulBytesRemoved: number;
626
+ invalidBytesRemoved: number;
627
+ controlCharsRemoved: number;
621
628
  }>;
622
629
  warnings: string[];
623
630
  nextStep?: string | undefined;
@@ -629,6 +636,7 @@ type SanitizeProgressEvent = {
629
636
  totalFiles: number;
630
637
  totalBytes: number;
631
638
  datasets: SanitizeDatasetType[];
639
+ sourceEncoding: SanitizeSourceEncoding;
632
640
  } | {
633
641
  kind: "progress";
634
642
  currentFileDisplayPath: string;
@@ -640,12 +648,16 @@ type SanitizeProgressEvent = {
640
648
  currentFileSize: number;
641
649
  processedRows: number;
642
650
  nulBytesRemoved: number;
651
+ invalidBytesRemoved: number;
652
+ controlCharsRemoved: number;
643
653
  changedFiles: number;
644
654
  } | {
645
655
  kind: "finish";
646
656
  totalFiles: number;
647
657
  processedRows: number;
648
658
  nulBytesRemoved: number;
659
+ invalidBytesRemoved: number;
660
+ controlCharsRemoved: number;
649
661
  changedFiles: number;
650
662
  totalBytes: number;
651
663
  };
@@ -653,6 +665,7 @@ type SanitizeProgressListener = (event: SanitizeProgressEvent) => void;
653
665
  type SanitizeOptions = {
654
666
  outputPath?: string | undefined;
655
667
  dataset?: SanitizeDatasetType | undefined;
668
+ sourceEncoding?: string | undefined;
656
669
  onProgress?: SanitizeProgressListener | undefined;
657
670
  };
658
671
 
@@ -1087,4 +1100,4 @@ declare function exportPostgresCsvDataset(inputPath: string, options?: PostgresC
1087
1100
 
1088
1101
  declare function generatePostgresDirectScript(inputPath: string, options?: PostgresDirectScriptOptions): Promise<PostgresDirectScriptSummary>;
1089
1102
 
1090
- export { type AppConfig, type AppEnvironment, AppError, type CheckpointCleanupPhase, DEFAULT_FEDERAL_REVENUE_DOWNLOAD_ROOT, DEFAULT_FEDERAL_REVENUE_SHARE_TOKEN, DEFAULT_FEDERAL_REVENUE_USER_AGENT, DEFAULT_FEDERAL_REVENUE_WEBDAV_URL, type DatabaseCleanupSummary, type DatabaseConfig, type DatasetBlock, type DatasetType, type ExtractionEntry, type ExtractionProgressEvent, type ExtractionProgressListener, type ExtractionSummary, FEDERAL_REVENUE_CONTROL_DIR, FEDERAL_REVENUE_CONTROL_SCOPE, FEDERAL_REVENUE_MANIFEST_VERSION, type Failure, type FederalRevenueCheckOptions, type FederalRevenueCheckSummary, type FederalRevenueCleanMode, type FederalRevenueCleanOptions, type FederalRevenueCleanSummary, type FederalRevenueClientOptions, type FederalRevenueDownloadEntry, type FederalRevenueDownloadOptions, type FederalRevenueDownloadProgressEvent, type FederalRevenueDownloadProgressListener, type FederalRevenueDownloadStatus, type FederalRevenueDownloadSummary, type FederalRevenueFile, type FederalRevenueLocalFileStatus, type FederalRevenueLocalStatusEntry, type FederalRevenueLockFile, type FederalRevenueManifest, type FederalRevenueManifestFile, type FederalRevenueManifestLastCommand, type FederalRevenueManifestLastStatus, type FederalRevenueReference, type FederalRevenueReferenceMode, type FederalRevenueReferenceSelection, type FederalRevenueRetryOptions, type FederalRevenueStatusOptions, type FederalRevenueStatusSummary, type FederalRevenueSyncLockOptions, type FederalRevenueSyncOptions, type FederalRevenueSyncSummary, type FileInspection, type ImportCheckpointRecord, type ImportCheckpointStatus, type ImportDatasetPlan, type ImportDatasetType, type ImportFilePlan, type ImportOptions, type ImportPerformanceSummary, type ImportPhaseStatus, type ImportPlanRecord, type ImportProgressEvent, type ImportProgressListener, type ImportSchemaCapabilities, type ImportSummary, type InputDetectionMode, type InputMode, type InspectSummary, type LogLevel, type LogStatus, type PostgresCsvDatasetSummary, type PostgresCsvExportOptions, type PostgresCsvExportProgressEvent, type PostgresCsvExportProgressListener, type PostgresCsvExportSummary, type PostgresCsvFile, type PostgresDirectScriptDatasetSummary, type PostgresDirectScriptOptions, type PostgresDirectScriptProgressEvent, type PostgresDirectScriptProgressListener, type PostgresDirectScriptSummary, type PostgresDirectSourceFile, type QuarantineListFilters, type QuarantineListSummary, type QuarantineRecord, type QuarantineStatsFilters, type QuarantineStatsSummary, type Result, type SanitizeDatasetType, type SanitizeOptions, type SanitizePlan, type SanitizeProgressEvent, type SanitizeProgressListener, type SanitizeSummary, type SchemaGenerationOptions, type SchemaProfile, ServiceError, type StructuredLogEntry, type SupportedOs, ValidationError, type ValidationSummary, appendJsonLinesLog, assertPostgresUrl, buildFederalRevenueDownloadHeaders, buildFederalRevenueReferenceOutputPath, checkFederalRevenueDataset, cleanFederalRevenueDataset, cleanupDatabaseCheckpointsData, cleanupDatabaseMaterializedData, cleanupDatabasePlansData, cleanupDatabaseStagingData, createFederalRevenueManifest, createJsonLinesLog, defaultExtractedOutputPath, detectOs, downloadFederalRevenueDataset, ensureDirectory, evaluateFederalRevenueManifestFile, evaluateFederalRevenueManifestFiles, exportPostgresCsvDataset, extractArchives, finalizeFederalRevenueManifest, generatePostgresDirectScript, generateSchemaSql, getAllLayouts, getCurrentFederalRevenueReference, getFederalRevenueControlDirectory, getFederalRevenueManifestPath, getFederalRevenueStatus, getFederalRevenueSyncLockPath, getLayoutSummary, getLogsDirectoryPath, getQuarantineStats, getUserAppDirectoryPath, importDataToDatabase, inspectFiles, listFederalRevenueFiles, listFederalRevenueReferences, listQuarantineRows, loadImportDataToStaging, materializeImportedData, prettyJson, readDatabaseConfig, readFederalRevenueManifest, resetDefaultDbUrl, resolveDatabaseUrl, resolveDbUrl, resolveFederalRevenueReference, resolveInputMode, resolveSchemaProfile, retryFederalRevenueDataset, runDoctor, safeReadText, safeWriteText, sanitizeInputDirectory, setDefaultDbUrl, showQuarantineRow, syncFederalRevenueDataset, testDatabaseConnection, toTitleCase, updateFederalRevenueManifestFile, validateFederalRevenueReference, validateInputDirectory, withFederalRevenueSyncLock, writeCommandFailureLog, writeCommandLog, writeDatabaseConfig, writeFederalRevenueManifest, writeSchemaFile };
1103
+ export { type AppConfig, type AppEnvironment, AppError, type CheckpointCleanupPhase, DEFAULT_FEDERAL_REVENUE_DOWNLOAD_ROOT, DEFAULT_FEDERAL_REVENUE_SHARE_TOKEN, DEFAULT_FEDERAL_REVENUE_USER_AGENT, DEFAULT_FEDERAL_REVENUE_WEBDAV_URL, type DatabaseCleanupSummary, type DatabaseConfig, type DatasetBlock, type DatasetType, type ExtractionEntry, type ExtractionProgressEvent, type ExtractionProgressListener, type ExtractionSummary, FEDERAL_REVENUE_CONTROL_DIR, FEDERAL_REVENUE_CONTROL_SCOPE, FEDERAL_REVENUE_MANIFEST_VERSION, type Failure, type FederalRevenueCheckOptions, type FederalRevenueCheckSummary, type FederalRevenueCleanMode, type FederalRevenueCleanOptions, type FederalRevenueCleanSummary, type FederalRevenueClientOptions, type FederalRevenueDownloadEntry, type FederalRevenueDownloadOptions, type FederalRevenueDownloadProgressEvent, type FederalRevenueDownloadProgressListener, type FederalRevenueDownloadStatus, type FederalRevenueDownloadSummary, type FederalRevenueFile, type FederalRevenueLocalFileStatus, type FederalRevenueLocalStatusEntry, type FederalRevenueLockFile, type FederalRevenueManifest, type FederalRevenueManifestFile, type FederalRevenueManifestLastCommand, type FederalRevenueManifestLastStatus, type FederalRevenueReference, type FederalRevenueReferenceMode, type FederalRevenueReferenceSelection, type FederalRevenueRetryOptions, type FederalRevenueStatusOptions, type FederalRevenueStatusSummary, type FederalRevenueSyncLockOptions, type FederalRevenueSyncOptions, type FederalRevenueSyncSummary, type FileInspection, type ImportCheckpointRecord, type ImportCheckpointStatus, type ImportDatasetPlan, type ImportDatasetType, type ImportFilePlan, type ImportOptions, type ImportPerformanceSummary, type ImportPhaseStatus, type ImportPlanRecord, type ImportProgressEvent, type ImportProgressListener, type ImportSchemaCapabilities, type ImportSummary, type InputDetectionMode, type InputMode, type InspectSummary, type LogLevel, type LogStatus, type PostgresCsvDatasetSummary, type PostgresCsvExportOptions, type PostgresCsvExportProgressEvent, type PostgresCsvExportProgressListener, type PostgresCsvExportSummary, type PostgresCsvFile, type PostgresDirectScriptDatasetSummary, type PostgresDirectScriptOptions, type PostgresDirectScriptProgressEvent, type PostgresDirectScriptProgressListener, type PostgresDirectScriptSummary, type PostgresDirectSourceFile, type QuarantineListFilters, type QuarantineListSummary, type QuarantineRecord, type QuarantineStatsFilters, type QuarantineStatsSummary, type Result, type SanitizeDatasetType, type SanitizeOptions, type SanitizePlan, type SanitizeProgressEvent, type SanitizeProgressListener, type SanitizeSourceEncoding, type SanitizeSummary, type SchemaGenerationOptions, type SchemaProfile, ServiceError, type StructuredLogEntry, type SupportedOs, ValidationError, type ValidationSummary, appendJsonLinesLog, assertPostgresUrl, buildFederalRevenueDownloadHeaders, buildFederalRevenueReferenceOutputPath, checkFederalRevenueDataset, cleanFederalRevenueDataset, cleanupDatabaseCheckpointsData, cleanupDatabaseMaterializedData, cleanupDatabasePlansData, cleanupDatabaseStagingData, createFederalRevenueManifest, createJsonLinesLog, defaultExtractedOutputPath, detectOs, downloadFederalRevenueDataset, ensureDirectory, evaluateFederalRevenueManifestFile, evaluateFederalRevenueManifestFiles, exportPostgresCsvDataset, extractArchives, finalizeFederalRevenueManifest, generatePostgresDirectScript, generateSchemaSql, getAllLayouts, getCurrentFederalRevenueReference, getFederalRevenueControlDirectory, getFederalRevenueManifestPath, getFederalRevenueStatus, getFederalRevenueSyncLockPath, getLayoutSummary, getLogsDirectoryPath, getQuarantineStats, getUserAppDirectoryPath, importDataToDatabase, inspectFiles, listFederalRevenueFiles, listFederalRevenueReferences, listQuarantineRows, loadImportDataToStaging, materializeImportedData, prettyJson, readDatabaseConfig, readFederalRevenueManifest, resetDefaultDbUrl, resolveDatabaseUrl, resolveDbUrl, resolveFederalRevenueReference, resolveInputMode, resolveSchemaProfile, retryFederalRevenueDataset, runDoctor, safeReadText, safeWriteText, sanitizeInputDirectory, setDefaultDbUrl, showQuarantineRow, syncFederalRevenueDataset, testDatabaseConnection, toTitleCase, updateFederalRevenueManifestFile, validateFederalRevenueReference, validateInputDirectory, withFederalRevenueSyncLock, writeCommandFailureLog, writeCommandLog, writeDatabaseConfig, writeFederalRevenueManifest, writeSchemaFile };
package/dist/index.js CHANGED
@@ -7853,81 +7853,264 @@ function isRecognizedSanitizeEntry(entry) {
7853
7853
  return entry.entryKind === "file" && entry.inferredType !== "zip-archive" && entry.inferredType !== "unknown";
7854
7854
  }
7855
7855
 
7856
+ // src/services/sanitize/encoding.ts
7857
+ import { StringDecoder } from "string_decoder";
7858
+ var WINDOWS_1252_C1_MAP = {
7859
+ 128: "\u20AC",
7860
+ 130: "\u201A",
7861
+ 131: "\u0192",
7862
+ 132: "\u201E",
7863
+ 133: "\u2026",
7864
+ 134: "\u2020",
7865
+ 135: "\u2021",
7866
+ 136: "\u02C6",
7867
+ 137: "\u2030",
7868
+ 138: "\u0160",
7869
+ 139: "\u2039",
7870
+ 140: "\u0152",
7871
+ 142: "\u017D",
7872
+ 145: "\u2018",
7873
+ 146: "\u2019",
7874
+ 147: "\u201C",
7875
+ 148: "\u201D",
7876
+ 149: "\u2022",
7877
+ 150: "\u2013",
7878
+ 151: "\u2014",
7879
+ 152: "\u02DC",
7880
+ 153: "\u2122",
7881
+ 154: "\u0161",
7882
+ 155: "\u203A",
7883
+ 156: "\u0153",
7884
+ 158: "\u017E",
7885
+ 159: "\u0178"
7886
+ };
7887
+ function normalizeSanitizeSourceEncoding(value) {
7888
+ const normalized = (value ?? "WIN1252").trim().toUpperCase().replace(/_/g, "-");
7889
+ switch (normalized) {
7890
+ case "WIN1252":
7891
+ case "WINDOWS-1252":
7892
+ case "CP1252":
7893
+ return "WIN1252";
7894
+ case "LATIN1":
7895
+ case "LATIN-1":
7896
+ case "ISO-8859-1":
7897
+ case "ISO8859-1":
7898
+ return "LATIN1";
7899
+ case "UTF8":
7900
+ case "UTF-8":
7901
+ return "UTF8";
7902
+ default:
7903
+ throw new ValidationError(
7904
+ `Unsupported sanitize source encoding: ${value}. Supported values: WIN1252, LATIN1, UTF8.`
7905
+ );
7906
+ }
7907
+ }
7908
+ function isAllowedControlCodePoint(codePoint) {
7909
+ return codePoint === 9 || codePoint === 10 || codePoint === 13;
7910
+ }
7911
+ function isProblematicControlCodePoint(codePoint) {
7912
+ if (isAllowedControlCodePoint(codePoint)) {
7913
+ return false;
7914
+ }
7915
+ return codePoint >= 0 && codePoint <= 31 || codePoint === 127 || codePoint >= 128 && codePoint <= 159 || codePoint === 65279;
7916
+ }
7917
+ function sanitizeDecodedText(text) {
7918
+ const output = [];
7919
+ let invalidBytesRemoved = 0;
7920
+ let controlCharsRemoved = 0;
7921
+ for (const char of text) {
7922
+ const codePoint = char.codePointAt(0);
7923
+ if (codePoint === 65533) {
7924
+ invalidBytesRemoved += 1;
7925
+ continue;
7926
+ }
7927
+ if (isProblematicControlCodePoint(codePoint)) {
7928
+ controlCharsRemoved += 1;
7929
+ continue;
7930
+ }
7931
+ output.push(char);
7932
+ }
7933
+ return {
7934
+ text: output.join(""),
7935
+ invalidBytesRemoved,
7936
+ controlCharsRemoved
7937
+ };
7938
+ }
7939
+ var SanitizeEncodingNormalizer = class {
7940
+ constructor(sourceEncoding) {
7941
+ this.sourceEncoding = sourceEncoding;
7942
+ this.utf8Decoder = sourceEncoding === "UTF8" ? new StringDecoder("utf8") : void 0;
7943
+ }
7944
+ sourceEncoding;
7945
+ utf8Decoder;
7946
+ normalizeChunk(chunk) {
7947
+ if (this.sourceEncoding === "UTF8") {
7948
+ const decoded = this.utf8Decoder.write(chunk);
7949
+ const sanitized = sanitizeDecodedText(decoded);
7950
+ const nulBytesRemoved = [...decoded].filter(
7951
+ (char) => char === "\0"
7952
+ ).length;
7953
+ return {
7954
+ ...sanitized,
7955
+ nulBytesRemoved
7956
+ };
7957
+ }
7958
+ return this.normalizeSingleByteChunk(chunk);
7959
+ }
7960
+ flush() {
7961
+ if (!this.utf8Decoder) {
7962
+ return {
7963
+ text: "",
7964
+ nulBytesRemoved: 0,
7965
+ invalidBytesRemoved: 0,
7966
+ controlCharsRemoved: 0
7967
+ };
7968
+ }
7969
+ const decoded = this.utf8Decoder.end();
7970
+ const sanitized = sanitizeDecodedText(decoded);
7971
+ const nulBytesRemoved = [...decoded].filter((char) => char === "\0").length;
7972
+ return {
7973
+ ...sanitized,
7974
+ nulBytesRemoved
7975
+ };
7976
+ }
7977
+ normalizeSingleByteChunk(chunk) {
7978
+ const output = [];
7979
+ let nulBytesRemoved = 0;
7980
+ let invalidBytesRemoved = 0;
7981
+ let controlCharsRemoved = 0;
7982
+ for (const byte of chunk) {
7983
+ if (byte === 0) {
7984
+ nulBytesRemoved += 1;
7985
+ continue;
7986
+ }
7987
+ if (byte < 32 || byte === 127) {
7988
+ if (isAllowedControlCodePoint(byte)) {
7989
+ output.push(String.fromCharCode(byte));
7990
+ } else {
7991
+ controlCharsRemoved += 1;
7992
+ }
7993
+ continue;
7994
+ }
7995
+ if (byte >= 128 && byte <= 159) {
7996
+ if (this.sourceEncoding === "WIN1252") {
7997
+ const mapped = WINDOWS_1252_C1_MAP[byte];
7998
+ if (mapped === void 0) {
7999
+ invalidBytesRemoved += 1;
8000
+ } else {
8001
+ output.push(mapped);
8002
+ }
8003
+ } else {
8004
+ controlCharsRemoved += 1;
8005
+ }
8006
+ continue;
8007
+ }
8008
+ output.push(String.fromCharCode(byte));
8009
+ }
8010
+ return {
8011
+ text: output.join(""),
8012
+ nulBytesRemoved,
8013
+ invalidBytesRemoved,
8014
+ controlCharsRemoved
8015
+ };
8016
+ }
8017
+ };
8018
+
7856
8019
  // src/services/sanitize/runner.ts
7857
8020
  import { createReadStream as createReadStream2, createWriteStream as createWriteStream2 } from "fs";
7858
8021
  import { mkdir as mkdir7 } from "fs/promises";
7859
8022
  import path13 from "path";
7860
- function stripNulBytes(chunk) {
7861
- let removed = 0;
7862
- for (let index = 0; index < chunk.length; index += 1) {
7863
- if (chunk[index] === 0) {
7864
- removed += 1;
7865
- }
8023
+ async function writeUtf8(output, value) {
8024
+ if (value.length === 0) {
8025
+ return;
7866
8026
  }
7867
- if (removed === 0) {
7868
- return { buffer: chunk, removed: 0 };
8027
+ if (!output.write(value, "utf8")) {
8028
+ await new Promise((resolve, reject) => {
8029
+ output.once("drain", resolve);
8030
+ output.once("error", reject);
8031
+ });
7869
8032
  }
7870
- const sanitized = Buffer.allocUnsafe(chunk.length - removed);
7871
- let outputIndex = 0;
7872
- for (let index = 0; index < chunk.length; index += 1) {
7873
- const value = chunk[index];
7874
- if (value !== 0) {
7875
- sanitized[outputIndex] = value;
7876
- outputIndex += 1;
8033
+ }
8034
+ function countNewlines(value) {
8035
+ let count = 0;
8036
+ for (let index = 0; index < value.length; index += 1) {
8037
+ if (value[index] === "\n") {
8038
+ count += 1;
7877
8039
  }
7878
8040
  }
7879
- return { buffer: sanitized, removed };
8041
+ return count;
7880
8042
  }
7881
- async function sanitizeDatasetFile(plan, onChunk) {
8043
+ async function sanitizeDatasetFile(plan, onChunk, options = {}) {
7882
8044
  await mkdir7(path13.dirname(plan.outputPath), { recursive: true });
8045
+ const sourceEncoding = normalizeSanitizeSourceEncoding(
8046
+ options.sourceEncoding
8047
+ );
8048
+ const normalizer = new SanitizeEncodingNormalizer(sourceEncoding);
7883
8049
  const input = createReadStream2(plan.absolutePath);
7884
- const output = createWriteStream2(plan.outputPath);
8050
+ const output = createWriteStream2(plan.outputPath, { encoding: "utf8" });
7885
8051
  let totalBytesRead = 0;
7886
8052
  let totalBytesWritten = 0;
7887
8053
  let nulBytesRemoved = 0;
8054
+ let invalidBytesRemoved = 0;
8055
+ let controlCharsRemoved = 0;
7888
8056
  let lineCount = 0;
7889
- let sawAnyByte = false;
7890
- let lastByteWasNewline = false;
8057
+ let sawAnyCharacter = false;
8058
+ let lastCharacterWasNewline = false;
8059
+ const processText = async (text) => {
8060
+ if (text.length === 0) {
8061
+ return;
8062
+ }
8063
+ sawAnyCharacter = true;
8064
+ lineCount += countNewlines(text);
8065
+ lastCharacterWasNewline = text.endsWith("\n");
8066
+ totalBytesWritten += Buffer.byteLength(text, "utf8");
8067
+ await writeUtf8(output, text);
8068
+ };
7891
8069
  try {
7892
8070
  for await (const chunk of input) {
7893
8071
  const chunkBuffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
7894
8072
  totalBytesRead += chunkBuffer.length;
7895
- const { buffer, removed } = stripNulBytes(chunkBuffer);
7896
- nulBytesRemoved += removed;
7897
- sawAnyByte = sawAnyByte || buffer.length > 0;
7898
- for (let index = 0; index < buffer.length; index += 1) {
7899
- if (buffer[index] === 10) {
7900
- lineCount += 1;
7901
- }
7902
- }
7903
- if (buffer.length > 0) {
7904
- lastByteWasNewline = buffer[buffer.length - 1] === 10;
7905
- }
7906
- totalBytesWritten += buffer.length;
7907
- output.write(buffer);
8073
+ const normalized = normalizer.normalizeChunk(chunkBuffer);
8074
+ nulBytesRemoved += normalized.nulBytesRemoved;
8075
+ invalidBytesRemoved += normalized.invalidBytesRemoved;
8076
+ controlCharsRemoved += normalized.controlCharsRemoved;
8077
+ await processText(normalized.text);
7908
8078
  onChunk?.({
7909
8079
  bytesProcessed: chunkBuffer.length,
7910
8080
  fileBytesProcessed: totalBytesRead,
7911
8081
  currentFileSize: plan.fileSize,
7912
8082
  processedRows: lineCount,
7913
- nulBytesRemoved
8083
+ nulBytesRemoved,
8084
+ invalidBytesRemoved,
8085
+ controlCharsRemoved
7914
8086
  });
7915
8087
  }
7916
- if (sawAnyByte && !lastByteWasNewline) {
8088
+ const flushed = normalizer.flush();
8089
+ nulBytesRemoved += flushed.nulBytesRemoved;
8090
+ invalidBytesRemoved += flushed.invalidBytesRemoved;
8091
+ controlCharsRemoved += flushed.controlCharsRemoved;
8092
+ await processText(flushed.text);
8093
+ if (sawAnyCharacter && !lastCharacterWasNewline) {
7917
8094
  lineCount += 1;
7918
8095
  }
7919
8096
  } finally {
7920
8097
  input.close();
7921
8098
  output.end();
7922
- await new Promise((resolve) => output.on("finish", () => resolve()));
8099
+ await new Promise((resolve, reject) => {
8100
+ output.on("finish", () => resolve());
8101
+ output.on("error", (error) => reject(error));
8102
+ });
7923
8103
  }
7924
8104
  return {
7925
8105
  plan,
7926
8106
  totalBytesRead,
7927
8107
  totalBytesWritten,
8108
+ sourceEncoding,
7928
8109
  nulBytesRemoved,
8110
+ invalidBytesRemoved,
8111
+ controlCharsRemoved,
7929
8112
  lineCount,
7930
- changed: nulBytesRemoved > 0 || totalBytesRead !== totalBytesWritten
8113
+ changed: nulBytesRemoved > 0 || invalidBytesRemoved > 0 || controlCharsRemoved > 0 || totalBytesRead !== totalBytesWritten
7931
8114
  };
7932
8115
  }
7933
8116
 
@@ -7990,40 +8173,54 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
7990
8173
  "No recognized validated dataset files were found for sanitization."
7991
8174
  );
7992
8175
  }
8176
+ const sourceEncoding = normalizeSanitizeSourceEncoding(
8177
+ options.sourceEncoding
8178
+ );
7993
8179
  options.onProgress?.({
7994
8180
  kind: "start",
7995
8181
  validatedPath,
7996
8182
  outputPath,
7997
8183
  totalFiles: plan.totalFiles,
7998
8184
  totalBytes: plan.totalBytes,
7999
- datasets: plan.datasets
8185
+ datasets: plan.datasets,
8186
+ sourceEncoding
8000
8187
  });
8001
8188
  let processedFiles = 0;
8002
8189
  let processedRows = 0;
8003
8190
  let processedBytes = 0;
8004
8191
  let nulBytesRemoved = 0;
8192
+ let invalidBytesRemoved = 0;
8193
+ let controlCharsRemoved = 0;
8005
8194
  let changedFiles = 0;
8006
8195
  const fileSummaries = [];
8007
8196
  for (const [index, filePlan] of plan.files.entries()) {
8008
- const fileResult = await sanitizeDatasetFile(filePlan, (chunk) => {
8009
- options.onProgress?.({
8010
- kind: "progress",
8011
- currentFileDisplayPath: filePlan.displayPath,
8012
- fileIndex: index + 1,
8013
- totalFiles: plan.totalFiles,
8014
- bytesProcessed: processedBytes + chunk.fileBytesProcessed,
8015
- totalBytes: plan.totalBytes,
8016
- fileBytesProcessed: chunk.fileBytesProcessed,
8017
- currentFileSize: chunk.currentFileSize,
8018
- processedRows: processedRows + chunk.processedRows,
8019
- nulBytesRemoved: nulBytesRemoved + chunk.nulBytesRemoved,
8020
- changedFiles
8021
- });
8022
- });
8197
+ const fileResult = await sanitizeDatasetFile(
8198
+ filePlan,
8199
+ (chunk) => {
8200
+ options.onProgress?.({
8201
+ kind: "progress",
8202
+ currentFileDisplayPath: filePlan.displayPath,
8203
+ fileIndex: index + 1,
8204
+ totalFiles: plan.totalFiles,
8205
+ bytesProcessed: processedBytes + chunk.fileBytesProcessed,
8206
+ totalBytes: plan.totalBytes,
8207
+ fileBytesProcessed: chunk.fileBytesProcessed,
8208
+ currentFileSize: chunk.currentFileSize,
8209
+ processedRows: processedRows + chunk.processedRows,
8210
+ nulBytesRemoved: nulBytesRemoved + chunk.nulBytesRemoved,
8211
+ invalidBytesRemoved: invalidBytesRemoved + chunk.invalidBytesRemoved,
8212
+ controlCharsRemoved: controlCharsRemoved + chunk.controlCharsRemoved,
8213
+ changedFiles
8214
+ });
8215
+ },
8216
+ { sourceEncoding }
8217
+ );
8023
8218
  processedFiles += 1;
8024
8219
  processedRows += fileResult.lineCount;
8025
8220
  processedBytes += fileResult.totalBytesRead;
8026
8221
  nulBytesRemoved += fileResult.nulBytesRemoved;
8222
+ invalidBytesRemoved += fileResult.invalidBytesRemoved;
8223
+ controlCharsRemoved += fileResult.controlCharsRemoved;
8027
8224
  changedFiles += fileResult.changed ? 1 : 0;
8028
8225
  fileSummaries.push({
8029
8226
  dataset: filePlan.dataset,
@@ -8031,7 +8228,9 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
8031
8228
  outputPath: filePlan.outputPath,
8032
8229
  lineCount: fileResult.lineCount,
8033
8230
  changed: fileResult.changed,
8034
- nulBytesRemoved: fileResult.nulBytesRemoved
8231
+ nulBytesRemoved: fileResult.nulBytesRemoved,
8232
+ invalidBytesRemoved: fileResult.invalidBytesRemoved,
8233
+ controlCharsRemoved: fileResult.controlCharsRemoved
8035
8234
  });
8036
8235
  }
8037
8236
  options.onProgress?.({
@@ -8039,6 +8238,8 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
8039
8238
  totalFiles: plan.totalFiles,
8040
8239
  processedRows,
8041
8240
  nulBytesRemoved,
8241
+ invalidBytesRemoved,
8242
+ controlCharsRemoved,
8042
8243
  changedFiles,
8043
8244
  totalBytes: plan.totalBytes
8044
8245
  });
@@ -8050,13 +8251,17 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
8050
8251
  totalBytes: plan.totalBytes,
8051
8252
  processedFiles,
8052
8253
  processedRows,
8254
+ sourceEncoding,
8053
8255
  nulBytesRemoved,
8256
+ invalidBytesRemoved,
8257
+ controlCharsRemoved,
8054
8258
  changedFiles,
8055
8259
  unchangedFiles: plan.totalFiles - changedFiles,
8056
8260
  datasets: plan.datasets,
8057
8261
  files: fileSummaries,
8058
8262
  warnings: [
8059
- "Sanitization prepares a clean dataset tree for import by removing known low-level byte issues such as NUL bytes before PostgreSQL loading begins.",
8263
+ "Sanitization now writes UTF-8 output and removes invalid bytes plus problematic control characters before PostgreSQL loading begins.",
8264
+ "The PostgreSQL direct import path can use --source-encoding UTF8 when reading files generated by this sanitization command.",
8060
8265
  "The import command still keeps quarantine and row-level recovery for unexpected issues, but sanitizing first reduces the amount of slow fallback work during import."
8061
8266
  ],
8062
8267
  nextStep: inferNextStep3(outputPath)
@@ -8849,7 +9054,7 @@ async function exportPostgresCsvDataset(inputPath, options = {}) {
8849
9054
  // src/services/postgres-direct/generator.ts
8850
9055
  import { mkdir as mkdir9, stat as stat7, writeFile as writeFile6 } from "fs/promises";
8851
9056
  import path17 from "path";
8852
- var DEFAULT_SOURCE_ENCODING = "WIN1252";
9057
+ var DEFAULT_SOURCE_ENCODING = "UTF8";
8853
9058
  function defaultPostgresDirectOutputPath(inputPath) {
8854
9059
  const baseName = path17.basename(inputPath);
8855
9060
  if (baseName.toLowerCase() === "sanitized") {
@@ -8864,7 +9069,7 @@ function normalizeSourceEncoding(value) {
8864
9069
  const encoding = (value ?? DEFAULT_SOURCE_ENCODING).trim();
8865
9070
  if (!/^[A-Za-z0-9_-]+$/.test(encoding)) {
8866
9071
  throw new ValidationError(
8867
- `Invalid source encoding: ${value}. Use a PostgreSQL client encoding name such as WIN1252 or UTF8.`
9072
+ `Invalid source encoding: ${value}. Use a PostgreSQL client encoding name such as UTF8, WIN1252 or LATIN1.`
8868
9073
  );
8869
9074
  }
8870
9075
  return encoding.toUpperCase();
@@ -8999,7 +9204,8 @@ async function generatePostgresDirectScript(inputPath, options = {}) {
8999
9204
  ...validation.ok ? [] : validation.errors,
9000
9205
  "This script imports sanitized Receita files directly with psql \\copy. It avoids rewriting the full dataset into a second CSV tree.",
9001
9206
  "The generated script expects the database schema generated by cnpj-db-loader to be applied before execution.",
9002
- "Use --source-encoding UTF8 only if your sanitized files are already UTF-8. The default WIN1252 matches the usual Receita file encoding."
9207
+ "The direct PostgreSQL script now defaults to UTF8 because the sanitize command writes clean UTF-8 files.",
9208
+ "Use --source-encoding WIN1252 or LATIN1 only when generating scripts for legacy sanitized files produced by older loader versions."
9003
9209
  ],
9004
9210
  nextStep: inferNextStep5(scriptPath)
9005
9211
  };