@danielarndt0/cnpj-db-loader 2.4.0-beta.1 → 2.4.0-beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -5
- package/dist/cli.js +301 -61
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +14 -1
- package/dist/index.js +264 -58
- package/dist/index.js.map +1 -1
- package/docs/architecture.md +1 -1
- package/docs/cli.md +1 -1
- package/docs/commands.md +1 -1
- package/docs/postgres-direct.md +10 -10
- package/docs/sanitize.md +52 -16
- package/package.json +3 -3
- package/docs/releases/v2.4.0.md +0 -40
package/dist/index.d.ts
CHANGED
|
@@ -582,6 +582,8 @@ declare function showQuarantineRow(id: number, options?: {
|
|
|
582
582
|
dbUrl?: string;
|
|
583
583
|
}): Promise<QuarantineRecord>;
|
|
584
584
|
|
|
585
|
+
type SanitizeSourceEncoding = "WIN1252" | "LATIN1" | "UTF8";
|
|
586
|
+
|
|
585
587
|
type SanitizeDatasetType = Exclude<DatasetType, "zip-archive" | "unknown">;
|
|
586
588
|
type SanitizeFilePlan = {
|
|
587
589
|
dataset: SanitizeDatasetType;
|
|
@@ -607,7 +609,10 @@ type SanitizeSummary = {
|
|
|
607
609
|
totalBytes: number;
|
|
608
610
|
processedFiles: number;
|
|
609
611
|
processedRows: number;
|
|
612
|
+
sourceEncoding: SanitizeSourceEncoding;
|
|
610
613
|
nulBytesRemoved: number;
|
|
614
|
+
invalidBytesRemoved: number;
|
|
615
|
+
controlCharsRemoved: number;
|
|
611
616
|
changedFiles: number;
|
|
612
617
|
unchangedFiles: number;
|
|
613
618
|
datasets: SanitizeDatasetType[];
|
|
@@ -618,6 +623,8 @@ type SanitizeSummary = {
|
|
|
618
623
|
lineCount: number;
|
|
619
624
|
changed: boolean;
|
|
620
625
|
nulBytesRemoved: number;
|
|
626
|
+
invalidBytesRemoved: number;
|
|
627
|
+
controlCharsRemoved: number;
|
|
621
628
|
}>;
|
|
622
629
|
warnings: string[];
|
|
623
630
|
nextStep?: string | undefined;
|
|
@@ -629,6 +636,7 @@ type SanitizeProgressEvent = {
|
|
|
629
636
|
totalFiles: number;
|
|
630
637
|
totalBytes: number;
|
|
631
638
|
datasets: SanitizeDatasetType[];
|
|
639
|
+
sourceEncoding: SanitizeSourceEncoding;
|
|
632
640
|
} | {
|
|
633
641
|
kind: "progress";
|
|
634
642
|
currentFileDisplayPath: string;
|
|
@@ -640,12 +648,16 @@ type SanitizeProgressEvent = {
|
|
|
640
648
|
currentFileSize: number;
|
|
641
649
|
processedRows: number;
|
|
642
650
|
nulBytesRemoved: number;
|
|
651
|
+
invalidBytesRemoved: number;
|
|
652
|
+
controlCharsRemoved: number;
|
|
643
653
|
changedFiles: number;
|
|
644
654
|
} | {
|
|
645
655
|
kind: "finish";
|
|
646
656
|
totalFiles: number;
|
|
647
657
|
processedRows: number;
|
|
648
658
|
nulBytesRemoved: number;
|
|
659
|
+
invalidBytesRemoved: number;
|
|
660
|
+
controlCharsRemoved: number;
|
|
649
661
|
changedFiles: number;
|
|
650
662
|
totalBytes: number;
|
|
651
663
|
};
|
|
@@ -653,6 +665,7 @@ type SanitizeProgressListener = (event: SanitizeProgressEvent) => void;
|
|
|
653
665
|
type SanitizeOptions = {
|
|
654
666
|
outputPath?: string | undefined;
|
|
655
667
|
dataset?: SanitizeDatasetType | undefined;
|
|
668
|
+
sourceEncoding?: string | undefined;
|
|
656
669
|
onProgress?: SanitizeProgressListener | undefined;
|
|
657
670
|
};
|
|
658
671
|
|
|
@@ -1087,4 +1100,4 @@ declare function exportPostgresCsvDataset(inputPath: string, options?: PostgresC
|
|
|
1087
1100
|
|
|
1088
1101
|
declare function generatePostgresDirectScript(inputPath: string, options?: PostgresDirectScriptOptions): Promise<PostgresDirectScriptSummary>;
|
|
1089
1102
|
|
|
1090
|
-
export { type AppConfig, type AppEnvironment, AppError, type CheckpointCleanupPhase, DEFAULT_FEDERAL_REVENUE_DOWNLOAD_ROOT, DEFAULT_FEDERAL_REVENUE_SHARE_TOKEN, DEFAULT_FEDERAL_REVENUE_USER_AGENT, DEFAULT_FEDERAL_REVENUE_WEBDAV_URL, type DatabaseCleanupSummary, type DatabaseConfig, type DatasetBlock, type DatasetType, type ExtractionEntry, type ExtractionProgressEvent, type ExtractionProgressListener, type ExtractionSummary, FEDERAL_REVENUE_CONTROL_DIR, FEDERAL_REVENUE_CONTROL_SCOPE, FEDERAL_REVENUE_MANIFEST_VERSION, type Failure, type FederalRevenueCheckOptions, type FederalRevenueCheckSummary, type FederalRevenueCleanMode, type FederalRevenueCleanOptions, type FederalRevenueCleanSummary, type FederalRevenueClientOptions, type FederalRevenueDownloadEntry, type FederalRevenueDownloadOptions, type FederalRevenueDownloadProgressEvent, type FederalRevenueDownloadProgressListener, type FederalRevenueDownloadStatus, type FederalRevenueDownloadSummary, type FederalRevenueFile, type FederalRevenueLocalFileStatus, type FederalRevenueLocalStatusEntry, type FederalRevenueLockFile, type FederalRevenueManifest, type FederalRevenueManifestFile, type FederalRevenueManifestLastCommand, type FederalRevenueManifestLastStatus, type FederalRevenueReference, type FederalRevenueReferenceMode, type FederalRevenueReferenceSelection, type FederalRevenueRetryOptions, type FederalRevenueStatusOptions, type FederalRevenueStatusSummary, type FederalRevenueSyncLockOptions, type FederalRevenueSyncOptions, type FederalRevenueSyncSummary, type FileInspection, type ImportCheckpointRecord, type ImportCheckpointStatus, type ImportDatasetPlan, type ImportDatasetType, type ImportFilePlan, type ImportOptions, type ImportPerformanceSummary, type ImportPhaseStatus, type ImportPlanRecord, type ImportProgressEvent, type ImportProgressListener, type ImportSchemaCapabilities, type ImportSummary, type InputDetectionMode, type InputMode, type InspectSummary, type LogLevel, type LogStatus, type PostgresCsvDatasetSummary, type PostgresCsvExportOptions, type PostgresCsvExportProgressEvent, type PostgresCsvExportProgressListener, type PostgresCsvExportSummary, type PostgresCsvFile, type PostgresDirectScriptDatasetSummary, type PostgresDirectScriptOptions, type PostgresDirectScriptProgressEvent, type PostgresDirectScriptProgressListener, type PostgresDirectScriptSummary, type PostgresDirectSourceFile, type QuarantineListFilters, type QuarantineListSummary, type QuarantineRecord, type QuarantineStatsFilters, type QuarantineStatsSummary, type Result, type SanitizeDatasetType, type SanitizeOptions, type SanitizePlan, type SanitizeProgressEvent, type SanitizeProgressListener, type SanitizeSummary, type SchemaGenerationOptions, type SchemaProfile, ServiceError, type StructuredLogEntry, type SupportedOs, ValidationError, type ValidationSummary, appendJsonLinesLog, assertPostgresUrl, buildFederalRevenueDownloadHeaders, buildFederalRevenueReferenceOutputPath, checkFederalRevenueDataset, cleanFederalRevenueDataset, cleanupDatabaseCheckpointsData, cleanupDatabaseMaterializedData, cleanupDatabasePlansData, cleanupDatabaseStagingData, createFederalRevenueManifest, createJsonLinesLog, defaultExtractedOutputPath, detectOs, downloadFederalRevenueDataset, ensureDirectory, evaluateFederalRevenueManifestFile, evaluateFederalRevenueManifestFiles, exportPostgresCsvDataset, extractArchives, finalizeFederalRevenueManifest, generatePostgresDirectScript, generateSchemaSql, getAllLayouts, getCurrentFederalRevenueReference, getFederalRevenueControlDirectory, getFederalRevenueManifestPath, getFederalRevenueStatus, getFederalRevenueSyncLockPath, getLayoutSummary, getLogsDirectoryPath, getQuarantineStats, getUserAppDirectoryPath, importDataToDatabase, inspectFiles, listFederalRevenueFiles, listFederalRevenueReferences, listQuarantineRows, loadImportDataToStaging, materializeImportedData, prettyJson, readDatabaseConfig, readFederalRevenueManifest, resetDefaultDbUrl, resolveDatabaseUrl, resolveDbUrl, resolveFederalRevenueReference, resolveInputMode, resolveSchemaProfile, retryFederalRevenueDataset, runDoctor, safeReadText, safeWriteText, sanitizeInputDirectory, setDefaultDbUrl, showQuarantineRow, syncFederalRevenueDataset, testDatabaseConnection, toTitleCase, updateFederalRevenueManifestFile, validateFederalRevenueReference, validateInputDirectory, withFederalRevenueSyncLock, writeCommandFailureLog, writeCommandLog, writeDatabaseConfig, writeFederalRevenueManifest, writeSchemaFile };
|
|
1103
|
+
export { type AppConfig, type AppEnvironment, AppError, type CheckpointCleanupPhase, DEFAULT_FEDERAL_REVENUE_DOWNLOAD_ROOT, DEFAULT_FEDERAL_REVENUE_SHARE_TOKEN, DEFAULT_FEDERAL_REVENUE_USER_AGENT, DEFAULT_FEDERAL_REVENUE_WEBDAV_URL, type DatabaseCleanupSummary, type DatabaseConfig, type DatasetBlock, type DatasetType, type ExtractionEntry, type ExtractionProgressEvent, type ExtractionProgressListener, type ExtractionSummary, FEDERAL_REVENUE_CONTROL_DIR, FEDERAL_REVENUE_CONTROL_SCOPE, FEDERAL_REVENUE_MANIFEST_VERSION, type Failure, type FederalRevenueCheckOptions, type FederalRevenueCheckSummary, type FederalRevenueCleanMode, type FederalRevenueCleanOptions, type FederalRevenueCleanSummary, type FederalRevenueClientOptions, type FederalRevenueDownloadEntry, type FederalRevenueDownloadOptions, type FederalRevenueDownloadProgressEvent, type FederalRevenueDownloadProgressListener, type FederalRevenueDownloadStatus, type FederalRevenueDownloadSummary, type FederalRevenueFile, type FederalRevenueLocalFileStatus, type FederalRevenueLocalStatusEntry, type FederalRevenueLockFile, type FederalRevenueManifest, type FederalRevenueManifestFile, type FederalRevenueManifestLastCommand, type FederalRevenueManifestLastStatus, type FederalRevenueReference, type FederalRevenueReferenceMode, type FederalRevenueReferenceSelection, type FederalRevenueRetryOptions, type FederalRevenueStatusOptions, type FederalRevenueStatusSummary, type FederalRevenueSyncLockOptions, type FederalRevenueSyncOptions, type FederalRevenueSyncSummary, type FileInspection, type ImportCheckpointRecord, type ImportCheckpointStatus, type ImportDatasetPlan, type ImportDatasetType, type ImportFilePlan, type ImportOptions, type ImportPerformanceSummary, type ImportPhaseStatus, type ImportPlanRecord, type ImportProgressEvent, type ImportProgressListener, type ImportSchemaCapabilities, type ImportSummary, type InputDetectionMode, type InputMode, type InspectSummary, type LogLevel, type LogStatus, type PostgresCsvDatasetSummary, type PostgresCsvExportOptions, type PostgresCsvExportProgressEvent, type PostgresCsvExportProgressListener, type PostgresCsvExportSummary, type PostgresCsvFile, type PostgresDirectScriptDatasetSummary, type PostgresDirectScriptOptions, type PostgresDirectScriptProgressEvent, type PostgresDirectScriptProgressListener, type PostgresDirectScriptSummary, type PostgresDirectSourceFile, type QuarantineListFilters, type QuarantineListSummary, type QuarantineRecord, type QuarantineStatsFilters, type QuarantineStatsSummary, type Result, type SanitizeDatasetType, type SanitizeOptions, type SanitizePlan, type SanitizeProgressEvent, type SanitizeProgressListener, type SanitizeSourceEncoding, type SanitizeSummary, type SchemaGenerationOptions, type SchemaProfile, ServiceError, type StructuredLogEntry, type SupportedOs, ValidationError, type ValidationSummary, appendJsonLinesLog, assertPostgresUrl, buildFederalRevenueDownloadHeaders, buildFederalRevenueReferenceOutputPath, checkFederalRevenueDataset, cleanFederalRevenueDataset, cleanupDatabaseCheckpointsData, cleanupDatabaseMaterializedData, cleanupDatabasePlansData, cleanupDatabaseStagingData, createFederalRevenueManifest, createJsonLinesLog, defaultExtractedOutputPath, detectOs, downloadFederalRevenueDataset, ensureDirectory, evaluateFederalRevenueManifestFile, evaluateFederalRevenueManifestFiles, exportPostgresCsvDataset, extractArchives, finalizeFederalRevenueManifest, generatePostgresDirectScript, generateSchemaSql, getAllLayouts, getCurrentFederalRevenueReference, getFederalRevenueControlDirectory, getFederalRevenueManifestPath, getFederalRevenueStatus, getFederalRevenueSyncLockPath, getLayoutSummary, getLogsDirectoryPath, getQuarantineStats, getUserAppDirectoryPath, importDataToDatabase, inspectFiles, listFederalRevenueFiles, listFederalRevenueReferences, listQuarantineRows, loadImportDataToStaging, materializeImportedData, prettyJson, readDatabaseConfig, readFederalRevenueManifest, resetDefaultDbUrl, resolveDatabaseUrl, resolveDbUrl, resolveFederalRevenueReference, resolveInputMode, resolveSchemaProfile, retryFederalRevenueDataset, runDoctor, safeReadText, safeWriteText, sanitizeInputDirectory, setDefaultDbUrl, showQuarantineRow, syncFederalRevenueDataset, testDatabaseConnection, toTitleCase, updateFederalRevenueManifestFile, validateFederalRevenueReference, validateInputDirectory, withFederalRevenueSyncLock, writeCommandFailureLog, writeCommandLog, writeDatabaseConfig, writeFederalRevenueManifest, writeSchemaFile };
|
package/dist/index.js
CHANGED
|
@@ -7853,81 +7853,264 @@ function isRecognizedSanitizeEntry(entry) {
|
|
|
7853
7853
|
return entry.entryKind === "file" && entry.inferredType !== "zip-archive" && entry.inferredType !== "unknown";
|
|
7854
7854
|
}
|
|
7855
7855
|
|
|
7856
|
+
// src/services/sanitize/encoding.ts
|
|
7857
|
+
import { StringDecoder } from "string_decoder";
|
|
7858
|
+
var WINDOWS_1252_C1_MAP = {
|
|
7859
|
+
128: "\u20AC",
|
|
7860
|
+
130: "\u201A",
|
|
7861
|
+
131: "\u0192",
|
|
7862
|
+
132: "\u201E",
|
|
7863
|
+
133: "\u2026",
|
|
7864
|
+
134: "\u2020",
|
|
7865
|
+
135: "\u2021",
|
|
7866
|
+
136: "\u02C6",
|
|
7867
|
+
137: "\u2030",
|
|
7868
|
+
138: "\u0160",
|
|
7869
|
+
139: "\u2039",
|
|
7870
|
+
140: "\u0152",
|
|
7871
|
+
142: "\u017D",
|
|
7872
|
+
145: "\u2018",
|
|
7873
|
+
146: "\u2019",
|
|
7874
|
+
147: "\u201C",
|
|
7875
|
+
148: "\u201D",
|
|
7876
|
+
149: "\u2022",
|
|
7877
|
+
150: "\u2013",
|
|
7878
|
+
151: "\u2014",
|
|
7879
|
+
152: "\u02DC",
|
|
7880
|
+
153: "\u2122",
|
|
7881
|
+
154: "\u0161",
|
|
7882
|
+
155: "\u203A",
|
|
7883
|
+
156: "\u0153",
|
|
7884
|
+
158: "\u017E",
|
|
7885
|
+
159: "\u0178"
|
|
7886
|
+
};
|
|
7887
|
+
function normalizeSanitizeSourceEncoding(value) {
|
|
7888
|
+
const normalized = (value ?? "WIN1252").trim().toUpperCase().replace(/_/g, "-");
|
|
7889
|
+
switch (normalized) {
|
|
7890
|
+
case "WIN1252":
|
|
7891
|
+
case "WINDOWS-1252":
|
|
7892
|
+
case "CP1252":
|
|
7893
|
+
return "WIN1252";
|
|
7894
|
+
case "LATIN1":
|
|
7895
|
+
case "LATIN-1":
|
|
7896
|
+
case "ISO-8859-1":
|
|
7897
|
+
case "ISO8859-1":
|
|
7898
|
+
return "LATIN1";
|
|
7899
|
+
case "UTF8":
|
|
7900
|
+
case "UTF-8":
|
|
7901
|
+
return "UTF8";
|
|
7902
|
+
default:
|
|
7903
|
+
throw new ValidationError(
|
|
7904
|
+
`Unsupported sanitize source encoding: ${value}. Supported values: WIN1252, LATIN1, UTF8.`
|
|
7905
|
+
);
|
|
7906
|
+
}
|
|
7907
|
+
}
|
|
7908
|
+
function isAllowedControlCodePoint(codePoint) {
|
|
7909
|
+
return codePoint === 9 || codePoint === 10 || codePoint === 13;
|
|
7910
|
+
}
|
|
7911
|
+
function isProblematicControlCodePoint(codePoint) {
|
|
7912
|
+
if (isAllowedControlCodePoint(codePoint)) {
|
|
7913
|
+
return false;
|
|
7914
|
+
}
|
|
7915
|
+
return codePoint >= 0 && codePoint <= 31 || codePoint === 127 || codePoint >= 128 && codePoint <= 159 || codePoint === 65279;
|
|
7916
|
+
}
|
|
7917
|
+
function sanitizeDecodedText(text) {
|
|
7918
|
+
const output = [];
|
|
7919
|
+
let invalidBytesRemoved = 0;
|
|
7920
|
+
let controlCharsRemoved = 0;
|
|
7921
|
+
for (const char of text) {
|
|
7922
|
+
const codePoint = char.codePointAt(0);
|
|
7923
|
+
if (codePoint === 65533) {
|
|
7924
|
+
invalidBytesRemoved += 1;
|
|
7925
|
+
continue;
|
|
7926
|
+
}
|
|
7927
|
+
if (isProblematicControlCodePoint(codePoint)) {
|
|
7928
|
+
controlCharsRemoved += 1;
|
|
7929
|
+
continue;
|
|
7930
|
+
}
|
|
7931
|
+
output.push(char);
|
|
7932
|
+
}
|
|
7933
|
+
return {
|
|
7934
|
+
text: output.join(""),
|
|
7935
|
+
invalidBytesRemoved,
|
|
7936
|
+
controlCharsRemoved
|
|
7937
|
+
};
|
|
7938
|
+
}
|
|
7939
|
+
var SanitizeEncodingNormalizer = class {
|
|
7940
|
+
constructor(sourceEncoding) {
|
|
7941
|
+
this.sourceEncoding = sourceEncoding;
|
|
7942
|
+
this.utf8Decoder = sourceEncoding === "UTF8" ? new StringDecoder("utf8") : void 0;
|
|
7943
|
+
}
|
|
7944
|
+
sourceEncoding;
|
|
7945
|
+
utf8Decoder;
|
|
7946
|
+
normalizeChunk(chunk) {
|
|
7947
|
+
if (this.sourceEncoding === "UTF8") {
|
|
7948
|
+
const decoded = this.utf8Decoder.write(chunk);
|
|
7949
|
+
const sanitized = sanitizeDecodedText(decoded);
|
|
7950
|
+
const nulBytesRemoved = [...decoded].filter(
|
|
7951
|
+
(char) => char === "\0"
|
|
7952
|
+
).length;
|
|
7953
|
+
return {
|
|
7954
|
+
...sanitized,
|
|
7955
|
+
nulBytesRemoved
|
|
7956
|
+
};
|
|
7957
|
+
}
|
|
7958
|
+
return this.normalizeSingleByteChunk(chunk);
|
|
7959
|
+
}
|
|
7960
|
+
flush() {
|
|
7961
|
+
if (!this.utf8Decoder) {
|
|
7962
|
+
return {
|
|
7963
|
+
text: "",
|
|
7964
|
+
nulBytesRemoved: 0,
|
|
7965
|
+
invalidBytesRemoved: 0,
|
|
7966
|
+
controlCharsRemoved: 0
|
|
7967
|
+
};
|
|
7968
|
+
}
|
|
7969
|
+
const decoded = this.utf8Decoder.end();
|
|
7970
|
+
const sanitized = sanitizeDecodedText(decoded);
|
|
7971
|
+
const nulBytesRemoved = [...decoded].filter((char) => char === "\0").length;
|
|
7972
|
+
return {
|
|
7973
|
+
...sanitized,
|
|
7974
|
+
nulBytesRemoved
|
|
7975
|
+
};
|
|
7976
|
+
}
|
|
7977
|
+
normalizeSingleByteChunk(chunk) {
|
|
7978
|
+
const output = [];
|
|
7979
|
+
let nulBytesRemoved = 0;
|
|
7980
|
+
let invalidBytesRemoved = 0;
|
|
7981
|
+
let controlCharsRemoved = 0;
|
|
7982
|
+
for (const byte of chunk) {
|
|
7983
|
+
if (byte === 0) {
|
|
7984
|
+
nulBytesRemoved += 1;
|
|
7985
|
+
continue;
|
|
7986
|
+
}
|
|
7987
|
+
if (byte < 32 || byte === 127) {
|
|
7988
|
+
if (isAllowedControlCodePoint(byte)) {
|
|
7989
|
+
output.push(String.fromCharCode(byte));
|
|
7990
|
+
} else {
|
|
7991
|
+
controlCharsRemoved += 1;
|
|
7992
|
+
}
|
|
7993
|
+
continue;
|
|
7994
|
+
}
|
|
7995
|
+
if (byte >= 128 && byte <= 159) {
|
|
7996
|
+
if (this.sourceEncoding === "WIN1252") {
|
|
7997
|
+
const mapped = WINDOWS_1252_C1_MAP[byte];
|
|
7998
|
+
if (mapped === void 0) {
|
|
7999
|
+
invalidBytesRemoved += 1;
|
|
8000
|
+
} else {
|
|
8001
|
+
output.push(mapped);
|
|
8002
|
+
}
|
|
8003
|
+
} else {
|
|
8004
|
+
controlCharsRemoved += 1;
|
|
8005
|
+
}
|
|
8006
|
+
continue;
|
|
8007
|
+
}
|
|
8008
|
+
output.push(String.fromCharCode(byte));
|
|
8009
|
+
}
|
|
8010
|
+
return {
|
|
8011
|
+
text: output.join(""),
|
|
8012
|
+
nulBytesRemoved,
|
|
8013
|
+
invalidBytesRemoved,
|
|
8014
|
+
controlCharsRemoved
|
|
8015
|
+
};
|
|
8016
|
+
}
|
|
8017
|
+
};
|
|
8018
|
+
|
|
7856
8019
|
// src/services/sanitize/runner.ts
|
|
7857
8020
|
import { createReadStream as createReadStream2, createWriteStream as createWriteStream2 } from "fs";
|
|
7858
8021
|
import { mkdir as mkdir7 } from "fs/promises";
|
|
7859
8022
|
import path13 from "path";
|
|
7860
|
-
function
|
|
7861
|
-
|
|
7862
|
-
|
|
7863
|
-
if (chunk[index] === 0) {
|
|
7864
|
-
removed += 1;
|
|
7865
|
-
}
|
|
8023
|
+
async function writeUtf8(output, value) {
|
|
8024
|
+
if (value.length === 0) {
|
|
8025
|
+
return;
|
|
7866
8026
|
}
|
|
7867
|
-
if (
|
|
7868
|
-
|
|
8027
|
+
if (!output.write(value, "utf8")) {
|
|
8028
|
+
await new Promise((resolve, reject) => {
|
|
8029
|
+
output.once("drain", resolve);
|
|
8030
|
+
output.once("error", reject);
|
|
8031
|
+
});
|
|
7869
8032
|
}
|
|
7870
|
-
|
|
7871
|
-
|
|
7872
|
-
|
|
7873
|
-
|
|
7874
|
-
if (value
|
|
7875
|
-
|
|
7876
|
-
outputIndex += 1;
|
|
8033
|
+
}
|
|
8034
|
+
function countNewlines(value) {
|
|
8035
|
+
let count = 0;
|
|
8036
|
+
for (let index = 0; index < value.length; index += 1) {
|
|
8037
|
+
if (value[index] === "\n") {
|
|
8038
|
+
count += 1;
|
|
7877
8039
|
}
|
|
7878
8040
|
}
|
|
7879
|
-
return
|
|
8041
|
+
return count;
|
|
7880
8042
|
}
|
|
7881
|
-
async function sanitizeDatasetFile(plan, onChunk) {
|
|
8043
|
+
async function sanitizeDatasetFile(plan, onChunk, options = {}) {
|
|
7882
8044
|
await mkdir7(path13.dirname(plan.outputPath), { recursive: true });
|
|
8045
|
+
const sourceEncoding = normalizeSanitizeSourceEncoding(
|
|
8046
|
+
options.sourceEncoding
|
|
8047
|
+
);
|
|
8048
|
+
const normalizer = new SanitizeEncodingNormalizer(sourceEncoding);
|
|
7883
8049
|
const input = createReadStream2(plan.absolutePath);
|
|
7884
|
-
const output = createWriteStream2(plan.outputPath);
|
|
8050
|
+
const output = createWriteStream2(plan.outputPath, { encoding: "utf8" });
|
|
7885
8051
|
let totalBytesRead = 0;
|
|
7886
8052
|
let totalBytesWritten = 0;
|
|
7887
8053
|
let nulBytesRemoved = 0;
|
|
8054
|
+
let invalidBytesRemoved = 0;
|
|
8055
|
+
let controlCharsRemoved = 0;
|
|
7888
8056
|
let lineCount = 0;
|
|
7889
|
-
let
|
|
7890
|
-
let
|
|
8057
|
+
let sawAnyCharacter = false;
|
|
8058
|
+
let lastCharacterWasNewline = false;
|
|
8059
|
+
const processText = async (text) => {
|
|
8060
|
+
if (text.length === 0) {
|
|
8061
|
+
return;
|
|
8062
|
+
}
|
|
8063
|
+
sawAnyCharacter = true;
|
|
8064
|
+
lineCount += countNewlines(text);
|
|
8065
|
+
lastCharacterWasNewline = text.endsWith("\n");
|
|
8066
|
+
totalBytesWritten += Buffer.byteLength(text, "utf8");
|
|
8067
|
+
await writeUtf8(output, text);
|
|
8068
|
+
};
|
|
7891
8069
|
try {
|
|
7892
8070
|
for await (const chunk of input) {
|
|
7893
8071
|
const chunkBuffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
|
|
7894
8072
|
totalBytesRead += chunkBuffer.length;
|
|
7895
|
-
const
|
|
7896
|
-
nulBytesRemoved +=
|
|
7897
|
-
|
|
7898
|
-
|
|
7899
|
-
|
|
7900
|
-
lineCount += 1;
|
|
7901
|
-
}
|
|
7902
|
-
}
|
|
7903
|
-
if (buffer.length > 0) {
|
|
7904
|
-
lastByteWasNewline = buffer[buffer.length - 1] === 10;
|
|
7905
|
-
}
|
|
7906
|
-
totalBytesWritten += buffer.length;
|
|
7907
|
-
output.write(buffer);
|
|
8073
|
+
const normalized = normalizer.normalizeChunk(chunkBuffer);
|
|
8074
|
+
nulBytesRemoved += normalized.nulBytesRemoved;
|
|
8075
|
+
invalidBytesRemoved += normalized.invalidBytesRemoved;
|
|
8076
|
+
controlCharsRemoved += normalized.controlCharsRemoved;
|
|
8077
|
+
await processText(normalized.text);
|
|
7908
8078
|
onChunk?.({
|
|
7909
8079
|
bytesProcessed: chunkBuffer.length,
|
|
7910
8080
|
fileBytesProcessed: totalBytesRead,
|
|
7911
8081
|
currentFileSize: plan.fileSize,
|
|
7912
8082
|
processedRows: lineCount,
|
|
7913
|
-
nulBytesRemoved
|
|
8083
|
+
nulBytesRemoved,
|
|
8084
|
+
invalidBytesRemoved,
|
|
8085
|
+
controlCharsRemoved
|
|
7914
8086
|
});
|
|
7915
8087
|
}
|
|
7916
|
-
|
|
8088
|
+
const flushed = normalizer.flush();
|
|
8089
|
+
nulBytesRemoved += flushed.nulBytesRemoved;
|
|
8090
|
+
invalidBytesRemoved += flushed.invalidBytesRemoved;
|
|
8091
|
+
controlCharsRemoved += flushed.controlCharsRemoved;
|
|
8092
|
+
await processText(flushed.text);
|
|
8093
|
+
if (sawAnyCharacter && !lastCharacterWasNewline) {
|
|
7917
8094
|
lineCount += 1;
|
|
7918
8095
|
}
|
|
7919
8096
|
} finally {
|
|
7920
8097
|
input.close();
|
|
7921
8098
|
output.end();
|
|
7922
|
-
await new Promise((resolve
|
|
8099
|
+
await new Promise((resolve, reject) => {
|
|
8100
|
+
output.on("finish", () => resolve());
|
|
8101
|
+
output.on("error", (error) => reject(error));
|
|
8102
|
+
});
|
|
7923
8103
|
}
|
|
7924
8104
|
return {
|
|
7925
8105
|
plan,
|
|
7926
8106
|
totalBytesRead,
|
|
7927
8107
|
totalBytesWritten,
|
|
8108
|
+
sourceEncoding,
|
|
7928
8109
|
nulBytesRemoved,
|
|
8110
|
+
invalidBytesRemoved,
|
|
8111
|
+
controlCharsRemoved,
|
|
7929
8112
|
lineCount,
|
|
7930
|
-
changed: nulBytesRemoved > 0 || totalBytesRead !== totalBytesWritten
|
|
8113
|
+
changed: nulBytesRemoved > 0 || invalidBytesRemoved > 0 || controlCharsRemoved > 0 || totalBytesRead !== totalBytesWritten
|
|
7931
8114
|
};
|
|
7932
8115
|
}
|
|
7933
8116
|
|
|
@@ -7990,40 +8173,54 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
|
|
|
7990
8173
|
"No recognized validated dataset files were found for sanitization."
|
|
7991
8174
|
);
|
|
7992
8175
|
}
|
|
8176
|
+
const sourceEncoding = normalizeSanitizeSourceEncoding(
|
|
8177
|
+
options.sourceEncoding
|
|
8178
|
+
);
|
|
7993
8179
|
options.onProgress?.({
|
|
7994
8180
|
kind: "start",
|
|
7995
8181
|
validatedPath,
|
|
7996
8182
|
outputPath,
|
|
7997
8183
|
totalFiles: plan.totalFiles,
|
|
7998
8184
|
totalBytes: plan.totalBytes,
|
|
7999
|
-
datasets: plan.datasets
|
|
8185
|
+
datasets: plan.datasets,
|
|
8186
|
+
sourceEncoding
|
|
8000
8187
|
});
|
|
8001
8188
|
let processedFiles = 0;
|
|
8002
8189
|
let processedRows = 0;
|
|
8003
8190
|
let processedBytes = 0;
|
|
8004
8191
|
let nulBytesRemoved = 0;
|
|
8192
|
+
let invalidBytesRemoved = 0;
|
|
8193
|
+
let controlCharsRemoved = 0;
|
|
8005
8194
|
let changedFiles = 0;
|
|
8006
8195
|
const fileSummaries = [];
|
|
8007
8196
|
for (const [index, filePlan] of plan.files.entries()) {
|
|
8008
|
-
const fileResult = await sanitizeDatasetFile(
|
|
8009
|
-
|
|
8010
|
-
|
|
8011
|
-
|
|
8012
|
-
|
|
8013
|
-
|
|
8014
|
-
|
|
8015
|
-
|
|
8016
|
-
|
|
8017
|
-
|
|
8018
|
-
|
|
8019
|
-
|
|
8020
|
-
|
|
8021
|
-
|
|
8022
|
-
|
|
8197
|
+
const fileResult = await sanitizeDatasetFile(
|
|
8198
|
+
filePlan,
|
|
8199
|
+
(chunk) => {
|
|
8200
|
+
options.onProgress?.({
|
|
8201
|
+
kind: "progress",
|
|
8202
|
+
currentFileDisplayPath: filePlan.displayPath,
|
|
8203
|
+
fileIndex: index + 1,
|
|
8204
|
+
totalFiles: plan.totalFiles,
|
|
8205
|
+
bytesProcessed: processedBytes + chunk.fileBytesProcessed,
|
|
8206
|
+
totalBytes: plan.totalBytes,
|
|
8207
|
+
fileBytesProcessed: chunk.fileBytesProcessed,
|
|
8208
|
+
currentFileSize: chunk.currentFileSize,
|
|
8209
|
+
processedRows: processedRows + chunk.processedRows,
|
|
8210
|
+
nulBytesRemoved: nulBytesRemoved + chunk.nulBytesRemoved,
|
|
8211
|
+
invalidBytesRemoved: invalidBytesRemoved + chunk.invalidBytesRemoved,
|
|
8212
|
+
controlCharsRemoved: controlCharsRemoved + chunk.controlCharsRemoved,
|
|
8213
|
+
changedFiles
|
|
8214
|
+
});
|
|
8215
|
+
},
|
|
8216
|
+
{ sourceEncoding }
|
|
8217
|
+
);
|
|
8023
8218
|
processedFiles += 1;
|
|
8024
8219
|
processedRows += fileResult.lineCount;
|
|
8025
8220
|
processedBytes += fileResult.totalBytesRead;
|
|
8026
8221
|
nulBytesRemoved += fileResult.nulBytesRemoved;
|
|
8222
|
+
invalidBytesRemoved += fileResult.invalidBytesRemoved;
|
|
8223
|
+
controlCharsRemoved += fileResult.controlCharsRemoved;
|
|
8027
8224
|
changedFiles += fileResult.changed ? 1 : 0;
|
|
8028
8225
|
fileSummaries.push({
|
|
8029
8226
|
dataset: filePlan.dataset,
|
|
@@ -8031,7 +8228,9 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
|
|
|
8031
8228
|
outputPath: filePlan.outputPath,
|
|
8032
8229
|
lineCount: fileResult.lineCount,
|
|
8033
8230
|
changed: fileResult.changed,
|
|
8034
|
-
nulBytesRemoved: fileResult.nulBytesRemoved
|
|
8231
|
+
nulBytesRemoved: fileResult.nulBytesRemoved,
|
|
8232
|
+
invalidBytesRemoved: fileResult.invalidBytesRemoved,
|
|
8233
|
+
controlCharsRemoved: fileResult.controlCharsRemoved
|
|
8035
8234
|
});
|
|
8036
8235
|
}
|
|
8037
8236
|
options.onProgress?.({
|
|
@@ -8039,6 +8238,8 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
|
|
|
8039
8238
|
totalFiles: plan.totalFiles,
|
|
8040
8239
|
processedRows,
|
|
8041
8240
|
nulBytesRemoved,
|
|
8241
|
+
invalidBytesRemoved,
|
|
8242
|
+
controlCharsRemoved,
|
|
8042
8243
|
changedFiles,
|
|
8043
8244
|
totalBytes: plan.totalBytes
|
|
8044
8245
|
});
|
|
@@ -8050,13 +8251,17 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
|
|
|
8050
8251
|
totalBytes: plan.totalBytes,
|
|
8051
8252
|
processedFiles,
|
|
8052
8253
|
processedRows,
|
|
8254
|
+
sourceEncoding,
|
|
8053
8255
|
nulBytesRemoved,
|
|
8256
|
+
invalidBytesRemoved,
|
|
8257
|
+
controlCharsRemoved,
|
|
8054
8258
|
changedFiles,
|
|
8055
8259
|
unchangedFiles: plan.totalFiles - changedFiles,
|
|
8056
8260
|
datasets: plan.datasets,
|
|
8057
8261
|
files: fileSummaries,
|
|
8058
8262
|
warnings: [
|
|
8059
|
-
"Sanitization
|
|
8263
|
+
"Sanitization now writes UTF-8 output and removes invalid bytes plus problematic control characters before PostgreSQL loading begins.",
|
|
8264
|
+
"The PostgreSQL direct import path can use --source-encoding UTF8 when reading files generated by this sanitization command.",
|
|
8060
8265
|
"The import command still keeps quarantine and row-level recovery for unexpected issues, but sanitizing first reduces the amount of slow fallback work during import."
|
|
8061
8266
|
],
|
|
8062
8267
|
nextStep: inferNextStep3(outputPath)
|
|
@@ -8849,7 +9054,7 @@ async function exportPostgresCsvDataset(inputPath, options = {}) {
|
|
|
8849
9054
|
// src/services/postgres-direct/generator.ts
|
|
8850
9055
|
import { mkdir as mkdir9, stat as stat7, writeFile as writeFile6 } from "fs/promises";
|
|
8851
9056
|
import path17 from "path";
|
|
8852
|
-
var DEFAULT_SOURCE_ENCODING = "
|
|
9057
|
+
var DEFAULT_SOURCE_ENCODING = "UTF8";
|
|
8853
9058
|
function defaultPostgresDirectOutputPath(inputPath) {
|
|
8854
9059
|
const baseName = path17.basename(inputPath);
|
|
8855
9060
|
if (baseName.toLowerCase() === "sanitized") {
|
|
@@ -8864,7 +9069,7 @@ function normalizeSourceEncoding(value) {
|
|
|
8864
9069
|
const encoding = (value ?? DEFAULT_SOURCE_ENCODING).trim();
|
|
8865
9070
|
if (!/^[A-Za-z0-9_-]+$/.test(encoding)) {
|
|
8866
9071
|
throw new ValidationError(
|
|
8867
|
-
`Invalid source encoding: ${value}. Use a PostgreSQL client encoding name such as WIN1252 or
|
|
9072
|
+
`Invalid source encoding: ${value}. Use a PostgreSQL client encoding name such as UTF8, WIN1252 or LATIN1.`
|
|
8868
9073
|
);
|
|
8869
9074
|
}
|
|
8870
9075
|
return encoding.toUpperCase();
|
|
@@ -8999,7 +9204,8 @@ async function generatePostgresDirectScript(inputPath, options = {}) {
|
|
|
8999
9204
|
...validation.ok ? [] : validation.errors,
|
|
9000
9205
|
"This script imports sanitized Receita files directly with psql \\copy. It avoids rewriting the full dataset into a second CSV tree.",
|
|
9001
9206
|
"The generated script expects the database schema generated by cnpj-db-loader to be applied before execution.",
|
|
9002
|
-
"
|
|
9207
|
+
"The direct PostgreSQL script now defaults to UTF8 because the sanitize command writes clean UTF-8 files.",
|
|
9208
|
+
"Use --source-encoding WIN1252 or LATIN1 only when generating scripts for legacy sanitized files produced by older loader versions."
|
|
9003
9209
|
],
|
|
9004
9210
|
nextStep: inferNextStep5(scriptPath)
|
|
9005
9211
|
};
|