@danielarndt0/cnpj-db-loader 2.4.0-beta.1 → 2.4.0-beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -5
- package/dist/cli.js +301 -61
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +14 -1
- package/dist/index.js +264 -58
- package/dist/index.js.map +1 -1
- package/docs/architecture.md +1 -1
- package/docs/cli.md +1 -1
- package/docs/commands.md +1 -1
- package/docs/postgres-direct.md +10 -10
- package/docs/sanitize.md +52 -16
- package/package.json +3 -3
- package/docs/releases/v2.4.0.md +0 -40
package/README.md
CHANGED
|
@@ -10,7 +10,7 @@ This version focuses on the real loading workflow:
|
|
|
10
10
|
- check, download, retry, clean, and inspect the latest Federal Revenue CNPJ monthly ZIP archives from the public share
|
|
11
11
|
- extract Receita Federal ZIP archives
|
|
12
12
|
- validate an extracted tree
|
|
13
|
-
- sanitize validated files before import
|
|
13
|
+
- sanitize validated files into clean UTF-8 before import, removing NUL bytes, invalid bytes and problematic control characters
|
|
14
14
|
- print or generate final, staging, or combined SQL schemas
|
|
15
15
|
- configure and test the default PostgreSQL URL
|
|
16
16
|
- import validated dataset files into PostgreSQL with:
|
|
@@ -51,7 +51,7 @@ cnpj-db-loader schema generate --profile full
|
|
|
51
51
|
cnpj-db-loader import ./downloads/<reference>/sanitized --load-batch-size 500 --materialize-batch-size 50000 --verbose-progress
|
|
52
52
|
|
|
53
53
|
# Optional hybrid path for PostgreSQL direct loading
|
|
54
|
-
cnpj-db-loader postgres generate-script ./downloads/<reference>/sanitized --output ./downloads/<reference>/postgres-direct --force
|
|
54
|
+
cnpj-db-loader postgres generate-script ./downloads/<reference>/sanitized --output ./downloads/<reference>/postgres-direct --source-encoding UTF8 --force
|
|
55
55
|
psql "postgres://postgres:postgres@localhost:5432/cnpj" -f ./downloads/<reference>/postgres-direct/import-postgres-direct.sql
|
|
56
56
|
```
|
|
57
57
|
|
|
@@ -67,7 +67,7 @@ cnpj-db-loader federal-revenue sync [reference] [--reference <yyyy-mm>] [--curre
|
|
|
67
67
|
cnpj-db-loader inspect <input>
|
|
68
68
|
cnpj-db-loader extract <input> [--output <path>]
|
|
69
69
|
cnpj-db-loader validate <input>
|
|
70
|
-
cnpj-db-loader sanitize <input> [--output <path>] [--dataset <name>] [-f]
|
|
70
|
+
cnpj-db-loader sanitize <input> [--output <path>] [--dataset <name>] [--source-encoding <encoding>] [-f]
|
|
71
71
|
cnpj-db-loader schema print [--profile <profile>]
|
|
72
72
|
cnpj-db-loader schema generate [--name <name>] [--output <path>] [--profile <profile>]
|
|
73
73
|
cnpj-db-loader database config set <url>
|
|
@@ -95,11 +95,11 @@ For local benchmarks or controlled full loads, the CLI can now generate a direct
|
|
|
95
95
|
|
|
96
96
|
```bash
|
|
97
97
|
cnpj-db-loader sanitize ./downloads/<reference>/extracted
|
|
98
|
-
cnpj-db-loader postgres generate-script ./downloads/<reference>/sanitized --output ./downloads/<reference>/postgres-direct --force
|
|
98
|
+
cnpj-db-loader postgres generate-script ./downloads/<reference>/sanitized --output ./downloads/<reference>/postgres-direct --source-encoding UTF8 --force
|
|
99
99
|
psql "postgres://postgres:postgres@localhost:5432/cnpj" -f ./downloads/<reference>/postgres-direct/import-postgres-direct.sql
|
|
100
100
|
```
|
|
101
101
|
|
|
102
|
-
This path keeps download, extraction, validation and sanitization inside the loader, then lets PostgreSQL load the sanitized Receita files directly through `\copy`, convert values into staging tables and materialize the final tables with set-based SQL. The standard `import` command remains the safest path when checkpoint resume and quarantine recovery are required.
|
|
102
|
+
This path keeps download, extraction, validation and robust UTF-8 sanitization inside the loader, then lets PostgreSQL load the sanitized Receita files directly through `\copy`, convert values into staging tables and materialize the final tables with set-based SQL. The standard `import` command remains the safest path when checkpoint resume and quarantine recovery are required.
|
|
103
103
|
|
|
104
104
|
## Logs
|
|
105
105
|
|
package/dist/cli.js
CHANGED
|
@@ -7821,81 +7821,264 @@ function isRecognizedSanitizeEntry(entry) {
|
|
|
7821
7821
|
return entry.entryKind === "file" && entry.inferredType !== "zip-archive" && entry.inferredType !== "unknown";
|
|
7822
7822
|
}
|
|
7823
7823
|
|
|
7824
|
+
// src/services/sanitize/encoding.ts
|
|
7825
|
+
import { StringDecoder } from "string_decoder";
|
|
7826
|
+
var WINDOWS_1252_C1_MAP = {
|
|
7827
|
+
128: "\u20AC",
|
|
7828
|
+
130: "\u201A",
|
|
7829
|
+
131: "\u0192",
|
|
7830
|
+
132: "\u201E",
|
|
7831
|
+
133: "\u2026",
|
|
7832
|
+
134: "\u2020",
|
|
7833
|
+
135: "\u2021",
|
|
7834
|
+
136: "\u02C6",
|
|
7835
|
+
137: "\u2030",
|
|
7836
|
+
138: "\u0160",
|
|
7837
|
+
139: "\u2039",
|
|
7838
|
+
140: "\u0152",
|
|
7839
|
+
142: "\u017D",
|
|
7840
|
+
145: "\u2018",
|
|
7841
|
+
146: "\u2019",
|
|
7842
|
+
147: "\u201C",
|
|
7843
|
+
148: "\u201D",
|
|
7844
|
+
149: "\u2022",
|
|
7845
|
+
150: "\u2013",
|
|
7846
|
+
151: "\u2014",
|
|
7847
|
+
152: "\u02DC",
|
|
7848
|
+
153: "\u2122",
|
|
7849
|
+
154: "\u0161",
|
|
7850
|
+
155: "\u203A",
|
|
7851
|
+
156: "\u0153",
|
|
7852
|
+
158: "\u017E",
|
|
7853
|
+
159: "\u0178"
|
|
7854
|
+
};
|
|
7855
|
+
function normalizeSanitizeSourceEncoding(value) {
|
|
7856
|
+
const normalized = (value ?? "WIN1252").trim().toUpperCase().replace(/_/g, "-");
|
|
7857
|
+
switch (normalized) {
|
|
7858
|
+
case "WIN1252":
|
|
7859
|
+
case "WINDOWS-1252":
|
|
7860
|
+
case "CP1252":
|
|
7861
|
+
return "WIN1252";
|
|
7862
|
+
case "LATIN1":
|
|
7863
|
+
case "LATIN-1":
|
|
7864
|
+
case "ISO-8859-1":
|
|
7865
|
+
case "ISO8859-1":
|
|
7866
|
+
return "LATIN1";
|
|
7867
|
+
case "UTF8":
|
|
7868
|
+
case "UTF-8":
|
|
7869
|
+
return "UTF8";
|
|
7870
|
+
default:
|
|
7871
|
+
throw new ValidationError(
|
|
7872
|
+
`Unsupported sanitize source encoding: ${value}. Supported values: WIN1252, LATIN1, UTF8.`
|
|
7873
|
+
);
|
|
7874
|
+
}
|
|
7875
|
+
}
|
|
7876
|
+
function isAllowedControlCodePoint(codePoint) {
|
|
7877
|
+
return codePoint === 9 || codePoint === 10 || codePoint === 13;
|
|
7878
|
+
}
|
|
7879
|
+
function isProblematicControlCodePoint(codePoint) {
|
|
7880
|
+
if (isAllowedControlCodePoint(codePoint)) {
|
|
7881
|
+
return false;
|
|
7882
|
+
}
|
|
7883
|
+
return codePoint >= 0 && codePoint <= 31 || codePoint === 127 || codePoint >= 128 && codePoint <= 159 || codePoint === 65279;
|
|
7884
|
+
}
|
|
7885
|
+
function sanitizeDecodedText(text) {
|
|
7886
|
+
const output2 = [];
|
|
7887
|
+
let invalidBytesRemoved = 0;
|
|
7888
|
+
let controlCharsRemoved = 0;
|
|
7889
|
+
for (const char of text) {
|
|
7890
|
+
const codePoint = char.codePointAt(0);
|
|
7891
|
+
if (codePoint === 65533) {
|
|
7892
|
+
invalidBytesRemoved += 1;
|
|
7893
|
+
continue;
|
|
7894
|
+
}
|
|
7895
|
+
if (isProblematicControlCodePoint(codePoint)) {
|
|
7896
|
+
controlCharsRemoved += 1;
|
|
7897
|
+
continue;
|
|
7898
|
+
}
|
|
7899
|
+
output2.push(char);
|
|
7900
|
+
}
|
|
7901
|
+
return {
|
|
7902
|
+
text: output2.join(""),
|
|
7903
|
+
invalidBytesRemoved,
|
|
7904
|
+
controlCharsRemoved
|
|
7905
|
+
};
|
|
7906
|
+
}
|
|
7907
|
+
var SanitizeEncodingNormalizer = class {
|
|
7908
|
+
constructor(sourceEncoding) {
|
|
7909
|
+
this.sourceEncoding = sourceEncoding;
|
|
7910
|
+
this.utf8Decoder = sourceEncoding === "UTF8" ? new StringDecoder("utf8") : void 0;
|
|
7911
|
+
}
|
|
7912
|
+
sourceEncoding;
|
|
7913
|
+
utf8Decoder;
|
|
7914
|
+
normalizeChunk(chunk) {
|
|
7915
|
+
if (this.sourceEncoding === "UTF8") {
|
|
7916
|
+
const decoded = this.utf8Decoder.write(chunk);
|
|
7917
|
+
const sanitized = sanitizeDecodedText(decoded);
|
|
7918
|
+
const nulBytesRemoved = [...decoded].filter(
|
|
7919
|
+
(char) => char === "\0"
|
|
7920
|
+
).length;
|
|
7921
|
+
return {
|
|
7922
|
+
...sanitized,
|
|
7923
|
+
nulBytesRemoved
|
|
7924
|
+
};
|
|
7925
|
+
}
|
|
7926
|
+
return this.normalizeSingleByteChunk(chunk);
|
|
7927
|
+
}
|
|
7928
|
+
flush() {
|
|
7929
|
+
if (!this.utf8Decoder) {
|
|
7930
|
+
return {
|
|
7931
|
+
text: "",
|
|
7932
|
+
nulBytesRemoved: 0,
|
|
7933
|
+
invalidBytesRemoved: 0,
|
|
7934
|
+
controlCharsRemoved: 0
|
|
7935
|
+
};
|
|
7936
|
+
}
|
|
7937
|
+
const decoded = this.utf8Decoder.end();
|
|
7938
|
+
const sanitized = sanitizeDecodedText(decoded);
|
|
7939
|
+
const nulBytesRemoved = [...decoded].filter((char) => char === "\0").length;
|
|
7940
|
+
return {
|
|
7941
|
+
...sanitized,
|
|
7942
|
+
nulBytesRemoved
|
|
7943
|
+
};
|
|
7944
|
+
}
|
|
7945
|
+
normalizeSingleByteChunk(chunk) {
|
|
7946
|
+
const output2 = [];
|
|
7947
|
+
let nulBytesRemoved = 0;
|
|
7948
|
+
let invalidBytesRemoved = 0;
|
|
7949
|
+
let controlCharsRemoved = 0;
|
|
7950
|
+
for (const byte of chunk) {
|
|
7951
|
+
if (byte === 0) {
|
|
7952
|
+
nulBytesRemoved += 1;
|
|
7953
|
+
continue;
|
|
7954
|
+
}
|
|
7955
|
+
if (byte < 32 || byte === 127) {
|
|
7956
|
+
if (isAllowedControlCodePoint(byte)) {
|
|
7957
|
+
output2.push(String.fromCharCode(byte));
|
|
7958
|
+
} else {
|
|
7959
|
+
controlCharsRemoved += 1;
|
|
7960
|
+
}
|
|
7961
|
+
continue;
|
|
7962
|
+
}
|
|
7963
|
+
if (byte >= 128 && byte <= 159) {
|
|
7964
|
+
if (this.sourceEncoding === "WIN1252") {
|
|
7965
|
+
const mapped = WINDOWS_1252_C1_MAP[byte];
|
|
7966
|
+
if (mapped === void 0) {
|
|
7967
|
+
invalidBytesRemoved += 1;
|
|
7968
|
+
} else {
|
|
7969
|
+
output2.push(mapped);
|
|
7970
|
+
}
|
|
7971
|
+
} else {
|
|
7972
|
+
controlCharsRemoved += 1;
|
|
7973
|
+
}
|
|
7974
|
+
continue;
|
|
7975
|
+
}
|
|
7976
|
+
output2.push(String.fromCharCode(byte));
|
|
7977
|
+
}
|
|
7978
|
+
return {
|
|
7979
|
+
text: output2.join(""),
|
|
7980
|
+
nulBytesRemoved,
|
|
7981
|
+
invalidBytesRemoved,
|
|
7982
|
+
controlCharsRemoved
|
|
7983
|
+
};
|
|
7984
|
+
}
|
|
7985
|
+
};
|
|
7986
|
+
|
|
7824
7987
|
// src/services/sanitize/runner.ts
|
|
7825
7988
|
import { createReadStream as createReadStream2, createWriteStream as createWriteStream2 } from "fs";
|
|
7826
7989
|
import { mkdir as mkdir7 } from "fs/promises";
|
|
7827
7990
|
import path13 from "path";
|
|
7828
|
-
function
|
|
7829
|
-
|
|
7830
|
-
|
|
7831
|
-
if (chunk[index] === 0) {
|
|
7832
|
-
removed += 1;
|
|
7833
|
-
}
|
|
7991
|
+
async function writeUtf8(output2, value) {
|
|
7992
|
+
if (value.length === 0) {
|
|
7993
|
+
return;
|
|
7834
7994
|
}
|
|
7835
|
-
if (
|
|
7836
|
-
|
|
7995
|
+
if (!output2.write(value, "utf8")) {
|
|
7996
|
+
await new Promise((resolve2, reject) => {
|
|
7997
|
+
output2.once("drain", resolve2);
|
|
7998
|
+
output2.once("error", reject);
|
|
7999
|
+
});
|
|
7837
8000
|
}
|
|
7838
|
-
|
|
7839
|
-
|
|
7840
|
-
|
|
7841
|
-
|
|
7842
|
-
if (value
|
|
7843
|
-
|
|
7844
|
-
outputIndex += 1;
|
|
8001
|
+
}
|
|
8002
|
+
function countNewlines(value) {
|
|
8003
|
+
let count = 0;
|
|
8004
|
+
for (let index = 0; index < value.length; index += 1) {
|
|
8005
|
+
if (value[index] === "\n") {
|
|
8006
|
+
count += 1;
|
|
7845
8007
|
}
|
|
7846
8008
|
}
|
|
7847
|
-
return
|
|
8009
|
+
return count;
|
|
7848
8010
|
}
|
|
7849
|
-
async function sanitizeDatasetFile(plan, onChunk) {
|
|
8011
|
+
async function sanitizeDatasetFile(plan, onChunk, options = {}) {
|
|
7850
8012
|
await mkdir7(path13.dirname(plan.outputPath), { recursive: true });
|
|
8013
|
+
const sourceEncoding = normalizeSanitizeSourceEncoding(
|
|
8014
|
+
options.sourceEncoding
|
|
8015
|
+
);
|
|
8016
|
+
const normalizer = new SanitizeEncodingNormalizer(sourceEncoding);
|
|
7851
8017
|
const input2 = createReadStream2(plan.absolutePath);
|
|
7852
|
-
const output2 = createWriteStream2(plan.outputPath);
|
|
8018
|
+
const output2 = createWriteStream2(plan.outputPath, { encoding: "utf8" });
|
|
7853
8019
|
let totalBytesRead = 0;
|
|
7854
8020
|
let totalBytesWritten = 0;
|
|
7855
8021
|
let nulBytesRemoved = 0;
|
|
8022
|
+
let invalidBytesRemoved = 0;
|
|
8023
|
+
let controlCharsRemoved = 0;
|
|
7856
8024
|
let lineCount = 0;
|
|
7857
|
-
let
|
|
7858
|
-
let
|
|
8025
|
+
let sawAnyCharacter = false;
|
|
8026
|
+
let lastCharacterWasNewline = false;
|
|
8027
|
+
const processText = async (text) => {
|
|
8028
|
+
if (text.length === 0) {
|
|
8029
|
+
return;
|
|
8030
|
+
}
|
|
8031
|
+
sawAnyCharacter = true;
|
|
8032
|
+
lineCount += countNewlines(text);
|
|
8033
|
+
lastCharacterWasNewline = text.endsWith("\n");
|
|
8034
|
+
totalBytesWritten += Buffer.byteLength(text, "utf8");
|
|
8035
|
+
await writeUtf8(output2, text);
|
|
8036
|
+
};
|
|
7859
8037
|
try {
|
|
7860
8038
|
for await (const chunk of input2) {
|
|
7861
8039
|
const chunkBuffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
|
|
7862
8040
|
totalBytesRead += chunkBuffer.length;
|
|
7863
|
-
const
|
|
7864
|
-
nulBytesRemoved +=
|
|
7865
|
-
|
|
7866
|
-
|
|
7867
|
-
|
|
7868
|
-
lineCount += 1;
|
|
7869
|
-
}
|
|
7870
|
-
}
|
|
7871
|
-
if (buffer.length > 0) {
|
|
7872
|
-
lastByteWasNewline = buffer[buffer.length - 1] === 10;
|
|
7873
|
-
}
|
|
7874
|
-
totalBytesWritten += buffer.length;
|
|
7875
|
-
output2.write(buffer);
|
|
8041
|
+
const normalized = normalizer.normalizeChunk(chunkBuffer);
|
|
8042
|
+
nulBytesRemoved += normalized.nulBytesRemoved;
|
|
8043
|
+
invalidBytesRemoved += normalized.invalidBytesRemoved;
|
|
8044
|
+
controlCharsRemoved += normalized.controlCharsRemoved;
|
|
8045
|
+
await processText(normalized.text);
|
|
7876
8046
|
onChunk?.({
|
|
7877
8047
|
bytesProcessed: chunkBuffer.length,
|
|
7878
8048
|
fileBytesProcessed: totalBytesRead,
|
|
7879
8049
|
currentFileSize: plan.fileSize,
|
|
7880
8050
|
processedRows: lineCount,
|
|
7881
|
-
nulBytesRemoved
|
|
8051
|
+
nulBytesRemoved,
|
|
8052
|
+
invalidBytesRemoved,
|
|
8053
|
+
controlCharsRemoved
|
|
7882
8054
|
});
|
|
7883
8055
|
}
|
|
7884
|
-
|
|
8056
|
+
const flushed = normalizer.flush();
|
|
8057
|
+
nulBytesRemoved += flushed.nulBytesRemoved;
|
|
8058
|
+
invalidBytesRemoved += flushed.invalidBytesRemoved;
|
|
8059
|
+
controlCharsRemoved += flushed.controlCharsRemoved;
|
|
8060
|
+
await processText(flushed.text);
|
|
8061
|
+
if (sawAnyCharacter && !lastCharacterWasNewline) {
|
|
7885
8062
|
lineCount += 1;
|
|
7886
8063
|
}
|
|
7887
8064
|
} finally {
|
|
7888
8065
|
input2.close();
|
|
7889
8066
|
output2.end();
|
|
7890
|
-
await new Promise((resolve2
|
|
8067
|
+
await new Promise((resolve2, reject) => {
|
|
8068
|
+
output2.on("finish", () => resolve2());
|
|
8069
|
+
output2.on("error", (error) => reject(error));
|
|
8070
|
+
});
|
|
7891
8071
|
}
|
|
7892
8072
|
return {
|
|
7893
8073
|
plan,
|
|
7894
8074
|
totalBytesRead,
|
|
7895
8075
|
totalBytesWritten,
|
|
8076
|
+
sourceEncoding,
|
|
7896
8077
|
nulBytesRemoved,
|
|
8078
|
+
invalidBytesRemoved,
|
|
8079
|
+
controlCharsRemoved,
|
|
7897
8080
|
lineCount,
|
|
7898
|
-
changed: nulBytesRemoved > 0 || totalBytesRead !== totalBytesWritten
|
|
8081
|
+
changed: nulBytesRemoved > 0 || invalidBytesRemoved > 0 || controlCharsRemoved > 0 || totalBytesRead !== totalBytesWritten
|
|
7899
8082
|
};
|
|
7900
8083
|
}
|
|
7901
8084
|
|
|
@@ -7958,40 +8141,54 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
|
|
|
7958
8141
|
"No recognized validated dataset files were found for sanitization."
|
|
7959
8142
|
);
|
|
7960
8143
|
}
|
|
8144
|
+
const sourceEncoding = normalizeSanitizeSourceEncoding(
|
|
8145
|
+
options.sourceEncoding
|
|
8146
|
+
);
|
|
7961
8147
|
options.onProgress?.({
|
|
7962
8148
|
kind: "start",
|
|
7963
8149
|
validatedPath,
|
|
7964
8150
|
outputPath,
|
|
7965
8151
|
totalFiles: plan.totalFiles,
|
|
7966
8152
|
totalBytes: plan.totalBytes,
|
|
7967
|
-
datasets: plan.datasets
|
|
8153
|
+
datasets: plan.datasets,
|
|
8154
|
+
sourceEncoding
|
|
7968
8155
|
});
|
|
7969
8156
|
let processedFiles = 0;
|
|
7970
8157
|
let processedRows = 0;
|
|
7971
8158
|
let processedBytes = 0;
|
|
7972
8159
|
let nulBytesRemoved = 0;
|
|
8160
|
+
let invalidBytesRemoved = 0;
|
|
8161
|
+
let controlCharsRemoved = 0;
|
|
7973
8162
|
let changedFiles = 0;
|
|
7974
8163
|
const fileSummaries = [];
|
|
7975
8164
|
for (const [index, filePlan] of plan.files.entries()) {
|
|
7976
|
-
const fileResult = await sanitizeDatasetFile(
|
|
7977
|
-
|
|
7978
|
-
|
|
7979
|
-
|
|
7980
|
-
|
|
7981
|
-
|
|
7982
|
-
|
|
7983
|
-
|
|
7984
|
-
|
|
7985
|
-
|
|
7986
|
-
|
|
7987
|
-
|
|
7988
|
-
|
|
7989
|
-
|
|
7990
|
-
|
|
8165
|
+
const fileResult = await sanitizeDatasetFile(
|
|
8166
|
+
filePlan,
|
|
8167
|
+
(chunk) => {
|
|
8168
|
+
options.onProgress?.({
|
|
8169
|
+
kind: "progress",
|
|
8170
|
+
currentFileDisplayPath: filePlan.displayPath,
|
|
8171
|
+
fileIndex: index + 1,
|
|
8172
|
+
totalFiles: plan.totalFiles,
|
|
8173
|
+
bytesProcessed: processedBytes + chunk.fileBytesProcessed,
|
|
8174
|
+
totalBytes: plan.totalBytes,
|
|
8175
|
+
fileBytesProcessed: chunk.fileBytesProcessed,
|
|
8176
|
+
currentFileSize: chunk.currentFileSize,
|
|
8177
|
+
processedRows: processedRows + chunk.processedRows,
|
|
8178
|
+
nulBytesRemoved: nulBytesRemoved + chunk.nulBytesRemoved,
|
|
8179
|
+
invalidBytesRemoved: invalidBytesRemoved + chunk.invalidBytesRemoved,
|
|
8180
|
+
controlCharsRemoved: controlCharsRemoved + chunk.controlCharsRemoved,
|
|
8181
|
+
changedFiles
|
|
8182
|
+
});
|
|
8183
|
+
},
|
|
8184
|
+
{ sourceEncoding }
|
|
8185
|
+
);
|
|
7991
8186
|
processedFiles += 1;
|
|
7992
8187
|
processedRows += fileResult.lineCount;
|
|
7993
8188
|
processedBytes += fileResult.totalBytesRead;
|
|
7994
8189
|
nulBytesRemoved += fileResult.nulBytesRemoved;
|
|
8190
|
+
invalidBytesRemoved += fileResult.invalidBytesRemoved;
|
|
8191
|
+
controlCharsRemoved += fileResult.controlCharsRemoved;
|
|
7995
8192
|
changedFiles += fileResult.changed ? 1 : 0;
|
|
7996
8193
|
fileSummaries.push({
|
|
7997
8194
|
dataset: filePlan.dataset,
|
|
@@ -7999,7 +8196,9 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
|
|
|
7999
8196
|
outputPath: filePlan.outputPath,
|
|
8000
8197
|
lineCount: fileResult.lineCount,
|
|
8001
8198
|
changed: fileResult.changed,
|
|
8002
|
-
nulBytesRemoved: fileResult.nulBytesRemoved
|
|
8199
|
+
nulBytesRemoved: fileResult.nulBytesRemoved,
|
|
8200
|
+
invalidBytesRemoved: fileResult.invalidBytesRemoved,
|
|
8201
|
+
controlCharsRemoved: fileResult.controlCharsRemoved
|
|
8003
8202
|
});
|
|
8004
8203
|
}
|
|
8005
8204
|
options.onProgress?.({
|
|
@@ -8007,6 +8206,8 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
|
|
|
8007
8206
|
totalFiles: plan.totalFiles,
|
|
8008
8207
|
processedRows,
|
|
8009
8208
|
nulBytesRemoved,
|
|
8209
|
+
invalidBytesRemoved,
|
|
8210
|
+
controlCharsRemoved,
|
|
8010
8211
|
changedFiles,
|
|
8011
8212
|
totalBytes: plan.totalBytes
|
|
8012
8213
|
});
|
|
@@ -8018,13 +8219,17 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
|
|
|
8018
8219
|
totalBytes: plan.totalBytes,
|
|
8019
8220
|
processedFiles,
|
|
8020
8221
|
processedRows,
|
|
8222
|
+
sourceEncoding,
|
|
8021
8223
|
nulBytesRemoved,
|
|
8224
|
+
invalidBytesRemoved,
|
|
8225
|
+
controlCharsRemoved,
|
|
8022
8226
|
changedFiles,
|
|
8023
8227
|
unchangedFiles: plan.totalFiles - changedFiles,
|
|
8024
8228
|
datasets: plan.datasets,
|
|
8025
8229
|
files: fileSummaries,
|
|
8026
8230
|
warnings: [
|
|
8027
|
-
"Sanitization
|
|
8231
|
+
"Sanitization now writes UTF-8 output and removes invalid bytes plus problematic control characters before PostgreSQL loading begins.",
|
|
8232
|
+
"The PostgreSQL direct import path can use --source-encoding UTF8 when reading files generated by this sanitization command.",
|
|
8028
8233
|
"The import command still keeps quarantine and row-level recovery for unexpected issues, but sanitizing first reduces the amount of slow fallback work during import."
|
|
8029
8234
|
],
|
|
8030
8235
|
nextStep: inferNextStep3(outputPath)
|
|
@@ -8817,7 +9022,7 @@ async function exportPostgresCsvDataset(inputPath, options = {}) {
|
|
|
8817
9022
|
// src/services/postgres-direct/generator.ts
|
|
8818
9023
|
import { mkdir as mkdir9, stat as stat7, writeFile as writeFile6 } from "fs/promises";
|
|
8819
9024
|
import path17 from "path";
|
|
8820
|
-
var DEFAULT_SOURCE_ENCODING = "
|
|
9025
|
+
var DEFAULT_SOURCE_ENCODING = "UTF8";
|
|
8821
9026
|
function defaultPostgresDirectOutputPath(inputPath) {
|
|
8822
9027
|
const baseName = path17.basename(inputPath);
|
|
8823
9028
|
if (baseName.toLowerCase() === "sanitized") {
|
|
@@ -8832,7 +9037,7 @@ function normalizeSourceEncoding(value) {
|
|
|
8832
9037
|
const encoding = (value ?? DEFAULT_SOURCE_ENCODING).trim();
|
|
8833
9038
|
if (!/^[A-Za-z0-9_-]+$/.test(encoding)) {
|
|
8834
9039
|
throw new ValidationError(
|
|
8835
|
-
`Invalid source encoding: ${value}. Use a PostgreSQL client encoding name such as WIN1252 or
|
|
9040
|
+
`Invalid source encoding: ${value}. Use a PostgreSQL client encoding name such as UTF8, WIN1252 or LATIN1.`
|
|
8836
9041
|
);
|
|
8837
9042
|
}
|
|
8838
9043
|
return encoding.toUpperCase();
|
|
@@ -8967,7 +9172,8 @@ async function generatePostgresDirectScript(inputPath, options = {}) {
|
|
|
8967
9172
|
...validation.ok ? [] : validation.errors,
|
|
8968
9173
|
"This script imports sanitized Receita files directly with psql \\copy. It avoids rewriting the full dataset into a second CSV tree.",
|
|
8969
9174
|
"The generated script expects the database schema generated by cnpj-db-loader to be applied before execution.",
|
|
8970
|
-
"
|
|
9175
|
+
"The direct PostgreSQL script now defaults to UTF8 because the sanitize command writes clean UTF-8 files.",
|
|
9176
|
+
"Use --source-encoding WIN1252 or LATIN1 only when generating scripts for legacy sanitized files produced by older loader versions."
|
|
8971
9177
|
],
|
|
8972
9178
|
nextStep: inferNextStep5(scriptPath)
|
|
8973
9179
|
};
|
|
@@ -9271,9 +9477,23 @@ function printSanitizeSummary(summary, logFilePath) {
|
|
|
9271
9477
|
console.log(
|
|
9272
9478
|
formatKeyValue("Processed bytes", formatBytes(summary.totalBytes))
|
|
9273
9479
|
);
|
|
9480
|
+
console.log(formatKeyValue("Source encoding", summary.sourceEncoding));
|
|
9481
|
+
console.log(formatKeyValue("Output encoding", "UTF8"));
|
|
9274
9482
|
console.log(
|
|
9275
9483
|
formatKeyValue("Removed NUL bytes", formatCount(summary.nulBytesRemoved))
|
|
9276
9484
|
);
|
|
9485
|
+
console.log(
|
|
9486
|
+
formatKeyValue(
|
|
9487
|
+
"Removed invalid bytes",
|
|
9488
|
+
formatCount(summary.invalidBytesRemoved)
|
|
9489
|
+
)
|
|
9490
|
+
);
|
|
9491
|
+
console.log(
|
|
9492
|
+
formatKeyValue(
|
|
9493
|
+
"Removed control chars",
|
|
9494
|
+
formatCount(summary.controlCharsRemoved)
|
|
9495
|
+
)
|
|
9496
|
+
);
|
|
9277
9497
|
console.log(formatKeyValue("Changed files", summary.changedFiles));
|
|
9278
9498
|
console.log(formatKeyValue("Unchanged files", summary.unchangedFiles));
|
|
9279
9499
|
if (summary.datasets.length > 0) {
|
|
@@ -10053,8 +10273,9 @@ function createSanitizeProgressReporter() {
|
|
|
10053
10273
|
`Validated: ${shortPath(event.validatedPath)}`,
|
|
10054
10274
|
`Output: ${shortPath(event.outputPath)}`,
|
|
10055
10275
|
`Datasets: ${event.datasets.join(" > ")}`,
|
|
10276
|
+
`Source encoding: ${event.sourceEncoding} > UTF8`,
|
|
10056
10277
|
`Files: 0/${formatCount(event.totalFiles)} | Bytes: ${formatBytes(0)} / ${formatBytes(event.totalBytes)}`,
|
|
10057
|
-
`Rows
|
|
10278
|
+
`Rows: ${formatCount(0)} | NUL: ${formatCount(0)} | Invalid bytes: ${formatCount(0)} | Controls: ${formatCount(0)}`,
|
|
10058
10279
|
`Current: waiting...`
|
|
10059
10280
|
];
|
|
10060
10281
|
renderBlock([
|
|
@@ -10070,8 +10291,9 @@ function createSanitizeProgressReporter() {
|
|
|
10070
10291
|
currentLines[1] ?? "",
|
|
10071
10292
|
currentLines[2] ?? "",
|
|
10072
10293
|
currentLines[3] ?? "",
|
|
10294
|
+
currentLines[4] ?? "",
|
|
10073
10295
|
`Files: ${formatCount(event.fileIndex)}/${formatCount(event.totalFiles)} | Bytes: ${formatBytes(event.bytesProcessed)} / ${formatBytes(event.totalBytes)}`,
|
|
10074
|
-
`Rows
|
|
10296
|
+
`Rows: ${formatCount(event.processedRows)} | NUL: ${formatCount(event.nulBytesRemoved)} | Invalid bytes: ${formatCount(event.invalidBytesRemoved)} | Controls: ${formatCount(event.controlCharsRemoved)} | Changed: ${formatCount(event.changedFiles)}`,
|
|
10075
10297
|
`Current: ${shortPath(event.currentFileDisplayPath)}`
|
|
10076
10298
|
];
|
|
10077
10299
|
renderBlock([
|
|
@@ -10091,6 +10313,18 @@ function createSanitizeProgressReporter() {
|
|
|
10091
10313
|
console.log(
|
|
10092
10314
|
formatKeyValue("Removed NUL bytes", formatCount(event.nulBytesRemoved))
|
|
10093
10315
|
);
|
|
10316
|
+
console.log(
|
|
10317
|
+
formatKeyValue(
|
|
10318
|
+
"Removed invalid bytes",
|
|
10319
|
+
formatCount(event.invalidBytesRemoved)
|
|
10320
|
+
)
|
|
10321
|
+
);
|
|
10322
|
+
console.log(
|
|
10323
|
+
formatKeyValue(
|
|
10324
|
+
"Removed control chars",
|
|
10325
|
+
formatCount(event.controlCharsRemoved)
|
|
10326
|
+
)
|
|
10327
|
+
);
|
|
10094
10328
|
console.log(
|
|
10095
10329
|
formatKeyValue("Changed files", formatCount(event.changedFiles))
|
|
10096
10330
|
);
|
|
@@ -11203,7 +11437,7 @@ function registerPostgresCommands(program) {
|
|
|
11203
11437
|
"Generated psql script file name. Defaults to import-postgres-direct.sql."
|
|
11204
11438
|
).option(
|
|
11205
11439
|
"--source-encoding <encoding>",
|
|
11206
|
-
"PostgreSQL client encoding used while reading sanitized Receita files. Defaults to
|
|
11440
|
+
"PostgreSQL client encoding used while reading sanitized Receita files. Defaults to UTF8."
|
|
11207
11441
|
).option("-f, --force", "Skip the confirmation prompt.").description(
|
|
11208
11442
|
"Generate a direct psql import script that loads sanitized Receita files without rewriting them into new CSV files."
|
|
11209
11443
|
).action(
|
|
@@ -11353,6 +11587,9 @@ function registerSanitizeCommands(program) {
|
|
|
11353
11587
|
).option(
|
|
11354
11588
|
"--dataset <dataset>",
|
|
11355
11589
|
"Sanitize only one validated dataset block (for example: establishments or companies)."
|
|
11590
|
+
).option(
|
|
11591
|
+
"--source-encoding <encoding>",
|
|
11592
|
+
"Source file encoding used while reading Receita files. Defaults to WIN1252 and writes clean UTF-8 output."
|
|
11356
11593
|
).option("-f, --force", "Skip the confirmation prompt.").description(
|
|
11357
11594
|
"Prepare a sanitized dataset tree before import by removing known low-level byte issues such as NUL bytes."
|
|
11358
11595
|
).action(
|
|
@@ -11376,6 +11613,9 @@ function registerSanitizeCommands(program) {
|
|
|
11376
11613
|
if (options.dataset) {
|
|
11377
11614
|
sanitizeOptions.dataset = options.dataset;
|
|
11378
11615
|
}
|
|
11616
|
+
if (options.sourceEncoding) {
|
|
11617
|
+
sanitizeOptions.sourceEncoding = options.sourceEncoding;
|
|
11618
|
+
}
|
|
11379
11619
|
const summary = await sanitizeInputDirectory(input2, sanitizeOptions);
|
|
11380
11620
|
const logFilePath = await writeCommandLog("sanitize", summary);
|
|
11381
11621
|
printSanitizeSummary(summary, logFilePath);
|