@danielarndt0/cnpj-db-loader 2.4.0-beta.1 → 2.4.0-beta.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -8
- package/dist/cli.js +856 -137
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +33 -1
- package/dist/index.js +782 -133
- package/dist/index.js.map +1 -1
- package/docs/architecture.md +1 -1
- package/docs/cli.md +1 -1
- package/docs/commands.md +6 -2
- package/docs/postgres-direct.md +239 -45
- package/docs/releases/v2.4.0-beta.3.md +42 -0
- package/docs/sanitize.md +52 -16
- package/package.json +3 -3
- package/docs/releases/v2.4.0.md +0 -40
package/dist/cli.js
CHANGED
|
@@ -7821,81 +7821,264 @@ function isRecognizedSanitizeEntry(entry) {
|
|
|
7821
7821
|
return entry.entryKind === "file" && entry.inferredType !== "zip-archive" && entry.inferredType !== "unknown";
|
|
7822
7822
|
}
|
|
7823
7823
|
|
|
7824
|
+
// src/services/sanitize/encoding.ts
|
|
7825
|
+
import { StringDecoder } from "string_decoder";
|
|
7826
|
+
var WINDOWS_1252_C1_MAP = {
|
|
7827
|
+
128: "\u20AC",
|
|
7828
|
+
130: "\u201A",
|
|
7829
|
+
131: "\u0192",
|
|
7830
|
+
132: "\u201E",
|
|
7831
|
+
133: "\u2026",
|
|
7832
|
+
134: "\u2020",
|
|
7833
|
+
135: "\u2021",
|
|
7834
|
+
136: "\u02C6",
|
|
7835
|
+
137: "\u2030",
|
|
7836
|
+
138: "\u0160",
|
|
7837
|
+
139: "\u2039",
|
|
7838
|
+
140: "\u0152",
|
|
7839
|
+
142: "\u017D",
|
|
7840
|
+
145: "\u2018",
|
|
7841
|
+
146: "\u2019",
|
|
7842
|
+
147: "\u201C",
|
|
7843
|
+
148: "\u201D",
|
|
7844
|
+
149: "\u2022",
|
|
7845
|
+
150: "\u2013",
|
|
7846
|
+
151: "\u2014",
|
|
7847
|
+
152: "\u02DC",
|
|
7848
|
+
153: "\u2122",
|
|
7849
|
+
154: "\u0161",
|
|
7850
|
+
155: "\u203A",
|
|
7851
|
+
156: "\u0153",
|
|
7852
|
+
158: "\u017E",
|
|
7853
|
+
159: "\u0178"
|
|
7854
|
+
};
|
|
7855
|
+
function normalizeSanitizeSourceEncoding(value) {
|
|
7856
|
+
const normalized = (value ?? "WIN1252").trim().toUpperCase().replace(/_/g, "-");
|
|
7857
|
+
switch (normalized) {
|
|
7858
|
+
case "WIN1252":
|
|
7859
|
+
case "WINDOWS-1252":
|
|
7860
|
+
case "CP1252":
|
|
7861
|
+
return "WIN1252";
|
|
7862
|
+
case "LATIN1":
|
|
7863
|
+
case "LATIN-1":
|
|
7864
|
+
case "ISO-8859-1":
|
|
7865
|
+
case "ISO8859-1":
|
|
7866
|
+
return "LATIN1";
|
|
7867
|
+
case "UTF8":
|
|
7868
|
+
case "UTF-8":
|
|
7869
|
+
return "UTF8";
|
|
7870
|
+
default:
|
|
7871
|
+
throw new ValidationError(
|
|
7872
|
+
`Unsupported sanitize source encoding: ${value}. Supported values: WIN1252, LATIN1, UTF8.`
|
|
7873
|
+
);
|
|
7874
|
+
}
|
|
7875
|
+
}
|
|
7876
|
+
function isAllowedControlCodePoint(codePoint) {
|
|
7877
|
+
return codePoint === 9 || codePoint === 10 || codePoint === 13;
|
|
7878
|
+
}
|
|
7879
|
+
function isProblematicControlCodePoint(codePoint) {
|
|
7880
|
+
if (isAllowedControlCodePoint(codePoint)) {
|
|
7881
|
+
return false;
|
|
7882
|
+
}
|
|
7883
|
+
return codePoint >= 0 && codePoint <= 31 || codePoint === 127 || codePoint >= 128 && codePoint <= 159 || codePoint === 65279;
|
|
7884
|
+
}
|
|
7885
|
+
function sanitizeDecodedText(text) {
|
|
7886
|
+
const output2 = [];
|
|
7887
|
+
let invalidBytesRemoved = 0;
|
|
7888
|
+
let controlCharsRemoved = 0;
|
|
7889
|
+
for (const char of text) {
|
|
7890
|
+
const codePoint = char.codePointAt(0);
|
|
7891
|
+
if (codePoint === 65533) {
|
|
7892
|
+
invalidBytesRemoved += 1;
|
|
7893
|
+
continue;
|
|
7894
|
+
}
|
|
7895
|
+
if (isProblematicControlCodePoint(codePoint)) {
|
|
7896
|
+
controlCharsRemoved += 1;
|
|
7897
|
+
continue;
|
|
7898
|
+
}
|
|
7899
|
+
output2.push(char);
|
|
7900
|
+
}
|
|
7901
|
+
return {
|
|
7902
|
+
text: output2.join(""),
|
|
7903
|
+
invalidBytesRemoved,
|
|
7904
|
+
controlCharsRemoved
|
|
7905
|
+
};
|
|
7906
|
+
}
|
|
7907
|
+
var SanitizeEncodingNormalizer = class {
|
|
7908
|
+
constructor(sourceEncoding) {
|
|
7909
|
+
this.sourceEncoding = sourceEncoding;
|
|
7910
|
+
this.utf8Decoder = sourceEncoding === "UTF8" ? new StringDecoder("utf8") : void 0;
|
|
7911
|
+
}
|
|
7912
|
+
sourceEncoding;
|
|
7913
|
+
utf8Decoder;
|
|
7914
|
+
normalizeChunk(chunk) {
|
|
7915
|
+
if (this.sourceEncoding === "UTF8") {
|
|
7916
|
+
const decoded = this.utf8Decoder.write(chunk);
|
|
7917
|
+
const sanitized = sanitizeDecodedText(decoded);
|
|
7918
|
+
const nulBytesRemoved = [...decoded].filter(
|
|
7919
|
+
(char) => char === "\0"
|
|
7920
|
+
).length;
|
|
7921
|
+
return {
|
|
7922
|
+
...sanitized,
|
|
7923
|
+
nulBytesRemoved
|
|
7924
|
+
};
|
|
7925
|
+
}
|
|
7926
|
+
return this.normalizeSingleByteChunk(chunk);
|
|
7927
|
+
}
|
|
7928
|
+
flush() {
|
|
7929
|
+
if (!this.utf8Decoder) {
|
|
7930
|
+
return {
|
|
7931
|
+
text: "",
|
|
7932
|
+
nulBytesRemoved: 0,
|
|
7933
|
+
invalidBytesRemoved: 0,
|
|
7934
|
+
controlCharsRemoved: 0
|
|
7935
|
+
};
|
|
7936
|
+
}
|
|
7937
|
+
const decoded = this.utf8Decoder.end();
|
|
7938
|
+
const sanitized = sanitizeDecodedText(decoded);
|
|
7939
|
+
const nulBytesRemoved = [...decoded].filter((char) => char === "\0").length;
|
|
7940
|
+
return {
|
|
7941
|
+
...sanitized,
|
|
7942
|
+
nulBytesRemoved
|
|
7943
|
+
};
|
|
7944
|
+
}
|
|
7945
|
+
normalizeSingleByteChunk(chunk) {
|
|
7946
|
+
const output2 = [];
|
|
7947
|
+
let nulBytesRemoved = 0;
|
|
7948
|
+
let invalidBytesRemoved = 0;
|
|
7949
|
+
let controlCharsRemoved = 0;
|
|
7950
|
+
for (const byte of chunk) {
|
|
7951
|
+
if (byte === 0) {
|
|
7952
|
+
nulBytesRemoved += 1;
|
|
7953
|
+
continue;
|
|
7954
|
+
}
|
|
7955
|
+
if (byte < 32 || byte === 127) {
|
|
7956
|
+
if (isAllowedControlCodePoint(byte)) {
|
|
7957
|
+
output2.push(String.fromCharCode(byte));
|
|
7958
|
+
} else {
|
|
7959
|
+
controlCharsRemoved += 1;
|
|
7960
|
+
}
|
|
7961
|
+
continue;
|
|
7962
|
+
}
|
|
7963
|
+
if (byte >= 128 && byte <= 159) {
|
|
7964
|
+
if (this.sourceEncoding === "WIN1252") {
|
|
7965
|
+
const mapped = WINDOWS_1252_C1_MAP[byte];
|
|
7966
|
+
if (mapped === void 0) {
|
|
7967
|
+
invalidBytesRemoved += 1;
|
|
7968
|
+
} else {
|
|
7969
|
+
output2.push(mapped);
|
|
7970
|
+
}
|
|
7971
|
+
} else {
|
|
7972
|
+
controlCharsRemoved += 1;
|
|
7973
|
+
}
|
|
7974
|
+
continue;
|
|
7975
|
+
}
|
|
7976
|
+
output2.push(String.fromCharCode(byte));
|
|
7977
|
+
}
|
|
7978
|
+
return {
|
|
7979
|
+
text: output2.join(""),
|
|
7980
|
+
nulBytesRemoved,
|
|
7981
|
+
invalidBytesRemoved,
|
|
7982
|
+
controlCharsRemoved
|
|
7983
|
+
};
|
|
7984
|
+
}
|
|
7985
|
+
};
|
|
7986
|
+
|
|
7824
7987
|
// src/services/sanitize/runner.ts
|
|
7825
7988
|
import { createReadStream as createReadStream2, createWriteStream as createWriteStream2 } from "fs";
|
|
7826
7989
|
import { mkdir as mkdir7 } from "fs/promises";
|
|
7827
7990
|
import path13 from "path";
|
|
7828
|
-
function
|
|
7829
|
-
|
|
7830
|
-
|
|
7831
|
-
if (chunk[index] === 0) {
|
|
7832
|
-
removed += 1;
|
|
7833
|
-
}
|
|
7991
|
+
async function writeUtf8(output2, value) {
|
|
7992
|
+
if (value.length === 0) {
|
|
7993
|
+
return;
|
|
7834
7994
|
}
|
|
7835
|
-
if (
|
|
7836
|
-
|
|
7995
|
+
if (!output2.write(value, "utf8")) {
|
|
7996
|
+
await new Promise((resolve2, reject) => {
|
|
7997
|
+
output2.once("drain", resolve2);
|
|
7998
|
+
output2.once("error", reject);
|
|
7999
|
+
});
|
|
7837
8000
|
}
|
|
7838
|
-
|
|
7839
|
-
|
|
7840
|
-
|
|
7841
|
-
|
|
7842
|
-
if (value
|
|
7843
|
-
|
|
7844
|
-
outputIndex += 1;
|
|
8001
|
+
}
|
|
8002
|
+
function countNewlines(value) {
|
|
8003
|
+
let count = 0;
|
|
8004
|
+
for (let index = 0; index < value.length; index += 1) {
|
|
8005
|
+
if (value[index] === "\n") {
|
|
8006
|
+
count += 1;
|
|
7845
8007
|
}
|
|
7846
8008
|
}
|
|
7847
|
-
return
|
|
8009
|
+
return count;
|
|
7848
8010
|
}
|
|
7849
|
-
async function sanitizeDatasetFile(plan, onChunk) {
|
|
8011
|
+
async function sanitizeDatasetFile(plan, onChunk, options = {}) {
|
|
7850
8012
|
await mkdir7(path13.dirname(plan.outputPath), { recursive: true });
|
|
8013
|
+
const sourceEncoding = normalizeSanitizeSourceEncoding(
|
|
8014
|
+
options.sourceEncoding
|
|
8015
|
+
);
|
|
8016
|
+
const normalizer = new SanitizeEncodingNormalizer(sourceEncoding);
|
|
7851
8017
|
const input2 = createReadStream2(plan.absolutePath);
|
|
7852
|
-
const output2 = createWriteStream2(plan.outputPath);
|
|
8018
|
+
const output2 = createWriteStream2(plan.outputPath, { encoding: "utf8" });
|
|
7853
8019
|
let totalBytesRead = 0;
|
|
7854
8020
|
let totalBytesWritten = 0;
|
|
7855
8021
|
let nulBytesRemoved = 0;
|
|
8022
|
+
let invalidBytesRemoved = 0;
|
|
8023
|
+
let controlCharsRemoved = 0;
|
|
7856
8024
|
let lineCount = 0;
|
|
7857
|
-
let
|
|
7858
|
-
let
|
|
8025
|
+
let sawAnyCharacter = false;
|
|
8026
|
+
let lastCharacterWasNewline = false;
|
|
8027
|
+
const processText = async (text) => {
|
|
8028
|
+
if (text.length === 0) {
|
|
8029
|
+
return;
|
|
8030
|
+
}
|
|
8031
|
+
sawAnyCharacter = true;
|
|
8032
|
+
lineCount += countNewlines(text);
|
|
8033
|
+
lastCharacterWasNewline = text.endsWith("\n");
|
|
8034
|
+
totalBytesWritten += Buffer.byteLength(text, "utf8");
|
|
8035
|
+
await writeUtf8(output2, text);
|
|
8036
|
+
};
|
|
7859
8037
|
try {
|
|
7860
8038
|
for await (const chunk of input2) {
|
|
7861
8039
|
const chunkBuffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
|
|
7862
8040
|
totalBytesRead += chunkBuffer.length;
|
|
7863
|
-
const
|
|
7864
|
-
nulBytesRemoved +=
|
|
7865
|
-
|
|
7866
|
-
|
|
7867
|
-
|
|
7868
|
-
lineCount += 1;
|
|
7869
|
-
}
|
|
7870
|
-
}
|
|
7871
|
-
if (buffer.length > 0) {
|
|
7872
|
-
lastByteWasNewline = buffer[buffer.length - 1] === 10;
|
|
7873
|
-
}
|
|
7874
|
-
totalBytesWritten += buffer.length;
|
|
7875
|
-
output2.write(buffer);
|
|
8041
|
+
const normalized = normalizer.normalizeChunk(chunkBuffer);
|
|
8042
|
+
nulBytesRemoved += normalized.nulBytesRemoved;
|
|
8043
|
+
invalidBytesRemoved += normalized.invalidBytesRemoved;
|
|
8044
|
+
controlCharsRemoved += normalized.controlCharsRemoved;
|
|
8045
|
+
await processText(normalized.text);
|
|
7876
8046
|
onChunk?.({
|
|
7877
8047
|
bytesProcessed: chunkBuffer.length,
|
|
7878
8048
|
fileBytesProcessed: totalBytesRead,
|
|
7879
8049
|
currentFileSize: plan.fileSize,
|
|
7880
8050
|
processedRows: lineCount,
|
|
7881
|
-
nulBytesRemoved
|
|
8051
|
+
nulBytesRemoved,
|
|
8052
|
+
invalidBytesRemoved,
|
|
8053
|
+
controlCharsRemoved
|
|
7882
8054
|
});
|
|
7883
8055
|
}
|
|
7884
|
-
|
|
8056
|
+
const flushed = normalizer.flush();
|
|
8057
|
+
nulBytesRemoved += flushed.nulBytesRemoved;
|
|
8058
|
+
invalidBytesRemoved += flushed.invalidBytesRemoved;
|
|
8059
|
+
controlCharsRemoved += flushed.controlCharsRemoved;
|
|
8060
|
+
await processText(flushed.text);
|
|
8061
|
+
if (sawAnyCharacter && !lastCharacterWasNewline) {
|
|
7885
8062
|
lineCount += 1;
|
|
7886
8063
|
}
|
|
7887
8064
|
} finally {
|
|
7888
8065
|
input2.close();
|
|
7889
8066
|
output2.end();
|
|
7890
|
-
await new Promise((resolve2
|
|
8067
|
+
await new Promise((resolve2, reject) => {
|
|
8068
|
+
output2.on("finish", () => resolve2());
|
|
8069
|
+
output2.on("error", (error) => reject(error));
|
|
8070
|
+
});
|
|
7891
8071
|
}
|
|
7892
8072
|
return {
|
|
7893
8073
|
plan,
|
|
7894
8074
|
totalBytesRead,
|
|
7895
8075
|
totalBytesWritten,
|
|
8076
|
+
sourceEncoding,
|
|
7896
8077
|
nulBytesRemoved,
|
|
8078
|
+
invalidBytesRemoved,
|
|
8079
|
+
controlCharsRemoved,
|
|
7897
8080
|
lineCount,
|
|
7898
|
-
changed: nulBytesRemoved > 0 || totalBytesRead !== totalBytesWritten
|
|
8081
|
+
changed: nulBytesRemoved > 0 || invalidBytesRemoved > 0 || controlCharsRemoved > 0 || totalBytesRead !== totalBytesWritten
|
|
7899
8082
|
};
|
|
7900
8083
|
}
|
|
7901
8084
|
|
|
@@ -7958,40 +8141,54 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
|
|
|
7958
8141
|
"No recognized validated dataset files were found for sanitization."
|
|
7959
8142
|
);
|
|
7960
8143
|
}
|
|
8144
|
+
const sourceEncoding = normalizeSanitizeSourceEncoding(
|
|
8145
|
+
options.sourceEncoding
|
|
8146
|
+
);
|
|
7961
8147
|
options.onProgress?.({
|
|
7962
8148
|
kind: "start",
|
|
7963
8149
|
validatedPath,
|
|
7964
8150
|
outputPath,
|
|
7965
8151
|
totalFiles: plan.totalFiles,
|
|
7966
8152
|
totalBytes: plan.totalBytes,
|
|
7967
|
-
datasets: plan.datasets
|
|
8153
|
+
datasets: plan.datasets,
|
|
8154
|
+
sourceEncoding
|
|
7968
8155
|
});
|
|
7969
8156
|
let processedFiles = 0;
|
|
7970
8157
|
let processedRows = 0;
|
|
7971
8158
|
let processedBytes = 0;
|
|
7972
8159
|
let nulBytesRemoved = 0;
|
|
8160
|
+
let invalidBytesRemoved = 0;
|
|
8161
|
+
let controlCharsRemoved = 0;
|
|
7973
8162
|
let changedFiles = 0;
|
|
7974
8163
|
const fileSummaries = [];
|
|
7975
8164
|
for (const [index, filePlan] of plan.files.entries()) {
|
|
7976
|
-
const fileResult = await sanitizeDatasetFile(
|
|
7977
|
-
|
|
7978
|
-
|
|
7979
|
-
|
|
7980
|
-
|
|
7981
|
-
|
|
7982
|
-
|
|
7983
|
-
|
|
7984
|
-
|
|
7985
|
-
|
|
7986
|
-
|
|
7987
|
-
|
|
7988
|
-
|
|
7989
|
-
|
|
7990
|
-
|
|
8165
|
+
const fileResult = await sanitizeDatasetFile(
|
|
8166
|
+
filePlan,
|
|
8167
|
+
(chunk) => {
|
|
8168
|
+
options.onProgress?.({
|
|
8169
|
+
kind: "progress",
|
|
8170
|
+
currentFileDisplayPath: filePlan.displayPath,
|
|
8171
|
+
fileIndex: index + 1,
|
|
8172
|
+
totalFiles: plan.totalFiles,
|
|
8173
|
+
bytesProcessed: processedBytes + chunk.fileBytesProcessed,
|
|
8174
|
+
totalBytes: plan.totalBytes,
|
|
8175
|
+
fileBytesProcessed: chunk.fileBytesProcessed,
|
|
8176
|
+
currentFileSize: chunk.currentFileSize,
|
|
8177
|
+
processedRows: processedRows + chunk.processedRows,
|
|
8178
|
+
nulBytesRemoved: nulBytesRemoved + chunk.nulBytesRemoved,
|
|
8179
|
+
invalidBytesRemoved: invalidBytesRemoved + chunk.invalidBytesRemoved,
|
|
8180
|
+
controlCharsRemoved: controlCharsRemoved + chunk.controlCharsRemoved,
|
|
8181
|
+
changedFiles
|
|
8182
|
+
});
|
|
8183
|
+
},
|
|
8184
|
+
{ sourceEncoding }
|
|
8185
|
+
);
|
|
7991
8186
|
processedFiles += 1;
|
|
7992
8187
|
processedRows += fileResult.lineCount;
|
|
7993
8188
|
processedBytes += fileResult.totalBytesRead;
|
|
7994
8189
|
nulBytesRemoved += fileResult.nulBytesRemoved;
|
|
8190
|
+
invalidBytesRemoved += fileResult.invalidBytesRemoved;
|
|
8191
|
+
controlCharsRemoved += fileResult.controlCharsRemoved;
|
|
7995
8192
|
changedFiles += fileResult.changed ? 1 : 0;
|
|
7996
8193
|
fileSummaries.push({
|
|
7997
8194
|
dataset: filePlan.dataset,
|
|
@@ -7999,7 +8196,9 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
|
|
|
7999
8196
|
outputPath: filePlan.outputPath,
|
|
8000
8197
|
lineCount: fileResult.lineCount,
|
|
8001
8198
|
changed: fileResult.changed,
|
|
8002
|
-
nulBytesRemoved: fileResult.nulBytesRemoved
|
|
8199
|
+
nulBytesRemoved: fileResult.nulBytesRemoved,
|
|
8200
|
+
invalidBytesRemoved: fileResult.invalidBytesRemoved,
|
|
8201
|
+
controlCharsRemoved: fileResult.controlCharsRemoved
|
|
8003
8202
|
});
|
|
8004
8203
|
}
|
|
8005
8204
|
options.onProgress?.({
|
|
@@ -8007,6 +8206,8 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
|
|
|
8007
8206
|
totalFiles: plan.totalFiles,
|
|
8008
8207
|
processedRows,
|
|
8009
8208
|
nulBytesRemoved,
|
|
8209
|
+
invalidBytesRemoved,
|
|
8210
|
+
controlCharsRemoved,
|
|
8010
8211
|
changedFiles,
|
|
8011
8212
|
totalBytes: plan.totalBytes
|
|
8012
8213
|
});
|
|
@@ -8018,13 +8219,17 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
|
|
|
8018
8219
|
totalBytes: plan.totalBytes,
|
|
8019
8220
|
processedFiles,
|
|
8020
8221
|
processedRows,
|
|
8222
|
+
sourceEncoding,
|
|
8021
8223
|
nulBytesRemoved,
|
|
8224
|
+
invalidBytesRemoved,
|
|
8225
|
+
controlCharsRemoved,
|
|
8022
8226
|
changedFiles,
|
|
8023
8227
|
unchangedFiles: plan.totalFiles - changedFiles,
|
|
8024
8228
|
datasets: plan.datasets,
|
|
8025
8229
|
files: fileSummaries,
|
|
8026
8230
|
warnings: [
|
|
8027
|
-
"Sanitization
|
|
8231
|
+
"Sanitization now writes UTF-8 output and removes invalid bytes plus problematic control characters before PostgreSQL loading begins.",
|
|
8232
|
+
"The PostgreSQL direct import path can use --source-encoding UTF8 when reading files generated by this sanitization command.",
|
|
8028
8233
|
"The import command still keeps quarantine and row-level recovery for unexpected issues, but sanitizing first reduces the amount of slow fallback work during import."
|
|
8029
8234
|
],
|
|
8030
8235
|
nextStep: inferNextStep3(outputPath)
|
|
@@ -8174,6 +8379,18 @@ var STAGING_TABLE_BY_DATASET3 = {
|
|
|
8174
8379
|
partners: "staging_partners",
|
|
8175
8380
|
simples_options: "staging_simples_options"
|
|
8176
8381
|
};
|
|
8382
|
+
var STEP_ORDER = [
|
|
8383
|
+
"setup",
|
|
8384
|
+
"load-domains",
|
|
8385
|
+
"load-companies",
|
|
8386
|
+
"load-establishments",
|
|
8387
|
+
"load-partners",
|
|
8388
|
+
"load-simples",
|
|
8389
|
+
"materialize",
|
|
8390
|
+
"materialize-secondary-cnaes",
|
|
8391
|
+
"indexes",
|
|
8392
|
+
"analyze"
|
|
8393
|
+
];
|
|
8177
8394
|
function quoteSqlLiteral(value) {
|
|
8178
8395
|
return `'${value.replace(/'/g, "''")}'`;
|
|
8179
8396
|
}
|
|
@@ -8191,6 +8408,9 @@ function receitaCopyCommand(tableName, columns, filePath) {
|
|
|
8191
8408
|
const normalizedFilePath = normalizePathForPsql(filePath);
|
|
8192
8409
|
return `\\copy ${tableName} (${columns.join(", ")}) from ${quoteSqlLiteral(normalizedFilePath)} with (format csv, header false, delimiter ';', quote '"', escape '"')`;
|
|
8193
8410
|
}
|
|
8411
|
+
function echo(message) {
|
|
8412
|
+
return `\\echo ${quoteSqlLiteral(message)}`;
|
|
8413
|
+
}
|
|
8194
8414
|
function datasetColumns(dataset) {
|
|
8195
8415
|
return DATASET_LAYOUTS[dataset].fields.map((field) => field.columnName);
|
|
8196
8416
|
}
|
|
@@ -8217,7 +8437,7 @@ function partnerDedupeExpression(alias) {
|
|
|
8217
8437
|
function materializeCompaniesSql() {
|
|
8218
8438
|
const columns = companiesLayout.fields.map((field) => field.columnName);
|
|
8219
8439
|
return [
|
|
8220
|
-
"
|
|
8440
|
+
echo("[materialize] Materializing companies..."),
|
|
8221
8441
|
"with source as (",
|
|
8222
8442
|
" select",
|
|
8223
8443
|
` ${columns.map((column) => `source.${column}`).join(",\n ")},`,
|
|
@@ -8231,7 +8451,8 @@ function materializeCompaniesSql() {
|
|
|
8231
8451
|
`select ${columns.join(", ")}`,
|
|
8232
8452
|
"from deduped",
|
|
8233
8453
|
"on conflict (cnpj_root) do update set",
|
|
8234
|
-
` ${updateAssignments(columns, ["cnpj_root"])}
|
|
8454
|
+
` ${updateAssignments(columns, ["cnpj_root"])};`,
|
|
8455
|
+
echo("[materialize] Companies materialization completed.")
|
|
8235
8456
|
].join("\n");
|
|
8236
8457
|
}
|
|
8237
8458
|
function materializeEstablishmentsSql() {
|
|
@@ -8240,7 +8461,7 @@ function materializeEstablishmentsSql() {
|
|
|
8240
8461
|
);
|
|
8241
8462
|
const insertColumns = [...baseColumns, "cnpj_full"];
|
|
8242
8463
|
return [
|
|
8243
|
-
"
|
|
8464
|
+
echo("[materialize] Materializing establishments..."),
|
|
8244
8465
|
"with source as (",
|
|
8245
8466
|
" select",
|
|
8246
8467
|
` ${baseColumns.map((column) => `source.${column}`).join(",\n ")},`,
|
|
@@ -8250,14 +8471,29 @@ function materializeEstablishmentsSql() {
|
|
|
8250
8471
|
"),",
|
|
8251
8472
|
"deduped as (",
|
|
8252
8473
|
" select * from source where dedupe_rank = 1",
|
|
8474
|
+
")",
|
|
8475
|
+
`insert into establishments (${insertColumns.join(", ")})`,
|
|
8476
|
+
`select ${insertColumns.join(", ")}`,
|
|
8477
|
+
"from deduped",
|
|
8478
|
+
"on conflict (cnpj_full) do update set",
|
|
8479
|
+
` ${updateAssignments(insertColumns, ["cnpj_root", "cnpj_order", "cnpj_check_digits", "cnpj_full"])};`,
|
|
8480
|
+
echo("[materialize] Establishments materialization completed.")
|
|
8481
|
+
].join("\n");
|
|
8482
|
+
}
|
|
8483
|
+
function materializeSecondaryCnaesSql() {
|
|
8484
|
+
return [
|
|
8485
|
+
echo(
|
|
8486
|
+
"[materialize-secondary-cnaes] Materializing establishment secondary CNAEs..."
|
|
8487
|
+
),
|
|
8488
|
+
"with source as (",
|
|
8489
|
+
" select",
|
|
8490
|
+
" staging.cnpj_root || staging.cnpj_order || staging.cnpj_check_digits as cnpj_full,",
|
|
8491
|
+
" staging.secondary_cnaes_raw,",
|
|
8492
|
+
" row_number() over (partition by staging.cnpj_root || staging.cnpj_order || staging.cnpj_check_digits order by staging.staging_id desc) as dedupe_rank",
|
|
8493
|
+
" from staging_establishments staging",
|
|
8253
8494
|
"),",
|
|
8254
|
-
"
|
|
8255
|
-
|
|
8256
|
-
` select ${insertColumns.join(", ")}`,
|
|
8257
|
-
" from deduped",
|
|
8258
|
-
" on conflict (cnpj_full) do update set",
|
|
8259
|
-
` ${updateAssignments(insertColumns, ["cnpj_root", "cnpj_order", "cnpj_check_digits", "cnpj_full"])}`,
|
|
8260
|
-
" returning cnpj_full",
|
|
8495
|
+
"deduped as (",
|
|
8496
|
+
" select * from source where dedupe_rank = 1",
|
|
8261
8497
|
"),",
|
|
8262
8498
|
"deleted_secondary_cnaes as (",
|
|
8263
8499
|
" delete from establishment_secondary_cnaes target",
|
|
@@ -8278,14 +8514,17 @@ function materializeEstablishmentsSql() {
|
|
|
8278
8514
|
"insert into establishment_secondary_cnaes (cnpj_full, cnae_code)",
|
|
8279
8515
|
"select cnpj_full, cnae_code",
|
|
8280
8516
|
"from secondary_cnaes_source",
|
|
8281
|
-
"on conflict (cnpj_full, cnae_code) do nothing;"
|
|
8517
|
+
"on conflict (cnpj_full, cnae_code) do nothing;",
|
|
8518
|
+
echo(
|
|
8519
|
+
"[materialize-secondary-cnaes] Secondary CNAEs materialization completed."
|
|
8520
|
+
)
|
|
8282
8521
|
].join("\n");
|
|
8283
8522
|
}
|
|
8284
8523
|
function materializePartnersSql() {
|
|
8285
8524
|
const baseColumns = partnersLayout.fields.map((field) => field.columnName);
|
|
8286
8525
|
const insertColumns = [...baseColumns, "partner_dedupe_key"];
|
|
8287
8526
|
return [
|
|
8288
|
-
"
|
|
8527
|
+
echo("[materialize] Materializing partners..."),
|
|
8289
8528
|
"with source as (",
|
|
8290
8529
|
" select",
|
|
8291
8530
|
` ${baseColumns.map((column) => `source.${column}`).join(",\n ")},`,
|
|
@@ -8305,13 +8544,14 @@ function materializePartnersSql() {
|
|
|
8305
8544
|
`select ${insertColumns.join(", ")}`,
|
|
8306
8545
|
"from deduped",
|
|
8307
8546
|
"on conflict (partner_dedupe_key) do update set",
|
|
8308
|
-
` ${updateAssignments(insertColumns, ["partner_dedupe_key"])}
|
|
8547
|
+
` ${updateAssignments(insertColumns, ["partner_dedupe_key"])};`,
|
|
8548
|
+
echo("[materialize] Partners materialization completed.")
|
|
8309
8549
|
].join("\n");
|
|
8310
8550
|
}
|
|
8311
8551
|
function materializeSimplesSql() {
|
|
8312
8552
|
const columns = simplesLayout.fields.map((field) => field.columnName);
|
|
8313
8553
|
return [
|
|
8314
|
-
"
|
|
8554
|
+
echo("[materialize] Materializing simples options..."),
|
|
8315
8555
|
"with source as (",
|
|
8316
8556
|
" select",
|
|
8317
8557
|
` ${columns.map((column) => `source.${column}`).join(",\n ")},`,
|
|
@@ -8325,7 +8565,8 @@ function materializeSimplesSql() {
|
|
|
8325
8565
|
`select ${columns.join(", ")}`,
|
|
8326
8566
|
"from deduped",
|
|
8327
8567
|
"on conflict (cnpj_root) do update set",
|
|
8328
|
-
` ${updateAssignments(columns, ["cnpj_root"])}
|
|
8568
|
+
` ${updateAssignments(columns, ["cnpj_root"])};`,
|
|
8569
|
+
echo("[materialize] Simples options materialization completed.")
|
|
8329
8570
|
].join("\n");
|
|
8330
8571
|
}
|
|
8331
8572
|
function copyDomainSql(dataset, files) {
|
|
@@ -8335,12 +8576,20 @@ function copyDomainSql(dataset, files) {
|
|
|
8335
8576
|
const columns = datasetColumns(dataset);
|
|
8336
8577
|
const tempTable = `tmp_hybrid_${dataset}`;
|
|
8337
8578
|
const lines = [
|
|
8338
|
-
|
|
8579
|
+
echo(`[load-domains] Loading ${dataset} lookup data...`),
|
|
8339
8580
|
`drop table if exists ${tempTable};`,
|
|
8340
8581
|
`create temporary table ${tempTable} (code text, description text);`
|
|
8341
8582
|
];
|
|
8342
|
-
for (const file of files) {
|
|
8343
|
-
lines.push(
|
|
8583
|
+
for (const [index, file] of files.entries()) {
|
|
8584
|
+
lines.push(
|
|
8585
|
+
echo(
|
|
8586
|
+
`[load-domains] Loading ${dataset} file ${index + 1} of ${files.length}: ${file.relativePath}`
|
|
8587
|
+
),
|
|
8588
|
+
csvCopyCommand(tempTable, columns, file.absolutePath),
|
|
8589
|
+
echo(
|
|
8590
|
+
`[load-domains] Loaded ${dataset} file ${index + 1} of ${files.length}.`
|
|
8591
|
+
)
|
|
8592
|
+
);
|
|
8344
8593
|
}
|
|
8345
8594
|
lines.push(
|
|
8346
8595
|
`insert into ${dataset} (${columns.join(", ")})`,
|
|
@@ -8361,12 +8610,17 @@ function copyStagingSql(dataset, files) {
|
|
|
8361
8610
|
return [];
|
|
8362
8611
|
}
|
|
8363
8612
|
const columns = datasetColumns(dataset);
|
|
8364
|
-
|
|
8365
|
-
|
|
8366
|
-
|
|
8367
|
-
(
|
|
8368
|
-
|
|
8369
|
-
|
|
8613
|
+
const lines = [echo(`[load-${dataset}] Loading ${dataset} staging data...`)];
|
|
8614
|
+
for (const [index, file] of files.entries()) {
|
|
8615
|
+
lines.push(
|
|
8616
|
+
echo(
|
|
8617
|
+
`[load-${dataset}] Loading file ${index + 1} of ${files.length}: ${file.relativePath}`
|
|
8618
|
+
),
|
|
8619
|
+
csvCopyCommand(tableName, columns, file.absolutePath),
|
|
8620
|
+
echo(`[load-${dataset}] Loaded file ${index + 1} of ${files.length}.`)
|
|
8621
|
+
);
|
|
8622
|
+
}
|
|
8623
|
+
return lines;
|
|
8370
8624
|
}
|
|
8371
8625
|
function csvFilesByDataset(files) {
|
|
8372
8626
|
const grouped = {};
|
|
@@ -8392,7 +8646,9 @@ function rawTableName(dataset) {
|
|
|
8392
8646
|
function createRawTempTableSql(dataset) {
|
|
8393
8647
|
const columns = DATASET_LAYOUTS[dataset].fields.map((field) => ` ${quoteIdentifier(field.columnName)} text`).join(",\n");
|
|
8394
8648
|
return [
|
|
8649
|
+
"set client_min_messages to warning;",
|
|
8395
8650
|
`drop table if exists ${rawTableName(dataset)};`,
|
|
8651
|
+
"reset client_min_messages;",
|
|
8396
8652
|
`create temporary table ${rawTableName(dataset)} (`,
|
|
8397
8653
|
columns,
|
|
8398
8654
|
");"
|
|
@@ -8474,11 +8730,21 @@ function rawDomainSql(dataset, files) {
|
|
|
8474
8730
|
const columns = layout.fields.map((field) => field.columnName);
|
|
8475
8731
|
const tableName = rawTableName(dataset);
|
|
8476
8732
|
const lines = [
|
|
8477
|
-
|
|
8733
|
+
echo(
|
|
8734
|
+
`[load-domains] Loading ${dataset} lookup data directly from sanitized Receita files...`
|
|
8735
|
+
),
|
|
8478
8736
|
createRawTempTableSql(dataset)
|
|
8479
8737
|
];
|
|
8480
|
-
for (const file of files) {
|
|
8481
|
-
lines.push(
|
|
8738
|
+
for (const [index, file] of files.entries()) {
|
|
8739
|
+
lines.push(
|
|
8740
|
+
echo(
|
|
8741
|
+
`[load-domains] Loading ${dataset} file ${index + 1} of ${files.length}: ${file.relativePath}`
|
|
8742
|
+
),
|
|
8743
|
+
receitaCopyCommand(tableName, columns, file.absolutePath),
|
|
8744
|
+
echo(
|
|
8745
|
+
`[load-domains] Loaded ${dataset} file ${index + 1} of ${files.length}.`
|
|
8746
|
+
)
|
|
8747
|
+
);
|
|
8482
8748
|
}
|
|
8483
8749
|
lines.push(
|
|
8484
8750
|
`insert into ${dataset} (${columns.join(", ")})`,
|
|
@@ -8488,7 +8754,8 @@ function rawDomainSql(dataset, files) {
|
|
|
8488
8754
|
`from ${tableName}`,
|
|
8489
8755
|
"where nullif(btrim(code), '') is not null",
|
|
8490
8756
|
"order by code",
|
|
8491
|
-
"on conflict (code) do update set description = excluded.description;"
|
|
8757
|
+
"on conflict (code) do update set description = excluded.description;",
|
|
8758
|
+
echo(`[load-domains] ${dataset} lookup data completed.`)
|
|
8492
8759
|
);
|
|
8493
8760
|
return lines;
|
|
8494
8761
|
}
|
|
@@ -8507,70 +8774,363 @@ function rawStagingSql(dataset, files) {
|
|
|
8507
8774
|
const expressions = layout.fields.map(
|
|
8508
8775
|
(field) => ` ${fieldExpression(dataset, field, alias)} as ${field.columnName}`
|
|
8509
8776
|
);
|
|
8777
|
+
const stepName = loadStepName(dataset);
|
|
8510
8778
|
const lines = [
|
|
8511
|
-
|
|
8779
|
+
echo(
|
|
8780
|
+
`[${stepName}] Loading ${dataset} staging data directly from sanitized Receita files...`
|
|
8781
|
+
),
|
|
8782
|
+
`truncate table ${targetTable} restart identity;`,
|
|
8512
8783
|
createRawTempTableSql(dataset)
|
|
8513
8784
|
];
|
|
8514
|
-
for (const file of files) {
|
|
8515
|
-
lines.push(
|
|
8785
|
+
for (const [index, file] of files.entries()) {
|
|
8786
|
+
lines.push(
|
|
8787
|
+
echo(
|
|
8788
|
+
`[${stepName}] Loading file ${index + 1} of ${files.length}: ${file.relativePath}`
|
|
8789
|
+
),
|
|
8790
|
+
receitaCopyCommand(tableName, columns, file.absolutePath),
|
|
8791
|
+
echo(`[${stepName}] Loaded file ${index + 1} of ${files.length}.`)
|
|
8792
|
+
);
|
|
8516
8793
|
}
|
|
8517
8794
|
lines.push(
|
|
8795
|
+
echo(
|
|
8796
|
+
`[${stepName}] Transforming ${dataset} raw rows into ${targetTable}...`
|
|
8797
|
+
),
|
|
8518
8798
|
`insert into ${targetTable} (${columns.join(", ")})`,
|
|
8519
8799
|
"select",
|
|
8520
8800
|
expressions.join(",\n"),
|
|
8521
|
-
`from ${tableName} ${alias}
|
|
8801
|
+
`from ${tableName} ${alias};`,
|
|
8802
|
+
echo(`[${stepName}] ${dataset} staging load completed.`)
|
|
8522
8803
|
);
|
|
8523
8804
|
return lines;
|
|
8524
8805
|
}
|
|
8525
|
-
function
|
|
8526
|
-
|
|
8527
|
-
|
|
8528
|
-
|
|
8529
|
-
|
|
8530
|
-
|
|
8531
|
-
|
|
8532
|
-
|
|
8806
|
+
function loadStepName(dataset) {
|
|
8807
|
+
switch (dataset) {
|
|
8808
|
+
case "companies":
|
|
8809
|
+
return "load-companies";
|
|
8810
|
+
case "establishments":
|
|
8811
|
+
return "load-establishments";
|
|
8812
|
+
case "partners":
|
|
8813
|
+
return "load-partners";
|
|
8814
|
+
case "simples_options":
|
|
8815
|
+
return "load-simples";
|
|
8816
|
+
default:
|
|
8817
|
+
return `load-${dataset}`;
|
|
8818
|
+
}
|
|
8819
|
+
}
|
|
8820
|
+
function scriptHeader(title, sourceEncoding) {
|
|
8821
|
+
return [
|
|
8822
|
+
`-- ${title}`,
|
|
8823
|
+
"-- Generated by cnpj-db-loader postgres generate-script.",
|
|
8533
8824
|
"\\set ON_ERROR_STOP on",
|
|
8534
|
-
|
|
8535
|
-
|
|
8536
|
-
|
|
8537
|
-
|
|
8538
|
-
|
|
8539
|
-
|
|
8540
|
-
"truncate table staging_companies restart identity;",
|
|
8541
|
-
"truncate table staging_establishments restart identity;",
|
|
8542
|
-
"truncate table staging_partners restart identity;",
|
|
8543
|
-
"truncate table staging_simples_options restart identity;",
|
|
8825
|
+
...sourceEncoding ? [
|
|
8826
|
+
echo(
|
|
8827
|
+
`Using source file encoding ${sourceEncoding} for psql copy operations...`
|
|
8828
|
+
),
|
|
8829
|
+
`set client_encoding to ${quoteSqlLiteral(sourceEncoding)};`
|
|
8830
|
+
] : [],
|
|
8544
8831
|
""
|
|
8545
8832
|
];
|
|
8546
|
-
|
|
8547
|
-
|
|
8833
|
+
}
|
|
8834
|
+
function wrapTransaction(lines, mode, shouldWrap) {
|
|
8835
|
+
if (!shouldWrap || mode !== "phase") {
|
|
8836
|
+
return [...lines];
|
|
8548
8837
|
}
|
|
8549
|
-
|
|
8550
|
-
|
|
8838
|
+
return ["begin;", "", ...lines, "", "commit;"];
|
|
8839
|
+
}
|
|
8840
|
+
function buildStepScript(title, body, input2, wrapInPhaseTransaction) {
|
|
8841
|
+
return [
|
|
8842
|
+
...scriptHeader(title, input2.sourceEncoding),
|
|
8843
|
+
...wrapTransaction(body, input2.transactionMode, wrapInPhaseTransaction),
|
|
8844
|
+
""
|
|
8845
|
+
].join("\n");
|
|
8846
|
+
}
|
|
8847
|
+
function includeSet(input2) {
|
|
8848
|
+
const selected = new Set(input2.include);
|
|
8849
|
+
if (input2.skipIndexes) {
|
|
8850
|
+
selected.delete("indexes");
|
|
8551
8851
|
}
|
|
8552
|
-
|
|
8553
|
-
|
|
8852
|
+
if (input2.skipAnalyze) {
|
|
8853
|
+
selected.delete("analyze");
|
|
8854
|
+
}
|
|
8855
|
+
return selected;
|
|
8856
|
+
}
|
|
8857
|
+
function hasAnyFinalMaterialization(selected) {
|
|
8858
|
+
return selected.has("companies") || selected.has("establishments") || selected.has("partners") || selected.has("simples");
|
|
8554
8859
|
}
|
|
8555
|
-
function
|
|
8860
|
+
function materializeSql(selected) {
|
|
8861
|
+
const lines = [echo("[materialize] Starting final table materialization...")];
|
|
8862
|
+
if (selected.has("companies")) {
|
|
8863
|
+
lines.push(materializeCompaniesSql(), "");
|
|
8864
|
+
}
|
|
8865
|
+
if (selected.has("establishments")) {
|
|
8866
|
+
lines.push(materializeEstablishmentsSql(), "");
|
|
8867
|
+
}
|
|
8868
|
+
if (selected.has("partners")) {
|
|
8869
|
+
lines.push(materializePartnersSql(), "");
|
|
8870
|
+
}
|
|
8871
|
+
if (selected.has("simples")) {
|
|
8872
|
+
lines.push(materializeSimplesSql(), "");
|
|
8873
|
+
}
|
|
8874
|
+
lines.push(echo("[materialize] Final table materialization completed."));
|
|
8875
|
+
return lines;
|
|
8876
|
+
}
|
|
8877
|
+
function indexesSql() {
|
|
8878
|
+
return [
|
|
8879
|
+
echo(
|
|
8880
|
+
"[indexes] No additional index operations are generated in this beta."
|
|
8881
|
+
),
|
|
8882
|
+
"-- Indexes are expected to be managed by the schema generated by cnpj-db-loader schema generate.",
|
|
8883
|
+
"-- A future fast-rebuild mode may generate DROP/CREATE INDEX operations here."
|
|
8884
|
+
];
|
|
8885
|
+
}
|
|
8886
|
+
function analyzeSql(selected) {
|
|
8887
|
+
const tables = /* @__PURE__ */ new Set();
|
|
8888
|
+
if (selected.has("companies")) {
|
|
8889
|
+
tables.add("companies");
|
|
8890
|
+
}
|
|
8891
|
+
if (selected.has("establishments")) {
|
|
8892
|
+
tables.add("establishments");
|
|
8893
|
+
}
|
|
8894
|
+
if (selected.has("secondary-cnaes")) {
|
|
8895
|
+
tables.add("establishment_secondary_cnaes");
|
|
8896
|
+
}
|
|
8897
|
+
if (selected.has("partners")) {
|
|
8898
|
+
tables.add("partners");
|
|
8899
|
+
}
|
|
8900
|
+
if (selected.has("simples")) {
|
|
8901
|
+
tables.add("simples_options");
|
|
8902
|
+
}
|
|
8903
|
+
if (selected.has("domains")) {
|
|
8904
|
+
for (const dataset of DOMAIN_DATASETS) {
|
|
8905
|
+
tables.add(dataset);
|
|
8906
|
+
}
|
|
8907
|
+
}
|
|
8908
|
+
return [
|
|
8909
|
+
echo("[analyze] Refreshing planner statistics..."),
|
|
8910
|
+
...[...tables].map((table) => `analyze ${table};`),
|
|
8911
|
+
echo("[analyze] Planner statistics refreshed.")
|
|
8912
|
+
];
|
|
8913
|
+
}
|
|
8914
|
+
function step(name, file, dependsOn, included) {
|
|
8915
|
+
return { name, file, dependsOn, included };
|
|
8916
|
+
}
|
|
8917
|
+
function generatePostgresDirectScriptFiles(input2) {
|
|
8556
8918
|
const grouped = directFilesByDataset(input2.files);
|
|
8557
|
-
const
|
|
8558
|
-
|
|
8919
|
+
const selected = includeSet(input2);
|
|
8920
|
+
if (!DOMAIN_DATASETS.some((dataset) => (grouped[dataset] ?? []).length > 0)) {
|
|
8921
|
+
selected.delete("domains");
|
|
8922
|
+
}
|
|
8923
|
+
if ((grouped.companies ?? []).length === 0) {
|
|
8924
|
+
selected.delete("companies");
|
|
8925
|
+
}
|
|
8926
|
+
if ((grouped.establishments ?? []).length === 0) {
|
|
8927
|
+
selected.delete("establishments");
|
|
8928
|
+
selected.delete("secondary-cnaes");
|
|
8929
|
+
}
|
|
8930
|
+
if ((grouped.partners ?? []).length === 0) {
|
|
8931
|
+
selected.delete("partners");
|
|
8932
|
+
}
|
|
8933
|
+
if ((grouped.simples_options ?? []).length === 0) {
|
|
8934
|
+
selected.delete("simples");
|
|
8935
|
+
}
|
|
8936
|
+
const scripts = {};
|
|
8937
|
+
const steps = [];
|
|
8938
|
+
const setupIncluded = true;
|
|
8939
|
+
steps.push(step("setup", "setup.sql", [], setupIncluded));
|
|
8940
|
+
scripts["setup.sql"] = [
|
|
8941
|
+
...scriptHeader(
|
|
8942
|
+
"CNPJ DB Loader PostgreSQL direct import setup",
|
|
8943
|
+
input2.sourceEncoding
|
|
8944
|
+
),
|
|
8945
|
+
echo("[setup] Preparing PostgreSQL direct import session..."),
|
|
8946
|
+
"-- The database schema must be applied before running these scripts.",
|
|
8947
|
+
"-- This setup script configures the psql session used by the generated orchestrator.",
|
|
8948
|
+
echo("[setup] Setup completed."),
|
|
8949
|
+
""
|
|
8950
|
+
].join("\n");
|
|
8951
|
+
const domainsIncluded = selected.has("domains") && DOMAIN_DATASETS.some((dataset) => (grouped[dataset] ?? []).length > 0);
|
|
8952
|
+
steps.push(
|
|
8953
|
+
step("load-domains", "load-domains.sql", ["setup"], domainsIncluded)
|
|
8954
|
+
);
|
|
8955
|
+
if (domainsIncluded) {
|
|
8956
|
+
const lines = [echo("[load-domains] Starting domain tables load...")];
|
|
8957
|
+
for (const dataset of DOMAIN_DATASETS) {
|
|
8958
|
+
lines.push(...rawDomainSql(dataset, grouped[dataset] ?? []), "");
|
|
8959
|
+
}
|
|
8960
|
+
lines.push(echo("[load-domains] Domain tables load completed."));
|
|
8961
|
+
scripts["load-domains.sql"] = buildStepScript(
|
|
8962
|
+
"CNPJ DB Loader PostgreSQL direct import domains step",
|
|
8963
|
+
lines,
|
|
8964
|
+
input2,
|
|
8965
|
+
true
|
|
8966
|
+
);
|
|
8967
|
+
}
|
|
8968
|
+
const datasetSteps = [
|
|
8969
|
+
{
|
|
8970
|
+
dataset: "companies",
|
|
8971
|
+
name: "load-companies",
|
|
8972
|
+
file: "load-companies.sql",
|
|
8973
|
+
include: "companies"
|
|
8974
|
+
},
|
|
8975
|
+
{
|
|
8976
|
+
dataset: "establishments",
|
|
8977
|
+
name: "load-establishments",
|
|
8978
|
+
file: "load-establishments.sql",
|
|
8979
|
+
include: "establishments"
|
|
8980
|
+
},
|
|
8981
|
+
{
|
|
8982
|
+
dataset: "partners",
|
|
8983
|
+
name: "load-partners",
|
|
8984
|
+
file: "load-partners.sql",
|
|
8985
|
+
include: "partners"
|
|
8986
|
+
},
|
|
8987
|
+
{
|
|
8988
|
+
dataset: "simples_options",
|
|
8989
|
+
name: "load-simples",
|
|
8990
|
+
file: "load-simples.sql",
|
|
8991
|
+
include: "simples"
|
|
8992
|
+
}
|
|
8993
|
+
];
|
|
8994
|
+
for (const item of datasetSteps) {
|
|
8995
|
+
const files = grouped[item.dataset] ?? [];
|
|
8996
|
+
const included = selected.has(item.include) && files.length > 0;
|
|
8997
|
+
steps.push(step(item.name, item.file, ["setup"], included));
|
|
8998
|
+
if (included) {
|
|
8999
|
+
scripts[item.file] = buildStepScript(
|
|
9000
|
+
`CNPJ DB Loader PostgreSQL direct import ${item.name} step`,
|
|
9001
|
+
rawStagingSql(item.dataset, files),
|
|
9002
|
+
input2,
|
|
9003
|
+
true
|
|
9004
|
+
);
|
|
9005
|
+
}
|
|
9006
|
+
}
|
|
9007
|
+
const materializeIncluded = hasAnyFinalMaterialization(selected);
|
|
9008
|
+
steps.push(
|
|
9009
|
+
step(
|
|
9010
|
+
"materialize",
|
|
9011
|
+
"materialize.sql",
|
|
9012
|
+
datasetSteps.filter((item) => selected.has(item.include)).map((item) => item.name),
|
|
9013
|
+
materializeIncluded
|
|
9014
|
+
)
|
|
9015
|
+
);
|
|
9016
|
+
if (materializeIncluded) {
|
|
9017
|
+
scripts["materialize.sql"] = buildStepScript(
|
|
9018
|
+
"CNPJ DB Loader PostgreSQL direct import materialization step",
|
|
9019
|
+
materializeSql(selected),
|
|
9020
|
+
input2,
|
|
9021
|
+
true
|
|
9022
|
+
);
|
|
9023
|
+
}
|
|
9024
|
+
const secondaryIncluded = selected.has("secondary-cnaes") && selected.has("establishments");
|
|
9025
|
+
steps.push(
|
|
9026
|
+
step(
|
|
9027
|
+
"materialize-secondary-cnaes",
|
|
9028
|
+
"materialize-secondary-cnaes.sql",
|
|
9029
|
+
["load-establishments"],
|
|
9030
|
+
secondaryIncluded
|
|
9031
|
+
)
|
|
9032
|
+
);
|
|
9033
|
+
if (secondaryIncluded) {
|
|
9034
|
+
scripts["materialize-secondary-cnaes.sql"] = buildStepScript(
|
|
9035
|
+
"CNPJ DB Loader PostgreSQL direct import secondary CNAEs step",
|
|
9036
|
+
[materializeSecondaryCnaesSql()],
|
|
9037
|
+
input2,
|
|
9038
|
+
true
|
|
9039
|
+
);
|
|
9040
|
+
}
|
|
9041
|
+
const indexesIncluded = selected.has("indexes");
|
|
9042
|
+
steps.push(
|
|
9043
|
+
step(
|
|
9044
|
+
"indexes",
|
|
9045
|
+
"indexes.sql",
|
|
9046
|
+
materializeIncluded ? ["materialize"] : ["setup"],
|
|
9047
|
+
indexesIncluded
|
|
9048
|
+
)
|
|
9049
|
+
);
|
|
9050
|
+
if (indexesIncluded) {
|
|
9051
|
+
scripts["indexes.sql"] = buildStepScript(
|
|
9052
|
+
"CNPJ DB Loader PostgreSQL direct import indexes step",
|
|
9053
|
+
indexesSql(),
|
|
9054
|
+
input2,
|
|
9055
|
+
true
|
|
9056
|
+
);
|
|
9057
|
+
}
|
|
9058
|
+
const analyzeIncluded = selected.has("analyze");
|
|
9059
|
+
const analyzeDependencies = [
|
|
9060
|
+
...domainsIncluded ? ["load-domains"] : [],
|
|
9061
|
+
...materializeIncluded ? ["materialize"] : [],
|
|
9062
|
+
...secondaryIncluded ? ["materialize-secondary-cnaes"] : []
|
|
9063
|
+
];
|
|
9064
|
+
steps.push(
|
|
9065
|
+
step(
|
|
9066
|
+
"analyze",
|
|
9067
|
+
"analyze.sql",
|
|
9068
|
+
analyzeDependencies.length > 0 ? analyzeDependencies : ["setup"],
|
|
9069
|
+
analyzeIncluded
|
|
9070
|
+
)
|
|
9071
|
+
);
|
|
9072
|
+
if (analyzeIncluded) {
|
|
9073
|
+
scripts["analyze.sql"] = buildStepScript(
|
|
9074
|
+
"CNPJ DB Loader PostgreSQL direct import analyze step",
|
|
9075
|
+
analyzeSql(selected),
|
|
9076
|
+
input2,
|
|
9077
|
+
true
|
|
9078
|
+
);
|
|
9079
|
+
}
|
|
9080
|
+
const orchestratorLines = [
|
|
9081
|
+
"-- CNPJ DB Loader direct PostgreSQL import orchestrator",
|
|
8559
9082
|
"-- Generated from sanitized Receita files by cnpj-db-loader postgres generate-script.",
|
|
8560
|
-
"-- This path avoids rewriting the dataset into a second CSV tree.",
|
|
8561
9083
|
"-- Execute with psql, for example:",
|
|
8562
|
-
'-- psql "postgres://postgres:postgres@localhost:5432/cnpj" -f import-postgres-direct.sql',
|
|
9084
|
+
'-- psql -d "postgres://postgres:postgres@localhost:5432/cnpj" -f import-postgres-direct.sql',
|
|
8563
9085
|
"",
|
|
8564
9086
|
"\\set ON_ERROR_STOP on",
|
|
8565
|
-
|
|
9087
|
+
echo(
|
|
9088
|
+
`Using source file encoding ${input2.sourceEncoding} for psql copy operations...`
|
|
9089
|
+
),
|
|
8566
9090
|
`set client_encoding to ${quoteSqlLiteral(input2.sourceEncoding)};`,
|
|
8567
|
-
|
|
9091
|
+
echo(
|
|
9092
|
+
`Starting CNPJ DB Loader direct PostgreSQL import using transaction mode ${input2.transactionMode}...`
|
|
9093
|
+
),
|
|
9094
|
+
"",
|
|
9095
|
+
...input2.transactionMode === "single" ? ["begin;", ""] : []
|
|
9096
|
+
];
|
|
9097
|
+
for (const name of STEP_ORDER) {
|
|
9098
|
+
const currentStep = steps.find((item) => item.name === name);
|
|
9099
|
+
if (!currentStep?.included) {
|
|
9100
|
+
continue;
|
|
9101
|
+
}
|
|
9102
|
+
orchestratorLines.push(
|
|
9103
|
+
echo(
|
|
9104
|
+
`[orchestrator] Running ${currentStep.name} (${currentStep.file})...`
|
|
9105
|
+
),
|
|
9106
|
+
`\\ir ${currentStep.file}`,
|
|
9107
|
+
echo(`[orchestrator] Completed ${currentStep.name}.`),
|
|
9108
|
+
""
|
|
9109
|
+
);
|
|
9110
|
+
}
|
|
9111
|
+
orchestratorLines.push(
|
|
9112
|
+
...input2.transactionMode === "single" ? ["commit;", ""] : [],
|
|
9113
|
+
echo("CNPJ DB Loader hybrid PostgreSQL import completed."),
|
|
9114
|
+
""
|
|
9115
|
+
);
|
|
9116
|
+
scripts["import-postgres-direct.sql"] = orchestratorLines.join("\n");
|
|
9117
|
+
return { scripts, steps };
|
|
9118
|
+
}
|
|
9119
|
+
function generatePostgresDirectImportScript(input2) {
|
|
9120
|
+
const grouped = csvFilesByDataset(input2.files);
|
|
9121
|
+
const lines = [
|
|
9122
|
+
"-- CNPJ DB Loader hybrid PostgreSQL import script",
|
|
9123
|
+
"-- Generated from PostgreSQL-ready CSV files exported by cnpj-db-loader postgres export-csv.",
|
|
9124
|
+
"-- Execute with psql, for example:",
|
|
9125
|
+
'-- psql -d "postgres://postgres:postgres@localhost:5432/cnpj" -f import-postgres-direct.sql',
|
|
9126
|
+
"",
|
|
9127
|
+
"\\set ON_ERROR_STOP on",
|
|
9128
|
+
echo("Starting CNPJ DB Loader hybrid PostgreSQL import..."),
|
|
8568
9129
|
"",
|
|
8569
9130
|
"begin;",
|
|
8570
9131
|
"",
|
|
8571
9132
|
"-- Keep the final schema and seed data managed by sql/schema.sql.",
|
|
8572
|
-
"-- This script
|
|
8573
|
-
"-- transforms values inside PostgreSQL, resets staging tables and upserts final data.",
|
|
9133
|
+
"-- This script only resets staging tables and then upserts final data.",
|
|
8574
9134
|
"truncate table staging_companies restart identity;",
|
|
8575
9135
|
"truncate table staging_establishments restart identity;",
|
|
8576
9136
|
"truncate table staging_partners restart identity;",
|
|
@@ -8578,10 +9138,10 @@ function generatePostgresSanitizedDirectImportScript(input2) {
|
|
|
8578
9138
|
""
|
|
8579
9139
|
];
|
|
8580
9140
|
for (const dataset of DOMAIN_DATASETS) {
|
|
8581
|
-
lines.push(...
|
|
9141
|
+
lines.push(...copyDomainSql(dataset, grouped[dataset] ?? []), "");
|
|
8582
9142
|
}
|
|
8583
9143
|
for (const dataset of STAGING_DATASETS) {
|
|
8584
|
-
lines.push(...
|
|
9144
|
+
lines.push(...copyStagingSql(dataset, grouped[dataset] ?? []), "");
|
|
8585
9145
|
}
|
|
8586
9146
|
lines.push(...materializationAndAnalyzeSql());
|
|
8587
9147
|
return lines.join("\n");
|
|
@@ -8592,11 +9152,13 @@ function materializationAndAnalyzeSql() {
|
|
|
8592
9152
|
"",
|
|
8593
9153
|
materializeEstablishmentsSql(),
|
|
8594
9154
|
"",
|
|
9155
|
+
materializeSecondaryCnaesSql(),
|
|
9156
|
+
"",
|
|
8595
9157
|
materializePartnersSql(),
|
|
8596
9158
|
"",
|
|
8597
9159
|
materializeSimplesSql(),
|
|
8598
9160
|
"",
|
|
8599
|
-
"
|
|
9161
|
+
echo("Refreshing planner statistics..."),
|
|
8600
9162
|
"analyze companies;",
|
|
8601
9163
|
"analyze establishments;",
|
|
8602
9164
|
"analyze establishment_secondary_cnaes;",
|
|
@@ -8611,7 +9173,7 @@ function materializationAndAnalyzeSql() {
|
|
|
8611
9173
|
"",
|
|
8612
9174
|
"commit;",
|
|
8613
9175
|
"",
|
|
8614
|
-
"
|
|
9176
|
+
echo("CNPJ DB Loader hybrid PostgreSQL import completed."),
|
|
8615
9177
|
""
|
|
8616
9178
|
];
|
|
8617
9179
|
}
|
|
@@ -8817,7 +9379,30 @@ async function exportPostgresCsvDataset(inputPath, options = {}) {
|
|
|
8817
9379
|
// src/services/postgres-direct/generator.ts
|
|
8818
9380
|
import { mkdir as mkdir9, stat as stat7, writeFile as writeFile6 } from "fs/promises";
|
|
8819
9381
|
import path17 from "path";
|
|
8820
|
-
var DEFAULT_SOURCE_ENCODING = "
|
|
9382
|
+
var DEFAULT_SOURCE_ENCODING = "UTF8";
|
|
9383
|
+
var DEFAULT_TRANSACTION_MODE = "single";
|
|
9384
|
+
var ALL_INCLUDE_TARGETS = [
|
|
9385
|
+
"domains",
|
|
9386
|
+
"companies",
|
|
9387
|
+
"establishments",
|
|
9388
|
+
"partners",
|
|
9389
|
+
"simples",
|
|
9390
|
+
"secondary-cnaes",
|
|
9391
|
+
"indexes",
|
|
9392
|
+
"analyze"
|
|
9393
|
+
];
|
|
9394
|
+
var INCLUDE_TARGETS_BY_DATASET = {
|
|
9395
|
+
companies: "companies",
|
|
9396
|
+
establishments: "establishments",
|
|
9397
|
+
partners: "partners",
|
|
9398
|
+
simples_options: "simples",
|
|
9399
|
+
countries: "domains",
|
|
9400
|
+
cities: "domains",
|
|
9401
|
+
partner_qualifications: "domains",
|
|
9402
|
+
legal_natures: "domains",
|
|
9403
|
+
reasons: "domains",
|
|
9404
|
+
cnaes: "domains"
|
|
9405
|
+
};
|
|
8821
9406
|
function defaultPostgresDirectOutputPath(inputPath) {
|
|
8822
9407
|
const baseName = path17.basename(inputPath);
|
|
8823
9408
|
if (baseName.toLowerCase() === "sanitized") {
|
|
@@ -8826,17 +9411,52 @@ function defaultPostgresDirectOutputPath(inputPath) {
|
|
|
8826
9411
|
return path17.join(path17.dirname(inputPath), `${baseName}-postgres-direct`);
|
|
8827
9412
|
}
|
|
8828
9413
|
function inferNextStep5(scriptPath) {
|
|
8829
|
-
return `psql "postgres://postgres:postgres@localhost:5432/cnpj" -f ${scriptPath.replace(/\\/g, "/")}`;
|
|
9414
|
+
return `psql -d "postgres://postgres:postgres@localhost:5432/cnpj" -f ${scriptPath.replace(/\\/g, "/")}`;
|
|
8830
9415
|
}
|
|
8831
9416
|
function normalizeSourceEncoding(value) {
|
|
8832
9417
|
const encoding = (value ?? DEFAULT_SOURCE_ENCODING).trim();
|
|
8833
9418
|
if (!/^[A-Za-z0-9_-]+$/.test(encoding)) {
|
|
8834
9419
|
throw new ValidationError(
|
|
8835
|
-
`Invalid source encoding: ${value}. Use a PostgreSQL client encoding name such as WIN1252 or
|
|
9420
|
+
`Invalid source encoding: ${value}. Use a PostgreSQL client encoding name such as UTF8, WIN1252 or LATIN1.`
|
|
8836
9421
|
);
|
|
8837
9422
|
}
|
|
8838
9423
|
return encoding.toUpperCase();
|
|
8839
9424
|
}
|
|
9425
|
+
function normalizeTransactionMode(value) {
|
|
9426
|
+
const mode = value ?? DEFAULT_TRANSACTION_MODE;
|
|
9427
|
+
if (!["single", "phase", "none"].includes(mode)) {
|
|
9428
|
+
throw new ValidationError(
|
|
9429
|
+
`Invalid transaction mode: ${String(value)}. Use single, phase or none.`
|
|
9430
|
+
);
|
|
9431
|
+
}
|
|
9432
|
+
return mode;
|
|
9433
|
+
}
|
|
9434
|
+
function isIncludeTarget(value) {
|
|
9435
|
+
return ALL_INCLUDE_TARGETS.includes(value);
|
|
9436
|
+
}
|
|
9437
|
+
function normalizeIncludeTargets(include, dataset) {
|
|
9438
|
+
if (include && include.length > 0) {
|
|
9439
|
+
const unique = [...new Set(include)];
|
|
9440
|
+
const invalid = unique.filter((item) => !isIncludeTarget(item));
|
|
9441
|
+
if (invalid.length > 0) {
|
|
9442
|
+
throw new ValidationError(
|
|
9443
|
+
`Invalid include target(s): ${invalid.join(", ")}. Use ${ALL_INCLUDE_TARGETS.join(", ")}.`
|
|
9444
|
+
);
|
|
9445
|
+
}
|
|
9446
|
+
return unique;
|
|
9447
|
+
}
|
|
9448
|
+
if (dataset) {
|
|
9449
|
+
const target = INCLUDE_TARGETS_BY_DATASET[dataset];
|
|
9450
|
+
if (!target) {
|
|
9451
|
+
return [];
|
|
9452
|
+
}
|
|
9453
|
+
if (target === "establishments") {
|
|
9454
|
+
return ["establishments", "secondary-cnaes", "analyze"];
|
|
9455
|
+
}
|
|
9456
|
+
return [target, "analyze"];
|
|
9457
|
+
}
|
|
9458
|
+
return [...ALL_INCLUDE_TARGETS];
|
|
9459
|
+
}
|
|
8840
9460
|
async function generatePostgresDirectScript(inputPath, options = {}) {
|
|
8841
9461
|
if (options.dataset && !isImportDatasetType(options.dataset)) {
|
|
8842
9462
|
throw new ValidationError(`Unsupported dataset type: ${options.dataset}.`);
|
|
@@ -8852,6 +9472,10 @@ async function generatePostgresDirectScript(inputPath, options = {}) {
|
|
|
8852
9472
|
options.outputPath ?? defaultPostgresDirectOutputPath(validatedPath)
|
|
8853
9473
|
);
|
|
8854
9474
|
const sourceEncoding = normalizeSourceEncoding(options.sourceEncoding);
|
|
9475
|
+
const transactionMode = normalizeTransactionMode(options.transactionMode);
|
|
9476
|
+
const include = normalizeIncludeTargets(options.include, options.dataset);
|
|
9477
|
+
const skipIndexes = options.skipIndexes ?? false;
|
|
9478
|
+
const skipAnalyze = options.skipAnalyze ?? false;
|
|
8855
9479
|
const inspected = await inspectFiles(validatedPath);
|
|
8856
9480
|
const recognizedFiles = inspected.entries.filter((entry) => entry.entryKind === "file").flatMap((entry) => {
|
|
8857
9481
|
if (!isImportDatasetType(entry.inferredType)) {
|
|
@@ -8879,7 +9503,11 @@ async function generatePostgresDirectScript(inputPath, options = {}) {
|
|
|
8879
9503
|
outputPath,
|
|
8880
9504
|
totalFiles: recognizedFiles.length,
|
|
8881
9505
|
datasets,
|
|
8882
|
-
sourceEncoding
|
|
9506
|
+
sourceEncoding,
|
|
9507
|
+
transactionMode,
|
|
9508
|
+
include,
|
|
9509
|
+
skipIndexes,
|
|
9510
|
+
skipAnalyze
|
|
8883
9511
|
});
|
|
8884
9512
|
await mkdir9(outputPath, { recursive: true });
|
|
8885
9513
|
const sourceFiles = [];
|
|
@@ -8915,11 +9543,21 @@ async function generatePostgresDirectScript(inputPath, options = {}) {
|
|
|
8915
9543
|
}
|
|
8916
9544
|
const scriptName = options.scriptName ?? "import-postgres-direct.sql";
|
|
8917
9545
|
const scriptPath = path17.join(outputPath, scriptName);
|
|
8918
|
-
const
|
|
9546
|
+
const generated = generatePostgresDirectScriptFiles({
|
|
8919
9547
|
files: sourceFiles,
|
|
8920
|
-
sourceEncoding
|
|
9548
|
+
sourceEncoding,
|
|
9549
|
+
transactionMode,
|
|
9550
|
+
include,
|
|
9551
|
+
skipIndexes,
|
|
9552
|
+
skipAnalyze
|
|
8921
9553
|
});
|
|
8922
|
-
|
|
9554
|
+
const scriptFiles = [];
|
|
9555
|
+
for (const [fileName, script] of Object.entries(generated.scripts)) {
|
|
9556
|
+
const outputFileName = fileName === "import-postgres-direct.sql" ? scriptName : fileName;
|
|
9557
|
+
const outputFilePath = path17.join(outputPath, outputFileName);
|
|
9558
|
+
await writeFile6(outputFilePath, script, "utf8");
|
|
9559
|
+
scriptFiles.push(outputFilePath);
|
|
9560
|
+
}
|
|
8923
9561
|
const manifestPath = path17.join(outputPath, "manifest.json");
|
|
8924
9562
|
const summaryDatasets = [...summariesByDataset.values()].sort(
|
|
8925
9563
|
(left, right) => IMPORT_ORDER.indexOf(left.dataset) - IMPORT_ORDER.indexOf(right.dataset)
|
|
@@ -8931,13 +9569,19 @@ async function generatePostgresDirectScript(inputPath, options = {}) {
|
|
|
8931
9569
|
const manifest = {
|
|
8932
9570
|
generatedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
8933
9571
|
mode: "direct-sanitized-script",
|
|
9572
|
+
transactionMode,
|
|
9573
|
+
include,
|
|
9574
|
+
skipIndexes,
|
|
9575
|
+
skipAnalyze,
|
|
8934
9576
|
inputPath: path17.resolve(inputPath),
|
|
8935
9577
|
validatedPath,
|
|
8936
9578
|
outputPath,
|
|
8937
9579
|
scriptPath,
|
|
9580
|
+
scriptFiles,
|
|
8938
9581
|
sourceEncoding,
|
|
8939
9582
|
totalFiles: sourceFiles.length,
|
|
8940
9583
|
totalBytes,
|
|
9584
|
+
steps: generated.steps,
|
|
8941
9585
|
datasets: summaryDatasets
|
|
8942
9586
|
};
|
|
8943
9587
|
await writeFile6(
|
|
@@ -8960,14 +9604,19 @@ async function generatePostgresDirectScript(inputPath, options = {}) {
|
|
|
8960
9604
|
scriptPath,
|
|
8961
9605
|
manifestPath,
|
|
8962
9606
|
sourceEncoding,
|
|
9607
|
+
transactionMode,
|
|
8963
9608
|
totalFiles: sourceFiles.length,
|
|
8964
9609
|
totalBytes,
|
|
8965
9610
|
datasets: summaryDatasets,
|
|
9611
|
+
scriptFiles,
|
|
9612
|
+
steps: generated.steps,
|
|
8966
9613
|
warnings: [
|
|
8967
9614
|
...validation.ok ? [] : validation.errors,
|
|
8968
9615
|
"This script imports sanitized Receita files directly with psql \\copy. It avoids rewriting the full dataset into a second CSV tree.",
|
|
8969
|
-
"The generated
|
|
8970
|
-
"
|
|
9616
|
+
"The generated scripts expect the database schema generated by cnpj-db-loader to be applied before execution.",
|
|
9617
|
+
"The direct PostgreSQL script now defaults to UTF8 because the sanitize command writes clean UTF-8 files.",
|
|
9618
|
+
"Use --source-encoding WIN1252 or LATIN1 only when generating scripts for legacy sanitized files produced by older loader versions.",
|
|
9619
|
+
"The generated import is now modular. Use import-postgres-direct.sql as the orchestrator or run individual phase scripts manually."
|
|
8971
9620
|
],
|
|
8972
9621
|
nextStep: inferNextStep5(scriptPath)
|
|
8973
9622
|
};
|
|
@@ -9271,9 +9920,23 @@ function printSanitizeSummary(summary, logFilePath) {
|
|
|
9271
9920
|
console.log(
|
|
9272
9921
|
formatKeyValue("Processed bytes", formatBytes(summary.totalBytes))
|
|
9273
9922
|
);
|
|
9923
|
+
console.log(formatKeyValue("Source encoding", summary.sourceEncoding));
|
|
9924
|
+
console.log(formatKeyValue("Output encoding", "UTF8"));
|
|
9274
9925
|
console.log(
|
|
9275
9926
|
formatKeyValue("Removed NUL bytes", formatCount(summary.nulBytesRemoved))
|
|
9276
9927
|
);
|
|
9928
|
+
console.log(
|
|
9929
|
+
formatKeyValue(
|
|
9930
|
+
"Removed invalid bytes",
|
|
9931
|
+
formatCount(summary.invalidBytesRemoved)
|
|
9932
|
+
)
|
|
9933
|
+
);
|
|
9934
|
+
console.log(
|
|
9935
|
+
formatKeyValue(
|
|
9936
|
+
"Removed control chars",
|
|
9937
|
+
formatCount(summary.controlCharsRemoved)
|
|
9938
|
+
)
|
|
9939
|
+
);
|
|
9277
9940
|
console.log(formatKeyValue("Changed files", summary.changedFiles));
|
|
9278
9941
|
console.log(formatKeyValue("Unchanged files", summary.unchangedFiles));
|
|
9279
9942
|
if (summary.datasets.length > 0) {
|
|
@@ -9601,6 +10264,16 @@ function printPostgresDirectScriptSummary(summary, logFilePath) {
|
|
|
9601
10264
|
console.log(formatKeyValue("Generated script", summary.scriptPath));
|
|
9602
10265
|
console.log(formatKeyValue("Manifest", summary.manifestPath));
|
|
9603
10266
|
console.log(formatKeyValue("Source encoding", summary.sourceEncoding));
|
|
10267
|
+
console.log(formatKeyValue("Transaction mode", summary.transactionMode));
|
|
10268
|
+
console.log(
|
|
10269
|
+
formatKeyValue("Generated SQL files", summary.scriptFiles.length)
|
|
10270
|
+
);
|
|
10271
|
+
console.log(
|
|
10272
|
+
formatKeyValue(
|
|
10273
|
+
"Included steps",
|
|
10274
|
+
summary.steps.filter((step2) => step2.included).map((step2) => step2.name).join(", ")
|
|
10275
|
+
)
|
|
10276
|
+
);
|
|
9604
10277
|
console.log(formatKeyValue("Source files", summary.totalFiles));
|
|
9605
10278
|
console.log(formatKeyValue("Source bytes", formatBytes(summary.totalBytes)));
|
|
9606
10279
|
if (summary.datasets.length > 0) {
|
|
@@ -10053,8 +10726,9 @@ function createSanitizeProgressReporter() {
|
|
|
10053
10726
|
`Validated: ${shortPath(event.validatedPath)}`,
|
|
10054
10727
|
`Output: ${shortPath(event.outputPath)}`,
|
|
10055
10728
|
`Datasets: ${event.datasets.join(" > ")}`,
|
|
10729
|
+
`Source encoding: ${event.sourceEncoding} > UTF8`,
|
|
10056
10730
|
`Files: 0/${formatCount(event.totalFiles)} | Bytes: ${formatBytes(0)} / ${formatBytes(event.totalBytes)}`,
|
|
10057
|
-
`Rows
|
|
10731
|
+
`Rows: ${formatCount(0)} | NUL: ${formatCount(0)} | Invalid bytes: ${formatCount(0)} | Controls: ${formatCount(0)}`,
|
|
10058
10732
|
`Current: waiting...`
|
|
10059
10733
|
];
|
|
10060
10734
|
renderBlock([
|
|
@@ -10070,8 +10744,9 @@ function createSanitizeProgressReporter() {
|
|
|
10070
10744
|
currentLines[1] ?? "",
|
|
10071
10745
|
currentLines[2] ?? "",
|
|
10072
10746
|
currentLines[3] ?? "",
|
|
10747
|
+
currentLines[4] ?? "",
|
|
10073
10748
|
`Files: ${formatCount(event.fileIndex)}/${formatCount(event.totalFiles)} | Bytes: ${formatBytes(event.bytesProcessed)} / ${formatBytes(event.totalBytes)}`,
|
|
10074
|
-
`Rows
|
|
10749
|
+
`Rows: ${formatCount(event.processedRows)} | NUL: ${formatCount(event.nulBytesRemoved)} | Invalid bytes: ${formatCount(event.invalidBytesRemoved)} | Controls: ${formatCount(event.controlCharsRemoved)} | Changed: ${formatCount(event.changedFiles)}`,
|
|
10075
10750
|
`Current: ${shortPath(event.currentFileDisplayPath)}`
|
|
10076
10751
|
];
|
|
10077
10752
|
renderBlock([
|
|
@@ -10091,6 +10766,18 @@ function createSanitizeProgressReporter() {
|
|
|
10091
10766
|
console.log(
|
|
10092
10767
|
formatKeyValue("Removed NUL bytes", formatCount(event.nulBytesRemoved))
|
|
10093
10768
|
);
|
|
10769
|
+
console.log(
|
|
10770
|
+
formatKeyValue(
|
|
10771
|
+
"Removed invalid bytes",
|
|
10772
|
+
formatCount(event.invalidBytesRemoved)
|
|
10773
|
+
)
|
|
10774
|
+
);
|
|
10775
|
+
console.log(
|
|
10776
|
+
formatKeyValue(
|
|
10777
|
+
"Removed control chars",
|
|
10778
|
+
formatCount(event.controlCharsRemoved)
|
|
10779
|
+
)
|
|
10780
|
+
);
|
|
10094
10781
|
console.log(
|
|
10095
10782
|
formatKeyValue("Changed files", formatCount(event.changedFiles))
|
|
10096
10783
|
);
|
|
@@ -10277,6 +10964,14 @@ function createPostgresDirectScriptProgressReporter() {
|
|
|
10277
10964
|
console.log(formatKeyValue("Validated path", event.validatedPath));
|
|
10278
10965
|
console.log(formatKeyValue("Output path", event.outputPath));
|
|
10279
10966
|
console.log(formatKeyValue("Source encoding", event.sourceEncoding));
|
|
10967
|
+
console.log(formatKeyValue("Transaction mode", event.transactionMode));
|
|
10968
|
+
console.log(formatKeyValue("Included steps", event.include.join(", ")));
|
|
10969
|
+
console.log(
|
|
10970
|
+
formatKeyValue("Skip indexes", event.skipIndexes ? "yes" : "no")
|
|
10971
|
+
);
|
|
10972
|
+
console.log(
|
|
10973
|
+
formatKeyValue("Skip analyze", event.skipAnalyze ? "yes" : "no")
|
|
10974
|
+
);
|
|
10280
10975
|
console.log(formatKeyValue("Files queued", event.totalFiles));
|
|
10281
10976
|
return;
|
|
10282
10977
|
}
|
|
@@ -11203,8 +11898,14 @@ function registerPostgresCommands(program) {
|
|
|
11203
11898
|
"Generated psql script file name. Defaults to import-postgres-direct.sql."
|
|
11204
11899
|
).option(
|
|
11205
11900
|
"--source-encoding <encoding>",
|
|
11206
|
-
"PostgreSQL client encoding used while reading sanitized Receita files. Defaults to
|
|
11207
|
-
).option(
|
|
11901
|
+
"PostgreSQL client encoding used while reading sanitized Receita files. Defaults to UTF8."
|
|
11902
|
+
).option(
|
|
11903
|
+
"--transaction-mode <mode>",
|
|
11904
|
+
"Transaction mode for generated scripts: single, phase or none. Defaults to single."
|
|
11905
|
+
).option(
|
|
11906
|
+
"--include <items>",
|
|
11907
|
+
"Comma-separated steps to include: domains,companies,establishments,partners,simples,secondary-cnaes,indexes,analyze."
|
|
11908
|
+
).option("--skip-indexes", "Do not generate the indexes step.").option("--skip-analyze", "Do not generate the analyze step.").option("-f, --force", "Skip the confirmation prompt.").description(
|
|
11208
11909
|
"Generate a direct psql import script that loads sanitized Receita files without rewriting them into new CSV files."
|
|
11209
11910
|
).action(
|
|
11210
11911
|
async (input2, options) => {
|
|
@@ -11233,6 +11934,18 @@ function registerPostgresCommands(program) {
|
|
|
11233
11934
|
if (options.sourceEncoding) {
|
|
11234
11935
|
generateOptions.sourceEncoding = options.sourceEncoding;
|
|
11235
11936
|
}
|
|
11937
|
+
if (options.transactionMode) {
|
|
11938
|
+
generateOptions.transactionMode = options.transactionMode;
|
|
11939
|
+
}
|
|
11940
|
+
if (options.include) {
|
|
11941
|
+
generateOptions.include = options.include.split(",").map((item) => item.trim()).filter(Boolean);
|
|
11942
|
+
}
|
|
11943
|
+
if (options.skipIndexes) {
|
|
11944
|
+
generateOptions.skipIndexes = true;
|
|
11945
|
+
}
|
|
11946
|
+
if (options.skipAnalyze) {
|
|
11947
|
+
generateOptions.skipAnalyze = true;
|
|
11948
|
+
}
|
|
11236
11949
|
const summary = await generatePostgresDirectScript(
|
|
11237
11950
|
input2,
|
|
11238
11951
|
generateOptions
|
|
@@ -11353,6 +12066,9 @@ function registerSanitizeCommands(program) {
|
|
|
11353
12066
|
).option(
|
|
11354
12067
|
"--dataset <dataset>",
|
|
11355
12068
|
"Sanitize only one validated dataset block (for example: establishments or companies)."
|
|
12069
|
+
).option(
|
|
12070
|
+
"--source-encoding <encoding>",
|
|
12071
|
+
"Source file encoding used while reading Receita files. Defaults to WIN1252 and writes clean UTF-8 output."
|
|
11356
12072
|
).option("-f, --force", "Skip the confirmation prompt.").description(
|
|
11357
12073
|
"Prepare a sanitized dataset tree before import by removing known low-level byte issues such as NUL bytes."
|
|
11358
12074
|
).action(
|
|
@@ -11376,6 +12092,9 @@ function registerSanitizeCommands(program) {
|
|
|
11376
12092
|
if (options.dataset) {
|
|
11377
12093
|
sanitizeOptions.dataset = options.dataset;
|
|
11378
12094
|
}
|
|
12095
|
+
if (options.sourceEncoding) {
|
|
12096
|
+
sanitizeOptions.sourceEncoding = options.sourceEncoding;
|
|
12097
|
+
}
|
|
11379
12098
|
const summary = await sanitizeInputDirectory(input2, sanitizeOptions);
|
|
11380
12099
|
const logFilePath = await writeCommandLog("sanitize", summary);
|
|
11381
12100
|
printSanitizeSummary(summary, logFilePath);
|