@danielarndt0/cnpj-db-loader 2.3.1 → 2.4.0-beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +22 -2
- package/dist/cli.js +1544 -157
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +134 -1
- package/dist/index.js +1174 -58
- package/dist/index.js.map +1 -1
- package/docs/architecture.md +9 -1
- package/docs/cli.md +1 -1
- package/docs/commands.md +23 -0
- package/docs/postgres-direct.md +138 -0
- package/docs/sanitize.md +52 -16
- package/docs/usage.md +14 -0
- package/package.json +3 -3
package/dist/cli.js
CHANGED
|
@@ -3237,6 +3237,15 @@ function createFieldValueParser(dataType) {
|
|
|
3237
3237
|
};
|
|
3238
3238
|
}
|
|
3239
3239
|
}
|
|
3240
|
+
function toDatabaseValue(dataType, rawValue) {
|
|
3241
|
+
return createFieldValueParser(dataType)(rawValue);
|
|
3242
|
+
}
|
|
3243
|
+
function normalizeCode(value, fallback) {
|
|
3244
|
+
if (typeof value === "string" && value.trim() !== "") {
|
|
3245
|
+
return value.trim();
|
|
3246
|
+
}
|
|
3247
|
+
return fallback;
|
|
3248
|
+
}
|
|
3240
3249
|
function createPartnerDedupeKeyBuilder(indices) {
|
|
3241
3250
|
const orderedIndices = [
|
|
3242
3251
|
indices.cnpjRoot,
|
|
@@ -3264,6 +3273,60 @@ function createEstablishmentCnpjFullBuilder(indices) {
|
|
|
3264
3273
|
return `${root}${order}${digits}`;
|
|
3265
3274
|
};
|
|
3266
3275
|
}
|
|
3276
|
+
function buildPartnerDedupeKey(recordByColumn) {
|
|
3277
|
+
return [
|
|
3278
|
+
recordByColumn.cnpj_root,
|
|
3279
|
+
recordByColumn.partner_type_code,
|
|
3280
|
+
recordByColumn.partner_name,
|
|
3281
|
+
recordByColumn.partner_document,
|
|
3282
|
+
recordByColumn.partner_qualification_code,
|
|
3283
|
+
recordByColumn.entry_date,
|
|
3284
|
+
recordByColumn.country_code,
|
|
3285
|
+
recordByColumn.legal_representative_document,
|
|
3286
|
+
recordByColumn.legal_representative_name,
|
|
3287
|
+
recordByColumn.legal_representative_qualification_code,
|
|
3288
|
+
recordByColumn.age_group_code
|
|
3289
|
+
].map((value) => value == null ? "" : String(value).trim()).join("|");
|
|
3290
|
+
}
|
|
3291
|
+
function transformRecord(dataset, layout, rawFields, schemaCapabilities, writeTarget) {
|
|
3292
|
+
const values = layout.fields.map(
|
|
3293
|
+
(field, index) => toDatabaseValue(field.dataType, rawFields[index] ?? "")
|
|
3294
|
+
);
|
|
3295
|
+
const recordByColumn = Object.fromEntries(
|
|
3296
|
+
layout.fields.map((field, index) => [field.columnName, values[index]])
|
|
3297
|
+
);
|
|
3298
|
+
if (dataset === "companies") {
|
|
3299
|
+
recordByColumn.company_size_code = normalizeCode(
|
|
3300
|
+
recordByColumn.company_size_code,
|
|
3301
|
+
"00"
|
|
3302
|
+
);
|
|
3303
|
+
}
|
|
3304
|
+
if (dataset === "establishments") {
|
|
3305
|
+
recordByColumn.branch_type_code = normalizeCode(
|
|
3306
|
+
recordByColumn.branch_type_code,
|
|
3307
|
+
"1"
|
|
3308
|
+
);
|
|
3309
|
+
recordByColumn.registration_status_code = normalizeCode(
|
|
3310
|
+
recordByColumn.registration_status_code,
|
|
3311
|
+
"01"
|
|
3312
|
+
);
|
|
3313
|
+
}
|
|
3314
|
+
const normalizedValues = layout.fields.map(
|
|
3315
|
+
(field) => recordByColumn[field.columnName]
|
|
3316
|
+
);
|
|
3317
|
+
if (writeTarget === "final") {
|
|
3318
|
+
if (dataset === "establishments" && schemaCapabilities.includeEstablishmentCnpjFullInInsert) {
|
|
3319
|
+
return [
|
|
3320
|
+
...normalizedValues,
|
|
3321
|
+
`${recordByColumn.cnpj_root ?? ""}${recordByColumn.cnpj_order ?? ""}${recordByColumn.cnpj_check_digits ?? ""}`
|
|
3322
|
+
];
|
|
3323
|
+
}
|
|
3324
|
+
if (dataset === "partners" && schemaCapabilities.includePartnerDedupeKeyInInsert) {
|
|
3325
|
+
return [...normalizedValues, buildPartnerDedupeKey(recordByColumn)];
|
|
3326
|
+
}
|
|
3327
|
+
}
|
|
3328
|
+
return normalizedValues;
|
|
3329
|
+
}
|
|
3267
3330
|
function buildParsedPayload(columns, values) {
|
|
3268
3331
|
return Object.fromEntries(
|
|
3269
3332
|
columns.map((column, index) => [column, values[index] ?? null])
|
|
@@ -3403,7 +3466,7 @@ function createImportRowNormalizer(input2) {
|
|
|
3403
3466
|
"cnpj_check_digits"
|
|
3404
3467
|
)
|
|
3405
3468
|
}) : null;
|
|
3406
|
-
const
|
|
3469
|
+
const buildPartnerDedupeKey2 = appendPartnerDedupeKey ? createPartnerDedupeKeyBuilder({
|
|
3407
3470
|
cnpjRoot: resolveLayoutColumnIndex(input2.layout, "cnpj_root"),
|
|
3408
3471
|
partnerTypeCode: resolveLayoutColumnIndex(
|
|
3409
3472
|
input2.layout,
|
|
@@ -3463,8 +3526,8 @@ function createImportRowNormalizer(input2) {
|
|
|
3463
3526
|
if (buildEstablishmentCnpjFull) {
|
|
3464
3527
|
values.push(buildEstablishmentCnpjFull(values));
|
|
3465
3528
|
}
|
|
3466
|
-
if (
|
|
3467
|
-
values.push(
|
|
3529
|
+
if (buildPartnerDedupeKey2) {
|
|
3530
|
+
values.push(buildPartnerDedupeKey2(values));
|
|
3468
3531
|
}
|
|
3469
3532
|
return {
|
|
3470
3533
|
values,
|
|
@@ -7758,81 +7821,264 @@ function isRecognizedSanitizeEntry(entry) {
|
|
|
7758
7821
|
return entry.entryKind === "file" && entry.inferredType !== "zip-archive" && entry.inferredType !== "unknown";
|
|
7759
7822
|
}
|
|
7760
7823
|
|
|
7824
|
+
// src/services/sanitize/encoding.ts
|
|
7825
|
+
import { StringDecoder } from "string_decoder";
|
|
7826
|
+
var WINDOWS_1252_C1_MAP = {
|
|
7827
|
+
128: "\u20AC",
|
|
7828
|
+
130: "\u201A",
|
|
7829
|
+
131: "\u0192",
|
|
7830
|
+
132: "\u201E",
|
|
7831
|
+
133: "\u2026",
|
|
7832
|
+
134: "\u2020",
|
|
7833
|
+
135: "\u2021",
|
|
7834
|
+
136: "\u02C6",
|
|
7835
|
+
137: "\u2030",
|
|
7836
|
+
138: "\u0160",
|
|
7837
|
+
139: "\u2039",
|
|
7838
|
+
140: "\u0152",
|
|
7839
|
+
142: "\u017D",
|
|
7840
|
+
145: "\u2018",
|
|
7841
|
+
146: "\u2019",
|
|
7842
|
+
147: "\u201C",
|
|
7843
|
+
148: "\u201D",
|
|
7844
|
+
149: "\u2022",
|
|
7845
|
+
150: "\u2013",
|
|
7846
|
+
151: "\u2014",
|
|
7847
|
+
152: "\u02DC",
|
|
7848
|
+
153: "\u2122",
|
|
7849
|
+
154: "\u0161",
|
|
7850
|
+
155: "\u203A",
|
|
7851
|
+
156: "\u0153",
|
|
7852
|
+
158: "\u017E",
|
|
7853
|
+
159: "\u0178"
|
|
7854
|
+
};
|
|
7855
|
+
function normalizeSanitizeSourceEncoding(value) {
|
|
7856
|
+
const normalized = (value ?? "WIN1252").trim().toUpperCase().replace(/_/g, "-");
|
|
7857
|
+
switch (normalized) {
|
|
7858
|
+
case "WIN1252":
|
|
7859
|
+
case "WINDOWS-1252":
|
|
7860
|
+
case "CP1252":
|
|
7861
|
+
return "WIN1252";
|
|
7862
|
+
case "LATIN1":
|
|
7863
|
+
case "LATIN-1":
|
|
7864
|
+
case "ISO-8859-1":
|
|
7865
|
+
case "ISO8859-1":
|
|
7866
|
+
return "LATIN1";
|
|
7867
|
+
case "UTF8":
|
|
7868
|
+
case "UTF-8":
|
|
7869
|
+
return "UTF8";
|
|
7870
|
+
default:
|
|
7871
|
+
throw new ValidationError(
|
|
7872
|
+
`Unsupported sanitize source encoding: ${value}. Supported values: WIN1252, LATIN1, UTF8.`
|
|
7873
|
+
);
|
|
7874
|
+
}
|
|
7875
|
+
}
|
|
7876
|
+
function isAllowedControlCodePoint(codePoint) {
|
|
7877
|
+
return codePoint === 9 || codePoint === 10 || codePoint === 13;
|
|
7878
|
+
}
|
|
7879
|
+
function isProblematicControlCodePoint(codePoint) {
|
|
7880
|
+
if (isAllowedControlCodePoint(codePoint)) {
|
|
7881
|
+
return false;
|
|
7882
|
+
}
|
|
7883
|
+
return codePoint >= 0 && codePoint <= 31 || codePoint === 127 || codePoint >= 128 && codePoint <= 159 || codePoint === 65279;
|
|
7884
|
+
}
|
|
7885
|
+
function sanitizeDecodedText(text) {
|
|
7886
|
+
const output2 = [];
|
|
7887
|
+
let invalidBytesRemoved = 0;
|
|
7888
|
+
let controlCharsRemoved = 0;
|
|
7889
|
+
for (const char of text) {
|
|
7890
|
+
const codePoint = char.codePointAt(0);
|
|
7891
|
+
if (codePoint === 65533) {
|
|
7892
|
+
invalidBytesRemoved += 1;
|
|
7893
|
+
continue;
|
|
7894
|
+
}
|
|
7895
|
+
if (isProblematicControlCodePoint(codePoint)) {
|
|
7896
|
+
controlCharsRemoved += 1;
|
|
7897
|
+
continue;
|
|
7898
|
+
}
|
|
7899
|
+
output2.push(char);
|
|
7900
|
+
}
|
|
7901
|
+
return {
|
|
7902
|
+
text: output2.join(""),
|
|
7903
|
+
invalidBytesRemoved,
|
|
7904
|
+
controlCharsRemoved
|
|
7905
|
+
};
|
|
7906
|
+
}
|
|
7907
|
+
var SanitizeEncodingNormalizer = class {
|
|
7908
|
+
constructor(sourceEncoding) {
|
|
7909
|
+
this.sourceEncoding = sourceEncoding;
|
|
7910
|
+
this.utf8Decoder = sourceEncoding === "UTF8" ? new StringDecoder("utf8") : void 0;
|
|
7911
|
+
}
|
|
7912
|
+
sourceEncoding;
|
|
7913
|
+
utf8Decoder;
|
|
7914
|
+
normalizeChunk(chunk) {
|
|
7915
|
+
if (this.sourceEncoding === "UTF8") {
|
|
7916
|
+
const decoded = this.utf8Decoder.write(chunk);
|
|
7917
|
+
const sanitized = sanitizeDecodedText(decoded);
|
|
7918
|
+
const nulBytesRemoved = [...decoded].filter(
|
|
7919
|
+
(char) => char === "\0"
|
|
7920
|
+
).length;
|
|
7921
|
+
return {
|
|
7922
|
+
...sanitized,
|
|
7923
|
+
nulBytesRemoved
|
|
7924
|
+
};
|
|
7925
|
+
}
|
|
7926
|
+
return this.normalizeSingleByteChunk(chunk);
|
|
7927
|
+
}
|
|
7928
|
+
flush() {
|
|
7929
|
+
if (!this.utf8Decoder) {
|
|
7930
|
+
return {
|
|
7931
|
+
text: "",
|
|
7932
|
+
nulBytesRemoved: 0,
|
|
7933
|
+
invalidBytesRemoved: 0,
|
|
7934
|
+
controlCharsRemoved: 0
|
|
7935
|
+
};
|
|
7936
|
+
}
|
|
7937
|
+
const decoded = this.utf8Decoder.end();
|
|
7938
|
+
const sanitized = sanitizeDecodedText(decoded);
|
|
7939
|
+
const nulBytesRemoved = [...decoded].filter((char) => char === "\0").length;
|
|
7940
|
+
return {
|
|
7941
|
+
...sanitized,
|
|
7942
|
+
nulBytesRemoved
|
|
7943
|
+
};
|
|
7944
|
+
}
|
|
7945
|
+
normalizeSingleByteChunk(chunk) {
|
|
7946
|
+
const output2 = [];
|
|
7947
|
+
let nulBytesRemoved = 0;
|
|
7948
|
+
let invalidBytesRemoved = 0;
|
|
7949
|
+
let controlCharsRemoved = 0;
|
|
7950
|
+
for (const byte of chunk) {
|
|
7951
|
+
if (byte === 0) {
|
|
7952
|
+
nulBytesRemoved += 1;
|
|
7953
|
+
continue;
|
|
7954
|
+
}
|
|
7955
|
+
if (byte < 32 || byte === 127) {
|
|
7956
|
+
if (isAllowedControlCodePoint(byte)) {
|
|
7957
|
+
output2.push(String.fromCharCode(byte));
|
|
7958
|
+
} else {
|
|
7959
|
+
controlCharsRemoved += 1;
|
|
7960
|
+
}
|
|
7961
|
+
continue;
|
|
7962
|
+
}
|
|
7963
|
+
if (byte >= 128 && byte <= 159) {
|
|
7964
|
+
if (this.sourceEncoding === "WIN1252") {
|
|
7965
|
+
const mapped = WINDOWS_1252_C1_MAP[byte];
|
|
7966
|
+
if (mapped === void 0) {
|
|
7967
|
+
invalidBytesRemoved += 1;
|
|
7968
|
+
} else {
|
|
7969
|
+
output2.push(mapped);
|
|
7970
|
+
}
|
|
7971
|
+
} else {
|
|
7972
|
+
controlCharsRemoved += 1;
|
|
7973
|
+
}
|
|
7974
|
+
continue;
|
|
7975
|
+
}
|
|
7976
|
+
output2.push(String.fromCharCode(byte));
|
|
7977
|
+
}
|
|
7978
|
+
return {
|
|
7979
|
+
text: output2.join(""),
|
|
7980
|
+
nulBytesRemoved,
|
|
7981
|
+
invalidBytesRemoved,
|
|
7982
|
+
controlCharsRemoved
|
|
7983
|
+
};
|
|
7984
|
+
}
|
|
7985
|
+
};
|
|
7986
|
+
|
|
7761
7987
|
// src/services/sanitize/runner.ts
|
|
7762
7988
|
import { createReadStream as createReadStream2, createWriteStream as createWriteStream2 } from "fs";
|
|
7763
7989
|
import { mkdir as mkdir7 } from "fs/promises";
|
|
7764
7990
|
import path13 from "path";
|
|
7765
|
-
function
|
|
7766
|
-
|
|
7767
|
-
|
|
7768
|
-
if (chunk[index] === 0) {
|
|
7769
|
-
removed += 1;
|
|
7770
|
-
}
|
|
7991
|
+
async function writeUtf8(output2, value) {
|
|
7992
|
+
if (value.length === 0) {
|
|
7993
|
+
return;
|
|
7771
7994
|
}
|
|
7772
|
-
if (
|
|
7773
|
-
|
|
7995
|
+
if (!output2.write(value, "utf8")) {
|
|
7996
|
+
await new Promise((resolve2, reject) => {
|
|
7997
|
+
output2.once("drain", resolve2);
|
|
7998
|
+
output2.once("error", reject);
|
|
7999
|
+
});
|
|
7774
8000
|
}
|
|
7775
|
-
|
|
7776
|
-
|
|
7777
|
-
|
|
7778
|
-
|
|
7779
|
-
if (value
|
|
7780
|
-
|
|
7781
|
-
outputIndex += 1;
|
|
8001
|
+
}
|
|
8002
|
+
function countNewlines(value) {
|
|
8003
|
+
let count = 0;
|
|
8004
|
+
for (let index = 0; index < value.length; index += 1) {
|
|
8005
|
+
if (value[index] === "\n") {
|
|
8006
|
+
count += 1;
|
|
7782
8007
|
}
|
|
7783
8008
|
}
|
|
7784
|
-
return
|
|
8009
|
+
return count;
|
|
7785
8010
|
}
|
|
7786
|
-
async function sanitizeDatasetFile(plan, onChunk) {
|
|
8011
|
+
async function sanitizeDatasetFile(plan, onChunk, options = {}) {
|
|
7787
8012
|
await mkdir7(path13.dirname(plan.outputPath), { recursive: true });
|
|
8013
|
+
const sourceEncoding = normalizeSanitizeSourceEncoding(
|
|
8014
|
+
options.sourceEncoding
|
|
8015
|
+
);
|
|
8016
|
+
const normalizer = new SanitizeEncodingNormalizer(sourceEncoding);
|
|
7788
8017
|
const input2 = createReadStream2(plan.absolutePath);
|
|
7789
|
-
const output2 = createWriteStream2(plan.outputPath);
|
|
8018
|
+
const output2 = createWriteStream2(plan.outputPath, { encoding: "utf8" });
|
|
7790
8019
|
let totalBytesRead = 0;
|
|
7791
8020
|
let totalBytesWritten = 0;
|
|
7792
8021
|
let nulBytesRemoved = 0;
|
|
8022
|
+
let invalidBytesRemoved = 0;
|
|
8023
|
+
let controlCharsRemoved = 0;
|
|
7793
8024
|
let lineCount = 0;
|
|
7794
|
-
let
|
|
7795
|
-
let
|
|
8025
|
+
let sawAnyCharacter = false;
|
|
8026
|
+
let lastCharacterWasNewline = false;
|
|
8027
|
+
const processText = async (text) => {
|
|
8028
|
+
if (text.length === 0) {
|
|
8029
|
+
return;
|
|
8030
|
+
}
|
|
8031
|
+
sawAnyCharacter = true;
|
|
8032
|
+
lineCount += countNewlines(text);
|
|
8033
|
+
lastCharacterWasNewline = text.endsWith("\n");
|
|
8034
|
+
totalBytesWritten += Buffer.byteLength(text, "utf8");
|
|
8035
|
+
await writeUtf8(output2, text);
|
|
8036
|
+
};
|
|
7796
8037
|
try {
|
|
7797
8038
|
for await (const chunk of input2) {
|
|
7798
8039
|
const chunkBuffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
|
|
7799
8040
|
totalBytesRead += chunkBuffer.length;
|
|
7800
|
-
const
|
|
7801
|
-
nulBytesRemoved +=
|
|
7802
|
-
|
|
7803
|
-
|
|
7804
|
-
|
|
7805
|
-
lineCount += 1;
|
|
7806
|
-
}
|
|
7807
|
-
}
|
|
7808
|
-
if (buffer.length > 0) {
|
|
7809
|
-
lastByteWasNewline = buffer[buffer.length - 1] === 10;
|
|
7810
|
-
}
|
|
7811
|
-
totalBytesWritten += buffer.length;
|
|
7812
|
-
output2.write(buffer);
|
|
8041
|
+
const normalized = normalizer.normalizeChunk(chunkBuffer);
|
|
8042
|
+
nulBytesRemoved += normalized.nulBytesRemoved;
|
|
8043
|
+
invalidBytesRemoved += normalized.invalidBytesRemoved;
|
|
8044
|
+
controlCharsRemoved += normalized.controlCharsRemoved;
|
|
8045
|
+
await processText(normalized.text);
|
|
7813
8046
|
onChunk?.({
|
|
7814
8047
|
bytesProcessed: chunkBuffer.length,
|
|
7815
8048
|
fileBytesProcessed: totalBytesRead,
|
|
7816
8049
|
currentFileSize: plan.fileSize,
|
|
7817
8050
|
processedRows: lineCount,
|
|
7818
|
-
nulBytesRemoved
|
|
8051
|
+
nulBytesRemoved,
|
|
8052
|
+
invalidBytesRemoved,
|
|
8053
|
+
controlCharsRemoved
|
|
7819
8054
|
});
|
|
7820
8055
|
}
|
|
7821
|
-
|
|
8056
|
+
const flushed = normalizer.flush();
|
|
8057
|
+
nulBytesRemoved += flushed.nulBytesRemoved;
|
|
8058
|
+
invalidBytesRemoved += flushed.invalidBytesRemoved;
|
|
8059
|
+
controlCharsRemoved += flushed.controlCharsRemoved;
|
|
8060
|
+
await processText(flushed.text);
|
|
8061
|
+
if (sawAnyCharacter && !lastCharacterWasNewline) {
|
|
7822
8062
|
lineCount += 1;
|
|
7823
8063
|
}
|
|
7824
8064
|
} finally {
|
|
7825
8065
|
input2.close();
|
|
7826
8066
|
output2.end();
|
|
7827
|
-
await new Promise((resolve2
|
|
8067
|
+
await new Promise((resolve2, reject) => {
|
|
8068
|
+
output2.on("finish", () => resolve2());
|
|
8069
|
+
output2.on("error", (error) => reject(error));
|
|
8070
|
+
});
|
|
7828
8071
|
}
|
|
7829
8072
|
return {
|
|
7830
8073
|
plan,
|
|
7831
8074
|
totalBytesRead,
|
|
7832
8075
|
totalBytesWritten,
|
|
8076
|
+
sourceEncoding,
|
|
7833
8077
|
nulBytesRemoved,
|
|
8078
|
+
invalidBytesRemoved,
|
|
8079
|
+
controlCharsRemoved,
|
|
7834
8080
|
lineCount,
|
|
7835
|
-
changed: nulBytesRemoved > 0 || totalBytesRead !== totalBytesWritten
|
|
8081
|
+
changed: nulBytesRemoved > 0 || invalidBytesRemoved > 0 || controlCharsRemoved > 0 || totalBytesRead !== totalBytesWritten
|
|
7836
8082
|
};
|
|
7837
8083
|
}
|
|
7838
8084
|
|
|
@@ -7895,40 +8141,54 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
|
|
|
7895
8141
|
"No recognized validated dataset files were found for sanitization."
|
|
7896
8142
|
);
|
|
7897
8143
|
}
|
|
8144
|
+
const sourceEncoding = normalizeSanitizeSourceEncoding(
|
|
8145
|
+
options.sourceEncoding
|
|
8146
|
+
);
|
|
7898
8147
|
options.onProgress?.({
|
|
7899
8148
|
kind: "start",
|
|
7900
8149
|
validatedPath,
|
|
7901
8150
|
outputPath,
|
|
7902
8151
|
totalFiles: plan.totalFiles,
|
|
7903
8152
|
totalBytes: plan.totalBytes,
|
|
7904
|
-
datasets: plan.datasets
|
|
8153
|
+
datasets: plan.datasets,
|
|
8154
|
+
sourceEncoding
|
|
7905
8155
|
});
|
|
7906
8156
|
let processedFiles = 0;
|
|
7907
8157
|
let processedRows = 0;
|
|
7908
8158
|
let processedBytes = 0;
|
|
7909
8159
|
let nulBytesRemoved = 0;
|
|
8160
|
+
let invalidBytesRemoved = 0;
|
|
8161
|
+
let controlCharsRemoved = 0;
|
|
7910
8162
|
let changedFiles = 0;
|
|
7911
8163
|
const fileSummaries = [];
|
|
7912
8164
|
for (const [index, filePlan] of plan.files.entries()) {
|
|
7913
|
-
const fileResult = await sanitizeDatasetFile(
|
|
7914
|
-
|
|
7915
|
-
|
|
7916
|
-
|
|
7917
|
-
|
|
7918
|
-
|
|
7919
|
-
|
|
7920
|
-
|
|
7921
|
-
|
|
7922
|
-
|
|
7923
|
-
|
|
7924
|
-
|
|
7925
|
-
|
|
7926
|
-
|
|
7927
|
-
|
|
8165
|
+
const fileResult = await sanitizeDatasetFile(
|
|
8166
|
+
filePlan,
|
|
8167
|
+
(chunk) => {
|
|
8168
|
+
options.onProgress?.({
|
|
8169
|
+
kind: "progress",
|
|
8170
|
+
currentFileDisplayPath: filePlan.displayPath,
|
|
8171
|
+
fileIndex: index + 1,
|
|
8172
|
+
totalFiles: plan.totalFiles,
|
|
8173
|
+
bytesProcessed: processedBytes + chunk.fileBytesProcessed,
|
|
8174
|
+
totalBytes: plan.totalBytes,
|
|
8175
|
+
fileBytesProcessed: chunk.fileBytesProcessed,
|
|
8176
|
+
currentFileSize: chunk.currentFileSize,
|
|
8177
|
+
processedRows: processedRows + chunk.processedRows,
|
|
8178
|
+
nulBytesRemoved: nulBytesRemoved + chunk.nulBytesRemoved,
|
|
8179
|
+
invalidBytesRemoved: invalidBytesRemoved + chunk.invalidBytesRemoved,
|
|
8180
|
+
controlCharsRemoved: controlCharsRemoved + chunk.controlCharsRemoved,
|
|
8181
|
+
changedFiles
|
|
8182
|
+
});
|
|
8183
|
+
},
|
|
8184
|
+
{ sourceEncoding }
|
|
8185
|
+
);
|
|
7928
8186
|
processedFiles += 1;
|
|
7929
8187
|
processedRows += fileResult.lineCount;
|
|
7930
8188
|
processedBytes += fileResult.totalBytesRead;
|
|
7931
8189
|
nulBytesRemoved += fileResult.nulBytesRemoved;
|
|
8190
|
+
invalidBytesRemoved += fileResult.invalidBytesRemoved;
|
|
8191
|
+
controlCharsRemoved += fileResult.controlCharsRemoved;
|
|
7932
8192
|
changedFiles += fileResult.changed ? 1 : 0;
|
|
7933
8193
|
fileSummaries.push({
|
|
7934
8194
|
dataset: filePlan.dataset,
|
|
@@ -7936,7 +8196,9 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
|
|
|
7936
8196
|
outputPath: filePlan.outputPath,
|
|
7937
8197
|
lineCount: fileResult.lineCount,
|
|
7938
8198
|
changed: fileResult.changed,
|
|
7939
|
-
nulBytesRemoved: fileResult.nulBytesRemoved
|
|
8199
|
+
nulBytesRemoved: fileResult.nulBytesRemoved,
|
|
8200
|
+
invalidBytesRemoved: fileResult.invalidBytesRemoved,
|
|
8201
|
+
controlCharsRemoved: fileResult.controlCharsRemoved
|
|
7940
8202
|
});
|
|
7941
8203
|
}
|
|
7942
8204
|
options.onProgress?.({
|
|
@@ -7944,6 +8206,8 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
|
|
|
7944
8206
|
totalFiles: plan.totalFiles,
|
|
7945
8207
|
processedRows,
|
|
7946
8208
|
nulBytesRemoved,
|
|
8209
|
+
invalidBytesRemoved,
|
|
8210
|
+
controlCharsRemoved,
|
|
7947
8211
|
changedFiles,
|
|
7948
8212
|
totalBytes: plan.totalBytes
|
|
7949
8213
|
});
|
|
@@ -7955,13 +8219,17 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
|
|
|
7955
8219
|
totalBytes: plan.totalBytes,
|
|
7956
8220
|
processedFiles,
|
|
7957
8221
|
processedRows,
|
|
8222
|
+
sourceEncoding,
|
|
7958
8223
|
nulBytesRemoved,
|
|
8224
|
+
invalidBytesRemoved,
|
|
8225
|
+
controlCharsRemoved,
|
|
7959
8226
|
changedFiles,
|
|
7960
8227
|
unchangedFiles: plan.totalFiles - changedFiles,
|
|
7961
8228
|
datasets: plan.datasets,
|
|
7962
8229
|
files: fileSummaries,
|
|
7963
8230
|
warnings: [
|
|
7964
|
-
"Sanitization
|
|
8231
|
+
"Sanitization now writes UTF-8 output and removes invalid bytes plus problematic control characters before PostgreSQL loading begins.",
|
|
8232
|
+
"The PostgreSQL direct import path can use --source-encoding UTF8 when reading files generated by this sanitization command.",
|
|
7965
8233
|
"The import command still keeps quarantine and row-level recovery for unexpected issues, but sanitizing first reduces the amount of slow fallback work during import."
|
|
7966
8234
|
],
|
|
7967
8235
|
nextStep: inferNextStep3(outputPath)
|
|
@@ -8065,117 +8333,963 @@ async function syncFederalRevenueDataset(options = {}) {
|
|
|
8065
8333
|
);
|
|
8066
8334
|
}
|
|
8067
8335
|
|
|
8068
|
-
// src/
|
|
8069
|
-
|
|
8070
|
-
|
|
8071
|
-
|
|
8072
|
-
return value;
|
|
8073
|
-
}
|
|
8074
|
-
return `\x1B[${code}m${value}\x1B[0m`;
|
|
8075
|
-
}
|
|
8076
|
-
var theme = {
|
|
8077
|
-
muted: (value) => paint(90, value),
|
|
8078
|
-
red: (value) => paint(31, value),
|
|
8079
|
-
green: (value) => paint(32, value),
|
|
8080
|
-
yellow: (value) => paint(33, value),
|
|
8081
|
-
blue: (value) => paint(34, value),
|
|
8082
|
-
command: (value) => paint(36, value),
|
|
8083
|
-
flag: (value) => paint(33, value),
|
|
8084
|
-
section: (value) => paint(36, paint(1, value)),
|
|
8085
|
-
successLabel: (value) => paint(32, paint(1, value)),
|
|
8086
|
-
warningLabel: (value) => paint(33, paint(1, value)),
|
|
8087
|
-
infoLabel: (value) => paint(36, paint(1, value)),
|
|
8088
|
-
errorLabel: (value) => paint(31, paint(1, value))
|
|
8089
|
-
};
|
|
8090
|
-
|
|
8091
|
-
// src/cli/ui/output/program-ui.ts
|
|
8092
|
-
function configureProgramUi(program) {
|
|
8093
|
-
program.configureOutput({
|
|
8094
|
-
writeErr: (str) => process.stderr.write(str),
|
|
8095
|
-
outputError: (str, write) => {
|
|
8096
|
-
write(theme.red(str));
|
|
8097
|
-
}
|
|
8098
|
-
});
|
|
8099
|
-
program.configureHelp({
|
|
8100
|
-
subcommandTerm: (cmd) => theme.command(cmd.name()),
|
|
8101
|
-
optionTerm: (option) => theme.flag(option.flags)
|
|
8102
|
-
});
|
|
8103
|
-
}
|
|
8336
|
+
// src/services/postgres-direct/exporter.ts
|
|
8337
|
+
import { createWriteStream as createWriteStream3 } from "fs";
|
|
8338
|
+
import { mkdir as mkdir8, writeFile as writeFile5 } from "fs/promises";
|
|
8339
|
+
import path16 from "path";
|
|
8104
8340
|
|
|
8105
|
-
// src/
|
|
8106
|
-
function
|
|
8107
|
-
if (
|
|
8108
|
-
|
|
8109
|
-
process.exit(1);
|
|
8341
|
+
// src/services/postgres-direct/csv.ts
|
|
8342
|
+
function formatCsvValue(value) {
|
|
8343
|
+
if (value === null || value === void 0) {
|
|
8344
|
+
return "";
|
|
8110
8345
|
}
|
|
8111
|
-
|
|
8112
|
-
|
|
8113
|
-
|
|
8346
|
+
if (value instanceof Date) {
|
|
8347
|
+
return formatCsvValue(value.toISOString());
|
|
8348
|
+
}
|
|
8349
|
+
const text = String(value);
|
|
8350
|
+
const shouldQuote = /[",\r\n]/.test(text);
|
|
8351
|
+
if (!shouldQuote) {
|
|
8352
|
+
return text;
|
|
8353
|
+
}
|
|
8354
|
+
return `"${text.replace(/"/g, '""')}"`;
|
|
8355
|
+
}
|
|
8356
|
+
function formatCsvRow(values) {
|
|
8357
|
+
return values.map(formatCsvValue).join(",");
|
|
8114
8358
|
}
|
|
8115
8359
|
|
|
8116
|
-
// src/
|
|
8360
|
+
// src/services/postgres-direct/script.ts
|
|
8117
8361
|
import path15 from "path";
|
|
8118
|
-
|
|
8119
|
-
|
|
8362
|
+
var STAGING_DATASETS = [
|
|
8363
|
+
"companies",
|
|
8364
|
+
"establishments",
|
|
8365
|
+
"partners",
|
|
8366
|
+
"simples_options"
|
|
8367
|
+
];
|
|
8368
|
+
var DOMAIN_DATASETS = [
|
|
8369
|
+
"partner_qualifications",
|
|
8370
|
+
"legal_natures",
|
|
8371
|
+
"countries",
|
|
8372
|
+
"cities",
|
|
8373
|
+
"reasons",
|
|
8374
|
+
"cnaes"
|
|
8375
|
+
];
|
|
8376
|
+
var STAGING_TABLE_BY_DATASET3 = {
|
|
8377
|
+
companies: "staging_companies",
|
|
8378
|
+
establishments: "staging_establishments",
|
|
8379
|
+
partners: "staging_partners",
|
|
8380
|
+
simples_options: "staging_simples_options"
|
|
8381
|
+
};
|
|
8382
|
+
function quoteSqlLiteral(value) {
|
|
8383
|
+
return `'${value.replace(/'/g, "''")}'`;
|
|
8120
8384
|
}
|
|
8121
|
-
function
|
|
8122
|
-
return
|
|
8385
|
+
function quoteIdentifier(value) {
|
|
8386
|
+
return `"${value.replace(/"/g, '""')}"`;
|
|
8123
8387
|
}
|
|
8124
|
-
function
|
|
8125
|
-
|
|
8126
|
-
return;
|
|
8127
|
-
}
|
|
8128
|
-
console.log(theme.infoLabel("NOTES"));
|
|
8129
|
-
for (const note of notes) {
|
|
8130
|
-
console.log(` ${theme.blue("\u2022")} ${note}`);
|
|
8131
|
-
}
|
|
8388
|
+
function normalizePathForPsql(filePath) {
|
|
8389
|
+
return path15.resolve(filePath).replace(/\\/g, "/");
|
|
8132
8390
|
}
|
|
8133
|
-
function
|
|
8134
|
-
|
|
8135
|
-
|
|
8136
|
-
|
|
8137
|
-
|
|
8138
|
-
|
|
8139
|
-
|
|
8391
|
+
function csvCopyCommand(tableName, columns, filePath) {
|
|
8392
|
+
const normalizedFilePath = normalizePathForPsql(filePath);
|
|
8393
|
+
return `\\copy ${tableName} (${columns.join(", ")}) from ${quoteSqlLiteral(normalizedFilePath)} with (format csv, header true, delimiter ',', quote '"', escape '"', null '')`;
|
|
8394
|
+
}
|
|
8395
|
+
function receitaCopyCommand(tableName, columns, filePath) {
|
|
8396
|
+
const normalizedFilePath = normalizePathForPsql(filePath);
|
|
8397
|
+
return `\\copy ${tableName} (${columns.join(", ")}) from ${quoteSqlLiteral(normalizedFilePath)} with (format csv, header false, delimiter ';', quote '"', escape '"')`;
|
|
8398
|
+
}
|
|
8399
|
+
function datasetColumns(dataset) {
|
|
8400
|
+
return DATASET_LAYOUTS[dataset].fields.map((field) => field.columnName);
|
|
8401
|
+
}
|
|
8402
|
+
function updateAssignments(columns, excludedColumns) {
|
|
8403
|
+
return columns.filter((column) => !excludedColumns.includes(column)).map((column) => `${column} = excluded.${column}`).concat(["updated_at = now()"]).join(",\n ");
|
|
8404
|
+
}
|
|
8405
|
+
function partnerDedupeExpression(alias) {
|
|
8406
|
+
return [
|
|
8407
|
+
"md5(",
|
|
8408
|
+
` coalesce(${alias}.cnpj_root, '') || '|' ||`,
|
|
8409
|
+
` coalesce(${alias}.partner_type_code, '') || '|' ||`,
|
|
8410
|
+
` coalesce(${alias}.partner_name, '') || '|' ||`,
|
|
8411
|
+
` coalesce(${alias}.partner_document, '') || '|' ||`,
|
|
8412
|
+
` coalesce(${alias}.partner_qualification_code, '') || '|' ||`,
|
|
8413
|
+
` coalesce((${alias}.entry_date - date '2000-01-01')::text, '') || '|' ||`,
|
|
8414
|
+
` coalesce(${alias}.country_code, '') || '|' ||`,
|
|
8415
|
+
` coalesce(${alias}.legal_representative_document, '') || '|' ||`,
|
|
8416
|
+
` coalesce(${alias}.legal_representative_name, '') || '|' ||`,
|
|
8417
|
+
` coalesce(${alias}.legal_representative_qualification_code, '') || '|' ||`,
|
|
8418
|
+
` coalesce(${alias}.age_group_code, '')`,
|
|
8419
|
+
")"
|
|
8420
|
+
].join("\n");
|
|
8421
|
+
}
|
|
8422
|
+
function materializeCompaniesSql() {
|
|
8423
|
+
const columns = companiesLayout.fields.map((field) => field.columnName);
|
|
8424
|
+
return [
|
|
8425
|
+
"\\echo 'Materializing companies...'",
|
|
8426
|
+
"with source as (",
|
|
8427
|
+
" select",
|
|
8428
|
+
` ${columns.map((column) => `source.${column}`).join(",\n ")},`,
|
|
8429
|
+
" row_number() over (partition by source.cnpj_root order by source.staging_id desc) as dedupe_rank",
|
|
8430
|
+
" from staging_companies source",
|
|
8431
|
+
"),",
|
|
8432
|
+
"deduped as (",
|
|
8433
|
+
" select * from source where dedupe_rank = 1",
|
|
8434
|
+
")",
|
|
8435
|
+
`insert into companies (${columns.join(", ")})`,
|
|
8436
|
+
`select ${columns.join(", ")}`,
|
|
8437
|
+
"from deduped",
|
|
8438
|
+
"on conflict (cnpj_root) do update set",
|
|
8439
|
+
` ${updateAssignments(columns, ["cnpj_root"])};`
|
|
8440
|
+
].join("\n");
|
|
8441
|
+
}
|
|
8442
|
+
function materializeEstablishmentsSql() {
|
|
8443
|
+
const baseColumns = establishmentsLayout.fields.map(
|
|
8444
|
+
(field) => field.columnName
|
|
8445
|
+
);
|
|
8446
|
+
const insertColumns = [...baseColumns, "cnpj_full"];
|
|
8447
|
+
return [
|
|
8448
|
+
"\\echo 'Materializing establishments and secondary CNAEs...'",
|
|
8449
|
+
"with source as (",
|
|
8450
|
+
" select",
|
|
8451
|
+
` ${baseColumns.map((column) => `source.${column}`).join(",\n ")},`,
|
|
8452
|
+
" source.cnpj_root || source.cnpj_order || source.cnpj_check_digits as cnpj_full,",
|
|
8453
|
+
" row_number() over (partition by source.cnpj_root || source.cnpj_order || source.cnpj_check_digits order by source.staging_id desc) as dedupe_rank",
|
|
8454
|
+
" from staging_establishments source",
|
|
8455
|
+
"),",
|
|
8456
|
+
"deduped as (",
|
|
8457
|
+
" select * from source where dedupe_rank = 1",
|
|
8458
|
+
"),",
|
|
8459
|
+
"upserted as (",
|
|
8460
|
+
` insert into establishments (${insertColumns.join(", ")})`,
|
|
8461
|
+
` select ${insertColumns.join(", ")}`,
|
|
8462
|
+
" from deduped",
|
|
8463
|
+
" on conflict (cnpj_full) do update set",
|
|
8464
|
+
` ${updateAssignments(insertColumns, ["cnpj_root", "cnpj_order", "cnpj_check_digits", "cnpj_full"])}`,
|
|
8465
|
+
" returning cnpj_full",
|
|
8466
|
+
"),",
|
|
8467
|
+
"deleted_secondary_cnaes as (",
|
|
8468
|
+
" delete from establishment_secondary_cnaes target",
|
|
8469
|
+
" using (select cnpj_full from deduped) source_keys",
|
|
8470
|
+
" where target.cnpj_full = source_keys.cnpj_full",
|
|
8471
|
+
" returning 1",
|
|
8472
|
+
"),",
|
|
8473
|
+
"secondary_cnaes_source as (",
|
|
8474
|
+
" select distinct",
|
|
8475
|
+
" deduped.cnpj_full,",
|
|
8476
|
+
" btrim(cnae_code) as cnae_code",
|
|
8477
|
+
" from deduped",
|
|
8478
|
+
" cross join lateral unnest(string_to_array(deduped.secondary_cnaes_raw, ',')) as cnae_code",
|
|
8479
|
+
" where deduped.secondary_cnaes_raw is not null",
|
|
8480
|
+
" and deduped.secondary_cnaes_raw <> ''",
|
|
8481
|
+
" and btrim(cnae_code) <> ''",
|
|
8482
|
+
")",
|
|
8483
|
+
"insert into establishment_secondary_cnaes (cnpj_full, cnae_code)",
|
|
8484
|
+
"select cnpj_full, cnae_code",
|
|
8485
|
+
"from secondary_cnaes_source",
|
|
8486
|
+
"on conflict (cnpj_full, cnae_code) do nothing;"
|
|
8487
|
+
].join("\n");
|
|
8488
|
+
}
|
|
8489
|
+
function materializePartnersSql() {
|
|
8490
|
+
const baseColumns = partnersLayout.fields.map((field) => field.columnName);
|
|
8491
|
+
const insertColumns = [...baseColumns, "partner_dedupe_key"];
|
|
8492
|
+
return [
|
|
8493
|
+
"\\echo 'Materializing partners...'",
|
|
8494
|
+
"with source as (",
|
|
8495
|
+
" select",
|
|
8496
|
+
` ${baseColumns.map((column) => `source.${column}`).join(",\n ")},`,
|
|
8497
|
+
` ${partnerDedupeExpression("source")} as partner_dedupe_key`,
|
|
8498
|
+
" from staging_partners source",
|
|
8499
|
+
"),",
|
|
8500
|
+
"ranked as (",
|
|
8501
|
+
" select",
|
|
8502
|
+
" source.*,",
|
|
8503
|
+
" row_number() over (partition by source.partner_dedupe_key order by source.cnpj_root asc) as dedupe_rank",
|
|
8504
|
+
" from source",
|
|
8505
|
+
"),",
|
|
8506
|
+
"deduped as (",
|
|
8507
|
+
" select * from ranked where dedupe_rank = 1",
|
|
8508
|
+
")",
|
|
8509
|
+
`insert into partners (${insertColumns.join(", ")})`,
|
|
8510
|
+
`select ${insertColumns.join(", ")}`,
|
|
8511
|
+
"from deduped",
|
|
8512
|
+
"on conflict (partner_dedupe_key) do update set",
|
|
8513
|
+
` ${updateAssignments(insertColumns, ["partner_dedupe_key"])};`
|
|
8514
|
+
].join("\n");
|
|
8515
|
+
}
|
|
8516
|
+
function materializeSimplesSql() {
|
|
8517
|
+
const columns = simplesLayout.fields.map((field) => field.columnName);
|
|
8518
|
+
return [
|
|
8519
|
+
"\\echo 'Materializing simples options...'",
|
|
8520
|
+
"with source as (",
|
|
8521
|
+
" select",
|
|
8522
|
+
` ${columns.map((column) => `source.${column}`).join(",\n ")},`,
|
|
8523
|
+
" row_number() over (partition by source.cnpj_root order by source.staging_id desc) as dedupe_rank",
|
|
8524
|
+
" from staging_simples_options source",
|
|
8525
|
+
"),",
|
|
8526
|
+
"deduped as (",
|
|
8527
|
+
" select * from source where dedupe_rank = 1",
|
|
8528
|
+
")",
|
|
8529
|
+
`insert into simples_options (${columns.join(", ")})`,
|
|
8530
|
+
`select ${columns.join(", ")}`,
|
|
8531
|
+
"from deduped",
|
|
8532
|
+
"on conflict (cnpj_root) do update set",
|
|
8533
|
+
` ${updateAssignments(columns, ["cnpj_root"])};`
|
|
8534
|
+
].join("\n");
|
|
8535
|
+
}
|
|
8536
|
+
function copyDomainSql(dataset, files) {
|
|
8537
|
+
if (files.length === 0) {
|
|
8538
|
+
return [];
|
|
8140
8539
|
}
|
|
8540
|
+
const columns = datasetColumns(dataset);
|
|
8541
|
+
const tempTable = `tmp_hybrid_${dataset}`;
|
|
8542
|
+
const lines = [
|
|
8543
|
+
`\\echo 'Loading ${dataset} lookup data...'`,
|
|
8544
|
+
`drop table if exists ${tempTable};`,
|
|
8545
|
+
`create temporary table ${tempTable} (code text, description text);`
|
|
8546
|
+
];
|
|
8547
|
+
for (const file of files) {
|
|
8548
|
+
lines.push(csvCopyCommand(tempTable, columns, file.absolutePath));
|
|
8549
|
+
}
|
|
8550
|
+
lines.push(
|
|
8551
|
+
`insert into ${dataset} (${columns.join(", ")})`,
|
|
8552
|
+
`select distinct on (code) ${columns.join(", ")}`,
|
|
8553
|
+
`from ${tempTable}`,
|
|
8554
|
+
"where code is not null and code <> ''",
|
|
8555
|
+
"order by code",
|
|
8556
|
+
"on conflict (code) do update set description = excluded.description;"
|
|
8557
|
+
);
|
|
8558
|
+
return lines;
|
|
8141
8559
|
}
|
|
8142
|
-
function
|
|
8143
|
-
if (
|
|
8144
|
-
return;
|
|
8560
|
+
function copyStagingSql(dataset, files) {
|
|
8561
|
+
if (files.length === 0) {
|
|
8562
|
+
return [];
|
|
8145
8563
|
}
|
|
8146
|
-
|
|
8147
|
-
|
|
8148
|
-
|
|
8564
|
+
const tableName = STAGING_TABLE_BY_DATASET3[dataset];
|
|
8565
|
+
if (!tableName) {
|
|
8566
|
+
return [];
|
|
8149
8567
|
}
|
|
8568
|
+
const columns = datasetColumns(dataset);
|
|
8569
|
+
return [
|
|
8570
|
+
`\\echo 'Loading ${dataset} staging data...'`,
|
|
8571
|
+
...files.map(
|
|
8572
|
+
(file) => csvCopyCommand(tableName, columns, file.absolutePath)
|
|
8573
|
+
)
|
|
8574
|
+
];
|
|
8150
8575
|
}
|
|
8151
|
-
function
|
|
8152
|
-
|
|
8153
|
-
|
|
8576
|
+
function csvFilesByDataset(files) {
|
|
8577
|
+
const grouped = {};
|
|
8578
|
+
for (const file of files) {
|
|
8579
|
+
const items = grouped[file.dataset] ?? [];
|
|
8580
|
+
items.push(file);
|
|
8581
|
+
grouped[file.dataset] = items;
|
|
8154
8582
|
}
|
|
8155
|
-
|
|
8156
|
-
|
|
8157
|
-
|
|
8158
|
-
|
|
8159
|
-
|
|
8160
|
-
|
|
8583
|
+
return grouped;
|
|
8584
|
+
}
|
|
8585
|
+
function directFilesByDataset(files) {
|
|
8586
|
+
const grouped = {};
|
|
8587
|
+
for (const file of files) {
|
|
8588
|
+
const items = grouped[file.dataset] ?? [];
|
|
8589
|
+
items.push(file);
|
|
8590
|
+
grouped[file.dataset] = items;
|
|
8161
8591
|
}
|
|
8162
|
-
return
|
|
8592
|
+
return grouped;
|
|
8163
8593
|
}
|
|
8164
|
-
function
|
|
8165
|
-
return
|
|
8594
|
+
function rawTableName(dataset) {
|
|
8595
|
+
return `tmp_hybrid_raw_${dataset}`;
|
|
8166
8596
|
}
|
|
8167
|
-
function
|
|
8168
|
-
|
|
8169
|
-
|
|
8170
|
-
|
|
8171
|
-
|
|
8597
|
+
function createRawTempTableSql(dataset) {
|
|
8598
|
+
const columns = DATASET_LAYOUTS[dataset].fields.map((field) => ` ${quoteIdentifier(field.columnName)} text`).join(",\n");
|
|
8599
|
+
return [
|
|
8600
|
+
`drop table if exists ${rawTableName(dataset)};`,
|
|
8601
|
+
`create temporary table ${rawTableName(dataset)} (`,
|
|
8602
|
+
columns,
|
|
8603
|
+
");"
|
|
8604
|
+
].join("\n");
|
|
8172
8605
|
}
|
|
8173
|
-
function
|
|
8174
|
-
|
|
8175
|
-
|
|
8606
|
+
function textExpression(alias, column) {
|
|
8607
|
+
return `nullif(btrim(${alias}.${quoteIdentifier(column)}), '')`;
|
|
8608
|
+
}
|
|
8609
|
+
function dateExpression(alias, column) {
|
|
8610
|
+
const value = `btrim(${alias}.${quoteIdentifier(column)})`;
|
|
8611
|
+
return [
|
|
8612
|
+
"case",
|
|
8613
|
+
` when ${value} = '' or ${value} = '00000000' then null`,
|
|
8614
|
+
` when ${value} ~ '^\\d{8}$' then to_date(${value}, 'YYYYMMDD')`,
|
|
8615
|
+
" else null",
|
|
8616
|
+
"end"
|
|
8617
|
+
].join(" ");
|
|
8618
|
+
}
|
|
8619
|
+
function numericExpression(alias, column) {
|
|
8620
|
+
const value = `btrim(${alias}.${quoteIdentifier(column)})`;
|
|
8621
|
+
return [
|
|
8622
|
+
"case",
|
|
8623
|
+
` when ${value} = '' then null`,
|
|
8624
|
+
` when ${value} like '%,%' and ${value} like '%.%' then replace(replace(${value}, '.', ''), ',', '.')::numeric`,
|
|
8625
|
+
` when ${value} like '%,%' then replace(${value}, ',', '.')::numeric`,
|
|
8626
|
+
` else ${value}::numeric`,
|
|
8627
|
+
"end"
|
|
8628
|
+
].join(" ");
|
|
8629
|
+
}
|
|
8630
|
+
function integerExpression(alias, column) {
|
|
8631
|
+
const value = `btrim(${alias}.${quoteIdentifier(column)})`;
|
|
8632
|
+
return [
|
|
8633
|
+
"case",
|
|
8634
|
+
` when ${value} = '' then null`,
|
|
8635
|
+
` when ${value} ~ '^-?\\d+$' then ${value}::integer`,
|
|
8636
|
+
" else null",
|
|
8637
|
+
"end"
|
|
8638
|
+
].join(" ");
|
|
8639
|
+
}
|
|
8640
|
+
function booleanExpression(alias, column) {
|
|
8641
|
+
const value = `lower(btrim(${alias}.${quoteIdentifier(column)}))`;
|
|
8642
|
+
return [
|
|
8643
|
+
"case",
|
|
8644
|
+
` when ${value} in ('1', 'true', 't', 'y', 'yes', 's') then true`,
|
|
8645
|
+
` when ${value} in ('0', 'false', 'f', 'n', 'no') then false`,
|
|
8646
|
+
" else null",
|
|
8647
|
+
"end"
|
|
8648
|
+
].join(" ");
|
|
8649
|
+
}
|
|
8650
|
+
function fieldExpression(dataset, field, alias) {
|
|
8651
|
+
const column = field.columnName;
|
|
8652
|
+
if (dataset === "companies" && column === "company_size_code") {
|
|
8653
|
+
return `coalesce(${textExpression(alias, column)}, '00')`;
|
|
8176
8654
|
}
|
|
8177
|
-
|
|
8178
|
-
|
|
8655
|
+
if (dataset === "establishments" && column === "branch_type_code") {
|
|
8656
|
+
return `coalesce(${textExpression(alias, column)}, '1')`;
|
|
8657
|
+
}
|
|
8658
|
+
if (dataset === "establishments" && column === "registration_status_code") {
|
|
8659
|
+
return `coalesce(${textExpression(alias, column)}, '01')`;
|
|
8660
|
+
}
|
|
8661
|
+
switch (field.dataType) {
|
|
8662
|
+
case "date":
|
|
8663
|
+
return dateExpression(alias, column);
|
|
8664
|
+
case "numeric":
|
|
8665
|
+
return numericExpression(alias, column);
|
|
8666
|
+
case "integer":
|
|
8667
|
+
return integerExpression(alias, column);
|
|
8668
|
+
case "boolean":
|
|
8669
|
+
return booleanExpression(alias, column);
|
|
8670
|
+
default:
|
|
8671
|
+
return textExpression(alias, column);
|
|
8672
|
+
}
|
|
8673
|
+
}
|
|
8674
|
+
function rawDomainSql(dataset, files) {
|
|
8675
|
+
if (files.length === 0) {
|
|
8676
|
+
return [];
|
|
8677
|
+
}
|
|
8678
|
+
const layout = DATASET_LAYOUTS[dataset];
|
|
8679
|
+
const columns = layout.fields.map((field) => field.columnName);
|
|
8680
|
+
const tableName = rawTableName(dataset);
|
|
8681
|
+
const lines = [
|
|
8682
|
+
`\\echo 'Loading ${dataset} lookup data directly from sanitized Receita files...'`,
|
|
8683
|
+
createRawTempTableSql(dataset)
|
|
8684
|
+
];
|
|
8685
|
+
for (const file of files) {
|
|
8686
|
+
lines.push(receitaCopyCommand(tableName, columns, file.absolutePath));
|
|
8687
|
+
}
|
|
8688
|
+
lines.push(
|
|
8689
|
+
`insert into ${dataset} (${columns.join(", ")})`,
|
|
8690
|
+
"select distinct on (code)",
|
|
8691
|
+
" nullif(btrim(code), '') as code,",
|
|
8692
|
+
" nullif(btrim(description), '') as description",
|
|
8693
|
+
`from ${tableName}`,
|
|
8694
|
+
"where nullif(btrim(code), '') is not null",
|
|
8695
|
+
"order by code",
|
|
8696
|
+
"on conflict (code) do update set description = excluded.description;"
|
|
8697
|
+
);
|
|
8698
|
+
return lines;
|
|
8699
|
+
}
|
|
8700
|
+
function rawStagingSql(dataset, files) {
|
|
8701
|
+
if (files.length === 0) {
|
|
8702
|
+
return [];
|
|
8703
|
+
}
|
|
8704
|
+
const targetTable = STAGING_TABLE_BY_DATASET3[dataset];
|
|
8705
|
+
if (!targetTable) {
|
|
8706
|
+
return [];
|
|
8707
|
+
}
|
|
8708
|
+
const layout = DATASET_LAYOUTS[dataset];
|
|
8709
|
+
const columns = layout.fields.map((field) => field.columnName);
|
|
8710
|
+
const tableName = rawTableName(dataset);
|
|
8711
|
+
const alias = "source";
|
|
8712
|
+
const expressions = layout.fields.map(
|
|
8713
|
+
(field) => ` ${fieldExpression(dataset, field, alias)} as ${field.columnName}`
|
|
8714
|
+
);
|
|
8715
|
+
const lines = [
|
|
8716
|
+
`\\echo 'Loading ${dataset} staging data directly from sanitized Receita files...'`,
|
|
8717
|
+
createRawTempTableSql(dataset)
|
|
8718
|
+
];
|
|
8719
|
+
for (const file of files) {
|
|
8720
|
+
lines.push(receitaCopyCommand(tableName, columns, file.absolutePath));
|
|
8721
|
+
}
|
|
8722
|
+
lines.push(
|
|
8723
|
+
`insert into ${targetTable} (${columns.join(", ")})`,
|
|
8724
|
+
"select",
|
|
8725
|
+
expressions.join(",\n"),
|
|
8726
|
+
`from ${tableName} ${alias};`
|
|
8727
|
+
);
|
|
8728
|
+
return lines;
|
|
8729
|
+
}
|
|
8730
|
+
function generatePostgresDirectImportScript(input2) {
|
|
8731
|
+
const grouped = csvFilesByDataset(input2.files);
|
|
8732
|
+
const lines = [
|
|
8733
|
+
"-- CNPJ DB Loader hybrid PostgreSQL import script",
|
|
8734
|
+
"-- Generated from PostgreSQL-ready CSV files exported by cnpj-db-loader postgres export-csv.",
|
|
8735
|
+
"-- Execute with psql, for example:",
|
|
8736
|
+
'-- psql "postgres://postgres:postgres@localhost:5432/cnpj" -f import-postgres-direct.sql',
|
|
8737
|
+
"",
|
|
8738
|
+
"\\set ON_ERROR_STOP on",
|
|
8739
|
+
"\\echo 'Starting CNPJ DB Loader hybrid PostgreSQL import...'",
|
|
8740
|
+
"",
|
|
8741
|
+
"begin;",
|
|
8742
|
+
"",
|
|
8743
|
+
"-- Keep the final schema and seed data managed by sql/schema.sql.",
|
|
8744
|
+
"-- This script only resets staging tables and then upserts final data.",
|
|
8745
|
+
"truncate table staging_companies restart identity;",
|
|
8746
|
+
"truncate table staging_establishments restart identity;",
|
|
8747
|
+
"truncate table staging_partners restart identity;",
|
|
8748
|
+
"truncate table staging_simples_options restart identity;",
|
|
8749
|
+
""
|
|
8750
|
+
];
|
|
8751
|
+
for (const dataset of DOMAIN_DATASETS) {
|
|
8752
|
+
lines.push(...copyDomainSql(dataset, grouped[dataset] ?? []), "");
|
|
8753
|
+
}
|
|
8754
|
+
for (const dataset of STAGING_DATASETS) {
|
|
8755
|
+
lines.push(...copyStagingSql(dataset, grouped[dataset] ?? []), "");
|
|
8756
|
+
}
|
|
8757
|
+
lines.push(...materializationAndAnalyzeSql());
|
|
8758
|
+
return lines.join("\n");
|
|
8759
|
+
}
|
|
8760
|
+
function generatePostgresSanitizedDirectImportScript(input2) {
|
|
8761
|
+
const grouped = directFilesByDataset(input2.files);
|
|
8762
|
+
const lines = [
|
|
8763
|
+
"-- CNPJ DB Loader direct PostgreSQL import script",
|
|
8764
|
+
"-- Generated from sanitized Receita files by cnpj-db-loader postgres generate-script.",
|
|
8765
|
+
"-- This path avoids rewriting the dataset into a second CSV tree.",
|
|
8766
|
+
"-- Execute with psql, for example:",
|
|
8767
|
+
'-- psql "postgres://postgres:postgres@localhost:5432/cnpj" -f import-postgres-direct.sql',
|
|
8768
|
+
"",
|
|
8769
|
+
"\\set ON_ERROR_STOP on",
|
|
8770
|
+
`\\echo 'Using source file encoding ${input2.sourceEncoding} for psql copy operations...'`,
|
|
8771
|
+
`set client_encoding to ${quoteSqlLiteral(input2.sourceEncoding)};`,
|
|
8772
|
+
"\\echo 'Starting CNPJ DB Loader direct PostgreSQL import from sanitized files...'",
|
|
8773
|
+
"",
|
|
8774
|
+
"begin;",
|
|
8775
|
+
"",
|
|
8776
|
+
"-- Keep the final schema and seed data managed by sql/schema.sql.",
|
|
8777
|
+
"-- This script copies sanitized Receita files into temporary raw tables,",
|
|
8778
|
+
"-- transforms values inside PostgreSQL, resets staging tables and upserts final data.",
|
|
8779
|
+
"truncate table staging_companies restart identity;",
|
|
8780
|
+
"truncate table staging_establishments restart identity;",
|
|
8781
|
+
"truncate table staging_partners restart identity;",
|
|
8782
|
+
"truncate table staging_simples_options restart identity;",
|
|
8783
|
+
""
|
|
8784
|
+
];
|
|
8785
|
+
for (const dataset of DOMAIN_DATASETS) {
|
|
8786
|
+
lines.push(...rawDomainSql(dataset, grouped[dataset] ?? []), "");
|
|
8787
|
+
}
|
|
8788
|
+
for (const dataset of STAGING_DATASETS) {
|
|
8789
|
+
lines.push(...rawStagingSql(dataset, grouped[dataset] ?? []), "");
|
|
8790
|
+
}
|
|
8791
|
+
lines.push(...materializationAndAnalyzeSql());
|
|
8792
|
+
return lines.join("\n");
|
|
8793
|
+
}
|
|
8794
|
+
function materializationAndAnalyzeSql() {
|
|
8795
|
+
return [
|
|
8796
|
+
materializeCompaniesSql(),
|
|
8797
|
+
"",
|
|
8798
|
+
materializeEstablishmentsSql(),
|
|
8799
|
+
"",
|
|
8800
|
+
materializePartnersSql(),
|
|
8801
|
+
"",
|
|
8802
|
+
materializeSimplesSql(),
|
|
8803
|
+
"",
|
|
8804
|
+
"\\echo 'Refreshing planner statistics...'",
|
|
8805
|
+
"analyze companies;",
|
|
8806
|
+
"analyze establishments;",
|
|
8807
|
+
"analyze establishment_secondary_cnaes;",
|
|
8808
|
+
"analyze partners;",
|
|
8809
|
+
"analyze simples_options;",
|
|
8810
|
+
"analyze cnaes;",
|
|
8811
|
+
"analyze cities;",
|
|
8812
|
+
"analyze countries;",
|
|
8813
|
+
"analyze legal_natures;",
|
|
8814
|
+
"analyze partner_qualifications;",
|
|
8815
|
+
"analyze reasons;",
|
|
8816
|
+
"",
|
|
8817
|
+
"commit;",
|
|
8818
|
+
"",
|
|
8819
|
+
"\\echo 'CNPJ DB Loader hybrid PostgreSQL import completed.'",
|
|
8820
|
+
""
|
|
8821
|
+
];
|
|
8822
|
+
}
|
|
8823
|
+
|
|
8824
|
+
// src/services/postgres-direct/exporter.ts
|
|
8825
|
+
var POSTGRES_DIRECT_SCHEMA_CAPABILITIES = {
|
|
8826
|
+
includeEstablishmentCnpjFullInInsert: true,
|
|
8827
|
+
includeEstablishmentSecondaryCnaesTable: true,
|
|
8828
|
+
includePartnerDedupeKeyInInsert: true,
|
|
8829
|
+
requiresLookupReconciliation: false
|
|
8830
|
+
};
|
|
8831
|
+
function defaultPostgresCsvOutputPath(inputPath) {
|
|
8832
|
+
const baseName = path16.basename(inputPath);
|
|
8833
|
+
return path16.join(path16.dirname(inputPath), `${baseName}-postgres-csv`);
|
|
8834
|
+
}
|
|
8835
|
+
function normalizeOutputFileName(relativePath) {
|
|
8836
|
+
const parsed = path16.parse(relativePath);
|
|
8837
|
+
const baseName = parsed.name || parsed.base || "dataset";
|
|
8838
|
+
return path16.join(parsed.dir, `${baseName}.csv`);
|
|
8839
|
+
}
|
|
8840
|
+
function resolveDatasetOutputPath(outputPath, dataset, relativePath) {
|
|
8841
|
+
return path16.join(outputPath, dataset, normalizeOutputFileName(relativePath));
|
|
8842
|
+
}
|
|
8843
|
+
function inferNextStep4(scriptPath) {
|
|
8844
|
+
return `psql "postgres://postgres:postgres@localhost:5432/cnpj" -f ${scriptPath.replace(/\\/g, "/")}`;
|
|
8845
|
+
}
|
|
8846
|
+
async function writeCsvFile(input2) {
|
|
8847
|
+
const layout = DATASET_LAYOUTS[input2.dataset];
|
|
8848
|
+
const columns = layout.fields.map((field) => field.columnName);
|
|
8849
|
+
await mkdir8(path16.dirname(input2.outputFile), { recursive: true });
|
|
8850
|
+
const output2 = createWriteStream3(input2.outputFile, { encoding: "utf8" });
|
|
8851
|
+
let rows = 0;
|
|
8852
|
+
try {
|
|
8853
|
+
output2.write(`${formatCsvRow(columns)}
|
|
8854
|
+
`);
|
|
8855
|
+
for await (const sourceLine of readImportSourceLines(input2.inputFile)) {
|
|
8856
|
+
if (sourceLine.rawLine.trim() === "") {
|
|
8857
|
+
continue;
|
|
8858
|
+
}
|
|
8859
|
+
const parsed = parseImportSourceLine(sourceLine);
|
|
8860
|
+
const normalizedFields = normalizeFieldCount(
|
|
8861
|
+
parsed.fields,
|
|
8862
|
+
layout.fields.length,
|
|
8863
|
+
input2.inputFile,
|
|
8864
|
+
parsed.lineNumber
|
|
8865
|
+
);
|
|
8866
|
+
const values = transformRecord(
|
|
8867
|
+
input2.dataset,
|
|
8868
|
+
layout,
|
|
8869
|
+
normalizedFields,
|
|
8870
|
+
POSTGRES_DIRECT_SCHEMA_CAPABILITIES,
|
|
8871
|
+
"staging"
|
|
8872
|
+
);
|
|
8873
|
+
output2.write(`${formatCsvRow(values)}
|
|
8874
|
+
`);
|
|
8875
|
+
rows += 1;
|
|
8876
|
+
}
|
|
8877
|
+
} finally {
|
|
8878
|
+
output2.end();
|
|
8879
|
+
await new Promise((resolve2, reject) => {
|
|
8880
|
+
output2.on("finish", () => resolve2());
|
|
8881
|
+
output2.on("error", (error) => reject(error));
|
|
8882
|
+
});
|
|
8883
|
+
}
|
|
8884
|
+
return rows;
|
|
8885
|
+
}
|
|
8886
|
+
async function exportPostgresCsvDataset(inputPath, options = {}) {
|
|
8887
|
+
if (options.dataset && !isImportDatasetType(options.dataset)) {
|
|
8888
|
+
throw new ValidationError(`Unsupported dataset type: ${options.dataset}.`);
|
|
8889
|
+
}
|
|
8890
|
+
const validation = await validateInputDirectory(inputPath);
|
|
8891
|
+
if (!validation.ok) {
|
|
8892
|
+
throw new ValidationError(
|
|
8893
|
+
`The input directory is not ready for PostgreSQL CSV export. ${validation.errors.join(" ")}`
|
|
8894
|
+
);
|
|
8895
|
+
}
|
|
8896
|
+
const validatedPath = validation.validatedPath;
|
|
8897
|
+
const outputPath = path16.resolve(
|
|
8898
|
+
options.outputPath ?? defaultPostgresCsvOutputPath(validatedPath)
|
|
8899
|
+
);
|
|
8900
|
+
const inspected = await inspectFiles(validatedPath);
|
|
8901
|
+
const recognizedFiles = inspected.entries.filter((entry) => entry.entryKind === "file").flatMap((entry) => {
|
|
8902
|
+
if (!isImportDatasetType(entry.inferredType)) {
|
|
8903
|
+
return [];
|
|
8904
|
+
}
|
|
8905
|
+
if (options.dataset && entry.inferredType !== options.dataset) {
|
|
8906
|
+
return [];
|
|
8907
|
+
}
|
|
8908
|
+
return [{ ...entry, inferredType: entry.inferredType }];
|
|
8909
|
+
}).sort(sortEntries);
|
|
8910
|
+
if (recognizedFiles.length === 0) {
|
|
8911
|
+
throw new ValidationError(
|
|
8912
|
+
"No recognized dataset files were found for PostgreSQL CSV export."
|
|
8913
|
+
);
|
|
8914
|
+
}
|
|
8915
|
+
const datasets = [
|
|
8916
|
+
...new Set(recognizedFiles.map((entry) => entry.inferredType))
|
|
8917
|
+
].sort(
|
|
8918
|
+
(left, right) => IMPORT_ORDER.indexOf(left) - IMPORT_ORDER.indexOf(right)
|
|
8919
|
+
);
|
|
8920
|
+
options.onProgress?.({
|
|
8921
|
+
kind: "start",
|
|
8922
|
+
inputPath: path16.resolve(inputPath),
|
|
8923
|
+
validatedPath,
|
|
8924
|
+
outputPath,
|
|
8925
|
+
totalFiles: recognizedFiles.length,
|
|
8926
|
+
datasets
|
|
8927
|
+
});
|
|
8928
|
+
const exportedFiles = [];
|
|
8929
|
+
const summariesByDataset = /* @__PURE__ */ new Map();
|
|
8930
|
+
for (const [index, entry] of recognizedFiles.entries()) {
|
|
8931
|
+
const dataset = entry.inferredType;
|
|
8932
|
+
const inputFile = path16.join(validatedPath, entry.relativePath);
|
|
8933
|
+
const outputFile = resolveDatasetOutputPath(
|
|
8934
|
+
outputPath,
|
|
8935
|
+
dataset,
|
|
8936
|
+
entry.relativePath
|
|
8937
|
+
);
|
|
8938
|
+
options.onProgress?.({
|
|
8939
|
+
kind: "file_start",
|
|
8940
|
+
dataset,
|
|
8941
|
+
fileIndex: index + 1,
|
|
8942
|
+
totalFiles: recognizedFiles.length,
|
|
8943
|
+
inputFile: buildDisplayPath(inputFile),
|
|
8944
|
+
outputFile
|
|
8945
|
+
});
|
|
8946
|
+
const rowCount = await writeCsvFile({ dataset, inputFile, outputFile });
|
|
8947
|
+
exportedFiles.push({
|
|
8948
|
+
dataset,
|
|
8949
|
+
absolutePath: outputFile,
|
|
8950
|
+
relativePath: path16.relative(outputPath, outputFile),
|
|
8951
|
+
rowCount
|
|
8952
|
+
});
|
|
8953
|
+
const currentSummary = summariesByDataset.get(dataset) ?? {
|
|
8954
|
+
dataset,
|
|
8955
|
+
files: 0,
|
|
8956
|
+
rows: 0,
|
|
8957
|
+
outputFiles: []
|
|
8958
|
+
};
|
|
8959
|
+
currentSummary.files += 1;
|
|
8960
|
+
currentSummary.rows += rowCount;
|
|
8961
|
+
currentSummary.outputFiles.push(outputFile);
|
|
8962
|
+
summariesByDataset.set(dataset, currentSummary);
|
|
8963
|
+
options.onProgress?.({
|
|
8964
|
+
kind: "file_finish",
|
|
8965
|
+
dataset,
|
|
8966
|
+
fileIndex: index + 1,
|
|
8967
|
+
totalFiles: recognizedFiles.length,
|
|
8968
|
+
inputFile: buildDisplayPath(inputFile),
|
|
8969
|
+
outputFile,
|
|
8970
|
+
rows: rowCount
|
|
8971
|
+
});
|
|
8972
|
+
}
|
|
8973
|
+
const scriptName = options.scriptName ?? "import-postgres-direct.sql";
|
|
8974
|
+
const scriptPath = path16.join(outputPath, scriptName);
|
|
8975
|
+
const script = generatePostgresDirectImportScript({ files: exportedFiles });
|
|
8976
|
+
await writeFile5(scriptPath, script, "utf8");
|
|
8977
|
+
const manifestPath = path16.join(outputPath, "manifest.json");
|
|
8978
|
+
const summaryDatasets = [...summariesByDataset.values()].sort(
|
|
8979
|
+
(left, right) => IMPORT_ORDER.indexOf(left.dataset) - IMPORT_ORDER.indexOf(right.dataset)
|
|
8980
|
+
);
|
|
8981
|
+
const totalRows = summaryDatasets.reduce((sum, item) => sum + item.rows, 0);
|
|
8982
|
+
const manifest = {
|
|
8983
|
+
generatedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
8984
|
+
inputPath: path16.resolve(inputPath),
|
|
8985
|
+
validatedPath,
|
|
8986
|
+
outputPath,
|
|
8987
|
+
scriptPath,
|
|
8988
|
+
totalFiles: exportedFiles.length,
|
|
8989
|
+
totalRows,
|
|
8990
|
+
datasets: summaryDatasets
|
|
8991
|
+
};
|
|
8992
|
+
await writeFile5(
|
|
8993
|
+
manifestPath,
|
|
8994
|
+
`${JSON.stringify(manifest, null, 2)}
|
|
8995
|
+
`,
|
|
8996
|
+
"utf8"
|
|
8997
|
+
);
|
|
8998
|
+
options.onProgress?.({
|
|
8999
|
+
kind: "finish",
|
|
9000
|
+
outputPath,
|
|
9001
|
+
scriptPath,
|
|
9002
|
+
totalFiles: exportedFiles.length,
|
|
9003
|
+
totalRows
|
|
9004
|
+
});
|
|
9005
|
+
return {
|
|
9006
|
+
inputPath: path16.resolve(inputPath),
|
|
9007
|
+
validatedPath,
|
|
9008
|
+
outputPath,
|
|
9009
|
+
scriptPath,
|
|
9010
|
+
manifestPath,
|
|
9011
|
+
totalFiles: exportedFiles.length,
|
|
9012
|
+
totalRows,
|
|
9013
|
+
datasets: summaryDatasets,
|
|
9014
|
+
warnings: [
|
|
9015
|
+
"PostgreSQL-ready CSV export is intended for hybrid bulk imports after extraction, validation and sanitization.",
|
|
9016
|
+
"The generated SQL script resets staging tables and then upserts final tables. Review it before running against production databases."
|
|
9017
|
+
],
|
|
9018
|
+
nextStep: inferNextStep4(scriptPath)
|
|
9019
|
+
};
|
|
9020
|
+
}
|
|
9021
|
+
|
|
9022
|
+
// src/services/postgres-direct/generator.ts
|
|
9023
|
+
import { mkdir as mkdir9, stat as stat7, writeFile as writeFile6 } from "fs/promises";
|
|
9024
|
+
import path17 from "path";
|
|
9025
|
+
var DEFAULT_SOURCE_ENCODING = "UTF8";
|
|
9026
|
+
function defaultPostgresDirectOutputPath(inputPath) {
|
|
9027
|
+
const baseName = path17.basename(inputPath);
|
|
9028
|
+
if (baseName.toLowerCase() === "sanitized") {
|
|
9029
|
+
return path17.join(path17.dirname(inputPath), "postgres-direct");
|
|
9030
|
+
}
|
|
9031
|
+
return path17.join(path17.dirname(inputPath), `${baseName}-postgres-direct`);
|
|
9032
|
+
}
|
|
9033
|
+
function inferNextStep5(scriptPath) {
|
|
9034
|
+
return `psql "postgres://postgres:postgres@localhost:5432/cnpj" -f ${scriptPath.replace(/\\/g, "/")}`;
|
|
9035
|
+
}
|
|
9036
|
+
function normalizeSourceEncoding(value) {
|
|
9037
|
+
const encoding = (value ?? DEFAULT_SOURCE_ENCODING).trim();
|
|
9038
|
+
if (!/^[A-Za-z0-9_-]+$/.test(encoding)) {
|
|
9039
|
+
throw new ValidationError(
|
|
9040
|
+
`Invalid source encoding: ${value}. Use a PostgreSQL client encoding name such as UTF8, WIN1252 or LATIN1.`
|
|
9041
|
+
);
|
|
9042
|
+
}
|
|
9043
|
+
return encoding.toUpperCase();
|
|
9044
|
+
}
|
|
9045
|
+
async function generatePostgresDirectScript(inputPath, options = {}) {
|
|
9046
|
+
if (options.dataset && !isImportDatasetType(options.dataset)) {
|
|
9047
|
+
throw new ValidationError(`Unsupported dataset type: ${options.dataset}.`);
|
|
9048
|
+
}
|
|
9049
|
+
const validation = await validateInputDirectory(inputPath);
|
|
9050
|
+
if (!validation.ok && !options.dataset) {
|
|
9051
|
+
throw new ValidationError(
|
|
9052
|
+
`The input directory is not ready for PostgreSQL direct script generation. ${validation.errors.join(" ")}`
|
|
9053
|
+
);
|
|
9054
|
+
}
|
|
9055
|
+
const validatedPath = validation.ok ? validation.validatedPath : path17.resolve(inputPath);
|
|
9056
|
+
const outputPath = path17.resolve(
|
|
9057
|
+
options.outputPath ?? defaultPostgresDirectOutputPath(validatedPath)
|
|
9058
|
+
);
|
|
9059
|
+
const sourceEncoding = normalizeSourceEncoding(options.sourceEncoding);
|
|
9060
|
+
const inspected = await inspectFiles(validatedPath);
|
|
9061
|
+
const recognizedFiles = inspected.entries.filter((entry) => entry.entryKind === "file").flatMap((entry) => {
|
|
9062
|
+
if (!isImportDatasetType(entry.inferredType)) {
|
|
9063
|
+
return [];
|
|
9064
|
+
}
|
|
9065
|
+
if (options.dataset && entry.inferredType !== options.dataset) {
|
|
9066
|
+
return [];
|
|
9067
|
+
}
|
|
9068
|
+
return [{ ...entry, inferredType: entry.inferredType }];
|
|
9069
|
+
}).sort(sortEntries);
|
|
9070
|
+
if (recognizedFiles.length === 0) {
|
|
9071
|
+
throw new ValidationError(
|
|
9072
|
+
"No recognized dataset files were found for PostgreSQL direct script generation."
|
|
9073
|
+
);
|
|
9074
|
+
}
|
|
9075
|
+
const datasets = [
|
|
9076
|
+
...new Set(recognizedFiles.map((entry) => entry.inferredType))
|
|
9077
|
+
].sort(
|
|
9078
|
+
(left, right) => IMPORT_ORDER.indexOf(left) - IMPORT_ORDER.indexOf(right)
|
|
9079
|
+
);
|
|
9080
|
+
options.onProgress?.({
|
|
9081
|
+
kind: "start",
|
|
9082
|
+
inputPath: path17.resolve(inputPath),
|
|
9083
|
+
validatedPath,
|
|
9084
|
+
outputPath,
|
|
9085
|
+
totalFiles: recognizedFiles.length,
|
|
9086
|
+
datasets,
|
|
9087
|
+
sourceEncoding
|
|
9088
|
+
});
|
|
9089
|
+
await mkdir9(outputPath, { recursive: true });
|
|
9090
|
+
const sourceFiles = [];
|
|
9091
|
+
const summariesByDataset = /* @__PURE__ */ new Map();
|
|
9092
|
+
for (const [index, entry] of recognizedFiles.entries()) {
|
|
9093
|
+
const dataset = entry.inferredType;
|
|
9094
|
+
const absolutePath = path17.join(validatedPath, entry.relativePath);
|
|
9095
|
+
const fileStats = await stat7(absolutePath);
|
|
9096
|
+
sourceFiles.push({
|
|
9097
|
+
dataset,
|
|
9098
|
+
absolutePath,
|
|
9099
|
+
relativePath: entry.relativePath,
|
|
9100
|
+
fileSize: fileStats.size
|
|
9101
|
+
});
|
|
9102
|
+
const currentSummary = summariesByDataset.get(dataset) ?? {
|
|
9103
|
+
dataset,
|
|
9104
|
+
files: 0,
|
|
9105
|
+
totalBytes: 0,
|
|
9106
|
+
sourceFiles: []
|
|
9107
|
+
};
|
|
9108
|
+
currentSummary.files += 1;
|
|
9109
|
+
currentSummary.totalBytes += fileStats.size;
|
|
9110
|
+
currentSummary.sourceFiles.push(absolutePath);
|
|
9111
|
+
summariesByDataset.set(dataset, currentSummary);
|
|
9112
|
+
options.onProgress?.({
|
|
9113
|
+
kind: "file_registered",
|
|
9114
|
+
dataset,
|
|
9115
|
+
fileIndex: index + 1,
|
|
9116
|
+
totalFiles: recognizedFiles.length,
|
|
9117
|
+
inputFile: buildDisplayPath(absolutePath),
|
|
9118
|
+
fileSize: fileStats.size
|
|
9119
|
+
});
|
|
9120
|
+
}
|
|
9121
|
+
const scriptName = options.scriptName ?? "import-postgres-direct.sql";
|
|
9122
|
+
const scriptPath = path17.join(outputPath, scriptName);
|
|
9123
|
+
const script = generatePostgresSanitizedDirectImportScript({
|
|
9124
|
+
files: sourceFiles,
|
|
9125
|
+
sourceEncoding
|
|
9126
|
+
});
|
|
9127
|
+
await writeFile6(scriptPath, script, "utf8");
|
|
9128
|
+
const manifestPath = path17.join(outputPath, "manifest.json");
|
|
9129
|
+
const summaryDatasets = [...summariesByDataset.values()].sort(
|
|
9130
|
+
(left, right) => IMPORT_ORDER.indexOf(left.dataset) - IMPORT_ORDER.indexOf(right.dataset)
|
|
9131
|
+
);
|
|
9132
|
+
const totalBytes = summaryDatasets.reduce(
|
|
9133
|
+
(sum, item) => sum + item.totalBytes,
|
|
9134
|
+
0
|
|
9135
|
+
);
|
|
9136
|
+
const manifest = {
|
|
9137
|
+
generatedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
9138
|
+
mode: "direct-sanitized-script",
|
|
9139
|
+
inputPath: path17.resolve(inputPath),
|
|
9140
|
+
validatedPath,
|
|
9141
|
+
outputPath,
|
|
9142
|
+
scriptPath,
|
|
9143
|
+
sourceEncoding,
|
|
9144
|
+
totalFiles: sourceFiles.length,
|
|
9145
|
+
totalBytes,
|
|
9146
|
+
datasets: summaryDatasets
|
|
9147
|
+
};
|
|
9148
|
+
await writeFile6(
|
|
9149
|
+
manifestPath,
|
|
9150
|
+
`${JSON.stringify(manifest, null, 2)}
|
|
9151
|
+
`,
|
|
9152
|
+
"utf8"
|
|
9153
|
+
);
|
|
9154
|
+
options.onProgress?.({
|
|
9155
|
+
kind: "finish",
|
|
9156
|
+
outputPath,
|
|
9157
|
+
scriptPath,
|
|
9158
|
+
totalFiles: sourceFiles.length,
|
|
9159
|
+
totalBytes
|
|
9160
|
+
});
|
|
9161
|
+
return {
|
|
9162
|
+
inputPath: path17.resolve(inputPath),
|
|
9163
|
+
validatedPath,
|
|
9164
|
+
outputPath,
|
|
9165
|
+
scriptPath,
|
|
9166
|
+
manifestPath,
|
|
9167
|
+
sourceEncoding,
|
|
9168
|
+
totalFiles: sourceFiles.length,
|
|
9169
|
+
totalBytes,
|
|
9170
|
+
datasets: summaryDatasets,
|
|
9171
|
+
warnings: [
|
|
9172
|
+
...validation.ok ? [] : validation.errors,
|
|
9173
|
+
"This script imports sanitized Receita files directly with psql \\copy. It avoids rewriting the full dataset into a second CSV tree.",
|
|
9174
|
+
"The generated script expects the database schema generated by cnpj-db-loader to be applied before execution.",
|
|
9175
|
+
"The direct PostgreSQL script now defaults to UTF8 because the sanitize command writes clean UTF-8 files.",
|
|
9176
|
+
"Use --source-encoding WIN1252 or LATIN1 only when generating scripts for legacy sanitized files produced by older loader versions."
|
|
9177
|
+
],
|
|
9178
|
+
nextStep: inferNextStep5(scriptPath)
|
|
9179
|
+
};
|
|
9180
|
+
}
|
|
9181
|
+
|
|
9182
|
+
// src/cli/ui/theme.ts
|
|
9183
|
+
var colorsEnabled = process.stdout.isTTY && !process.argv.includes("--no-color") && process.env.NO_COLOR === void 0;
|
|
9184
|
+
function paint(code, value) {
|
|
9185
|
+
if (!colorsEnabled) {
|
|
9186
|
+
return value;
|
|
9187
|
+
}
|
|
9188
|
+
return `\x1B[${code}m${value}\x1B[0m`;
|
|
9189
|
+
}
|
|
9190
|
+
var theme = {
|
|
9191
|
+
muted: (value) => paint(90, value),
|
|
9192
|
+
red: (value) => paint(31, value),
|
|
9193
|
+
green: (value) => paint(32, value),
|
|
9194
|
+
yellow: (value) => paint(33, value),
|
|
9195
|
+
blue: (value) => paint(34, value),
|
|
9196
|
+
command: (value) => paint(36, value),
|
|
9197
|
+
flag: (value) => paint(33, value),
|
|
9198
|
+
section: (value) => paint(36, paint(1, value)),
|
|
9199
|
+
successLabel: (value) => paint(32, paint(1, value)),
|
|
9200
|
+
warningLabel: (value) => paint(33, paint(1, value)),
|
|
9201
|
+
infoLabel: (value) => paint(36, paint(1, value)),
|
|
9202
|
+
errorLabel: (value) => paint(31, paint(1, value))
|
|
9203
|
+
};
|
|
9204
|
+
|
|
9205
|
+
// src/cli/ui/output/program-ui.ts
|
|
9206
|
+
function configureProgramUi(program) {
|
|
9207
|
+
program.configureOutput({
|
|
9208
|
+
writeErr: (str) => process.stderr.write(str),
|
|
9209
|
+
outputError: (str, write) => {
|
|
9210
|
+
write(theme.red(str));
|
|
9211
|
+
}
|
|
9212
|
+
});
|
|
9213
|
+
program.configureHelp({
|
|
9214
|
+
subcommandTerm: (cmd) => theme.command(cmd.name()),
|
|
9215
|
+
optionTerm: (option) => theme.flag(option.flags)
|
|
9216
|
+
});
|
|
9217
|
+
}
|
|
9218
|
+
|
|
9219
|
+
// src/cli/ui/output/errors.ts
|
|
9220
|
+
function handleCliError(error) {
|
|
9221
|
+
if (error instanceof AppError) {
|
|
9222
|
+
console.error(`${theme.errorLabel(error.code)} ${error.message}`);
|
|
9223
|
+
process.exit(1);
|
|
9224
|
+
}
|
|
9225
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
9226
|
+
console.error(`${theme.errorLabel("UNEXPECTED_ERROR")} ${message}`);
|
|
9227
|
+
process.exit(1);
|
|
9228
|
+
}
|
|
9229
|
+
|
|
9230
|
+
// src/cli/ui/output/shared.ts
|
|
9231
|
+
import path18 from "path";
|
|
9232
|
+
function resolveLogFilePath(logFilePath) {
|
|
9233
|
+
return path18.resolve(logFilePath);
|
|
9234
|
+
}
|
|
9235
|
+
function formatKeyValue(label, value) {
|
|
9236
|
+
return `${theme.muted(`- ${label}:`)} ${value}`;
|
|
9237
|
+
}
|
|
9238
|
+
function printNotes(notes) {
|
|
9239
|
+
if (notes.length === 0) {
|
|
9240
|
+
return;
|
|
9241
|
+
}
|
|
9242
|
+
console.log(theme.infoLabel("NOTES"));
|
|
9243
|
+
for (const note of notes) {
|
|
9244
|
+
console.log(` ${theme.blue("\u2022")} ${note}`);
|
|
9245
|
+
}
|
|
9246
|
+
}
|
|
9247
|
+
function printWarnings(warnings) {
|
|
9248
|
+
if (warnings.length === 0) {
|
|
9249
|
+
return;
|
|
9250
|
+
}
|
|
9251
|
+
console.log(theme.warningLabel("WARNINGS"));
|
|
9252
|
+
for (const warning of warnings) {
|
|
9253
|
+
console.log(` ${theme.yellow("\u2022")} ${warning}`);
|
|
9254
|
+
}
|
|
9255
|
+
}
|
|
9256
|
+
function printErrors(errors) {
|
|
9257
|
+
if (errors.length === 0) {
|
|
9258
|
+
return;
|
|
9259
|
+
}
|
|
9260
|
+
console.log(theme.errorLabel("ERRORS"));
|
|
9261
|
+
for (const error of errors) {
|
|
9262
|
+
console.log(` ${theme.red("\u2022")} ${error}`);
|
|
9263
|
+
}
|
|
9264
|
+
}
|
|
9265
|
+
function formatBytes(value) {
|
|
9266
|
+
if (value < 1024) {
|
|
9267
|
+
return `${value} B`;
|
|
9268
|
+
}
|
|
9269
|
+
const units = ["KB", "MB", "GB", "TB"];
|
|
9270
|
+
let currentValue = value / 1024;
|
|
9271
|
+
let unitIndex = 0;
|
|
9272
|
+
while (currentValue >= 1024 && unitIndex < units.length - 1) {
|
|
9273
|
+
currentValue /= 1024;
|
|
9274
|
+
unitIndex += 1;
|
|
9275
|
+
}
|
|
9276
|
+
return `${currentValue.toFixed(currentValue >= 100 ? 0 : currentValue >= 10 ? 1 : 2)} ${units[unitIndex]}`;
|
|
9277
|
+
}
|
|
9278
|
+
function formatCount(value) {
|
|
9279
|
+
return new Intl.NumberFormat("en-US").format(value);
|
|
9280
|
+
}
|
|
9281
|
+
function formatDecimal(value, fractionDigits = 2) {
|
|
9282
|
+
return new Intl.NumberFormat("en-US", {
|
|
9283
|
+
maximumFractionDigits: fractionDigits,
|
|
9284
|
+
minimumFractionDigits: 0
|
|
9285
|
+
}).format(value);
|
|
9286
|
+
}
|
|
9287
|
+
function formatDuration(valueMs) {
|
|
9288
|
+
if (valueMs < 1e3) {
|
|
9289
|
+
return `${Math.round(valueMs)} ms`;
|
|
9290
|
+
}
|
|
9291
|
+
const seconds = valueMs / 1e3;
|
|
9292
|
+
if (seconds < 60) {
|
|
8179
9293
|
return `${formatDecimal(seconds)} s`;
|
|
8180
9294
|
}
|
|
8181
9295
|
const minutes = Math.floor(seconds / 60);
|
|
@@ -8363,9 +9477,23 @@ function printSanitizeSummary(summary, logFilePath) {
|
|
|
8363
9477
|
console.log(
|
|
8364
9478
|
formatKeyValue("Processed bytes", formatBytes(summary.totalBytes))
|
|
8365
9479
|
);
|
|
9480
|
+
console.log(formatKeyValue("Source encoding", summary.sourceEncoding));
|
|
9481
|
+
console.log(formatKeyValue("Output encoding", "UTF8"));
|
|
8366
9482
|
console.log(
|
|
8367
9483
|
formatKeyValue("Removed NUL bytes", formatCount(summary.nulBytesRemoved))
|
|
8368
9484
|
);
|
|
9485
|
+
console.log(
|
|
9486
|
+
formatKeyValue(
|
|
9487
|
+
"Removed invalid bytes",
|
|
9488
|
+
formatCount(summary.invalidBytesRemoved)
|
|
9489
|
+
)
|
|
9490
|
+
);
|
|
9491
|
+
console.log(
|
|
9492
|
+
formatKeyValue(
|
|
9493
|
+
"Removed control chars",
|
|
9494
|
+
formatCount(summary.controlCharsRemoved)
|
|
9495
|
+
)
|
|
9496
|
+
);
|
|
8369
9497
|
console.log(formatKeyValue("Changed files", summary.changedFiles));
|
|
8370
9498
|
console.log(formatKeyValue("Unchanged files", summary.unchangedFiles));
|
|
8371
9499
|
if (summary.datasets.length > 0) {
|
|
@@ -8656,6 +9784,59 @@ function printFederalRevenueSyncSummary(summary, logFilePath) {
|
|
|
8656
9784
|
`${theme.muted("Import progress log:")} ${resolveLogFilePath(summary.import.progressLogPath)}`
|
|
8657
9785
|
);
|
|
8658
9786
|
}
|
|
9787
|
+
function printPostgresCsvExportSummary(summary, logFilePath) {
|
|
9788
|
+
console.log(
|
|
9789
|
+
theme.successLabel("POSTGRES"),
|
|
9790
|
+
"PostgreSQL-ready CSV export completed."
|
|
9791
|
+
);
|
|
9792
|
+
console.log(formatKeyValue("Input path", summary.inputPath));
|
|
9793
|
+
console.log(formatKeyValue("Validated path", summary.validatedPath));
|
|
9794
|
+
console.log(formatKeyValue("Output path", summary.outputPath));
|
|
9795
|
+
console.log(formatKeyValue("Generated script", summary.scriptPath));
|
|
9796
|
+
console.log(formatKeyValue("Manifest", summary.manifestPath));
|
|
9797
|
+
console.log(formatKeyValue("Exported files", summary.totalFiles));
|
|
9798
|
+
console.log(formatKeyValue("Exported rows", formatCount(summary.totalRows)));
|
|
9799
|
+
if (summary.datasets.length > 0) {
|
|
9800
|
+
console.log(theme.infoLabel("DATASETS"));
|
|
9801
|
+
for (const dataset of summary.datasets) {
|
|
9802
|
+
console.log(
|
|
9803
|
+
` ${theme.blue("\u2022")} ${dataset.dataset}: ${dataset.files} file(s), ${formatCount(dataset.rows)} row(s)`
|
|
9804
|
+
);
|
|
9805
|
+
}
|
|
9806
|
+
}
|
|
9807
|
+
printWarnings(summary.warnings);
|
|
9808
|
+
if (summary.nextStep) {
|
|
9809
|
+
console.log(`${theme.infoLabel("NEXT")} ${summary.nextStep}`);
|
|
9810
|
+
}
|
|
9811
|
+
console.log(`${theme.muted("Log file:")} ${resolveLogFilePath(logFilePath)}`);
|
|
9812
|
+
}
|
|
9813
|
+
function printPostgresDirectScriptSummary(summary, logFilePath) {
|
|
9814
|
+
console.log(
|
|
9815
|
+
theme.successLabel("POSTGRES"),
|
|
9816
|
+
"Direct PostgreSQL import script generated."
|
|
9817
|
+
);
|
|
9818
|
+
console.log(formatKeyValue("Input path", summary.inputPath));
|
|
9819
|
+
console.log(formatKeyValue("Validated path", summary.validatedPath));
|
|
9820
|
+
console.log(formatKeyValue("Output path", summary.outputPath));
|
|
9821
|
+
console.log(formatKeyValue("Generated script", summary.scriptPath));
|
|
9822
|
+
console.log(formatKeyValue("Manifest", summary.manifestPath));
|
|
9823
|
+
console.log(formatKeyValue("Source encoding", summary.sourceEncoding));
|
|
9824
|
+
console.log(formatKeyValue("Source files", summary.totalFiles));
|
|
9825
|
+
console.log(formatKeyValue("Source bytes", formatBytes(summary.totalBytes)));
|
|
9826
|
+
if (summary.datasets.length > 0) {
|
|
9827
|
+
console.log(theme.infoLabel("DATASETS"));
|
|
9828
|
+
for (const dataset of summary.datasets) {
|
|
9829
|
+
console.log(
|
|
9830
|
+
` ${theme.blue("\u2022")} ${dataset.dataset}: ${dataset.files} file(s), ${formatBytes(dataset.totalBytes)}`
|
|
9831
|
+
);
|
|
9832
|
+
}
|
|
9833
|
+
}
|
|
9834
|
+
printWarnings(summary.warnings);
|
|
9835
|
+
if (summary.nextStep) {
|
|
9836
|
+
console.log(`${theme.infoLabel("NEXT")} ${summary.nextStep}`);
|
|
9837
|
+
}
|
|
9838
|
+
console.log(`${theme.muted("Log file:")} ${resolveLogFilePath(logFilePath)}`);
|
|
9839
|
+
}
|
|
8659
9840
|
|
|
8660
9841
|
// src/cli/ui/output/progress.ts
|
|
8661
9842
|
function createExtractionProgressReporter() {
|
|
@@ -9092,8 +10273,9 @@ function createSanitizeProgressReporter() {
|
|
|
9092
10273
|
`Validated: ${shortPath(event.validatedPath)}`,
|
|
9093
10274
|
`Output: ${shortPath(event.outputPath)}`,
|
|
9094
10275
|
`Datasets: ${event.datasets.join(" > ")}`,
|
|
10276
|
+
`Source encoding: ${event.sourceEncoding} > UTF8`,
|
|
9095
10277
|
`Files: 0/${formatCount(event.totalFiles)} | Bytes: ${formatBytes(0)} / ${formatBytes(event.totalBytes)}`,
|
|
9096
|
-
`Rows
|
|
10278
|
+
`Rows: ${formatCount(0)} | NUL: ${formatCount(0)} | Invalid bytes: ${formatCount(0)} | Controls: ${formatCount(0)}`,
|
|
9097
10279
|
`Current: waiting...`
|
|
9098
10280
|
];
|
|
9099
10281
|
renderBlock([
|
|
@@ -9109,8 +10291,9 @@ function createSanitizeProgressReporter() {
|
|
|
9109
10291
|
currentLines[1] ?? "",
|
|
9110
10292
|
currentLines[2] ?? "",
|
|
9111
10293
|
currentLines[3] ?? "",
|
|
10294
|
+
currentLines[4] ?? "",
|
|
9112
10295
|
`Files: ${formatCount(event.fileIndex)}/${formatCount(event.totalFiles)} | Bytes: ${formatBytes(event.bytesProcessed)} / ${formatBytes(event.totalBytes)}`,
|
|
9113
|
-
`Rows
|
|
10296
|
+
`Rows: ${formatCount(event.processedRows)} | NUL: ${formatCount(event.nulBytesRemoved)} | Invalid bytes: ${formatCount(event.invalidBytesRemoved)} | Controls: ${formatCount(event.controlCharsRemoved)} | Changed: ${formatCount(event.changedFiles)}`,
|
|
9114
10297
|
`Current: ${shortPath(event.currentFileDisplayPath)}`
|
|
9115
10298
|
];
|
|
9116
10299
|
renderBlock([
|
|
@@ -9130,6 +10313,18 @@ function createSanitizeProgressReporter() {
|
|
|
9130
10313
|
console.log(
|
|
9131
10314
|
formatKeyValue("Removed NUL bytes", formatCount(event.nulBytesRemoved))
|
|
9132
10315
|
);
|
|
10316
|
+
console.log(
|
|
10317
|
+
formatKeyValue(
|
|
10318
|
+
"Removed invalid bytes",
|
|
10319
|
+
formatCount(event.invalidBytesRemoved)
|
|
10320
|
+
)
|
|
10321
|
+
);
|
|
10322
|
+
console.log(
|
|
10323
|
+
formatKeyValue(
|
|
10324
|
+
"Removed control chars",
|
|
10325
|
+
formatCount(event.controlCharsRemoved)
|
|
10326
|
+
)
|
|
10327
|
+
);
|
|
9133
10328
|
console.log(
|
|
9134
10329
|
formatKeyValue("Changed files", formatCount(event.changedFiles))
|
|
9135
10330
|
);
|
|
@@ -9276,6 +10471,65 @@ function createFederalRevenueDownloadProgressReporter() {
|
|
|
9276
10471
|
);
|
|
9277
10472
|
};
|
|
9278
10473
|
}
|
|
10474
|
+
function createPostgresCsvExportProgressReporter() {
|
|
10475
|
+
return (event) => {
|
|
10476
|
+
if (event.kind === "start") {
|
|
10477
|
+
console.log(
|
|
10478
|
+
theme.infoLabel("POSTGRES"),
|
|
10479
|
+
"Starting PostgreSQL-ready CSV export..."
|
|
10480
|
+
);
|
|
10481
|
+
console.log(formatKeyValue("Input path", event.inputPath));
|
|
10482
|
+
console.log(formatKeyValue("Validated path", event.validatedPath));
|
|
10483
|
+
console.log(formatKeyValue("Output path", event.outputPath));
|
|
10484
|
+
console.log(formatKeyValue("Files queued", event.totalFiles));
|
|
10485
|
+
return;
|
|
10486
|
+
}
|
|
10487
|
+
if (event.kind === "file_finish") {
|
|
10488
|
+
console.log(
|
|
10489
|
+
`${theme.infoLabel("POSTGRES")} ${event.fileIndex}/${event.totalFiles} ${event.dataset} exported with ${formatCount(event.rows)} row(s).`
|
|
10490
|
+
);
|
|
10491
|
+
return;
|
|
10492
|
+
}
|
|
10493
|
+
if (event.kind === "finish") {
|
|
10494
|
+
console.log(
|
|
10495
|
+
theme.successLabel("POSTGRES"),
|
|
10496
|
+
`Exported ${event.totalFiles} file(s) with ${formatCount(event.totalRows)} row(s).`
|
|
10497
|
+
);
|
|
10498
|
+
console.log(formatKeyValue("Output path", event.outputPath));
|
|
10499
|
+
console.log(formatKeyValue("Script path", event.scriptPath));
|
|
10500
|
+
}
|
|
10501
|
+
};
|
|
10502
|
+
}
|
|
10503
|
+
function createPostgresDirectScriptProgressReporter() {
|
|
10504
|
+
return (event) => {
|
|
10505
|
+
if (event.kind === "start") {
|
|
10506
|
+
console.log(
|
|
10507
|
+
theme.infoLabel("POSTGRES"),
|
|
10508
|
+
"Starting direct PostgreSQL script generation..."
|
|
10509
|
+
);
|
|
10510
|
+
console.log(formatKeyValue("Input path", event.inputPath));
|
|
10511
|
+
console.log(formatKeyValue("Validated path", event.validatedPath));
|
|
10512
|
+
console.log(formatKeyValue("Output path", event.outputPath));
|
|
10513
|
+
console.log(formatKeyValue("Source encoding", event.sourceEncoding));
|
|
10514
|
+
console.log(formatKeyValue("Files queued", event.totalFiles));
|
|
10515
|
+
return;
|
|
10516
|
+
}
|
|
10517
|
+
if (event.kind === "file_registered") {
|
|
10518
|
+
console.log(
|
|
10519
|
+
`${theme.infoLabel("POSTGRES")} ${event.fileIndex}/${event.totalFiles} ${event.dataset} registered (${formatBytes(event.fileSize)}).`
|
|
10520
|
+
);
|
|
10521
|
+
return;
|
|
10522
|
+
}
|
|
10523
|
+
if (event.kind === "finish") {
|
|
10524
|
+
console.log(
|
|
10525
|
+
theme.successLabel("POSTGRES"),
|
|
10526
|
+
`Generated direct import script for ${event.totalFiles} file(s) (${formatBytes(event.totalBytes)}).`
|
|
10527
|
+
);
|
|
10528
|
+
console.log(formatKeyValue("Output path", event.outputPath));
|
|
10529
|
+
console.log(formatKeyValue("Script path", event.scriptPath));
|
|
10530
|
+
}
|
|
10531
|
+
};
|
|
10532
|
+
}
|
|
9279
10533
|
|
|
9280
10534
|
// src/cli/ui/output/quarantine.ts
|
|
9281
10535
|
function printAppliedFilters(summaryFilters) {
|
|
@@ -10164,8 +11418,116 @@ function registerQuarantineCommands(program) {
|
|
|
10164
11418
|
});
|
|
10165
11419
|
}
|
|
10166
11420
|
|
|
11421
|
+
// src/cli/commands/register-postgres.ts
|
|
11422
|
+
function registerPostgresCommands(program) {
|
|
11423
|
+
const postgres = program.command("postgres").description(
|
|
11424
|
+
"PostgreSQL-oriented helpers for hybrid loading and database operations."
|
|
11425
|
+
);
|
|
11426
|
+
postgres.command("generate-script").argument(
|
|
11427
|
+
"<input>",
|
|
11428
|
+
"Path to the sanitized dataset directory generated by cnpj-db-loader sanitize."
|
|
11429
|
+
).option(
|
|
11430
|
+
"--output <path>",
|
|
11431
|
+
"Custom output directory for the generated psql script and manifest."
|
|
11432
|
+
).option(
|
|
11433
|
+
"--dataset <dataset>",
|
|
11434
|
+
"Generate a script only for one dataset block, for example establishments or companies."
|
|
11435
|
+
).option(
|
|
11436
|
+
"--script-name <name>",
|
|
11437
|
+
"Generated psql script file name. Defaults to import-postgres-direct.sql."
|
|
11438
|
+
).option(
|
|
11439
|
+
"--source-encoding <encoding>",
|
|
11440
|
+
"PostgreSQL client encoding used while reading sanitized Receita files. Defaults to UTF8."
|
|
11441
|
+
).option("-f, --force", "Skip the confirmation prompt.").description(
|
|
11442
|
+
"Generate a direct psql import script that loads sanitized Receita files without rewriting them into new CSV files."
|
|
11443
|
+
).action(
|
|
11444
|
+
async (input2, options) => {
|
|
11445
|
+
if (!options.force) {
|
|
11446
|
+
const confirmed = await confirm(
|
|
11447
|
+
`Generate a direct PostgreSQL psql import script from ${input2}? This command does not rewrite the source files; it creates only a SQL script and manifest.`
|
|
11448
|
+
);
|
|
11449
|
+
if (!confirmed) {
|
|
11450
|
+
console.log("PostgreSQL direct script generation cancelled.");
|
|
11451
|
+
return;
|
|
11452
|
+
}
|
|
11453
|
+
}
|
|
11454
|
+
const progress = createPostgresDirectScriptProgressReporter();
|
|
11455
|
+
const generateOptions = {
|
|
11456
|
+
onProgress: progress
|
|
11457
|
+
};
|
|
11458
|
+
if (options.output) {
|
|
11459
|
+
generateOptions.outputPath = options.output;
|
|
11460
|
+
}
|
|
11461
|
+
if (options.dataset) {
|
|
11462
|
+
generateOptions.dataset = options.dataset;
|
|
11463
|
+
}
|
|
11464
|
+
if (options.scriptName) {
|
|
11465
|
+
generateOptions.scriptName = options.scriptName;
|
|
11466
|
+
}
|
|
11467
|
+
if (options.sourceEncoding) {
|
|
11468
|
+
generateOptions.sourceEncoding = options.sourceEncoding;
|
|
11469
|
+
}
|
|
11470
|
+
const summary = await generatePostgresDirectScript(
|
|
11471
|
+
input2,
|
|
11472
|
+
generateOptions
|
|
11473
|
+
);
|
|
11474
|
+
const logFilePath = await writeCommandLog(
|
|
11475
|
+
"postgres-generate-script",
|
|
11476
|
+
summary
|
|
11477
|
+
);
|
|
11478
|
+
printPostgresDirectScriptSummary(summary, logFilePath);
|
|
11479
|
+
}
|
|
11480
|
+
);
|
|
11481
|
+
postgres.command("export-csv").argument(
|
|
11482
|
+
"<input>",
|
|
11483
|
+
"Path to the sanitized or validated extracted dataset directory."
|
|
11484
|
+
).option(
|
|
11485
|
+
"--output <path>",
|
|
11486
|
+
"Custom output directory for PostgreSQL-ready CSV files."
|
|
11487
|
+
).option(
|
|
11488
|
+
"--dataset <dataset>",
|
|
11489
|
+
"Export only one dataset block, for example establishments or companies."
|
|
11490
|
+
).option(
|
|
11491
|
+
"--script-name <name>",
|
|
11492
|
+
"Generated psql script file name. Defaults to import-postgres-direct.sql."
|
|
11493
|
+
).option("-f, --force", "Skip the confirmation prompt.").description(
|
|
11494
|
+
"Convert sanitized Receita files into real PostgreSQL-ready CSV files and generate a direct psql import script."
|
|
11495
|
+
).action(
|
|
11496
|
+
async (input2, options) => {
|
|
11497
|
+
if (!options.force) {
|
|
11498
|
+
const confirmed = await confirm(
|
|
11499
|
+
`Export PostgreSQL-ready CSV files from ${input2}? This command creates normalized CSV files and a generated psql import script.`
|
|
11500
|
+
);
|
|
11501
|
+
if (!confirmed) {
|
|
11502
|
+
console.log("PostgreSQL CSV export cancelled.");
|
|
11503
|
+
return;
|
|
11504
|
+
}
|
|
11505
|
+
}
|
|
11506
|
+
const progress = createPostgresCsvExportProgressReporter();
|
|
11507
|
+
const exportOptions = {
|
|
11508
|
+
onProgress: progress
|
|
11509
|
+
};
|
|
11510
|
+
if (options.output) {
|
|
11511
|
+
exportOptions.outputPath = options.output;
|
|
11512
|
+
}
|
|
11513
|
+
if (options.dataset) {
|
|
11514
|
+
exportOptions.dataset = options.dataset;
|
|
11515
|
+
}
|
|
11516
|
+
if (options.scriptName) {
|
|
11517
|
+
exportOptions.scriptName = options.scriptName;
|
|
11518
|
+
}
|
|
11519
|
+
const summary = await exportPostgresCsvDataset(input2, exportOptions);
|
|
11520
|
+
const logFilePath = await writeCommandLog(
|
|
11521
|
+
"postgres-export-csv",
|
|
11522
|
+
summary
|
|
11523
|
+
);
|
|
11524
|
+
printPostgresCsvExportSummary(summary, logFilePath);
|
|
11525
|
+
}
|
|
11526
|
+
);
|
|
11527
|
+
}
|
|
11528
|
+
|
|
10167
11529
|
// src/cli/commands/register-schema.ts
|
|
10168
|
-
import
|
|
11530
|
+
import path19 from "path";
|
|
10169
11531
|
function ensureSqlExtension(fileName) {
|
|
10170
11532
|
return fileName.toLowerCase().endsWith(".sql") ? fileName : `${fileName}.sql`;
|
|
10171
11533
|
}
|
|
@@ -10183,7 +11545,7 @@ function resolveSchemaOutputPath(profile, name, output2) {
|
|
|
10183
11545
|
const fileName = ensureSqlExtension(
|
|
10184
11546
|
name?.trim() || getDefaultSchemaBaseName(profile)
|
|
10185
11547
|
);
|
|
10186
|
-
return
|
|
11548
|
+
return path19.resolve(output2 ?? process.cwd(), fileName);
|
|
10187
11549
|
}
|
|
10188
11550
|
function registerSchemaCommands(program) {
|
|
10189
11551
|
const schema = program.command("schema").description(
|
|
@@ -10225,6 +11587,9 @@ function registerSanitizeCommands(program) {
|
|
|
10225
11587
|
).option(
|
|
10226
11588
|
"--dataset <dataset>",
|
|
10227
11589
|
"Sanitize only one validated dataset block (for example: establishments or companies)."
|
|
11590
|
+
).option(
|
|
11591
|
+
"--source-encoding <encoding>",
|
|
11592
|
+
"Source file encoding used while reading Receita files. Defaults to WIN1252 and writes clean UTF-8 output."
|
|
10228
11593
|
).option("-f, --force", "Skip the confirmation prompt.").description(
|
|
10229
11594
|
"Prepare a sanitized dataset tree before import by removing known low-level byte issues such as NUL bytes."
|
|
10230
11595
|
).action(
|
|
@@ -10248,6 +11613,9 @@ function registerSanitizeCommands(program) {
|
|
|
10248
11613
|
if (options.dataset) {
|
|
10249
11614
|
sanitizeOptions.dataset = options.dataset;
|
|
10250
11615
|
}
|
|
11616
|
+
if (options.sourceEncoding) {
|
|
11617
|
+
sanitizeOptions.sourceEncoding = options.sourceEncoding;
|
|
11618
|
+
}
|
|
10251
11619
|
const summary = await sanitizeInputDirectory(input2, sanitizeOptions);
|
|
10252
11620
|
const logFilePath = await writeCommandLog("sanitize", summary);
|
|
10253
11621
|
printSanitizeSummary(summary, logFilePath);
|
|
@@ -10270,12 +11638,25 @@ function registerValidateCommands(program) {
|
|
|
10270
11638
|
}
|
|
10271
11639
|
|
|
10272
11640
|
// src/cli/shared/app-config.ts
|
|
10273
|
-
import { readFileSync } from "fs";
|
|
11641
|
+
import { existsSync, readFileSync } from "fs";
|
|
10274
11642
|
import { dirname as dirname2, resolve } from "path";
|
|
10275
11643
|
import { fileURLToPath } from "url";
|
|
11644
|
+
function findPackageJsonPath() {
|
|
11645
|
+
let currentDir = dirname2(fileURLToPath(import.meta.url));
|
|
11646
|
+
for (let depth = 0; depth < 6; depth += 1) {
|
|
11647
|
+
const candidatePath = resolve(currentDir, "package.json");
|
|
11648
|
+
if (existsSync(candidatePath)) {
|
|
11649
|
+
return candidatePath;
|
|
11650
|
+
}
|
|
11651
|
+
currentDir = resolve(currentDir, "..");
|
|
11652
|
+
}
|
|
11653
|
+
return null;
|
|
11654
|
+
}
|
|
10276
11655
|
function getPackageVersion() {
|
|
10277
|
-
const
|
|
10278
|
-
|
|
11656
|
+
const packageJsonPath = findPackageJsonPath();
|
|
11657
|
+
if (!packageJsonPath) {
|
|
11658
|
+
return "0.0.0";
|
|
11659
|
+
}
|
|
10279
11660
|
const packageJson = JSON.parse(readFileSync(packageJsonPath, "utf-8"));
|
|
10280
11661
|
return packageJson.version ?? "0.0.0";
|
|
10281
11662
|
}
|
|
@@ -10302,9 +11683,14 @@ ${theme.section("Recommended flow")}
|
|
|
10302
11683
|
${theme.command("cnpj-db-loader database cleanup staging --force")}
|
|
10303
11684
|
${theme.command("cnpj-db-loader import ./downloads/<reference>/sanitized")}
|
|
10304
11685
|
|
|
11686
|
+
${theme.section("Hybrid PostgreSQL path")}
|
|
11687
|
+
${theme.command("cnpj-db-loader postgres generate-script ./downloads/<reference>/sanitized --output ./downloads/<reference>/postgres-direct --force")}
|
|
11688
|
+
${theme.command('psql "postgres://user:password@localhost:5432/cnpj" -f ./downloads/<reference>/postgres-direct/import-postgres-direct.sql')}
|
|
11689
|
+
|
|
10305
11690
|
${theme.section("Notes")}
|
|
10306
11691
|
${theme.muted("Use federal-revenue when you want the CLI to check/download the remote monthly dataset first. Use inspect first when you already have local files.")}
|
|
10307
11692
|
${theme.muted("Generate the schema only when you need to create the database structure. Sanitization is the recommended preparation step before recurring imports.")}
|
|
11693
|
+
${theme.muted("Use postgres generate-script when you want PostgreSQL to run the heavy bulk load directly from sanitized Receita files through a generated psql script.")}
|
|
10308
11694
|
${theme.muted("JSON execution logs are written inside the user home directory at ~/.cnpjdbloader/logs with structured level/event metadata.")}
|
|
10309
11695
|
`;
|
|
10310
11696
|
}
|
|
@@ -10321,6 +11707,7 @@ function buildProgram() {
|
|
|
10321
11707
|
registerSchemaCommands(program);
|
|
10322
11708
|
registerFederalRevenueCommands(program);
|
|
10323
11709
|
registerDatabaseCommands(program);
|
|
11710
|
+
registerPostgresCommands(program);
|
|
10324
11711
|
registerImportCommands(program);
|
|
10325
11712
|
registerQuarantineCommands(program);
|
|
10326
11713
|
registerDoctorCommands(program);
|