@danielarndt0/cnpj-db-loader 2.3.1 → 2.4.0-beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +22 -2
- package/dist/cli.js +1544 -157
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +134 -1
- package/dist/index.js +1174 -58
- package/dist/index.js.map +1 -1
- package/docs/architecture.md +9 -1
- package/docs/cli.md +1 -1
- package/docs/commands.md +23 -0
- package/docs/postgres-direct.md +138 -0
- package/docs/sanitize.md +52 -16
- package/docs/usage.md +14 -0
- package/package.json +3 -3
package/dist/index.js
CHANGED
|
@@ -3269,6 +3269,15 @@ function createFieldValueParser(dataType) {
|
|
|
3269
3269
|
};
|
|
3270
3270
|
}
|
|
3271
3271
|
}
|
|
3272
|
+
function toDatabaseValue(dataType, rawValue) {
|
|
3273
|
+
return createFieldValueParser(dataType)(rawValue);
|
|
3274
|
+
}
|
|
3275
|
+
function normalizeCode(value, fallback) {
|
|
3276
|
+
if (typeof value === "string" && value.trim() !== "") {
|
|
3277
|
+
return value.trim();
|
|
3278
|
+
}
|
|
3279
|
+
return fallback;
|
|
3280
|
+
}
|
|
3272
3281
|
function createPartnerDedupeKeyBuilder(indices) {
|
|
3273
3282
|
const orderedIndices = [
|
|
3274
3283
|
indices.cnpjRoot,
|
|
@@ -3296,6 +3305,60 @@ function createEstablishmentCnpjFullBuilder(indices) {
|
|
|
3296
3305
|
return `${root}${order}${digits}`;
|
|
3297
3306
|
};
|
|
3298
3307
|
}
|
|
3308
|
+
function buildPartnerDedupeKey(recordByColumn) {
|
|
3309
|
+
return [
|
|
3310
|
+
recordByColumn.cnpj_root,
|
|
3311
|
+
recordByColumn.partner_type_code,
|
|
3312
|
+
recordByColumn.partner_name,
|
|
3313
|
+
recordByColumn.partner_document,
|
|
3314
|
+
recordByColumn.partner_qualification_code,
|
|
3315
|
+
recordByColumn.entry_date,
|
|
3316
|
+
recordByColumn.country_code,
|
|
3317
|
+
recordByColumn.legal_representative_document,
|
|
3318
|
+
recordByColumn.legal_representative_name,
|
|
3319
|
+
recordByColumn.legal_representative_qualification_code,
|
|
3320
|
+
recordByColumn.age_group_code
|
|
3321
|
+
].map((value) => value == null ? "" : String(value).trim()).join("|");
|
|
3322
|
+
}
|
|
3323
|
+
function transformRecord(dataset, layout, rawFields, schemaCapabilities, writeTarget) {
|
|
3324
|
+
const values = layout.fields.map(
|
|
3325
|
+
(field, index) => toDatabaseValue(field.dataType, rawFields[index] ?? "")
|
|
3326
|
+
);
|
|
3327
|
+
const recordByColumn = Object.fromEntries(
|
|
3328
|
+
layout.fields.map((field, index) => [field.columnName, values[index]])
|
|
3329
|
+
);
|
|
3330
|
+
if (dataset === "companies") {
|
|
3331
|
+
recordByColumn.company_size_code = normalizeCode(
|
|
3332
|
+
recordByColumn.company_size_code,
|
|
3333
|
+
"00"
|
|
3334
|
+
);
|
|
3335
|
+
}
|
|
3336
|
+
if (dataset === "establishments") {
|
|
3337
|
+
recordByColumn.branch_type_code = normalizeCode(
|
|
3338
|
+
recordByColumn.branch_type_code,
|
|
3339
|
+
"1"
|
|
3340
|
+
);
|
|
3341
|
+
recordByColumn.registration_status_code = normalizeCode(
|
|
3342
|
+
recordByColumn.registration_status_code,
|
|
3343
|
+
"01"
|
|
3344
|
+
);
|
|
3345
|
+
}
|
|
3346
|
+
const normalizedValues = layout.fields.map(
|
|
3347
|
+
(field) => recordByColumn[field.columnName]
|
|
3348
|
+
);
|
|
3349
|
+
if (writeTarget === "final") {
|
|
3350
|
+
if (dataset === "establishments" && schemaCapabilities.includeEstablishmentCnpjFullInInsert) {
|
|
3351
|
+
return [
|
|
3352
|
+
...normalizedValues,
|
|
3353
|
+
`${recordByColumn.cnpj_root ?? ""}${recordByColumn.cnpj_order ?? ""}${recordByColumn.cnpj_check_digits ?? ""}`
|
|
3354
|
+
];
|
|
3355
|
+
}
|
|
3356
|
+
if (dataset === "partners" && schemaCapabilities.includePartnerDedupeKeyInInsert) {
|
|
3357
|
+
return [...normalizedValues, buildPartnerDedupeKey(recordByColumn)];
|
|
3358
|
+
}
|
|
3359
|
+
}
|
|
3360
|
+
return normalizedValues;
|
|
3361
|
+
}
|
|
3299
3362
|
function buildParsedPayload(columns, values) {
|
|
3300
3363
|
return Object.fromEntries(
|
|
3301
3364
|
columns.map((column, index) => [column, values[index] ?? null])
|
|
@@ -3435,7 +3498,7 @@ function createImportRowNormalizer(input) {
|
|
|
3435
3498
|
"cnpj_check_digits"
|
|
3436
3499
|
)
|
|
3437
3500
|
}) : null;
|
|
3438
|
-
const
|
|
3501
|
+
const buildPartnerDedupeKey2 = appendPartnerDedupeKey ? createPartnerDedupeKeyBuilder({
|
|
3439
3502
|
cnpjRoot: resolveLayoutColumnIndex(input.layout, "cnpj_root"),
|
|
3440
3503
|
partnerTypeCode: resolveLayoutColumnIndex(
|
|
3441
3504
|
input.layout,
|
|
@@ -3495,8 +3558,8 @@ function createImportRowNormalizer(input) {
|
|
|
3495
3558
|
if (buildEstablishmentCnpjFull) {
|
|
3496
3559
|
values.push(buildEstablishmentCnpjFull(values));
|
|
3497
3560
|
}
|
|
3498
|
-
if (
|
|
3499
|
-
values.push(
|
|
3561
|
+
if (buildPartnerDedupeKey2) {
|
|
3562
|
+
values.push(buildPartnerDedupeKey2(values));
|
|
3500
3563
|
}
|
|
3501
3564
|
return {
|
|
3502
3565
|
values,
|
|
@@ -7790,81 +7853,264 @@ function isRecognizedSanitizeEntry(entry) {
|
|
|
7790
7853
|
return entry.entryKind === "file" && entry.inferredType !== "zip-archive" && entry.inferredType !== "unknown";
|
|
7791
7854
|
}
|
|
7792
7855
|
|
|
7856
|
+
// src/services/sanitize/encoding.ts
|
|
7857
|
+
import { StringDecoder } from "string_decoder";
|
|
7858
|
+
var WINDOWS_1252_C1_MAP = {
|
|
7859
|
+
128: "\u20AC",
|
|
7860
|
+
130: "\u201A",
|
|
7861
|
+
131: "\u0192",
|
|
7862
|
+
132: "\u201E",
|
|
7863
|
+
133: "\u2026",
|
|
7864
|
+
134: "\u2020",
|
|
7865
|
+
135: "\u2021",
|
|
7866
|
+
136: "\u02C6",
|
|
7867
|
+
137: "\u2030",
|
|
7868
|
+
138: "\u0160",
|
|
7869
|
+
139: "\u2039",
|
|
7870
|
+
140: "\u0152",
|
|
7871
|
+
142: "\u017D",
|
|
7872
|
+
145: "\u2018",
|
|
7873
|
+
146: "\u2019",
|
|
7874
|
+
147: "\u201C",
|
|
7875
|
+
148: "\u201D",
|
|
7876
|
+
149: "\u2022",
|
|
7877
|
+
150: "\u2013",
|
|
7878
|
+
151: "\u2014",
|
|
7879
|
+
152: "\u02DC",
|
|
7880
|
+
153: "\u2122",
|
|
7881
|
+
154: "\u0161",
|
|
7882
|
+
155: "\u203A",
|
|
7883
|
+
156: "\u0153",
|
|
7884
|
+
158: "\u017E",
|
|
7885
|
+
159: "\u0178"
|
|
7886
|
+
};
|
|
7887
|
+
function normalizeSanitizeSourceEncoding(value) {
|
|
7888
|
+
const normalized = (value ?? "WIN1252").trim().toUpperCase().replace(/_/g, "-");
|
|
7889
|
+
switch (normalized) {
|
|
7890
|
+
case "WIN1252":
|
|
7891
|
+
case "WINDOWS-1252":
|
|
7892
|
+
case "CP1252":
|
|
7893
|
+
return "WIN1252";
|
|
7894
|
+
case "LATIN1":
|
|
7895
|
+
case "LATIN-1":
|
|
7896
|
+
case "ISO-8859-1":
|
|
7897
|
+
case "ISO8859-1":
|
|
7898
|
+
return "LATIN1";
|
|
7899
|
+
case "UTF8":
|
|
7900
|
+
case "UTF-8":
|
|
7901
|
+
return "UTF8";
|
|
7902
|
+
default:
|
|
7903
|
+
throw new ValidationError(
|
|
7904
|
+
`Unsupported sanitize source encoding: ${value}. Supported values: WIN1252, LATIN1, UTF8.`
|
|
7905
|
+
);
|
|
7906
|
+
}
|
|
7907
|
+
}
|
|
7908
|
+
function isAllowedControlCodePoint(codePoint) {
|
|
7909
|
+
return codePoint === 9 || codePoint === 10 || codePoint === 13;
|
|
7910
|
+
}
|
|
7911
|
+
function isProblematicControlCodePoint(codePoint) {
|
|
7912
|
+
if (isAllowedControlCodePoint(codePoint)) {
|
|
7913
|
+
return false;
|
|
7914
|
+
}
|
|
7915
|
+
return codePoint >= 0 && codePoint <= 31 || codePoint === 127 || codePoint >= 128 && codePoint <= 159 || codePoint === 65279;
|
|
7916
|
+
}
|
|
7917
|
+
function sanitizeDecodedText(text) {
|
|
7918
|
+
const output = [];
|
|
7919
|
+
let invalidBytesRemoved = 0;
|
|
7920
|
+
let controlCharsRemoved = 0;
|
|
7921
|
+
for (const char of text) {
|
|
7922
|
+
const codePoint = char.codePointAt(0);
|
|
7923
|
+
if (codePoint === 65533) {
|
|
7924
|
+
invalidBytesRemoved += 1;
|
|
7925
|
+
continue;
|
|
7926
|
+
}
|
|
7927
|
+
if (isProblematicControlCodePoint(codePoint)) {
|
|
7928
|
+
controlCharsRemoved += 1;
|
|
7929
|
+
continue;
|
|
7930
|
+
}
|
|
7931
|
+
output.push(char);
|
|
7932
|
+
}
|
|
7933
|
+
return {
|
|
7934
|
+
text: output.join(""),
|
|
7935
|
+
invalidBytesRemoved,
|
|
7936
|
+
controlCharsRemoved
|
|
7937
|
+
};
|
|
7938
|
+
}
|
|
7939
|
+
var SanitizeEncodingNormalizer = class {
|
|
7940
|
+
constructor(sourceEncoding) {
|
|
7941
|
+
this.sourceEncoding = sourceEncoding;
|
|
7942
|
+
this.utf8Decoder = sourceEncoding === "UTF8" ? new StringDecoder("utf8") : void 0;
|
|
7943
|
+
}
|
|
7944
|
+
sourceEncoding;
|
|
7945
|
+
utf8Decoder;
|
|
7946
|
+
normalizeChunk(chunk) {
|
|
7947
|
+
if (this.sourceEncoding === "UTF8") {
|
|
7948
|
+
const decoded = this.utf8Decoder.write(chunk);
|
|
7949
|
+
const sanitized = sanitizeDecodedText(decoded);
|
|
7950
|
+
const nulBytesRemoved = [...decoded].filter(
|
|
7951
|
+
(char) => char === "\0"
|
|
7952
|
+
).length;
|
|
7953
|
+
return {
|
|
7954
|
+
...sanitized,
|
|
7955
|
+
nulBytesRemoved
|
|
7956
|
+
};
|
|
7957
|
+
}
|
|
7958
|
+
return this.normalizeSingleByteChunk(chunk);
|
|
7959
|
+
}
|
|
7960
|
+
flush() {
|
|
7961
|
+
if (!this.utf8Decoder) {
|
|
7962
|
+
return {
|
|
7963
|
+
text: "",
|
|
7964
|
+
nulBytesRemoved: 0,
|
|
7965
|
+
invalidBytesRemoved: 0,
|
|
7966
|
+
controlCharsRemoved: 0
|
|
7967
|
+
};
|
|
7968
|
+
}
|
|
7969
|
+
const decoded = this.utf8Decoder.end();
|
|
7970
|
+
const sanitized = sanitizeDecodedText(decoded);
|
|
7971
|
+
const nulBytesRemoved = [...decoded].filter((char) => char === "\0").length;
|
|
7972
|
+
return {
|
|
7973
|
+
...sanitized,
|
|
7974
|
+
nulBytesRemoved
|
|
7975
|
+
};
|
|
7976
|
+
}
|
|
7977
|
+
normalizeSingleByteChunk(chunk) {
|
|
7978
|
+
const output = [];
|
|
7979
|
+
let nulBytesRemoved = 0;
|
|
7980
|
+
let invalidBytesRemoved = 0;
|
|
7981
|
+
let controlCharsRemoved = 0;
|
|
7982
|
+
for (const byte of chunk) {
|
|
7983
|
+
if (byte === 0) {
|
|
7984
|
+
nulBytesRemoved += 1;
|
|
7985
|
+
continue;
|
|
7986
|
+
}
|
|
7987
|
+
if (byte < 32 || byte === 127) {
|
|
7988
|
+
if (isAllowedControlCodePoint(byte)) {
|
|
7989
|
+
output.push(String.fromCharCode(byte));
|
|
7990
|
+
} else {
|
|
7991
|
+
controlCharsRemoved += 1;
|
|
7992
|
+
}
|
|
7993
|
+
continue;
|
|
7994
|
+
}
|
|
7995
|
+
if (byte >= 128 && byte <= 159) {
|
|
7996
|
+
if (this.sourceEncoding === "WIN1252") {
|
|
7997
|
+
const mapped = WINDOWS_1252_C1_MAP[byte];
|
|
7998
|
+
if (mapped === void 0) {
|
|
7999
|
+
invalidBytesRemoved += 1;
|
|
8000
|
+
} else {
|
|
8001
|
+
output.push(mapped);
|
|
8002
|
+
}
|
|
8003
|
+
} else {
|
|
8004
|
+
controlCharsRemoved += 1;
|
|
8005
|
+
}
|
|
8006
|
+
continue;
|
|
8007
|
+
}
|
|
8008
|
+
output.push(String.fromCharCode(byte));
|
|
8009
|
+
}
|
|
8010
|
+
return {
|
|
8011
|
+
text: output.join(""),
|
|
8012
|
+
nulBytesRemoved,
|
|
8013
|
+
invalidBytesRemoved,
|
|
8014
|
+
controlCharsRemoved
|
|
8015
|
+
};
|
|
8016
|
+
}
|
|
8017
|
+
};
|
|
8018
|
+
|
|
7793
8019
|
// src/services/sanitize/runner.ts
|
|
7794
8020
|
import { createReadStream as createReadStream2, createWriteStream as createWriteStream2 } from "fs";
|
|
7795
8021
|
import { mkdir as mkdir7 } from "fs/promises";
|
|
7796
8022
|
import path13 from "path";
|
|
7797
|
-
function
|
|
7798
|
-
|
|
7799
|
-
|
|
7800
|
-
if (chunk[index] === 0) {
|
|
7801
|
-
removed += 1;
|
|
7802
|
-
}
|
|
8023
|
+
async function writeUtf8(output, value) {
|
|
8024
|
+
if (value.length === 0) {
|
|
8025
|
+
return;
|
|
7803
8026
|
}
|
|
7804
|
-
if (
|
|
7805
|
-
|
|
8027
|
+
if (!output.write(value, "utf8")) {
|
|
8028
|
+
await new Promise((resolve, reject) => {
|
|
8029
|
+
output.once("drain", resolve);
|
|
8030
|
+
output.once("error", reject);
|
|
8031
|
+
});
|
|
7806
8032
|
}
|
|
7807
|
-
|
|
7808
|
-
|
|
7809
|
-
|
|
7810
|
-
|
|
7811
|
-
if (value
|
|
7812
|
-
|
|
7813
|
-
outputIndex += 1;
|
|
8033
|
+
}
|
|
8034
|
+
function countNewlines(value) {
|
|
8035
|
+
let count = 0;
|
|
8036
|
+
for (let index = 0; index < value.length; index += 1) {
|
|
8037
|
+
if (value[index] === "\n") {
|
|
8038
|
+
count += 1;
|
|
7814
8039
|
}
|
|
7815
8040
|
}
|
|
7816
|
-
return
|
|
8041
|
+
return count;
|
|
7817
8042
|
}
|
|
7818
|
-
async function sanitizeDatasetFile(plan, onChunk) {
|
|
8043
|
+
async function sanitizeDatasetFile(plan, onChunk, options = {}) {
|
|
7819
8044
|
await mkdir7(path13.dirname(plan.outputPath), { recursive: true });
|
|
8045
|
+
const sourceEncoding = normalizeSanitizeSourceEncoding(
|
|
8046
|
+
options.sourceEncoding
|
|
8047
|
+
);
|
|
8048
|
+
const normalizer = new SanitizeEncodingNormalizer(sourceEncoding);
|
|
7820
8049
|
const input = createReadStream2(plan.absolutePath);
|
|
7821
|
-
const output = createWriteStream2(plan.outputPath);
|
|
8050
|
+
const output = createWriteStream2(plan.outputPath, { encoding: "utf8" });
|
|
7822
8051
|
let totalBytesRead = 0;
|
|
7823
8052
|
let totalBytesWritten = 0;
|
|
7824
8053
|
let nulBytesRemoved = 0;
|
|
8054
|
+
let invalidBytesRemoved = 0;
|
|
8055
|
+
let controlCharsRemoved = 0;
|
|
7825
8056
|
let lineCount = 0;
|
|
7826
|
-
let
|
|
7827
|
-
let
|
|
8057
|
+
let sawAnyCharacter = false;
|
|
8058
|
+
let lastCharacterWasNewline = false;
|
|
8059
|
+
const processText = async (text) => {
|
|
8060
|
+
if (text.length === 0) {
|
|
8061
|
+
return;
|
|
8062
|
+
}
|
|
8063
|
+
sawAnyCharacter = true;
|
|
8064
|
+
lineCount += countNewlines(text);
|
|
8065
|
+
lastCharacterWasNewline = text.endsWith("\n");
|
|
8066
|
+
totalBytesWritten += Buffer.byteLength(text, "utf8");
|
|
8067
|
+
await writeUtf8(output, text);
|
|
8068
|
+
};
|
|
7828
8069
|
try {
|
|
7829
8070
|
for await (const chunk of input) {
|
|
7830
8071
|
const chunkBuffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
|
|
7831
8072
|
totalBytesRead += chunkBuffer.length;
|
|
7832
|
-
const
|
|
7833
|
-
nulBytesRemoved +=
|
|
7834
|
-
|
|
7835
|
-
|
|
7836
|
-
|
|
7837
|
-
lineCount += 1;
|
|
7838
|
-
}
|
|
7839
|
-
}
|
|
7840
|
-
if (buffer.length > 0) {
|
|
7841
|
-
lastByteWasNewline = buffer[buffer.length - 1] === 10;
|
|
7842
|
-
}
|
|
7843
|
-
totalBytesWritten += buffer.length;
|
|
7844
|
-
output.write(buffer);
|
|
8073
|
+
const normalized = normalizer.normalizeChunk(chunkBuffer);
|
|
8074
|
+
nulBytesRemoved += normalized.nulBytesRemoved;
|
|
8075
|
+
invalidBytesRemoved += normalized.invalidBytesRemoved;
|
|
8076
|
+
controlCharsRemoved += normalized.controlCharsRemoved;
|
|
8077
|
+
await processText(normalized.text);
|
|
7845
8078
|
onChunk?.({
|
|
7846
8079
|
bytesProcessed: chunkBuffer.length,
|
|
7847
8080
|
fileBytesProcessed: totalBytesRead,
|
|
7848
8081
|
currentFileSize: plan.fileSize,
|
|
7849
8082
|
processedRows: lineCount,
|
|
7850
|
-
nulBytesRemoved
|
|
8083
|
+
nulBytesRemoved,
|
|
8084
|
+
invalidBytesRemoved,
|
|
8085
|
+
controlCharsRemoved
|
|
7851
8086
|
});
|
|
7852
8087
|
}
|
|
7853
|
-
|
|
8088
|
+
const flushed = normalizer.flush();
|
|
8089
|
+
nulBytesRemoved += flushed.nulBytesRemoved;
|
|
8090
|
+
invalidBytesRemoved += flushed.invalidBytesRemoved;
|
|
8091
|
+
controlCharsRemoved += flushed.controlCharsRemoved;
|
|
8092
|
+
await processText(flushed.text);
|
|
8093
|
+
if (sawAnyCharacter && !lastCharacterWasNewline) {
|
|
7854
8094
|
lineCount += 1;
|
|
7855
8095
|
}
|
|
7856
8096
|
} finally {
|
|
7857
8097
|
input.close();
|
|
7858
8098
|
output.end();
|
|
7859
|
-
await new Promise((resolve
|
|
8099
|
+
await new Promise((resolve, reject) => {
|
|
8100
|
+
output.on("finish", () => resolve());
|
|
8101
|
+
output.on("error", (error) => reject(error));
|
|
8102
|
+
});
|
|
7860
8103
|
}
|
|
7861
8104
|
return {
|
|
7862
8105
|
plan,
|
|
7863
8106
|
totalBytesRead,
|
|
7864
8107
|
totalBytesWritten,
|
|
8108
|
+
sourceEncoding,
|
|
7865
8109
|
nulBytesRemoved,
|
|
8110
|
+
invalidBytesRemoved,
|
|
8111
|
+
controlCharsRemoved,
|
|
7866
8112
|
lineCount,
|
|
7867
|
-
changed: nulBytesRemoved > 0 || totalBytesRead !== totalBytesWritten
|
|
8113
|
+
changed: nulBytesRemoved > 0 || invalidBytesRemoved > 0 || controlCharsRemoved > 0 || totalBytesRead !== totalBytesWritten
|
|
7868
8114
|
};
|
|
7869
8115
|
}
|
|
7870
8116
|
|
|
@@ -7927,40 +8173,54 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
|
|
|
7927
8173
|
"No recognized validated dataset files were found for sanitization."
|
|
7928
8174
|
);
|
|
7929
8175
|
}
|
|
8176
|
+
const sourceEncoding = normalizeSanitizeSourceEncoding(
|
|
8177
|
+
options.sourceEncoding
|
|
8178
|
+
);
|
|
7930
8179
|
options.onProgress?.({
|
|
7931
8180
|
kind: "start",
|
|
7932
8181
|
validatedPath,
|
|
7933
8182
|
outputPath,
|
|
7934
8183
|
totalFiles: plan.totalFiles,
|
|
7935
8184
|
totalBytes: plan.totalBytes,
|
|
7936
|
-
datasets: plan.datasets
|
|
8185
|
+
datasets: plan.datasets,
|
|
8186
|
+
sourceEncoding
|
|
7937
8187
|
});
|
|
7938
8188
|
let processedFiles = 0;
|
|
7939
8189
|
let processedRows = 0;
|
|
7940
8190
|
let processedBytes = 0;
|
|
7941
8191
|
let nulBytesRemoved = 0;
|
|
8192
|
+
let invalidBytesRemoved = 0;
|
|
8193
|
+
let controlCharsRemoved = 0;
|
|
7942
8194
|
let changedFiles = 0;
|
|
7943
8195
|
const fileSummaries = [];
|
|
7944
8196
|
for (const [index, filePlan] of plan.files.entries()) {
|
|
7945
|
-
const fileResult = await sanitizeDatasetFile(
|
|
7946
|
-
|
|
7947
|
-
|
|
7948
|
-
|
|
7949
|
-
|
|
7950
|
-
|
|
7951
|
-
|
|
7952
|
-
|
|
7953
|
-
|
|
7954
|
-
|
|
7955
|
-
|
|
7956
|
-
|
|
7957
|
-
|
|
7958
|
-
|
|
7959
|
-
|
|
8197
|
+
const fileResult = await sanitizeDatasetFile(
|
|
8198
|
+
filePlan,
|
|
8199
|
+
(chunk) => {
|
|
8200
|
+
options.onProgress?.({
|
|
8201
|
+
kind: "progress",
|
|
8202
|
+
currentFileDisplayPath: filePlan.displayPath,
|
|
8203
|
+
fileIndex: index + 1,
|
|
8204
|
+
totalFiles: plan.totalFiles,
|
|
8205
|
+
bytesProcessed: processedBytes + chunk.fileBytesProcessed,
|
|
8206
|
+
totalBytes: plan.totalBytes,
|
|
8207
|
+
fileBytesProcessed: chunk.fileBytesProcessed,
|
|
8208
|
+
currentFileSize: chunk.currentFileSize,
|
|
8209
|
+
processedRows: processedRows + chunk.processedRows,
|
|
8210
|
+
nulBytesRemoved: nulBytesRemoved + chunk.nulBytesRemoved,
|
|
8211
|
+
invalidBytesRemoved: invalidBytesRemoved + chunk.invalidBytesRemoved,
|
|
8212
|
+
controlCharsRemoved: controlCharsRemoved + chunk.controlCharsRemoved,
|
|
8213
|
+
changedFiles
|
|
8214
|
+
});
|
|
8215
|
+
},
|
|
8216
|
+
{ sourceEncoding }
|
|
8217
|
+
);
|
|
7960
8218
|
processedFiles += 1;
|
|
7961
8219
|
processedRows += fileResult.lineCount;
|
|
7962
8220
|
processedBytes += fileResult.totalBytesRead;
|
|
7963
8221
|
nulBytesRemoved += fileResult.nulBytesRemoved;
|
|
8222
|
+
invalidBytesRemoved += fileResult.invalidBytesRemoved;
|
|
8223
|
+
controlCharsRemoved += fileResult.controlCharsRemoved;
|
|
7964
8224
|
changedFiles += fileResult.changed ? 1 : 0;
|
|
7965
8225
|
fileSummaries.push({
|
|
7966
8226
|
dataset: filePlan.dataset,
|
|
@@ -7968,7 +8228,9 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
|
|
|
7968
8228
|
outputPath: filePlan.outputPath,
|
|
7969
8229
|
lineCount: fileResult.lineCount,
|
|
7970
8230
|
changed: fileResult.changed,
|
|
7971
|
-
nulBytesRemoved: fileResult.nulBytesRemoved
|
|
8231
|
+
nulBytesRemoved: fileResult.nulBytesRemoved,
|
|
8232
|
+
invalidBytesRemoved: fileResult.invalidBytesRemoved,
|
|
8233
|
+
controlCharsRemoved: fileResult.controlCharsRemoved
|
|
7972
8234
|
});
|
|
7973
8235
|
}
|
|
7974
8236
|
options.onProgress?.({
|
|
@@ -7976,6 +8238,8 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
|
|
|
7976
8238
|
totalFiles: plan.totalFiles,
|
|
7977
8239
|
processedRows,
|
|
7978
8240
|
nulBytesRemoved,
|
|
8241
|
+
invalidBytesRemoved,
|
|
8242
|
+
controlCharsRemoved,
|
|
7979
8243
|
changedFiles,
|
|
7980
8244
|
totalBytes: plan.totalBytes
|
|
7981
8245
|
});
|
|
@@ -7987,13 +8251,17 @@ async function sanitizeInputDirectory(inputPath, options = {}) {
|
|
|
7987
8251
|
totalBytes: plan.totalBytes,
|
|
7988
8252
|
processedFiles,
|
|
7989
8253
|
processedRows,
|
|
8254
|
+
sourceEncoding,
|
|
7990
8255
|
nulBytesRemoved,
|
|
8256
|
+
invalidBytesRemoved,
|
|
8257
|
+
controlCharsRemoved,
|
|
7991
8258
|
changedFiles,
|
|
7992
8259
|
unchangedFiles: plan.totalFiles - changedFiles,
|
|
7993
8260
|
datasets: plan.datasets,
|
|
7994
8261
|
files: fileSummaries,
|
|
7995
8262
|
warnings: [
|
|
7996
|
-
"Sanitization
|
|
8263
|
+
"Sanitization now writes UTF-8 output and removes invalid bytes plus problematic control characters before PostgreSQL loading begins.",
|
|
8264
|
+
"The PostgreSQL direct import path can use --source-encoding UTF8 when reading files generated by this sanitization command.",
|
|
7997
8265
|
"The import command still keeps quarantine and row-level recovery for unexpected issues, but sanitizing first reduces the amount of slow fallback work during import."
|
|
7998
8266
|
],
|
|
7999
8267
|
nextStep: inferNextStep3(outputPath)
|
|
@@ -8096,6 +8364,852 @@ async function syncFederalRevenueDataset(options = {}) {
|
|
|
8096
8364
|
() => runFederalRevenueSyncPipeline(options, check.selectedReference)
|
|
8097
8365
|
);
|
|
8098
8366
|
}
|
|
8367
|
+
|
|
8368
|
+
// src/services/postgres-direct/exporter.ts
|
|
8369
|
+
import { createWriteStream as createWriteStream3 } from "fs";
|
|
8370
|
+
import { mkdir as mkdir8, writeFile as writeFile5 } from "fs/promises";
|
|
8371
|
+
import path16 from "path";
|
|
8372
|
+
|
|
8373
|
+
// src/services/postgres-direct/csv.ts
|
|
8374
|
+
function formatCsvValue(value) {
|
|
8375
|
+
if (value === null || value === void 0) {
|
|
8376
|
+
return "";
|
|
8377
|
+
}
|
|
8378
|
+
if (value instanceof Date) {
|
|
8379
|
+
return formatCsvValue(value.toISOString());
|
|
8380
|
+
}
|
|
8381
|
+
const text = String(value);
|
|
8382
|
+
const shouldQuote = /[",\r\n]/.test(text);
|
|
8383
|
+
if (!shouldQuote) {
|
|
8384
|
+
return text;
|
|
8385
|
+
}
|
|
8386
|
+
return `"${text.replace(/"/g, '""')}"`;
|
|
8387
|
+
}
|
|
8388
|
+
function formatCsvRow(values) {
|
|
8389
|
+
return values.map(formatCsvValue).join(",");
|
|
8390
|
+
}
|
|
8391
|
+
|
|
8392
|
+
// src/services/postgres-direct/script.ts
|
|
8393
|
+
import path15 from "path";
|
|
8394
|
+
var STAGING_DATASETS = [
|
|
8395
|
+
"companies",
|
|
8396
|
+
"establishments",
|
|
8397
|
+
"partners",
|
|
8398
|
+
"simples_options"
|
|
8399
|
+
];
|
|
8400
|
+
var DOMAIN_DATASETS = [
|
|
8401
|
+
"partner_qualifications",
|
|
8402
|
+
"legal_natures",
|
|
8403
|
+
"countries",
|
|
8404
|
+
"cities",
|
|
8405
|
+
"reasons",
|
|
8406
|
+
"cnaes"
|
|
8407
|
+
];
|
|
8408
|
+
var STAGING_TABLE_BY_DATASET3 = {
|
|
8409
|
+
companies: "staging_companies",
|
|
8410
|
+
establishments: "staging_establishments",
|
|
8411
|
+
partners: "staging_partners",
|
|
8412
|
+
simples_options: "staging_simples_options"
|
|
8413
|
+
};
|
|
8414
|
+
function quoteSqlLiteral(value) {
|
|
8415
|
+
return `'${value.replace(/'/g, "''")}'`;
|
|
8416
|
+
}
|
|
8417
|
+
function quoteIdentifier(value) {
|
|
8418
|
+
return `"${value.replace(/"/g, '""')}"`;
|
|
8419
|
+
}
|
|
8420
|
+
function normalizePathForPsql(filePath) {
|
|
8421
|
+
return path15.resolve(filePath).replace(/\\/g, "/");
|
|
8422
|
+
}
|
|
8423
|
+
function csvCopyCommand(tableName, columns, filePath) {
|
|
8424
|
+
const normalizedFilePath = normalizePathForPsql(filePath);
|
|
8425
|
+
return `\\copy ${tableName} (${columns.join(", ")}) from ${quoteSqlLiteral(normalizedFilePath)} with (format csv, header true, delimiter ',', quote '"', escape '"', null '')`;
|
|
8426
|
+
}
|
|
8427
|
+
function receitaCopyCommand(tableName, columns, filePath) {
|
|
8428
|
+
const normalizedFilePath = normalizePathForPsql(filePath);
|
|
8429
|
+
return `\\copy ${tableName} (${columns.join(", ")}) from ${quoteSqlLiteral(normalizedFilePath)} with (format csv, header false, delimiter ';', quote '"', escape '"')`;
|
|
8430
|
+
}
|
|
8431
|
+
function datasetColumns(dataset) {
|
|
8432
|
+
return DATASET_LAYOUTS[dataset].fields.map((field) => field.columnName);
|
|
8433
|
+
}
|
|
8434
|
+
function updateAssignments(columns, excludedColumns) {
|
|
8435
|
+
return columns.filter((column) => !excludedColumns.includes(column)).map((column) => `${column} = excluded.${column}`).concat(["updated_at = now()"]).join(",\n ");
|
|
8436
|
+
}
|
|
8437
|
+
function partnerDedupeExpression(alias) {
|
|
8438
|
+
return [
|
|
8439
|
+
"md5(",
|
|
8440
|
+
` coalesce(${alias}.cnpj_root, '') || '|' ||`,
|
|
8441
|
+
` coalesce(${alias}.partner_type_code, '') || '|' ||`,
|
|
8442
|
+
` coalesce(${alias}.partner_name, '') || '|' ||`,
|
|
8443
|
+
` coalesce(${alias}.partner_document, '') || '|' ||`,
|
|
8444
|
+
` coalesce(${alias}.partner_qualification_code, '') || '|' ||`,
|
|
8445
|
+
` coalesce((${alias}.entry_date - date '2000-01-01')::text, '') || '|' ||`,
|
|
8446
|
+
` coalesce(${alias}.country_code, '') || '|' ||`,
|
|
8447
|
+
` coalesce(${alias}.legal_representative_document, '') || '|' ||`,
|
|
8448
|
+
` coalesce(${alias}.legal_representative_name, '') || '|' ||`,
|
|
8449
|
+
` coalesce(${alias}.legal_representative_qualification_code, '') || '|' ||`,
|
|
8450
|
+
` coalesce(${alias}.age_group_code, '')`,
|
|
8451
|
+
")"
|
|
8452
|
+
].join("\n");
|
|
8453
|
+
}
|
|
8454
|
+
function materializeCompaniesSql() {
|
|
8455
|
+
const columns = companiesLayout.fields.map((field) => field.columnName);
|
|
8456
|
+
return [
|
|
8457
|
+
"\\echo 'Materializing companies...'",
|
|
8458
|
+
"with source as (",
|
|
8459
|
+
" select",
|
|
8460
|
+
` ${columns.map((column) => `source.${column}`).join(",\n ")},`,
|
|
8461
|
+
" row_number() over (partition by source.cnpj_root order by source.staging_id desc) as dedupe_rank",
|
|
8462
|
+
" from staging_companies source",
|
|
8463
|
+
"),",
|
|
8464
|
+
"deduped as (",
|
|
8465
|
+
" select * from source where dedupe_rank = 1",
|
|
8466
|
+
")",
|
|
8467
|
+
`insert into companies (${columns.join(", ")})`,
|
|
8468
|
+
`select ${columns.join(", ")}`,
|
|
8469
|
+
"from deduped",
|
|
8470
|
+
"on conflict (cnpj_root) do update set",
|
|
8471
|
+
` ${updateAssignments(columns, ["cnpj_root"])};`
|
|
8472
|
+
].join("\n");
|
|
8473
|
+
}
|
|
8474
|
+
function materializeEstablishmentsSql() {
|
|
8475
|
+
const baseColumns = establishmentsLayout.fields.map(
|
|
8476
|
+
(field) => field.columnName
|
|
8477
|
+
);
|
|
8478
|
+
const insertColumns = [...baseColumns, "cnpj_full"];
|
|
8479
|
+
return [
|
|
8480
|
+
"\\echo 'Materializing establishments and secondary CNAEs...'",
|
|
8481
|
+
"with source as (",
|
|
8482
|
+
" select",
|
|
8483
|
+
` ${baseColumns.map((column) => `source.${column}`).join(",\n ")},`,
|
|
8484
|
+
" source.cnpj_root || source.cnpj_order || source.cnpj_check_digits as cnpj_full,",
|
|
8485
|
+
" row_number() over (partition by source.cnpj_root || source.cnpj_order || source.cnpj_check_digits order by source.staging_id desc) as dedupe_rank",
|
|
8486
|
+
" from staging_establishments source",
|
|
8487
|
+
"),",
|
|
8488
|
+
"deduped as (",
|
|
8489
|
+
" select * from source where dedupe_rank = 1",
|
|
8490
|
+
"),",
|
|
8491
|
+
"upserted as (",
|
|
8492
|
+
` insert into establishments (${insertColumns.join(", ")})`,
|
|
8493
|
+
` select ${insertColumns.join(", ")}`,
|
|
8494
|
+
" from deduped",
|
|
8495
|
+
" on conflict (cnpj_full) do update set",
|
|
8496
|
+
` ${updateAssignments(insertColumns, ["cnpj_root", "cnpj_order", "cnpj_check_digits", "cnpj_full"])}`,
|
|
8497
|
+
" returning cnpj_full",
|
|
8498
|
+
"),",
|
|
8499
|
+
"deleted_secondary_cnaes as (",
|
|
8500
|
+
" delete from establishment_secondary_cnaes target",
|
|
8501
|
+
" using (select cnpj_full from deduped) source_keys",
|
|
8502
|
+
" where target.cnpj_full = source_keys.cnpj_full",
|
|
8503
|
+
" returning 1",
|
|
8504
|
+
"),",
|
|
8505
|
+
"secondary_cnaes_source as (",
|
|
8506
|
+
" select distinct",
|
|
8507
|
+
" deduped.cnpj_full,",
|
|
8508
|
+
" btrim(cnae_code) as cnae_code",
|
|
8509
|
+
" from deduped",
|
|
8510
|
+
" cross join lateral unnest(string_to_array(deduped.secondary_cnaes_raw, ',')) as cnae_code",
|
|
8511
|
+
" where deduped.secondary_cnaes_raw is not null",
|
|
8512
|
+
" and deduped.secondary_cnaes_raw <> ''",
|
|
8513
|
+
" and btrim(cnae_code) <> ''",
|
|
8514
|
+
")",
|
|
8515
|
+
"insert into establishment_secondary_cnaes (cnpj_full, cnae_code)",
|
|
8516
|
+
"select cnpj_full, cnae_code",
|
|
8517
|
+
"from secondary_cnaes_source",
|
|
8518
|
+
"on conflict (cnpj_full, cnae_code) do nothing;"
|
|
8519
|
+
].join("\n");
|
|
8520
|
+
}
|
|
8521
|
+
function materializePartnersSql() {
|
|
8522
|
+
const baseColumns = partnersLayout.fields.map((field) => field.columnName);
|
|
8523
|
+
const insertColumns = [...baseColumns, "partner_dedupe_key"];
|
|
8524
|
+
return [
|
|
8525
|
+
"\\echo 'Materializing partners...'",
|
|
8526
|
+
"with source as (",
|
|
8527
|
+
" select",
|
|
8528
|
+
` ${baseColumns.map((column) => `source.${column}`).join(",\n ")},`,
|
|
8529
|
+
` ${partnerDedupeExpression("source")} as partner_dedupe_key`,
|
|
8530
|
+
" from staging_partners source",
|
|
8531
|
+
"),",
|
|
8532
|
+
"ranked as (",
|
|
8533
|
+
" select",
|
|
8534
|
+
" source.*,",
|
|
8535
|
+
" row_number() over (partition by source.partner_dedupe_key order by source.cnpj_root asc) as dedupe_rank",
|
|
8536
|
+
" from source",
|
|
8537
|
+
"),",
|
|
8538
|
+
"deduped as (",
|
|
8539
|
+
" select * from ranked where dedupe_rank = 1",
|
|
8540
|
+
")",
|
|
8541
|
+
`insert into partners (${insertColumns.join(", ")})`,
|
|
8542
|
+
`select ${insertColumns.join(", ")}`,
|
|
8543
|
+
"from deduped",
|
|
8544
|
+
"on conflict (partner_dedupe_key) do update set",
|
|
8545
|
+
` ${updateAssignments(insertColumns, ["partner_dedupe_key"])};`
|
|
8546
|
+
].join("\n");
|
|
8547
|
+
}
|
|
8548
|
+
function materializeSimplesSql() {
|
|
8549
|
+
const columns = simplesLayout.fields.map((field) => field.columnName);
|
|
8550
|
+
return [
|
|
8551
|
+
"\\echo 'Materializing simples options...'",
|
|
8552
|
+
"with source as (",
|
|
8553
|
+
" select",
|
|
8554
|
+
` ${columns.map((column) => `source.${column}`).join(",\n ")},`,
|
|
8555
|
+
" row_number() over (partition by source.cnpj_root order by source.staging_id desc) as dedupe_rank",
|
|
8556
|
+
" from staging_simples_options source",
|
|
8557
|
+
"),",
|
|
8558
|
+
"deduped as (",
|
|
8559
|
+
" select * from source where dedupe_rank = 1",
|
|
8560
|
+
")",
|
|
8561
|
+
`insert into simples_options (${columns.join(", ")})`,
|
|
8562
|
+
`select ${columns.join(", ")}`,
|
|
8563
|
+
"from deduped",
|
|
8564
|
+
"on conflict (cnpj_root) do update set",
|
|
8565
|
+
` ${updateAssignments(columns, ["cnpj_root"])};`
|
|
8566
|
+
].join("\n");
|
|
8567
|
+
}
|
|
8568
|
+
function copyDomainSql(dataset, files) {
|
|
8569
|
+
if (files.length === 0) {
|
|
8570
|
+
return [];
|
|
8571
|
+
}
|
|
8572
|
+
const columns = datasetColumns(dataset);
|
|
8573
|
+
const tempTable = `tmp_hybrid_${dataset}`;
|
|
8574
|
+
const lines = [
|
|
8575
|
+
`\\echo 'Loading ${dataset} lookup data...'`,
|
|
8576
|
+
`drop table if exists ${tempTable};`,
|
|
8577
|
+
`create temporary table ${tempTable} (code text, description text);`
|
|
8578
|
+
];
|
|
8579
|
+
for (const file of files) {
|
|
8580
|
+
lines.push(csvCopyCommand(tempTable, columns, file.absolutePath));
|
|
8581
|
+
}
|
|
8582
|
+
lines.push(
|
|
8583
|
+
`insert into ${dataset} (${columns.join(", ")})`,
|
|
8584
|
+
`select distinct on (code) ${columns.join(", ")}`,
|
|
8585
|
+
`from ${tempTable}`,
|
|
8586
|
+
"where code is not null and code <> ''",
|
|
8587
|
+
"order by code",
|
|
8588
|
+
"on conflict (code) do update set description = excluded.description;"
|
|
8589
|
+
);
|
|
8590
|
+
return lines;
|
|
8591
|
+
}
|
|
8592
|
+
function copyStagingSql(dataset, files) {
|
|
8593
|
+
if (files.length === 0) {
|
|
8594
|
+
return [];
|
|
8595
|
+
}
|
|
8596
|
+
const tableName = STAGING_TABLE_BY_DATASET3[dataset];
|
|
8597
|
+
if (!tableName) {
|
|
8598
|
+
return [];
|
|
8599
|
+
}
|
|
8600
|
+
const columns = datasetColumns(dataset);
|
|
8601
|
+
return [
|
|
8602
|
+
`\\echo 'Loading ${dataset} staging data...'`,
|
|
8603
|
+
...files.map(
|
|
8604
|
+
(file) => csvCopyCommand(tableName, columns, file.absolutePath)
|
|
8605
|
+
)
|
|
8606
|
+
];
|
|
8607
|
+
}
|
|
8608
|
+
function csvFilesByDataset(files) {
|
|
8609
|
+
const grouped = {};
|
|
8610
|
+
for (const file of files) {
|
|
8611
|
+
const items = grouped[file.dataset] ?? [];
|
|
8612
|
+
items.push(file);
|
|
8613
|
+
grouped[file.dataset] = items;
|
|
8614
|
+
}
|
|
8615
|
+
return grouped;
|
|
8616
|
+
}
|
|
8617
|
+
function directFilesByDataset(files) {
|
|
8618
|
+
const grouped = {};
|
|
8619
|
+
for (const file of files) {
|
|
8620
|
+
const items = grouped[file.dataset] ?? [];
|
|
8621
|
+
items.push(file);
|
|
8622
|
+
grouped[file.dataset] = items;
|
|
8623
|
+
}
|
|
8624
|
+
return grouped;
|
|
8625
|
+
}
|
|
8626
|
+
function rawTableName(dataset) {
|
|
8627
|
+
return `tmp_hybrid_raw_${dataset}`;
|
|
8628
|
+
}
|
|
8629
|
+
function createRawTempTableSql(dataset) {
|
|
8630
|
+
const columns = DATASET_LAYOUTS[dataset].fields.map((field) => ` ${quoteIdentifier(field.columnName)} text`).join(",\n");
|
|
8631
|
+
return [
|
|
8632
|
+
`drop table if exists ${rawTableName(dataset)};`,
|
|
8633
|
+
`create temporary table ${rawTableName(dataset)} (`,
|
|
8634
|
+
columns,
|
|
8635
|
+
");"
|
|
8636
|
+
].join("\n");
|
|
8637
|
+
}
|
|
8638
|
+
function textExpression(alias, column) {
|
|
8639
|
+
return `nullif(btrim(${alias}.${quoteIdentifier(column)}), '')`;
|
|
8640
|
+
}
|
|
8641
|
+
function dateExpression(alias, column) {
|
|
8642
|
+
const value = `btrim(${alias}.${quoteIdentifier(column)})`;
|
|
8643
|
+
return [
|
|
8644
|
+
"case",
|
|
8645
|
+
` when ${value} = '' or ${value} = '00000000' then null`,
|
|
8646
|
+
` when ${value} ~ '^\\d{8}$' then to_date(${value}, 'YYYYMMDD')`,
|
|
8647
|
+
" else null",
|
|
8648
|
+
"end"
|
|
8649
|
+
].join(" ");
|
|
8650
|
+
}
|
|
8651
|
+
function numericExpression(alias, column) {
|
|
8652
|
+
const value = `btrim(${alias}.${quoteIdentifier(column)})`;
|
|
8653
|
+
return [
|
|
8654
|
+
"case",
|
|
8655
|
+
` when ${value} = '' then null`,
|
|
8656
|
+
` when ${value} like '%,%' and ${value} like '%.%' then replace(replace(${value}, '.', ''), ',', '.')::numeric`,
|
|
8657
|
+
` when ${value} like '%,%' then replace(${value}, ',', '.')::numeric`,
|
|
8658
|
+
` else ${value}::numeric`,
|
|
8659
|
+
"end"
|
|
8660
|
+
].join(" ");
|
|
8661
|
+
}
|
|
8662
|
+
function integerExpression(alias, column) {
|
|
8663
|
+
const value = `btrim(${alias}.${quoteIdentifier(column)})`;
|
|
8664
|
+
return [
|
|
8665
|
+
"case",
|
|
8666
|
+
` when ${value} = '' then null`,
|
|
8667
|
+
` when ${value} ~ '^-?\\d+$' then ${value}::integer`,
|
|
8668
|
+
" else null",
|
|
8669
|
+
"end"
|
|
8670
|
+
].join(" ");
|
|
8671
|
+
}
|
|
8672
|
+
function booleanExpression(alias, column) {
|
|
8673
|
+
const value = `lower(btrim(${alias}.${quoteIdentifier(column)}))`;
|
|
8674
|
+
return [
|
|
8675
|
+
"case",
|
|
8676
|
+
` when ${value} in ('1', 'true', 't', 'y', 'yes', 's') then true`,
|
|
8677
|
+
` when ${value} in ('0', 'false', 'f', 'n', 'no') then false`,
|
|
8678
|
+
" else null",
|
|
8679
|
+
"end"
|
|
8680
|
+
].join(" ");
|
|
8681
|
+
}
|
|
8682
|
+
function fieldExpression(dataset, field, alias) {
|
|
8683
|
+
const column = field.columnName;
|
|
8684
|
+
if (dataset === "companies" && column === "company_size_code") {
|
|
8685
|
+
return `coalesce(${textExpression(alias, column)}, '00')`;
|
|
8686
|
+
}
|
|
8687
|
+
if (dataset === "establishments" && column === "branch_type_code") {
|
|
8688
|
+
return `coalesce(${textExpression(alias, column)}, '1')`;
|
|
8689
|
+
}
|
|
8690
|
+
if (dataset === "establishments" && column === "registration_status_code") {
|
|
8691
|
+
return `coalesce(${textExpression(alias, column)}, '01')`;
|
|
8692
|
+
}
|
|
8693
|
+
switch (field.dataType) {
|
|
8694
|
+
case "date":
|
|
8695
|
+
return dateExpression(alias, column);
|
|
8696
|
+
case "numeric":
|
|
8697
|
+
return numericExpression(alias, column);
|
|
8698
|
+
case "integer":
|
|
8699
|
+
return integerExpression(alias, column);
|
|
8700
|
+
case "boolean":
|
|
8701
|
+
return booleanExpression(alias, column);
|
|
8702
|
+
default:
|
|
8703
|
+
return textExpression(alias, column);
|
|
8704
|
+
}
|
|
8705
|
+
}
|
|
8706
|
+
function rawDomainSql(dataset, files) {
|
|
8707
|
+
if (files.length === 0) {
|
|
8708
|
+
return [];
|
|
8709
|
+
}
|
|
8710
|
+
const layout = DATASET_LAYOUTS[dataset];
|
|
8711
|
+
const columns = layout.fields.map((field) => field.columnName);
|
|
8712
|
+
const tableName = rawTableName(dataset);
|
|
8713
|
+
const lines = [
|
|
8714
|
+
`\\echo 'Loading ${dataset} lookup data directly from sanitized Receita files...'`,
|
|
8715
|
+
createRawTempTableSql(dataset)
|
|
8716
|
+
];
|
|
8717
|
+
for (const file of files) {
|
|
8718
|
+
lines.push(receitaCopyCommand(tableName, columns, file.absolutePath));
|
|
8719
|
+
}
|
|
8720
|
+
lines.push(
|
|
8721
|
+
`insert into ${dataset} (${columns.join(", ")})`,
|
|
8722
|
+
"select distinct on (code)",
|
|
8723
|
+
" nullif(btrim(code), '') as code,",
|
|
8724
|
+
" nullif(btrim(description), '') as description",
|
|
8725
|
+
`from ${tableName}`,
|
|
8726
|
+
"where nullif(btrim(code), '') is not null",
|
|
8727
|
+
"order by code",
|
|
8728
|
+
"on conflict (code) do update set description = excluded.description;"
|
|
8729
|
+
);
|
|
8730
|
+
return lines;
|
|
8731
|
+
}
|
|
8732
|
+
function rawStagingSql(dataset, files) {
|
|
8733
|
+
if (files.length === 0) {
|
|
8734
|
+
return [];
|
|
8735
|
+
}
|
|
8736
|
+
const targetTable = STAGING_TABLE_BY_DATASET3[dataset];
|
|
8737
|
+
if (!targetTable) {
|
|
8738
|
+
return [];
|
|
8739
|
+
}
|
|
8740
|
+
const layout = DATASET_LAYOUTS[dataset];
|
|
8741
|
+
const columns = layout.fields.map((field) => field.columnName);
|
|
8742
|
+
const tableName = rawTableName(dataset);
|
|
8743
|
+
const alias = "source";
|
|
8744
|
+
const expressions = layout.fields.map(
|
|
8745
|
+
(field) => ` ${fieldExpression(dataset, field, alias)} as ${field.columnName}`
|
|
8746
|
+
);
|
|
8747
|
+
const lines = [
|
|
8748
|
+
`\\echo 'Loading ${dataset} staging data directly from sanitized Receita files...'`,
|
|
8749
|
+
createRawTempTableSql(dataset)
|
|
8750
|
+
];
|
|
8751
|
+
for (const file of files) {
|
|
8752
|
+
lines.push(receitaCopyCommand(tableName, columns, file.absolutePath));
|
|
8753
|
+
}
|
|
8754
|
+
lines.push(
|
|
8755
|
+
`insert into ${targetTable} (${columns.join(", ")})`,
|
|
8756
|
+
"select",
|
|
8757
|
+
expressions.join(",\n"),
|
|
8758
|
+
`from ${tableName} ${alias};`
|
|
8759
|
+
);
|
|
8760
|
+
return lines;
|
|
8761
|
+
}
|
|
8762
|
+
function generatePostgresDirectImportScript(input) {
|
|
8763
|
+
const grouped = csvFilesByDataset(input.files);
|
|
8764
|
+
const lines = [
|
|
8765
|
+
"-- CNPJ DB Loader hybrid PostgreSQL import script",
|
|
8766
|
+
"-- Generated from PostgreSQL-ready CSV files exported by cnpj-db-loader postgres export-csv.",
|
|
8767
|
+
"-- Execute with psql, for example:",
|
|
8768
|
+
'-- psql "postgres://postgres:postgres@localhost:5432/cnpj" -f import-postgres-direct.sql',
|
|
8769
|
+
"",
|
|
8770
|
+
"\\set ON_ERROR_STOP on",
|
|
8771
|
+
"\\echo 'Starting CNPJ DB Loader hybrid PostgreSQL import...'",
|
|
8772
|
+
"",
|
|
8773
|
+
"begin;",
|
|
8774
|
+
"",
|
|
8775
|
+
"-- Keep the final schema and seed data managed by sql/schema.sql.",
|
|
8776
|
+
"-- This script only resets staging tables and then upserts final data.",
|
|
8777
|
+
"truncate table staging_companies restart identity;",
|
|
8778
|
+
"truncate table staging_establishments restart identity;",
|
|
8779
|
+
"truncate table staging_partners restart identity;",
|
|
8780
|
+
"truncate table staging_simples_options restart identity;",
|
|
8781
|
+
""
|
|
8782
|
+
];
|
|
8783
|
+
for (const dataset of DOMAIN_DATASETS) {
|
|
8784
|
+
lines.push(...copyDomainSql(dataset, grouped[dataset] ?? []), "");
|
|
8785
|
+
}
|
|
8786
|
+
for (const dataset of STAGING_DATASETS) {
|
|
8787
|
+
lines.push(...copyStagingSql(dataset, grouped[dataset] ?? []), "");
|
|
8788
|
+
}
|
|
8789
|
+
lines.push(...materializationAndAnalyzeSql());
|
|
8790
|
+
return lines.join("\n");
|
|
8791
|
+
}
|
|
8792
|
+
function generatePostgresSanitizedDirectImportScript(input) {
|
|
8793
|
+
const grouped = directFilesByDataset(input.files);
|
|
8794
|
+
const lines = [
|
|
8795
|
+
"-- CNPJ DB Loader direct PostgreSQL import script",
|
|
8796
|
+
"-- Generated from sanitized Receita files by cnpj-db-loader postgres generate-script.",
|
|
8797
|
+
"-- This path avoids rewriting the dataset into a second CSV tree.",
|
|
8798
|
+
"-- Execute with psql, for example:",
|
|
8799
|
+
'-- psql "postgres://postgres:postgres@localhost:5432/cnpj" -f import-postgres-direct.sql',
|
|
8800
|
+
"",
|
|
8801
|
+
"\\set ON_ERROR_STOP on",
|
|
8802
|
+
`\\echo 'Using source file encoding ${input.sourceEncoding} for psql copy operations...'`,
|
|
8803
|
+
`set client_encoding to ${quoteSqlLiteral(input.sourceEncoding)};`,
|
|
8804
|
+
"\\echo 'Starting CNPJ DB Loader direct PostgreSQL import from sanitized files...'",
|
|
8805
|
+
"",
|
|
8806
|
+
"begin;",
|
|
8807
|
+
"",
|
|
8808
|
+
"-- Keep the final schema and seed data managed by sql/schema.sql.",
|
|
8809
|
+
"-- This script copies sanitized Receita files into temporary raw tables,",
|
|
8810
|
+
"-- transforms values inside PostgreSQL, resets staging tables and upserts final data.",
|
|
8811
|
+
"truncate table staging_companies restart identity;",
|
|
8812
|
+
"truncate table staging_establishments restart identity;",
|
|
8813
|
+
"truncate table staging_partners restart identity;",
|
|
8814
|
+
"truncate table staging_simples_options restart identity;",
|
|
8815
|
+
""
|
|
8816
|
+
];
|
|
8817
|
+
for (const dataset of DOMAIN_DATASETS) {
|
|
8818
|
+
lines.push(...rawDomainSql(dataset, grouped[dataset] ?? []), "");
|
|
8819
|
+
}
|
|
8820
|
+
for (const dataset of STAGING_DATASETS) {
|
|
8821
|
+
lines.push(...rawStagingSql(dataset, grouped[dataset] ?? []), "");
|
|
8822
|
+
}
|
|
8823
|
+
lines.push(...materializationAndAnalyzeSql());
|
|
8824
|
+
return lines.join("\n");
|
|
8825
|
+
}
|
|
8826
|
+
function materializationAndAnalyzeSql() {
|
|
8827
|
+
return [
|
|
8828
|
+
materializeCompaniesSql(),
|
|
8829
|
+
"",
|
|
8830
|
+
materializeEstablishmentsSql(),
|
|
8831
|
+
"",
|
|
8832
|
+
materializePartnersSql(),
|
|
8833
|
+
"",
|
|
8834
|
+
materializeSimplesSql(),
|
|
8835
|
+
"",
|
|
8836
|
+
"\\echo 'Refreshing planner statistics...'",
|
|
8837
|
+
"analyze companies;",
|
|
8838
|
+
"analyze establishments;",
|
|
8839
|
+
"analyze establishment_secondary_cnaes;",
|
|
8840
|
+
"analyze partners;",
|
|
8841
|
+
"analyze simples_options;",
|
|
8842
|
+
"analyze cnaes;",
|
|
8843
|
+
"analyze cities;",
|
|
8844
|
+
"analyze countries;",
|
|
8845
|
+
"analyze legal_natures;",
|
|
8846
|
+
"analyze partner_qualifications;",
|
|
8847
|
+
"analyze reasons;",
|
|
8848
|
+
"",
|
|
8849
|
+
"commit;",
|
|
8850
|
+
"",
|
|
8851
|
+
"\\echo 'CNPJ DB Loader hybrid PostgreSQL import completed.'",
|
|
8852
|
+
""
|
|
8853
|
+
];
|
|
8854
|
+
}
|
|
8855
|
+
|
|
8856
|
+
// src/services/postgres-direct/exporter.ts
|
|
8857
|
+
var POSTGRES_DIRECT_SCHEMA_CAPABILITIES = {
|
|
8858
|
+
includeEstablishmentCnpjFullInInsert: true,
|
|
8859
|
+
includeEstablishmentSecondaryCnaesTable: true,
|
|
8860
|
+
includePartnerDedupeKeyInInsert: true,
|
|
8861
|
+
requiresLookupReconciliation: false
|
|
8862
|
+
};
|
|
8863
|
+
function defaultPostgresCsvOutputPath(inputPath) {
|
|
8864
|
+
const baseName = path16.basename(inputPath);
|
|
8865
|
+
return path16.join(path16.dirname(inputPath), `${baseName}-postgres-csv`);
|
|
8866
|
+
}
|
|
8867
|
+
function normalizeOutputFileName(relativePath) {
|
|
8868
|
+
const parsed = path16.parse(relativePath);
|
|
8869
|
+
const baseName = parsed.name || parsed.base || "dataset";
|
|
8870
|
+
return path16.join(parsed.dir, `${baseName}.csv`);
|
|
8871
|
+
}
|
|
8872
|
+
function resolveDatasetOutputPath(outputPath, dataset, relativePath) {
|
|
8873
|
+
return path16.join(outputPath, dataset, normalizeOutputFileName(relativePath));
|
|
8874
|
+
}
|
|
8875
|
+
function inferNextStep4(scriptPath) {
|
|
8876
|
+
return `psql "postgres://postgres:postgres@localhost:5432/cnpj" -f ${scriptPath.replace(/\\/g, "/")}`;
|
|
8877
|
+
}
|
|
8878
|
+
async function writeCsvFile(input) {
|
|
8879
|
+
const layout = DATASET_LAYOUTS[input.dataset];
|
|
8880
|
+
const columns = layout.fields.map((field) => field.columnName);
|
|
8881
|
+
await mkdir8(path16.dirname(input.outputFile), { recursive: true });
|
|
8882
|
+
const output = createWriteStream3(input.outputFile, { encoding: "utf8" });
|
|
8883
|
+
let rows = 0;
|
|
8884
|
+
try {
|
|
8885
|
+
output.write(`${formatCsvRow(columns)}
|
|
8886
|
+
`);
|
|
8887
|
+
for await (const sourceLine of readImportSourceLines(input.inputFile)) {
|
|
8888
|
+
if (sourceLine.rawLine.trim() === "") {
|
|
8889
|
+
continue;
|
|
8890
|
+
}
|
|
8891
|
+
const parsed = parseImportSourceLine(sourceLine);
|
|
8892
|
+
const normalizedFields = normalizeFieldCount(
|
|
8893
|
+
parsed.fields,
|
|
8894
|
+
layout.fields.length,
|
|
8895
|
+
input.inputFile,
|
|
8896
|
+
parsed.lineNumber
|
|
8897
|
+
);
|
|
8898
|
+
const values = transformRecord(
|
|
8899
|
+
input.dataset,
|
|
8900
|
+
layout,
|
|
8901
|
+
normalizedFields,
|
|
8902
|
+
POSTGRES_DIRECT_SCHEMA_CAPABILITIES,
|
|
8903
|
+
"staging"
|
|
8904
|
+
);
|
|
8905
|
+
output.write(`${formatCsvRow(values)}
|
|
8906
|
+
`);
|
|
8907
|
+
rows += 1;
|
|
8908
|
+
}
|
|
8909
|
+
} finally {
|
|
8910
|
+
output.end();
|
|
8911
|
+
await new Promise((resolve, reject) => {
|
|
8912
|
+
output.on("finish", () => resolve());
|
|
8913
|
+
output.on("error", (error) => reject(error));
|
|
8914
|
+
});
|
|
8915
|
+
}
|
|
8916
|
+
return rows;
|
|
8917
|
+
}
|
|
8918
|
+
async function exportPostgresCsvDataset(inputPath, options = {}) {
|
|
8919
|
+
if (options.dataset && !isImportDatasetType(options.dataset)) {
|
|
8920
|
+
throw new ValidationError(`Unsupported dataset type: ${options.dataset}.`);
|
|
8921
|
+
}
|
|
8922
|
+
const validation = await validateInputDirectory(inputPath);
|
|
8923
|
+
if (!validation.ok) {
|
|
8924
|
+
throw new ValidationError(
|
|
8925
|
+
`The input directory is not ready for PostgreSQL CSV export. ${validation.errors.join(" ")}`
|
|
8926
|
+
);
|
|
8927
|
+
}
|
|
8928
|
+
const validatedPath = validation.validatedPath;
|
|
8929
|
+
const outputPath = path16.resolve(
|
|
8930
|
+
options.outputPath ?? defaultPostgresCsvOutputPath(validatedPath)
|
|
8931
|
+
);
|
|
8932
|
+
const inspected = await inspectFiles(validatedPath);
|
|
8933
|
+
const recognizedFiles = inspected.entries.filter((entry) => entry.entryKind === "file").flatMap((entry) => {
|
|
8934
|
+
if (!isImportDatasetType(entry.inferredType)) {
|
|
8935
|
+
return [];
|
|
8936
|
+
}
|
|
8937
|
+
if (options.dataset && entry.inferredType !== options.dataset) {
|
|
8938
|
+
return [];
|
|
8939
|
+
}
|
|
8940
|
+
return [{ ...entry, inferredType: entry.inferredType }];
|
|
8941
|
+
}).sort(sortEntries);
|
|
8942
|
+
if (recognizedFiles.length === 0) {
|
|
8943
|
+
throw new ValidationError(
|
|
8944
|
+
"No recognized dataset files were found for PostgreSQL CSV export."
|
|
8945
|
+
);
|
|
8946
|
+
}
|
|
8947
|
+
const datasets = [
|
|
8948
|
+
...new Set(recognizedFiles.map((entry) => entry.inferredType))
|
|
8949
|
+
].sort(
|
|
8950
|
+
(left, right) => IMPORT_ORDER.indexOf(left) - IMPORT_ORDER.indexOf(right)
|
|
8951
|
+
);
|
|
8952
|
+
options.onProgress?.({
|
|
8953
|
+
kind: "start",
|
|
8954
|
+
inputPath: path16.resolve(inputPath),
|
|
8955
|
+
validatedPath,
|
|
8956
|
+
outputPath,
|
|
8957
|
+
totalFiles: recognizedFiles.length,
|
|
8958
|
+
datasets
|
|
8959
|
+
});
|
|
8960
|
+
const exportedFiles = [];
|
|
8961
|
+
const summariesByDataset = /* @__PURE__ */ new Map();
|
|
8962
|
+
for (const [index, entry] of recognizedFiles.entries()) {
|
|
8963
|
+
const dataset = entry.inferredType;
|
|
8964
|
+
const inputFile = path16.join(validatedPath, entry.relativePath);
|
|
8965
|
+
const outputFile = resolveDatasetOutputPath(
|
|
8966
|
+
outputPath,
|
|
8967
|
+
dataset,
|
|
8968
|
+
entry.relativePath
|
|
8969
|
+
);
|
|
8970
|
+
options.onProgress?.({
|
|
8971
|
+
kind: "file_start",
|
|
8972
|
+
dataset,
|
|
8973
|
+
fileIndex: index + 1,
|
|
8974
|
+
totalFiles: recognizedFiles.length,
|
|
8975
|
+
inputFile: buildDisplayPath(inputFile),
|
|
8976
|
+
outputFile
|
|
8977
|
+
});
|
|
8978
|
+
const rowCount = await writeCsvFile({ dataset, inputFile, outputFile });
|
|
8979
|
+
exportedFiles.push({
|
|
8980
|
+
dataset,
|
|
8981
|
+
absolutePath: outputFile,
|
|
8982
|
+
relativePath: path16.relative(outputPath, outputFile),
|
|
8983
|
+
rowCount
|
|
8984
|
+
});
|
|
8985
|
+
const currentSummary = summariesByDataset.get(dataset) ?? {
|
|
8986
|
+
dataset,
|
|
8987
|
+
files: 0,
|
|
8988
|
+
rows: 0,
|
|
8989
|
+
outputFiles: []
|
|
8990
|
+
};
|
|
8991
|
+
currentSummary.files += 1;
|
|
8992
|
+
currentSummary.rows += rowCount;
|
|
8993
|
+
currentSummary.outputFiles.push(outputFile);
|
|
8994
|
+
summariesByDataset.set(dataset, currentSummary);
|
|
8995
|
+
options.onProgress?.({
|
|
8996
|
+
kind: "file_finish",
|
|
8997
|
+
dataset,
|
|
8998
|
+
fileIndex: index + 1,
|
|
8999
|
+
totalFiles: recognizedFiles.length,
|
|
9000
|
+
inputFile: buildDisplayPath(inputFile),
|
|
9001
|
+
outputFile,
|
|
9002
|
+
rows: rowCount
|
|
9003
|
+
});
|
|
9004
|
+
}
|
|
9005
|
+
const scriptName = options.scriptName ?? "import-postgres-direct.sql";
|
|
9006
|
+
const scriptPath = path16.join(outputPath, scriptName);
|
|
9007
|
+
const script = generatePostgresDirectImportScript({ files: exportedFiles });
|
|
9008
|
+
await writeFile5(scriptPath, script, "utf8");
|
|
9009
|
+
const manifestPath = path16.join(outputPath, "manifest.json");
|
|
9010
|
+
const summaryDatasets = [...summariesByDataset.values()].sort(
|
|
9011
|
+
(left, right) => IMPORT_ORDER.indexOf(left.dataset) - IMPORT_ORDER.indexOf(right.dataset)
|
|
9012
|
+
);
|
|
9013
|
+
const totalRows = summaryDatasets.reduce((sum, item) => sum + item.rows, 0);
|
|
9014
|
+
const manifest = {
|
|
9015
|
+
generatedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
9016
|
+
inputPath: path16.resolve(inputPath),
|
|
9017
|
+
validatedPath,
|
|
9018
|
+
outputPath,
|
|
9019
|
+
scriptPath,
|
|
9020
|
+
totalFiles: exportedFiles.length,
|
|
9021
|
+
totalRows,
|
|
9022
|
+
datasets: summaryDatasets
|
|
9023
|
+
};
|
|
9024
|
+
await writeFile5(
|
|
9025
|
+
manifestPath,
|
|
9026
|
+
`${JSON.stringify(manifest, null, 2)}
|
|
9027
|
+
`,
|
|
9028
|
+
"utf8"
|
|
9029
|
+
);
|
|
9030
|
+
options.onProgress?.({
|
|
9031
|
+
kind: "finish",
|
|
9032
|
+
outputPath,
|
|
9033
|
+
scriptPath,
|
|
9034
|
+
totalFiles: exportedFiles.length,
|
|
9035
|
+
totalRows
|
|
9036
|
+
});
|
|
9037
|
+
return {
|
|
9038
|
+
inputPath: path16.resolve(inputPath),
|
|
9039
|
+
validatedPath,
|
|
9040
|
+
outputPath,
|
|
9041
|
+
scriptPath,
|
|
9042
|
+
manifestPath,
|
|
9043
|
+
totalFiles: exportedFiles.length,
|
|
9044
|
+
totalRows,
|
|
9045
|
+
datasets: summaryDatasets,
|
|
9046
|
+
warnings: [
|
|
9047
|
+
"PostgreSQL-ready CSV export is intended for hybrid bulk imports after extraction, validation and sanitization.",
|
|
9048
|
+
"The generated SQL script resets staging tables and then upserts final tables. Review it before running against production databases."
|
|
9049
|
+
],
|
|
9050
|
+
nextStep: inferNextStep4(scriptPath)
|
|
9051
|
+
};
|
|
9052
|
+
}
|
|
9053
|
+
|
|
9054
|
+
// src/services/postgres-direct/generator.ts
|
|
9055
|
+
import { mkdir as mkdir9, stat as stat7, writeFile as writeFile6 } from "fs/promises";
|
|
9056
|
+
import path17 from "path";
|
|
9057
|
+
var DEFAULT_SOURCE_ENCODING = "UTF8";
|
|
9058
|
+
function defaultPostgresDirectOutputPath(inputPath) {
|
|
9059
|
+
const baseName = path17.basename(inputPath);
|
|
9060
|
+
if (baseName.toLowerCase() === "sanitized") {
|
|
9061
|
+
return path17.join(path17.dirname(inputPath), "postgres-direct");
|
|
9062
|
+
}
|
|
9063
|
+
return path17.join(path17.dirname(inputPath), `${baseName}-postgres-direct`);
|
|
9064
|
+
}
|
|
9065
|
+
function inferNextStep5(scriptPath) {
|
|
9066
|
+
return `psql "postgres://postgres:postgres@localhost:5432/cnpj" -f ${scriptPath.replace(/\\/g, "/")}`;
|
|
9067
|
+
}
|
|
9068
|
+
function normalizeSourceEncoding(value) {
|
|
9069
|
+
const encoding = (value ?? DEFAULT_SOURCE_ENCODING).trim();
|
|
9070
|
+
if (!/^[A-Za-z0-9_-]+$/.test(encoding)) {
|
|
9071
|
+
throw new ValidationError(
|
|
9072
|
+
`Invalid source encoding: ${value}. Use a PostgreSQL client encoding name such as UTF8, WIN1252 or LATIN1.`
|
|
9073
|
+
);
|
|
9074
|
+
}
|
|
9075
|
+
return encoding.toUpperCase();
|
|
9076
|
+
}
|
|
9077
|
+
async function generatePostgresDirectScript(inputPath, options = {}) {
|
|
9078
|
+
if (options.dataset && !isImportDatasetType(options.dataset)) {
|
|
9079
|
+
throw new ValidationError(`Unsupported dataset type: ${options.dataset}.`);
|
|
9080
|
+
}
|
|
9081
|
+
const validation = await validateInputDirectory(inputPath);
|
|
9082
|
+
if (!validation.ok && !options.dataset) {
|
|
9083
|
+
throw new ValidationError(
|
|
9084
|
+
`The input directory is not ready for PostgreSQL direct script generation. ${validation.errors.join(" ")}`
|
|
9085
|
+
);
|
|
9086
|
+
}
|
|
9087
|
+
const validatedPath = validation.ok ? validation.validatedPath : path17.resolve(inputPath);
|
|
9088
|
+
const outputPath = path17.resolve(
|
|
9089
|
+
options.outputPath ?? defaultPostgresDirectOutputPath(validatedPath)
|
|
9090
|
+
);
|
|
9091
|
+
const sourceEncoding = normalizeSourceEncoding(options.sourceEncoding);
|
|
9092
|
+
const inspected = await inspectFiles(validatedPath);
|
|
9093
|
+
const recognizedFiles = inspected.entries.filter((entry) => entry.entryKind === "file").flatMap((entry) => {
|
|
9094
|
+
if (!isImportDatasetType(entry.inferredType)) {
|
|
9095
|
+
return [];
|
|
9096
|
+
}
|
|
9097
|
+
if (options.dataset && entry.inferredType !== options.dataset) {
|
|
9098
|
+
return [];
|
|
9099
|
+
}
|
|
9100
|
+
return [{ ...entry, inferredType: entry.inferredType }];
|
|
9101
|
+
}).sort(sortEntries);
|
|
9102
|
+
if (recognizedFiles.length === 0) {
|
|
9103
|
+
throw new ValidationError(
|
|
9104
|
+
"No recognized dataset files were found for PostgreSQL direct script generation."
|
|
9105
|
+
);
|
|
9106
|
+
}
|
|
9107
|
+
const datasets = [
|
|
9108
|
+
...new Set(recognizedFiles.map((entry) => entry.inferredType))
|
|
9109
|
+
].sort(
|
|
9110
|
+
(left, right) => IMPORT_ORDER.indexOf(left) - IMPORT_ORDER.indexOf(right)
|
|
9111
|
+
);
|
|
9112
|
+
options.onProgress?.({
|
|
9113
|
+
kind: "start",
|
|
9114
|
+
inputPath: path17.resolve(inputPath),
|
|
9115
|
+
validatedPath,
|
|
9116
|
+
outputPath,
|
|
9117
|
+
totalFiles: recognizedFiles.length,
|
|
9118
|
+
datasets,
|
|
9119
|
+
sourceEncoding
|
|
9120
|
+
});
|
|
9121
|
+
await mkdir9(outputPath, { recursive: true });
|
|
9122
|
+
const sourceFiles = [];
|
|
9123
|
+
const summariesByDataset = /* @__PURE__ */ new Map();
|
|
9124
|
+
for (const [index, entry] of recognizedFiles.entries()) {
|
|
9125
|
+
const dataset = entry.inferredType;
|
|
9126
|
+
const absolutePath = path17.join(validatedPath, entry.relativePath);
|
|
9127
|
+
const fileStats = await stat7(absolutePath);
|
|
9128
|
+
sourceFiles.push({
|
|
9129
|
+
dataset,
|
|
9130
|
+
absolutePath,
|
|
9131
|
+
relativePath: entry.relativePath,
|
|
9132
|
+
fileSize: fileStats.size
|
|
9133
|
+
});
|
|
9134
|
+
const currentSummary = summariesByDataset.get(dataset) ?? {
|
|
9135
|
+
dataset,
|
|
9136
|
+
files: 0,
|
|
9137
|
+
totalBytes: 0,
|
|
9138
|
+
sourceFiles: []
|
|
9139
|
+
};
|
|
9140
|
+
currentSummary.files += 1;
|
|
9141
|
+
currentSummary.totalBytes += fileStats.size;
|
|
9142
|
+
currentSummary.sourceFiles.push(absolutePath);
|
|
9143
|
+
summariesByDataset.set(dataset, currentSummary);
|
|
9144
|
+
options.onProgress?.({
|
|
9145
|
+
kind: "file_registered",
|
|
9146
|
+
dataset,
|
|
9147
|
+
fileIndex: index + 1,
|
|
9148
|
+
totalFiles: recognizedFiles.length,
|
|
9149
|
+
inputFile: buildDisplayPath(absolutePath),
|
|
9150
|
+
fileSize: fileStats.size
|
|
9151
|
+
});
|
|
9152
|
+
}
|
|
9153
|
+
const scriptName = options.scriptName ?? "import-postgres-direct.sql";
|
|
9154
|
+
const scriptPath = path17.join(outputPath, scriptName);
|
|
9155
|
+
const script = generatePostgresSanitizedDirectImportScript({
|
|
9156
|
+
files: sourceFiles,
|
|
9157
|
+
sourceEncoding
|
|
9158
|
+
});
|
|
9159
|
+
await writeFile6(scriptPath, script, "utf8");
|
|
9160
|
+
const manifestPath = path17.join(outputPath, "manifest.json");
|
|
9161
|
+
const summaryDatasets = [...summariesByDataset.values()].sort(
|
|
9162
|
+
(left, right) => IMPORT_ORDER.indexOf(left.dataset) - IMPORT_ORDER.indexOf(right.dataset)
|
|
9163
|
+
);
|
|
9164
|
+
const totalBytes = summaryDatasets.reduce(
|
|
9165
|
+
(sum, item) => sum + item.totalBytes,
|
|
9166
|
+
0
|
|
9167
|
+
);
|
|
9168
|
+
const manifest = {
|
|
9169
|
+
generatedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
9170
|
+
mode: "direct-sanitized-script",
|
|
9171
|
+
inputPath: path17.resolve(inputPath),
|
|
9172
|
+
validatedPath,
|
|
9173
|
+
outputPath,
|
|
9174
|
+
scriptPath,
|
|
9175
|
+
sourceEncoding,
|
|
9176
|
+
totalFiles: sourceFiles.length,
|
|
9177
|
+
totalBytes,
|
|
9178
|
+
datasets: summaryDatasets
|
|
9179
|
+
};
|
|
9180
|
+
await writeFile6(
|
|
9181
|
+
manifestPath,
|
|
9182
|
+
`${JSON.stringify(manifest, null, 2)}
|
|
9183
|
+
`,
|
|
9184
|
+
"utf8"
|
|
9185
|
+
);
|
|
9186
|
+
options.onProgress?.({
|
|
9187
|
+
kind: "finish",
|
|
9188
|
+
outputPath,
|
|
9189
|
+
scriptPath,
|
|
9190
|
+
totalFiles: sourceFiles.length,
|
|
9191
|
+
totalBytes
|
|
9192
|
+
});
|
|
9193
|
+
return {
|
|
9194
|
+
inputPath: path17.resolve(inputPath),
|
|
9195
|
+
validatedPath,
|
|
9196
|
+
outputPath,
|
|
9197
|
+
scriptPath,
|
|
9198
|
+
manifestPath,
|
|
9199
|
+
sourceEncoding,
|
|
9200
|
+
totalFiles: sourceFiles.length,
|
|
9201
|
+
totalBytes,
|
|
9202
|
+
datasets: summaryDatasets,
|
|
9203
|
+
warnings: [
|
|
9204
|
+
...validation.ok ? [] : validation.errors,
|
|
9205
|
+
"This script imports sanitized Receita files directly with psql \\copy. It avoids rewriting the full dataset into a second CSV tree.",
|
|
9206
|
+
"The generated script expects the database schema generated by cnpj-db-loader to be applied before execution.",
|
|
9207
|
+
"The direct PostgreSQL script now defaults to UTF8 because the sanitize command writes clean UTF-8 files.",
|
|
9208
|
+
"Use --source-encoding WIN1252 or LATIN1 only when generating scripts for legacy sanitized files produced by older loader versions."
|
|
9209
|
+
],
|
|
9210
|
+
nextStep: inferNextStep5(scriptPath)
|
|
9211
|
+
};
|
|
9212
|
+
}
|
|
8099
9213
|
export {
|
|
8100
9214
|
AppError,
|
|
8101
9215
|
DEFAULT_FEDERAL_REVENUE_DOWNLOAD_ROOT,
|
|
@@ -8125,8 +9239,10 @@ export {
|
|
|
8125
9239
|
ensureDirectory,
|
|
8126
9240
|
evaluateFederalRevenueManifestFile,
|
|
8127
9241
|
evaluateFederalRevenueManifestFiles,
|
|
9242
|
+
exportPostgresCsvDataset,
|
|
8128
9243
|
extractArchives,
|
|
8129
9244
|
finalizeFederalRevenueManifest,
|
|
9245
|
+
generatePostgresDirectScript,
|
|
8130
9246
|
generateSchemaSql,
|
|
8131
9247
|
getAllLayouts,
|
|
8132
9248
|
getCurrentFederalRevenueReference,
|