@danielarndt0/cnpj-db-loader 2.4.0-beta.2 → 2.4.0-beta.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -6
- package/dist/cli.js +1037 -296
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +337 -297
- package/dist/index.js +879 -290
- package/dist/index.js.map +1 -1
- package/docs/commands.md +11 -1
- package/docs/federal-revenue.md +36 -2
- package/docs/postgres-direct.md +235 -41
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -87,6 +87,222 @@ function getConfigFilePath() {
|
|
|
87
87
|
return path2.join(os2.homedir(), ".config", "cnpj-db-loader", "config.json");
|
|
88
88
|
}
|
|
89
89
|
|
|
90
|
+
// src/services/federal-revenue/client.ts
|
|
91
|
+
var DEFAULT_FEDERAL_REVENUE_WEBDAV_URL = "https://arquivos.receitafederal.gov.br/public.php/webdav";
|
|
92
|
+
var DEFAULT_FEDERAL_REVENUE_USER_AGENT = "cnpj-db-loader federal-revenue-client";
|
|
93
|
+
var REFERENCE_PATTERN = /^\d{4}-\d{2}$/;
|
|
94
|
+
function trimTrailingSlash(value) {
|
|
95
|
+
return value.replace(/\/+$/g, "");
|
|
96
|
+
}
|
|
97
|
+
function normalizeBaseUrl(value) {
|
|
98
|
+
return trimTrailingSlash(value ?? DEFAULT_FEDERAL_REVENUE_WEBDAV_URL);
|
|
99
|
+
}
|
|
100
|
+
function getShareToken(value) {
|
|
101
|
+
const shareToken = value?.trim();
|
|
102
|
+
if (!shareToken) {
|
|
103
|
+
throw new ValidationError(
|
|
104
|
+
"Federal Revenue public share token is not configured. Run `cnpj-db-loader federal-revenue config set share-token <token>` or pass --share-token."
|
|
105
|
+
);
|
|
106
|
+
}
|
|
107
|
+
return shareToken;
|
|
108
|
+
}
|
|
109
|
+
function encodePathSegment(value) {
|
|
110
|
+
return encodeURIComponent(value).replace(/%2F/gi, "/");
|
|
111
|
+
}
|
|
112
|
+
function decodeXml(value) {
|
|
113
|
+
return value.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, '"').replace(/'/g, "'");
|
|
114
|
+
}
|
|
115
|
+
function decodeHrefSegment(value) {
|
|
116
|
+
try {
|
|
117
|
+
return decodeURIComponent(value);
|
|
118
|
+
} catch {
|
|
119
|
+
return value;
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
function getAuthHeader(shareToken) {
|
|
123
|
+
return `Basic ${Buffer.from(`${shareToken}:`).toString("base64")}`;
|
|
124
|
+
}
|
|
125
|
+
function buildUrl(baseUrl, segments = []) {
|
|
126
|
+
if (segments.length === 0) {
|
|
127
|
+
return `${baseUrl}/`;
|
|
128
|
+
}
|
|
129
|
+
return `${baseUrl}/${segments.map(encodePathSegment).join("/")}`;
|
|
130
|
+
}
|
|
131
|
+
function extractFirst(block, tagName) {
|
|
132
|
+
const pattern = new RegExp(
|
|
133
|
+
`<(?:[a-zA-Z0-9_-]+:)?${tagName}\\b[^>]*>([\\s\\S]*?)<\\/(?:[a-zA-Z0-9_-]+:)?${tagName}>`,
|
|
134
|
+
"i"
|
|
135
|
+
);
|
|
136
|
+
const match = block.match(pattern);
|
|
137
|
+
return match?.[1] ? decodeXml(match[1].trim()) : void 0;
|
|
138
|
+
}
|
|
139
|
+
function isCollectionResponse(block) {
|
|
140
|
+
return /<(?:[a-zA-Z0-9_-]+:)?collection\b/i.test(block);
|
|
141
|
+
}
|
|
142
|
+
function getNameFromHref(href) {
|
|
143
|
+
const cleanHref = href.split("?")[0] ?? href;
|
|
144
|
+
const withoutTrailingSlash = cleanHref.replace(/\/+$/g, "");
|
|
145
|
+
const rawName = withoutTrailingSlash.split("/").pop() ?? withoutTrailingSlash;
|
|
146
|
+
return decodeHrefSegment(rawName);
|
|
147
|
+
}
|
|
148
|
+
function parsePropfindXml(xml) {
|
|
149
|
+
const responseBlocks = xml.match(
|
|
150
|
+
/<(?:[a-zA-Z0-9_-]+:)?response\b[\s\S]*?<\/(?:[a-zA-Z0-9_-]+:)?response>/gi
|
|
151
|
+
);
|
|
152
|
+
if (!responseBlocks) {
|
|
153
|
+
return [];
|
|
154
|
+
}
|
|
155
|
+
return responseBlocks.map((block) => {
|
|
156
|
+
const href = extractFirst(block, "href");
|
|
157
|
+
if (!href) {
|
|
158
|
+
return void 0;
|
|
159
|
+
}
|
|
160
|
+
const size = extractFirst(block, "getcontentlength");
|
|
161
|
+
const parsedSize = size ? Number.parseInt(size, 10) : void 0;
|
|
162
|
+
const lastModified = extractFirst(block, "getlastmodified");
|
|
163
|
+
const etag = extractFirst(block, "getetag");
|
|
164
|
+
return {
|
|
165
|
+
href,
|
|
166
|
+
name: getNameFromHref(href),
|
|
167
|
+
isCollection: isCollectionResponse(block),
|
|
168
|
+
...Number.isFinite(parsedSize) ? { sizeInBytes: parsedSize } : {},
|
|
169
|
+
...lastModified ? { lastModified } : {},
|
|
170
|
+
...etag ? { etag } : {}
|
|
171
|
+
};
|
|
172
|
+
}).filter((entry) => entry !== void 0);
|
|
173
|
+
}
|
|
174
|
+
async function propfind(pathSegments, options = {}) {
|
|
175
|
+
const baseUrl = normalizeBaseUrl(options.baseUrl);
|
|
176
|
+
const shareToken = getShareToken(options.shareToken);
|
|
177
|
+
let response;
|
|
178
|
+
try {
|
|
179
|
+
response = await fetch(buildUrl(baseUrl, pathSegments), {
|
|
180
|
+
method: "PROPFIND",
|
|
181
|
+
headers: {
|
|
182
|
+
Accept: "application/xml,text/xml,*/*",
|
|
183
|
+
Authorization: getAuthHeader(shareToken),
|
|
184
|
+
Depth: "1",
|
|
185
|
+
"User-Agent": options.userAgent ?? DEFAULT_FEDERAL_REVENUE_USER_AGENT
|
|
186
|
+
}
|
|
187
|
+
});
|
|
188
|
+
} catch (error) {
|
|
189
|
+
throw new ValidationError(
|
|
190
|
+
`Federal Revenue WebDAV request failed before receiving a response: ${error instanceof Error ? error.message : String(error)}.`,
|
|
191
|
+
{ baseUrl, pathSegments }
|
|
192
|
+
);
|
|
193
|
+
}
|
|
194
|
+
if (!response.ok) {
|
|
195
|
+
throw new ValidationError(
|
|
196
|
+
`Federal Revenue WebDAV request failed with status ${response.status} ${response.statusText}.`,
|
|
197
|
+
{ status: response.status, statusText: response.statusText }
|
|
198
|
+
);
|
|
199
|
+
}
|
|
200
|
+
const xml = await response.text();
|
|
201
|
+
return {
|
|
202
|
+
entries: parsePropfindXml(xml),
|
|
203
|
+
baseUrl,
|
|
204
|
+
shareToken
|
|
205
|
+
};
|
|
206
|
+
}
|
|
207
|
+
function validateFederalRevenueReference(reference) {
|
|
208
|
+
if (!REFERENCE_PATTERN.test(reference)) {
|
|
209
|
+
throw new ValidationError(
|
|
210
|
+
`Federal Revenue reference is invalid: ${reference}. Expected YYYY-MM.`
|
|
211
|
+
);
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
function getCurrentFederalRevenueReference(date = /* @__PURE__ */ new Date()) {
|
|
215
|
+
const year = date.getFullYear();
|
|
216
|
+
const month = String(date.getMonth() + 1).padStart(2, "0");
|
|
217
|
+
return `${year}-${month}`;
|
|
218
|
+
}
|
|
219
|
+
async function listFederalRevenueReferences(options = {}) {
|
|
220
|
+
const result = await propfind([], options);
|
|
221
|
+
const references = result.entries.filter((entry) => entry.isCollection && REFERENCE_PATTERN.test(entry.name)).map((entry) => ({
|
|
222
|
+
reference: entry.name,
|
|
223
|
+
href: entry.href
|
|
224
|
+
})).sort((left, right) => left.reference.localeCompare(right.reference));
|
|
225
|
+
return {
|
|
226
|
+
references,
|
|
227
|
+
remoteBaseUrl: result.baseUrl
|
|
228
|
+
};
|
|
229
|
+
}
|
|
230
|
+
async function resolveFederalRevenueReference(input = {}) {
|
|
231
|
+
const { references } = await listFederalRevenueReferences(input);
|
|
232
|
+
const availableReferences = references.map((item) => item.reference);
|
|
233
|
+
const latest = availableReferences.at(-1);
|
|
234
|
+
if (!latest) {
|
|
235
|
+
throw new ValidationError(
|
|
236
|
+
"Federal Revenue reference discovery failed: no monthly references were found in the public share."
|
|
237
|
+
);
|
|
238
|
+
}
|
|
239
|
+
if (input.reference) {
|
|
240
|
+
validateFederalRevenueReference(input.reference);
|
|
241
|
+
if (!availableReferences.includes(input.reference)) {
|
|
242
|
+
throw new ValidationError(
|
|
243
|
+
`Federal Revenue reference not found: ${input.reference}. Latest available reference is ${latest}.`,
|
|
244
|
+
{
|
|
245
|
+
requestedReference: input.reference,
|
|
246
|
+
latestAvailableReference: latest,
|
|
247
|
+
availableReferences
|
|
248
|
+
}
|
|
249
|
+
);
|
|
250
|
+
}
|
|
251
|
+
return {
|
|
252
|
+
mode: "explicit",
|
|
253
|
+
selectedReference: input.reference,
|
|
254
|
+
availableReferences
|
|
255
|
+
};
|
|
256
|
+
}
|
|
257
|
+
if (input.current) {
|
|
258
|
+
const currentReference = getCurrentFederalRevenueReference();
|
|
259
|
+
if (!availableReferences.includes(currentReference)) {
|
|
260
|
+
throw new ValidationError(
|
|
261
|
+
`Federal Revenue current reference is not available yet: ${currentReference}. Latest available reference is ${latest}.`,
|
|
262
|
+
{
|
|
263
|
+
requestedReference: currentReference,
|
|
264
|
+
latestAvailableReference: latest,
|
|
265
|
+
availableReferences
|
|
266
|
+
}
|
|
267
|
+
);
|
|
268
|
+
}
|
|
269
|
+
return {
|
|
270
|
+
mode: "current",
|
|
271
|
+
selectedReference: currentReference,
|
|
272
|
+
availableReferences
|
|
273
|
+
};
|
|
274
|
+
}
|
|
275
|
+
return {
|
|
276
|
+
mode: "latest",
|
|
277
|
+
selectedReference: latest,
|
|
278
|
+
availableReferences
|
|
279
|
+
};
|
|
280
|
+
}
|
|
281
|
+
async function listFederalRevenueFiles(reference, options = {}) {
|
|
282
|
+
validateFederalRevenueReference(reference);
|
|
283
|
+
const result = await propfind([reference], options);
|
|
284
|
+
const files = result.entries.filter(
|
|
285
|
+
(entry) => !entry.isCollection && entry.name.toLowerCase().endsWith(".zip")
|
|
286
|
+
).map((entry) => ({
|
|
287
|
+
name: entry.name,
|
|
288
|
+
href: entry.href,
|
|
289
|
+
downloadUrl: buildUrl(result.baseUrl, [reference, entry.name]),
|
|
290
|
+
...entry.sizeInBytes !== void 0 ? { sizeInBytes: entry.sizeInBytes } : {},
|
|
291
|
+
...entry.lastModified ? { lastModified: entry.lastModified } : {},
|
|
292
|
+
...entry.etag ? { etag: entry.etag } : {}
|
|
293
|
+
})).sort((left, right) => left.name.localeCompare(right.name));
|
|
294
|
+
return {
|
|
295
|
+
files,
|
|
296
|
+
remoteBaseUrl: result.baseUrl
|
|
297
|
+
};
|
|
298
|
+
}
|
|
299
|
+
function buildFederalRevenueDownloadHeaders(options = {}) {
|
|
300
|
+
return {
|
|
301
|
+
Authorization: getAuthHeader(getShareToken(options.shareToken)),
|
|
302
|
+
"User-Agent": options.userAgent ?? DEFAULT_FEDERAL_REVENUE_USER_AGENT
|
|
303
|
+
};
|
|
304
|
+
}
|
|
305
|
+
|
|
90
306
|
// src/services/config.service.ts
|
|
91
307
|
async function readDatabaseConfig() {
|
|
92
308
|
const raw = await safeReadText(getConfigFilePath());
|
|
@@ -114,12 +330,149 @@ function assertPostgresUrl(url) {
|
|
|
114
330
|
);
|
|
115
331
|
}
|
|
116
332
|
}
|
|
333
|
+
function assertHttpUrl(url, label) {
|
|
334
|
+
let parsed;
|
|
335
|
+
try {
|
|
336
|
+
parsed = new URL(url);
|
|
337
|
+
} catch {
|
|
338
|
+
throw new ValidationError(`${label} is not a valid URL.`, { url });
|
|
339
|
+
}
|
|
340
|
+
if (!["http:", "https:"].includes(parsed.protocol)) {
|
|
341
|
+
throw new ValidationError(`${label} must use the http or https protocol.`, {
|
|
342
|
+
url
|
|
343
|
+
});
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
function assertNonEmpty(value, label) {
|
|
347
|
+
const trimmed = value.trim();
|
|
348
|
+
if (!trimmed) {
|
|
349
|
+
throw new ValidationError(`${label} cannot be empty.`);
|
|
350
|
+
}
|
|
351
|
+
return trimmed;
|
|
352
|
+
}
|
|
353
|
+
function normalizeFederalRevenueConfigKey(key) {
|
|
354
|
+
const normalized = key.trim().toLowerCase();
|
|
355
|
+
if (["share-token", "share_token", "token"].includes(normalized)) {
|
|
356
|
+
return "share-token";
|
|
357
|
+
}
|
|
358
|
+
if (["webdav-url", "webdav_url", "base-url", "base_url", "url"].includes(
|
|
359
|
+
normalized
|
|
360
|
+
)) {
|
|
361
|
+
return "webdav-url";
|
|
362
|
+
}
|
|
363
|
+
if (["user-agent", "user_agent"].includes(normalized)) {
|
|
364
|
+
return "user-agent";
|
|
365
|
+
}
|
|
366
|
+
throw new ValidationError(
|
|
367
|
+
`Unknown Federal Revenue config key: ${key}. Expected share-token, webdav-url, or user-agent.`
|
|
368
|
+
);
|
|
369
|
+
}
|
|
370
|
+
function assignFederalRevenueConfigValue(config, key, value) {
|
|
371
|
+
if (key === "share-token") {
|
|
372
|
+
return {
|
|
373
|
+
...config,
|
|
374
|
+
shareToken: assertNonEmpty(value, "Federal Revenue share token")
|
|
375
|
+
};
|
|
376
|
+
}
|
|
377
|
+
if (key === "webdav-url") {
|
|
378
|
+
const webdavUrl = assertNonEmpty(value, "Federal Revenue WebDAV URL");
|
|
379
|
+
assertHttpUrl(webdavUrl, "Federal Revenue WebDAV URL");
|
|
380
|
+
return { ...config, webdavUrl };
|
|
381
|
+
}
|
|
382
|
+
return {
|
|
383
|
+
...config,
|
|
384
|
+
userAgent: assertNonEmpty(value, "Federal Revenue user agent")
|
|
385
|
+
};
|
|
386
|
+
}
|
|
387
|
+
function deleteFederalRevenueConfigValue(config, key) {
|
|
388
|
+
const nextConfig = { ...config };
|
|
389
|
+
if (key === "share-token") {
|
|
390
|
+
delete nextConfig.shareToken;
|
|
391
|
+
}
|
|
392
|
+
if (key === "webdav-url") {
|
|
393
|
+
delete nextConfig.webdavUrl;
|
|
394
|
+
}
|
|
395
|
+
if (key === "user-agent") {
|
|
396
|
+
delete nextConfig.userAgent;
|
|
397
|
+
}
|
|
398
|
+
return nextConfig;
|
|
399
|
+
}
|
|
400
|
+
function isFederalRevenueConfigEmpty(config) {
|
|
401
|
+
return !config.shareToken && !config.webdavUrl && !config.userAgent;
|
|
402
|
+
}
|
|
117
403
|
async function setDefaultDbUrl(url) {
|
|
118
404
|
assertPostgresUrl(url);
|
|
119
|
-
|
|
405
|
+
const currentConfig = await readDatabaseConfig();
|
|
406
|
+
await writeDatabaseConfig({ ...currentConfig, defaultDbUrl: url });
|
|
120
407
|
}
|
|
121
408
|
async function resetDefaultDbUrl() {
|
|
122
|
-
await
|
|
409
|
+
const currentConfig = await readDatabaseConfig();
|
|
410
|
+
const nextConfig = { ...currentConfig };
|
|
411
|
+
delete nextConfig.defaultDbUrl;
|
|
412
|
+
await writeDatabaseConfig(nextConfig);
|
|
413
|
+
}
|
|
414
|
+
async function setFederalRevenueConfigValue(key, value) {
|
|
415
|
+
const normalizedKey = normalizeFederalRevenueConfigKey(key);
|
|
416
|
+
const currentConfig = await readDatabaseConfig();
|
|
417
|
+
const federalRevenueConfig = assignFederalRevenueConfigValue(
|
|
418
|
+
currentConfig.federalRevenue ?? {},
|
|
419
|
+
normalizedKey,
|
|
420
|
+
value
|
|
421
|
+
);
|
|
422
|
+
await writeDatabaseConfig({
|
|
423
|
+
...currentConfig,
|
|
424
|
+
federalRevenue: federalRevenueConfig
|
|
425
|
+
});
|
|
426
|
+
return getFederalRevenueEffectiveConfig(federalRevenueConfig);
|
|
427
|
+
}
|
|
428
|
+
async function resetFederalRevenueConfig(key) {
|
|
429
|
+
const currentConfig = await readDatabaseConfig();
|
|
430
|
+
if (!key) {
|
|
431
|
+
const nextConfig2 = { ...currentConfig };
|
|
432
|
+
delete nextConfig2.federalRevenue;
|
|
433
|
+
await writeDatabaseConfig(nextConfig2);
|
|
434
|
+
return getFederalRevenueEffectiveConfig({});
|
|
435
|
+
}
|
|
436
|
+
const normalizedKey = normalizeFederalRevenueConfigKey(key);
|
|
437
|
+
const federalRevenueConfig = deleteFederalRevenueConfigValue(
|
|
438
|
+
currentConfig.federalRevenue ?? {},
|
|
439
|
+
normalizedKey
|
|
440
|
+
);
|
|
441
|
+
const nextConfig = { ...currentConfig };
|
|
442
|
+
if (isFederalRevenueConfigEmpty(federalRevenueConfig)) {
|
|
443
|
+
delete nextConfig.federalRevenue;
|
|
444
|
+
} else {
|
|
445
|
+
nextConfig.federalRevenue = federalRevenueConfig;
|
|
446
|
+
}
|
|
447
|
+
await writeDatabaseConfig(nextConfig);
|
|
448
|
+
return getFederalRevenueEffectiveConfig(federalRevenueConfig);
|
|
449
|
+
}
|
|
450
|
+
function getFederalRevenueEffectiveConfig(config = {}) {
|
|
451
|
+
return {
|
|
452
|
+
webdavUrl: config.webdavUrl ?? DEFAULT_FEDERAL_REVENUE_WEBDAV_URL,
|
|
453
|
+
userAgent: config.userAgent ?? DEFAULT_FEDERAL_REVENUE_USER_AGENT,
|
|
454
|
+
...config.shareToken ? { shareToken: config.shareToken } : {},
|
|
455
|
+
configured: {
|
|
456
|
+
webdavUrl: Boolean(config.webdavUrl),
|
|
457
|
+
userAgent: Boolean(config.userAgent),
|
|
458
|
+
shareToken: Boolean(config.shareToken)
|
|
459
|
+
}
|
|
460
|
+
};
|
|
461
|
+
}
|
|
462
|
+
async function readFederalRevenueEffectiveConfig() {
|
|
463
|
+
const currentConfig = await readDatabaseConfig();
|
|
464
|
+
return getFederalRevenueEffectiveConfig(currentConfig.federalRevenue ?? {});
|
|
465
|
+
}
|
|
466
|
+
async function resolveFederalRevenueClientOptions(overrides = {}) {
|
|
467
|
+
const currentConfig = await readDatabaseConfig();
|
|
468
|
+
const effectiveConfig = getFederalRevenueEffectiveConfig(
|
|
469
|
+
currentConfig.federalRevenue ?? {}
|
|
470
|
+
);
|
|
471
|
+
return {
|
|
472
|
+
baseUrl: overrides.baseUrl ?? effectiveConfig.webdavUrl,
|
|
473
|
+
shareToken: overrides.shareToken ?? effectiveConfig.shareToken,
|
|
474
|
+
userAgent: overrides.userAgent ?? effectiveConfig.userAgent
|
|
475
|
+
};
|
|
123
476
|
}
|
|
124
477
|
|
|
125
478
|
// src/services/database.service.ts
|
|
@@ -6717,217 +7070,6 @@ async function showQuarantineRow(id, options) {
|
|
|
6717
7070
|
return record;
|
|
6718
7071
|
}
|
|
6719
7072
|
|
|
6720
|
-
// src/services/federal-revenue/client.ts
|
|
6721
|
-
var DEFAULT_FEDERAL_REVENUE_SHARE_TOKEN = "YggdBLfdninEJX9";
|
|
6722
|
-
var DEFAULT_FEDERAL_REVENUE_WEBDAV_URL = "https://arquivos.receitafederal.gov.br/public.php/webdav";
|
|
6723
|
-
var DEFAULT_FEDERAL_REVENUE_USER_AGENT = "cnpj-db-loader federal-revenue-client";
|
|
6724
|
-
var REFERENCE_PATTERN = /^\d{4}-\d{2}$/;
|
|
6725
|
-
function trimTrailingSlash(value) {
|
|
6726
|
-
return value.replace(/\/+$/g, "");
|
|
6727
|
-
}
|
|
6728
|
-
function normalizeBaseUrl(value) {
|
|
6729
|
-
return trimTrailingSlash(value ?? DEFAULT_FEDERAL_REVENUE_WEBDAV_URL);
|
|
6730
|
-
}
|
|
6731
|
-
function getShareToken(value) {
|
|
6732
|
-
return value ?? DEFAULT_FEDERAL_REVENUE_SHARE_TOKEN;
|
|
6733
|
-
}
|
|
6734
|
-
function encodePathSegment(value) {
|
|
6735
|
-
return encodeURIComponent(value).replace(/%2F/gi, "/");
|
|
6736
|
-
}
|
|
6737
|
-
function decodeXml(value) {
|
|
6738
|
-
return value.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, '"').replace(/'/g, "'");
|
|
6739
|
-
}
|
|
6740
|
-
function decodeHrefSegment(value) {
|
|
6741
|
-
try {
|
|
6742
|
-
return decodeURIComponent(value);
|
|
6743
|
-
} catch {
|
|
6744
|
-
return value;
|
|
6745
|
-
}
|
|
6746
|
-
}
|
|
6747
|
-
function getAuthHeader(shareToken) {
|
|
6748
|
-
return `Basic ${Buffer.from(`${shareToken}:`).toString("base64")}`;
|
|
6749
|
-
}
|
|
6750
|
-
function buildUrl(baseUrl, segments = []) {
|
|
6751
|
-
if (segments.length === 0) {
|
|
6752
|
-
return `${baseUrl}/`;
|
|
6753
|
-
}
|
|
6754
|
-
return `${baseUrl}/${segments.map(encodePathSegment).join("/")}`;
|
|
6755
|
-
}
|
|
6756
|
-
function extractFirst(block, tagName) {
|
|
6757
|
-
const pattern = new RegExp(
|
|
6758
|
-
`<(?:[a-zA-Z0-9_-]+:)?${tagName}\\b[^>]*>([\\s\\S]*?)<\\/(?:[a-zA-Z0-9_-]+:)?${tagName}>`,
|
|
6759
|
-
"i"
|
|
6760
|
-
);
|
|
6761
|
-
const match = block.match(pattern);
|
|
6762
|
-
return match?.[1] ? decodeXml(match[1].trim()) : void 0;
|
|
6763
|
-
}
|
|
6764
|
-
function isCollectionResponse(block) {
|
|
6765
|
-
return /<(?:[a-zA-Z0-9_-]+:)?collection\b/i.test(block);
|
|
6766
|
-
}
|
|
6767
|
-
function getNameFromHref(href) {
|
|
6768
|
-
const cleanHref = href.split("?")[0] ?? href;
|
|
6769
|
-
const withoutTrailingSlash = cleanHref.replace(/\/+$/g, "");
|
|
6770
|
-
const rawName = withoutTrailingSlash.split("/").pop() ?? withoutTrailingSlash;
|
|
6771
|
-
return decodeHrefSegment(rawName);
|
|
6772
|
-
}
|
|
6773
|
-
function parsePropfindXml(xml) {
|
|
6774
|
-
const responseBlocks = xml.match(
|
|
6775
|
-
/<(?:[a-zA-Z0-9_-]+:)?response\b[\s\S]*?<\/(?:[a-zA-Z0-9_-]+:)?response>/gi
|
|
6776
|
-
);
|
|
6777
|
-
if (!responseBlocks) {
|
|
6778
|
-
return [];
|
|
6779
|
-
}
|
|
6780
|
-
return responseBlocks.map((block) => {
|
|
6781
|
-
const href = extractFirst(block, "href");
|
|
6782
|
-
if (!href) {
|
|
6783
|
-
return void 0;
|
|
6784
|
-
}
|
|
6785
|
-
const size = extractFirst(block, "getcontentlength");
|
|
6786
|
-
const parsedSize = size ? Number.parseInt(size, 10) : void 0;
|
|
6787
|
-
const lastModified = extractFirst(block, "getlastmodified");
|
|
6788
|
-
const etag = extractFirst(block, "getetag");
|
|
6789
|
-
return {
|
|
6790
|
-
href,
|
|
6791
|
-
name: getNameFromHref(href),
|
|
6792
|
-
isCollection: isCollectionResponse(block),
|
|
6793
|
-
...Number.isFinite(parsedSize) ? { sizeInBytes: parsedSize } : {},
|
|
6794
|
-
...lastModified ? { lastModified } : {},
|
|
6795
|
-
...etag ? { etag } : {}
|
|
6796
|
-
};
|
|
6797
|
-
}).filter((entry) => entry !== void 0);
|
|
6798
|
-
}
|
|
6799
|
-
async function propfind(pathSegments, options = {}) {
|
|
6800
|
-
const baseUrl = normalizeBaseUrl(options.baseUrl);
|
|
6801
|
-
const shareToken = getShareToken(options.shareToken);
|
|
6802
|
-
let response;
|
|
6803
|
-
try {
|
|
6804
|
-
response = await fetch(buildUrl(baseUrl, pathSegments), {
|
|
6805
|
-
method: "PROPFIND",
|
|
6806
|
-
headers: {
|
|
6807
|
-
Accept: "application/xml,text/xml,*/*",
|
|
6808
|
-
Authorization: getAuthHeader(shareToken),
|
|
6809
|
-
Depth: "1",
|
|
6810
|
-
"User-Agent": options.userAgent ?? DEFAULT_FEDERAL_REVENUE_USER_AGENT
|
|
6811
|
-
}
|
|
6812
|
-
});
|
|
6813
|
-
} catch (error) {
|
|
6814
|
-
throw new ValidationError(
|
|
6815
|
-
`Federal Revenue WebDAV request failed before receiving a response: ${error instanceof Error ? error.message : String(error)}.`,
|
|
6816
|
-
{ baseUrl, pathSegments }
|
|
6817
|
-
);
|
|
6818
|
-
}
|
|
6819
|
-
if (!response.ok) {
|
|
6820
|
-
throw new ValidationError(
|
|
6821
|
-
`Federal Revenue WebDAV request failed with status ${response.status} ${response.statusText}.`,
|
|
6822
|
-
{ status: response.status, statusText: response.statusText }
|
|
6823
|
-
);
|
|
6824
|
-
}
|
|
6825
|
-
const xml = await response.text();
|
|
6826
|
-
return {
|
|
6827
|
-
entries: parsePropfindXml(xml),
|
|
6828
|
-
baseUrl,
|
|
6829
|
-
shareToken
|
|
6830
|
-
};
|
|
6831
|
-
}
|
|
6832
|
-
function validateFederalRevenueReference(reference) {
|
|
6833
|
-
if (!REFERENCE_PATTERN.test(reference)) {
|
|
6834
|
-
throw new ValidationError(
|
|
6835
|
-
`Federal Revenue reference is invalid: ${reference}. Expected YYYY-MM.`
|
|
6836
|
-
);
|
|
6837
|
-
}
|
|
6838
|
-
}
|
|
6839
|
-
function getCurrentFederalRevenueReference(date = /* @__PURE__ */ new Date()) {
|
|
6840
|
-
const year = date.getFullYear();
|
|
6841
|
-
const month = String(date.getMonth() + 1).padStart(2, "0");
|
|
6842
|
-
return `${year}-${month}`;
|
|
6843
|
-
}
|
|
6844
|
-
async function listFederalRevenueReferences(options = {}) {
|
|
6845
|
-
const result = await propfind([], options);
|
|
6846
|
-
const references = result.entries.filter((entry) => entry.isCollection && REFERENCE_PATTERN.test(entry.name)).map((entry) => ({
|
|
6847
|
-
reference: entry.name,
|
|
6848
|
-
href: entry.href
|
|
6849
|
-
})).sort((left, right) => left.reference.localeCompare(right.reference));
|
|
6850
|
-
return {
|
|
6851
|
-
references,
|
|
6852
|
-
remoteBaseUrl: result.baseUrl
|
|
6853
|
-
};
|
|
6854
|
-
}
|
|
6855
|
-
async function resolveFederalRevenueReference(input = {}) {
|
|
6856
|
-
const { references } = await listFederalRevenueReferences(input);
|
|
6857
|
-
const availableReferences = references.map((item) => item.reference);
|
|
6858
|
-
const latest = availableReferences.at(-1);
|
|
6859
|
-
if (!latest) {
|
|
6860
|
-
throw new ValidationError(
|
|
6861
|
-
"Federal Revenue reference discovery failed: no monthly references were found in the public share."
|
|
6862
|
-
);
|
|
6863
|
-
}
|
|
6864
|
-
if (input.reference) {
|
|
6865
|
-
validateFederalRevenueReference(input.reference);
|
|
6866
|
-
if (!availableReferences.includes(input.reference)) {
|
|
6867
|
-
throw new ValidationError(
|
|
6868
|
-
`Federal Revenue reference not found: ${input.reference}. Latest available reference is ${latest}.`,
|
|
6869
|
-
{
|
|
6870
|
-
requestedReference: input.reference,
|
|
6871
|
-
latestAvailableReference: latest,
|
|
6872
|
-
availableReferences
|
|
6873
|
-
}
|
|
6874
|
-
);
|
|
6875
|
-
}
|
|
6876
|
-
return {
|
|
6877
|
-
mode: "explicit",
|
|
6878
|
-
selectedReference: input.reference,
|
|
6879
|
-
availableReferences
|
|
6880
|
-
};
|
|
6881
|
-
}
|
|
6882
|
-
if (input.current) {
|
|
6883
|
-
const currentReference = getCurrentFederalRevenueReference();
|
|
6884
|
-
if (!availableReferences.includes(currentReference)) {
|
|
6885
|
-
throw new ValidationError(
|
|
6886
|
-
`Federal Revenue current reference is not available yet: ${currentReference}. Latest available reference is ${latest}.`,
|
|
6887
|
-
{
|
|
6888
|
-
requestedReference: currentReference,
|
|
6889
|
-
latestAvailableReference: latest,
|
|
6890
|
-
availableReferences
|
|
6891
|
-
}
|
|
6892
|
-
);
|
|
6893
|
-
}
|
|
6894
|
-
return {
|
|
6895
|
-
mode: "current",
|
|
6896
|
-
selectedReference: currentReference,
|
|
6897
|
-
availableReferences
|
|
6898
|
-
};
|
|
6899
|
-
}
|
|
6900
|
-
return {
|
|
6901
|
-
mode: "latest",
|
|
6902
|
-
selectedReference: latest,
|
|
6903
|
-
availableReferences
|
|
6904
|
-
};
|
|
6905
|
-
}
|
|
6906
|
-
async function listFederalRevenueFiles(reference, options = {}) {
|
|
6907
|
-
validateFederalRevenueReference(reference);
|
|
6908
|
-
const result = await propfind([reference], options);
|
|
6909
|
-
const files = result.entries.filter(
|
|
6910
|
-
(entry) => !entry.isCollection && entry.name.toLowerCase().endsWith(".zip")
|
|
6911
|
-
).map((entry) => ({
|
|
6912
|
-
name: entry.name,
|
|
6913
|
-
href: entry.href,
|
|
6914
|
-
downloadUrl: buildUrl(result.baseUrl, [reference, entry.name]),
|
|
6915
|
-
...entry.sizeInBytes !== void 0 ? { sizeInBytes: entry.sizeInBytes } : {},
|
|
6916
|
-
...entry.lastModified ? { lastModified: entry.lastModified } : {},
|
|
6917
|
-
...entry.etag ? { etag: entry.etag } : {}
|
|
6918
|
-
})).sort((left, right) => left.name.localeCompare(right.name));
|
|
6919
|
-
return {
|
|
6920
|
-
files,
|
|
6921
|
-
remoteBaseUrl: result.baseUrl
|
|
6922
|
-
};
|
|
6923
|
-
}
|
|
6924
|
-
function buildFederalRevenueDownloadHeaders(options = {}) {
|
|
6925
|
-
return {
|
|
6926
|
-
Authorization: getAuthHeader(getShareToken(options.shareToken)),
|
|
6927
|
-
"User-Agent": options.userAgent ?? DEFAULT_FEDERAL_REVENUE_USER_AGENT
|
|
6928
|
-
};
|
|
6929
|
-
}
|
|
6930
|
-
|
|
6931
7073
|
// src/services/federal-revenue/download.ts
|
|
6932
7074
|
import { createWriteStream } from "fs";
|
|
6933
7075
|
import { mkdir as mkdir5, rename, stat as stat5, unlink } from "fs/promises";
|
|
@@ -8411,6 +8553,18 @@ var STAGING_TABLE_BY_DATASET3 = {
|
|
|
8411
8553
|
partners: "staging_partners",
|
|
8412
8554
|
simples_options: "staging_simples_options"
|
|
8413
8555
|
};
|
|
8556
|
+
var STEP_ORDER = [
|
|
8557
|
+
"setup",
|
|
8558
|
+
"load-domains",
|
|
8559
|
+
"load-companies",
|
|
8560
|
+
"load-establishments",
|
|
8561
|
+
"load-partners",
|
|
8562
|
+
"load-simples",
|
|
8563
|
+
"materialize",
|
|
8564
|
+
"materialize-secondary-cnaes",
|
|
8565
|
+
"indexes",
|
|
8566
|
+
"analyze"
|
|
8567
|
+
];
|
|
8414
8568
|
function quoteSqlLiteral(value) {
|
|
8415
8569
|
return `'${value.replace(/'/g, "''")}'`;
|
|
8416
8570
|
}
|
|
@@ -8428,6 +8582,9 @@ function receitaCopyCommand(tableName, columns, filePath) {
|
|
|
8428
8582
|
const normalizedFilePath = normalizePathForPsql(filePath);
|
|
8429
8583
|
return `\\copy ${tableName} (${columns.join(", ")}) from ${quoteSqlLiteral(normalizedFilePath)} with (format csv, header false, delimiter ';', quote '"', escape '"')`;
|
|
8430
8584
|
}
|
|
8585
|
+
function echo(message) {
|
|
8586
|
+
return `\\echo ${quoteSqlLiteral(message)}`;
|
|
8587
|
+
}
|
|
8431
8588
|
function datasetColumns(dataset) {
|
|
8432
8589
|
return DATASET_LAYOUTS[dataset].fields.map((field) => field.columnName);
|
|
8433
8590
|
}
|
|
@@ -8454,7 +8611,7 @@ function partnerDedupeExpression(alias) {
|
|
|
8454
8611
|
function materializeCompaniesSql() {
|
|
8455
8612
|
const columns = companiesLayout.fields.map((field) => field.columnName);
|
|
8456
8613
|
return [
|
|
8457
|
-
"
|
|
8614
|
+
echo("[materialize] Materializing companies..."),
|
|
8458
8615
|
"with source as (",
|
|
8459
8616
|
" select",
|
|
8460
8617
|
` ${columns.map((column) => `source.${column}`).join(",\n ")},`,
|
|
@@ -8468,7 +8625,8 @@ function materializeCompaniesSql() {
|
|
|
8468
8625
|
`select ${columns.join(", ")}`,
|
|
8469
8626
|
"from deduped",
|
|
8470
8627
|
"on conflict (cnpj_root) do update set",
|
|
8471
|
-
` ${updateAssignments(columns, ["cnpj_root"])}
|
|
8628
|
+
` ${updateAssignments(columns, ["cnpj_root"])};`,
|
|
8629
|
+
echo("[materialize] Companies materialization completed.")
|
|
8472
8630
|
].join("\n");
|
|
8473
8631
|
}
|
|
8474
8632
|
function materializeEstablishmentsSql() {
|
|
@@ -8477,7 +8635,7 @@ function materializeEstablishmentsSql() {
|
|
|
8477
8635
|
);
|
|
8478
8636
|
const insertColumns = [...baseColumns, "cnpj_full"];
|
|
8479
8637
|
return [
|
|
8480
|
-
"
|
|
8638
|
+
echo("[materialize] Materializing establishments..."),
|
|
8481
8639
|
"with source as (",
|
|
8482
8640
|
" select",
|
|
8483
8641
|
` ${baseColumns.map((column) => `source.${column}`).join(",\n ")},`,
|
|
@@ -8487,14 +8645,29 @@ function materializeEstablishmentsSql() {
|
|
|
8487
8645
|
"),",
|
|
8488
8646
|
"deduped as (",
|
|
8489
8647
|
" select * from source where dedupe_rank = 1",
|
|
8648
|
+
")",
|
|
8649
|
+
`insert into establishments (${insertColumns.join(", ")})`,
|
|
8650
|
+
`select ${insertColumns.join(", ")}`,
|
|
8651
|
+
"from deduped",
|
|
8652
|
+
"on conflict (cnpj_full) do update set",
|
|
8653
|
+
` ${updateAssignments(insertColumns, ["cnpj_root", "cnpj_order", "cnpj_check_digits", "cnpj_full"])};`,
|
|
8654
|
+
echo("[materialize] Establishments materialization completed.")
|
|
8655
|
+
].join("\n");
|
|
8656
|
+
}
|
|
8657
|
+
function materializeSecondaryCnaesSql() {
|
|
8658
|
+
return [
|
|
8659
|
+
echo(
|
|
8660
|
+
"[materialize-secondary-cnaes] Materializing establishment secondary CNAEs..."
|
|
8661
|
+
),
|
|
8662
|
+
"with source as (",
|
|
8663
|
+
" select",
|
|
8664
|
+
" staging.cnpj_root || staging.cnpj_order || staging.cnpj_check_digits as cnpj_full,",
|
|
8665
|
+
" staging.secondary_cnaes_raw,",
|
|
8666
|
+
" row_number() over (partition by staging.cnpj_root || staging.cnpj_order || staging.cnpj_check_digits order by staging.staging_id desc) as dedupe_rank",
|
|
8667
|
+
" from staging_establishments staging",
|
|
8490
8668
|
"),",
|
|
8491
|
-
"
|
|
8492
|
-
|
|
8493
|
-
` select ${insertColumns.join(", ")}`,
|
|
8494
|
-
" from deduped",
|
|
8495
|
-
" on conflict (cnpj_full) do update set",
|
|
8496
|
-
` ${updateAssignments(insertColumns, ["cnpj_root", "cnpj_order", "cnpj_check_digits", "cnpj_full"])}`,
|
|
8497
|
-
" returning cnpj_full",
|
|
8669
|
+
"deduped as (",
|
|
8670
|
+
" select * from source where dedupe_rank = 1",
|
|
8498
8671
|
"),",
|
|
8499
8672
|
"deleted_secondary_cnaes as (",
|
|
8500
8673
|
" delete from establishment_secondary_cnaes target",
|
|
@@ -8515,14 +8688,17 @@ function materializeEstablishmentsSql() {
|
|
|
8515
8688
|
"insert into establishment_secondary_cnaes (cnpj_full, cnae_code)",
|
|
8516
8689
|
"select cnpj_full, cnae_code",
|
|
8517
8690
|
"from secondary_cnaes_source",
|
|
8518
|
-
"on conflict (cnpj_full, cnae_code) do nothing;"
|
|
8691
|
+
"on conflict (cnpj_full, cnae_code) do nothing;",
|
|
8692
|
+
echo(
|
|
8693
|
+
"[materialize-secondary-cnaes] Secondary CNAEs materialization completed."
|
|
8694
|
+
)
|
|
8519
8695
|
].join("\n");
|
|
8520
8696
|
}
|
|
8521
8697
|
function materializePartnersSql() {
|
|
8522
8698
|
const baseColumns = partnersLayout.fields.map((field) => field.columnName);
|
|
8523
8699
|
const insertColumns = [...baseColumns, "partner_dedupe_key"];
|
|
8524
8700
|
return [
|
|
8525
|
-
"
|
|
8701
|
+
echo("[materialize] Materializing partners..."),
|
|
8526
8702
|
"with source as (",
|
|
8527
8703
|
" select",
|
|
8528
8704
|
` ${baseColumns.map((column) => `source.${column}`).join(",\n ")},`,
|
|
@@ -8542,13 +8718,14 @@ function materializePartnersSql() {
|
|
|
8542
8718
|
`select ${insertColumns.join(", ")}`,
|
|
8543
8719
|
"from deduped",
|
|
8544
8720
|
"on conflict (partner_dedupe_key) do update set",
|
|
8545
|
-
` ${updateAssignments(insertColumns, ["partner_dedupe_key"])}
|
|
8721
|
+
` ${updateAssignments(insertColumns, ["partner_dedupe_key"])};`,
|
|
8722
|
+
echo("[materialize] Partners materialization completed.")
|
|
8546
8723
|
].join("\n");
|
|
8547
8724
|
}
|
|
8548
8725
|
function materializeSimplesSql() {
|
|
8549
8726
|
const columns = simplesLayout.fields.map((field) => field.columnName);
|
|
8550
8727
|
return [
|
|
8551
|
-
"
|
|
8728
|
+
echo("[materialize] Materializing simples options..."),
|
|
8552
8729
|
"with source as (",
|
|
8553
8730
|
" select",
|
|
8554
8731
|
` ${columns.map((column) => `source.${column}`).join(",\n ")},`,
|
|
@@ -8562,7 +8739,8 @@ function materializeSimplesSql() {
|
|
|
8562
8739
|
`select ${columns.join(", ")}`,
|
|
8563
8740
|
"from deduped",
|
|
8564
8741
|
"on conflict (cnpj_root) do update set",
|
|
8565
|
-
` ${updateAssignments(columns, ["cnpj_root"])}
|
|
8742
|
+
` ${updateAssignments(columns, ["cnpj_root"])};`,
|
|
8743
|
+
echo("[materialize] Simples options materialization completed.")
|
|
8566
8744
|
].join("\n");
|
|
8567
8745
|
}
|
|
8568
8746
|
function copyDomainSql(dataset, files) {
|
|
@@ -8572,12 +8750,20 @@ function copyDomainSql(dataset, files) {
|
|
|
8572
8750
|
const columns = datasetColumns(dataset);
|
|
8573
8751
|
const tempTable = `tmp_hybrid_${dataset}`;
|
|
8574
8752
|
const lines = [
|
|
8575
|
-
|
|
8753
|
+
echo(`[load-domains] Loading ${dataset} lookup data...`),
|
|
8576
8754
|
`drop table if exists ${tempTable};`,
|
|
8577
8755
|
`create temporary table ${tempTable} (code text, description text);`
|
|
8578
8756
|
];
|
|
8579
|
-
for (const file of files) {
|
|
8580
|
-
lines.push(
|
|
8757
|
+
for (const [index, file] of files.entries()) {
|
|
8758
|
+
lines.push(
|
|
8759
|
+
echo(
|
|
8760
|
+
`[load-domains] Loading ${dataset} file ${index + 1} of ${files.length}: ${file.relativePath}`
|
|
8761
|
+
),
|
|
8762
|
+
csvCopyCommand(tempTable, columns, file.absolutePath),
|
|
8763
|
+
echo(
|
|
8764
|
+
`[load-domains] Loaded ${dataset} file ${index + 1} of ${files.length}.`
|
|
8765
|
+
)
|
|
8766
|
+
);
|
|
8581
8767
|
}
|
|
8582
8768
|
lines.push(
|
|
8583
8769
|
`insert into ${dataset} (${columns.join(", ")})`,
|
|
@@ -8598,12 +8784,17 @@ function copyStagingSql(dataset, files) {
|
|
|
8598
8784
|
return [];
|
|
8599
8785
|
}
|
|
8600
8786
|
const columns = datasetColumns(dataset);
|
|
8601
|
-
|
|
8602
|
-
|
|
8603
|
-
|
|
8604
|
-
(
|
|
8605
|
-
|
|
8606
|
-
|
|
8787
|
+
const lines = [echo(`[load-${dataset}] Loading ${dataset} staging data...`)];
|
|
8788
|
+
for (const [index, file] of files.entries()) {
|
|
8789
|
+
lines.push(
|
|
8790
|
+
echo(
|
|
8791
|
+
`[load-${dataset}] Loading file ${index + 1} of ${files.length}: ${file.relativePath}`
|
|
8792
|
+
),
|
|
8793
|
+
csvCopyCommand(tableName, columns, file.absolutePath),
|
|
8794
|
+
echo(`[load-${dataset}] Loaded file ${index + 1} of ${files.length}.`)
|
|
8795
|
+
);
|
|
8796
|
+
}
|
|
8797
|
+
return lines;
|
|
8607
8798
|
}
|
|
8608
8799
|
function csvFilesByDataset(files) {
|
|
8609
8800
|
const grouped = {};
|
|
@@ -8629,7 +8820,9 @@ function rawTableName(dataset) {
|
|
|
8629
8820
|
function createRawTempTableSql(dataset) {
|
|
8630
8821
|
const columns = DATASET_LAYOUTS[dataset].fields.map((field) => ` ${quoteIdentifier(field.columnName)} text`).join(",\n");
|
|
8631
8822
|
return [
|
|
8823
|
+
"set client_min_messages to warning;",
|
|
8632
8824
|
`drop table if exists ${rawTableName(dataset)};`,
|
|
8825
|
+
"reset client_min_messages;",
|
|
8633
8826
|
`create temporary table ${rawTableName(dataset)} (`,
|
|
8634
8827
|
columns,
|
|
8635
8828
|
");"
|
|
@@ -8711,11 +8904,21 @@ function rawDomainSql(dataset, files) {
|
|
|
8711
8904
|
const columns = layout.fields.map((field) => field.columnName);
|
|
8712
8905
|
const tableName = rawTableName(dataset);
|
|
8713
8906
|
const lines = [
|
|
8714
|
-
|
|
8907
|
+
echo(
|
|
8908
|
+
`[load-domains] Loading ${dataset} lookup data directly from sanitized Receita files...`
|
|
8909
|
+
),
|
|
8715
8910
|
createRawTempTableSql(dataset)
|
|
8716
8911
|
];
|
|
8717
|
-
for (const file of files) {
|
|
8718
|
-
lines.push(
|
|
8912
|
+
for (const [index, file] of files.entries()) {
|
|
8913
|
+
lines.push(
|
|
8914
|
+
echo(
|
|
8915
|
+
`[load-domains] Loading ${dataset} file ${index + 1} of ${files.length}: ${file.relativePath}`
|
|
8916
|
+
),
|
|
8917
|
+
receitaCopyCommand(tableName, columns, file.absolutePath),
|
|
8918
|
+
echo(
|
|
8919
|
+
`[load-domains] Loaded ${dataset} file ${index + 1} of ${files.length}.`
|
|
8920
|
+
)
|
|
8921
|
+
);
|
|
8719
8922
|
}
|
|
8720
8923
|
lines.push(
|
|
8721
8924
|
`insert into ${dataset} (${columns.join(", ")})`,
|
|
@@ -8725,7 +8928,8 @@ function rawDomainSql(dataset, files) {
|
|
|
8725
8928
|
`from ${tableName}`,
|
|
8726
8929
|
"where nullif(btrim(code), '') is not null",
|
|
8727
8930
|
"order by code",
|
|
8728
|
-
"on conflict (code) do update set description = excluded.description;"
|
|
8931
|
+
"on conflict (code) do update set description = excluded.description;",
|
|
8932
|
+
echo(`[load-domains] ${dataset} lookup data completed.`)
|
|
8729
8933
|
);
|
|
8730
8934
|
return lines;
|
|
8731
8935
|
}
|
|
@@ -8744,70 +8948,363 @@ function rawStagingSql(dataset, files) {
|
|
|
8744
8948
|
const expressions = layout.fields.map(
|
|
8745
8949
|
(field) => ` ${fieldExpression(dataset, field, alias)} as ${field.columnName}`
|
|
8746
8950
|
);
|
|
8951
|
+
const stepName = loadStepName(dataset);
|
|
8747
8952
|
const lines = [
|
|
8748
|
-
|
|
8953
|
+
echo(
|
|
8954
|
+
`[${stepName}] Loading ${dataset} staging data directly from sanitized Receita files...`
|
|
8955
|
+
),
|
|
8956
|
+
`truncate table ${targetTable} restart identity;`,
|
|
8749
8957
|
createRawTempTableSql(dataset)
|
|
8750
8958
|
];
|
|
8751
|
-
for (const file of files) {
|
|
8752
|
-
lines.push(
|
|
8959
|
+
for (const [index, file] of files.entries()) {
|
|
8960
|
+
lines.push(
|
|
8961
|
+
echo(
|
|
8962
|
+
`[${stepName}] Loading file ${index + 1} of ${files.length}: ${file.relativePath}`
|
|
8963
|
+
),
|
|
8964
|
+
receitaCopyCommand(tableName, columns, file.absolutePath),
|
|
8965
|
+
echo(`[${stepName}] Loaded file ${index + 1} of ${files.length}.`)
|
|
8966
|
+
);
|
|
8753
8967
|
}
|
|
8754
8968
|
lines.push(
|
|
8969
|
+
echo(
|
|
8970
|
+
`[${stepName}] Transforming ${dataset} raw rows into ${targetTable}...`
|
|
8971
|
+
),
|
|
8755
8972
|
`insert into ${targetTable} (${columns.join(", ")})`,
|
|
8756
8973
|
"select",
|
|
8757
8974
|
expressions.join(",\n"),
|
|
8758
|
-
`from ${tableName} ${alias}
|
|
8975
|
+
`from ${tableName} ${alias};`,
|
|
8976
|
+
echo(`[${stepName}] ${dataset} staging load completed.`)
|
|
8759
8977
|
);
|
|
8760
8978
|
return lines;
|
|
8761
8979
|
}
|
|
8762
|
-
function
|
|
8763
|
-
|
|
8764
|
-
|
|
8765
|
-
|
|
8766
|
-
|
|
8767
|
-
|
|
8768
|
-
|
|
8769
|
-
|
|
8980
|
+
function loadStepName(dataset) {
|
|
8981
|
+
switch (dataset) {
|
|
8982
|
+
case "companies":
|
|
8983
|
+
return "load-companies";
|
|
8984
|
+
case "establishments":
|
|
8985
|
+
return "load-establishments";
|
|
8986
|
+
case "partners":
|
|
8987
|
+
return "load-partners";
|
|
8988
|
+
case "simples_options":
|
|
8989
|
+
return "load-simples";
|
|
8990
|
+
default:
|
|
8991
|
+
return `load-${dataset}`;
|
|
8992
|
+
}
|
|
8993
|
+
}
|
|
8994
|
+
function scriptHeader(title, sourceEncoding) {
|
|
8995
|
+
return [
|
|
8996
|
+
`-- ${title}`,
|
|
8997
|
+
"-- Generated by cnpj-db-loader postgres generate-script.",
|
|
8770
8998
|
"\\set ON_ERROR_STOP on",
|
|
8771
|
-
|
|
8772
|
-
|
|
8773
|
-
|
|
8774
|
-
|
|
8775
|
-
|
|
8776
|
-
|
|
8777
|
-
"truncate table staging_companies restart identity;",
|
|
8778
|
-
"truncate table staging_establishments restart identity;",
|
|
8779
|
-
"truncate table staging_partners restart identity;",
|
|
8780
|
-
"truncate table staging_simples_options restart identity;",
|
|
8999
|
+
...sourceEncoding ? [
|
|
9000
|
+
echo(
|
|
9001
|
+
`Using source file encoding ${sourceEncoding} for psql copy operations...`
|
|
9002
|
+
),
|
|
9003
|
+
`set client_encoding to ${quoteSqlLiteral(sourceEncoding)};`
|
|
9004
|
+
] : [],
|
|
8781
9005
|
""
|
|
8782
9006
|
];
|
|
8783
|
-
|
|
8784
|
-
|
|
9007
|
+
}
|
|
9008
|
+
function wrapTransaction(lines, mode, shouldWrap) {
|
|
9009
|
+
if (!shouldWrap || mode !== "phase") {
|
|
9010
|
+
return [...lines];
|
|
8785
9011
|
}
|
|
8786
|
-
|
|
8787
|
-
|
|
9012
|
+
return ["begin;", "", ...lines, "", "commit;"];
|
|
9013
|
+
}
|
|
9014
|
+
function buildStepScript(title, body, input, wrapInPhaseTransaction) {
|
|
9015
|
+
return [
|
|
9016
|
+
...scriptHeader(title, input.sourceEncoding),
|
|
9017
|
+
...wrapTransaction(body, input.transactionMode, wrapInPhaseTransaction),
|
|
9018
|
+
""
|
|
9019
|
+
].join("\n");
|
|
9020
|
+
}
|
|
9021
|
+
function includeSet(input) {
|
|
9022
|
+
const selected = new Set(input.include);
|
|
9023
|
+
if (input.skipIndexes) {
|
|
9024
|
+
selected.delete("indexes");
|
|
8788
9025
|
}
|
|
8789
|
-
|
|
8790
|
-
|
|
9026
|
+
if (input.skipAnalyze) {
|
|
9027
|
+
selected.delete("analyze");
|
|
9028
|
+
}
|
|
9029
|
+
return selected;
|
|
9030
|
+
}
|
|
9031
|
+
function hasAnyFinalMaterialization(selected) {
|
|
9032
|
+
return selected.has("companies") || selected.has("establishments") || selected.has("partners") || selected.has("simples");
|
|
9033
|
+
}
|
|
9034
|
+
function materializeSql(selected) {
|
|
9035
|
+
const lines = [echo("[materialize] Starting final table materialization...")];
|
|
9036
|
+
if (selected.has("companies")) {
|
|
9037
|
+
lines.push(materializeCompaniesSql(), "");
|
|
9038
|
+
}
|
|
9039
|
+
if (selected.has("establishments")) {
|
|
9040
|
+
lines.push(materializeEstablishmentsSql(), "");
|
|
9041
|
+
}
|
|
9042
|
+
if (selected.has("partners")) {
|
|
9043
|
+
lines.push(materializePartnersSql(), "");
|
|
9044
|
+
}
|
|
9045
|
+
if (selected.has("simples")) {
|
|
9046
|
+
lines.push(materializeSimplesSql(), "");
|
|
9047
|
+
}
|
|
9048
|
+
lines.push(echo("[materialize] Final table materialization completed."));
|
|
9049
|
+
return lines;
|
|
9050
|
+
}
|
|
9051
|
+
function indexesSql() {
|
|
9052
|
+
return [
|
|
9053
|
+
echo(
|
|
9054
|
+
"[indexes] No additional index operations are generated in this beta."
|
|
9055
|
+
),
|
|
9056
|
+
"-- Indexes are expected to be managed by the schema generated by cnpj-db-loader schema generate.",
|
|
9057
|
+
"-- A future fast-rebuild mode may generate DROP/CREATE INDEX operations here."
|
|
9058
|
+
];
|
|
9059
|
+
}
|
|
9060
|
+
function analyzeSql(selected) {
|
|
9061
|
+
const tables = /* @__PURE__ */ new Set();
|
|
9062
|
+
if (selected.has("companies")) {
|
|
9063
|
+
tables.add("companies");
|
|
9064
|
+
}
|
|
9065
|
+
if (selected.has("establishments")) {
|
|
9066
|
+
tables.add("establishments");
|
|
9067
|
+
}
|
|
9068
|
+
if (selected.has("secondary-cnaes")) {
|
|
9069
|
+
tables.add("establishment_secondary_cnaes");
|
|
9070
|
+
}
|
|
9071
|
+
if (selected.has("partners")) {
|
|
9072
|
+
tables.add("partners");
|
|
9073
|
+
}
|
|
9074
|
+
if (selected.has("simples")) {
|
|
9075
|
+
tables.add("simples_options");
|
|
9076
|
+
}
|
|
9077
|
+
if (selected.has("domains")) {
|
|
9078
|
+
for (const dataset of DOMAIN_DATASETS) {
|
|
9079
|
+
tables.add(dataset);
|
|
9080
|
+
}
|
|
9081
|
+
}
|
|
9082
|
+
return [
|
|
9083
|
+
echo("[analyze] Refreshing planner statistics..."),
|
|
9084
|
+
...[...tables].map((table) => `analyze ${table};`),
|
|
9085
|
+
echo("[analyze] Planner statistics refreshed.")
|
|
9086
|
+
];
|
|
8791
9087
|
}
|
|
8792
|
-
function
|
|
9088
|
+
function step(name, file, dependsOn, included) {
|
|
9089
|
+
return { name, file, dependsOn, included };
|
|
9090
|
+
}
|
|
9091
|
+
function generatePostgresDirectScriptFiles(input) {
|
|
8793
9092
|
const grouped = directFilesByDataset(input.files);
|
|
8794
|
-
const
|
|
8795
|
-
|
|
9093
|
+
const selected = includeSet(input);
|
|
9094
|
+
if (!DOMAIN_DATASETS.some((dataset) => (grouped[dataset] ?? []).length > 0)) {
|
|
9095
|
+
selected.delete("domains");
|
|
9096
|
+
}
|
|
9097
|
+
if ((grouped.companies ?? []).length === 0) {
|
|
9098
|
+
selected.delete("companies");
|
|
9099
|
+
}
|
|
9100
|
+
if ((grouped.establishments ?? []).length === 0) {
|
|
9101
|
+
selected.delete("establishments");
|
|
9102
|
+
selected.delete("secondary-cnaes");
|
|
9103
|
+
}
|
|
9104
|
+
if ((grouped.partners ?? []).length === 0) {
|
|
9105
|
+
selected.delete("partners");
|
|
9106
|
+
}
|
|
9107
|
+
if ((grouped.simples_options ?? []).length === 0) {
|
|
9108
|
+
selected.delete("simples");
|
|
9109
|
+
}
|
|
9110
|
+
const scripts = {};
|
|
9111
|
+
const steps = [];
|
|
9112
|
+
const setupIncluded = true;
|
|
9113
|
+
steps.push(step("setup", "setup.sql", [], setupIncluded));
|
|
9114
|
+
scripts["setup.sql"] = [
|
|
9115
|
+
...scriptHeader(
|
|
9116
|
+
"CNPJ DB Loader PostgreSQL direct import setup",
|
|
9117
|
+
input.sourceEncoding
|
|
9118
|
+
),
|
|
9119
|
+
echo("[setup] Preparing PostgreSQL direct import session..."),
|
|
9120
|
+
"-- The database schema must be applied before running these scripts.",
|
|
9121
|
+
"-- This setup script configures the psql session used by the generated orchestrator.",
|
|
9122
|
+
echo("[setup] Setup completed."),
|
|
9123
|
+
""
|
|
9124
|
+
].join("\n");
|
|
9125
|
+
const domainsIncluded = selected.has("domains") && DOMAIN_DATASETS.some((dataset) => (grouped[dataset] ?? []).length > 0);
|
|
9126
|
+
steps.push(
|
|
9127
|
+
step("load-domains", "load-domains.sql", ["setup"], domainsIncluded)
|
|
9128
|
+
);
|
|
9129
|
+
if (domainsIncluded) {
|
|
9130
|
+
const lines = [echo("[load-domains] Starting domain tables load...")];
|
|
9131
|
+
for (const dataset of DOMAIN_DATASETS) {
|
|
9132
|
+
lines.push(...rawDomainSql(dataset, grouped[dataset] ?? []), "");
|
|
9133
|
+
}
|
|
9134
|
+
lines.push(echo("[load-domains] Domain tables load completed."));
|
|
9135
|
+
scripts["load-domains.sql"] = buildStepScript(
|
|
9136
|
+
"CNPJ DB Loader PostgreSQL direct import domains step",
|
|
9137
|
+
lines,
|
|
9138
|
+
input,
|
|
9139
|
+
true
|
|
9140
|
+
);
|
|
9141
|
+
}
|
|
9142
|
+
const datasetSteps = [
|
|
9143
|
+
{
|
|
9144
|
+
dataset: "companies",
|
|
9145
|
+
name: "load-companies",
|
|
9146
|
+
file: "load-companies.sql",
|
|
9147
|
+
include: "companies"
|
|
9148
|
+
},
|
|
9149
|
+
{
|
|
9150
|
+
dataset: "establishments",
|
|
9151
|
+
name: "load-establishments",
|
|
9152
|
+
file: "load-establishments.sql",
|
|
9153
|
+
include: "establishments"
|
|
9154
|
+
},
|
|
9155
|
+
{
|
|
9156
|
+
dataset: "partners",
|
|
9157
|
+
name: "load-partners",
|
|
9158
|
+
file: "load-partners.sql",
|
|
9159
|
+
include: "partners"
|
|
9160
|
+
},
|
|
9161
|
+
{
|
|
9162
|
+
dataset: "simples_options",
|
|
9163
|
+
name: "load-simples",
|
|
9164
|
+
file: "load-simples.sql",
|
|
9165
|
+
include: "simples"
|
|
9166
|
+
}
|
|
9167
|
+
];
|
|
9168
|
+
for (const item of datasetSteps) {
|
|
9169
|
+
const files = grouped[item.dataset] ?? [];
|
|
9170
|
+
const included = selected.has(item.include) && files.length > 0;
|
|
9171
|
+
steps.push(step(item.name, item.file, ["setup"], included));
|
|
9172
|
+
if (included) {
|
|
9173
|
+
scripts[item.file] = buildStepScript(
|
|
9174
|
+
`CNPJ DB Loader PostgreSQL direct import ${item.name} step`,
|
|
9175
|
+
rawStagingSql(item.dataset, files),
|
|
9176
|
+
input,
|
|
9177
|
+
true
|
|
9178
|
+
);
|
|
9179
|
+
}
|
|
9180
|
+
}
|
|
9181
|
+
const materializeIncluded = hasAnyFinalMaterialization(selected);
|
|
9182
|
+
steps.push(
|
|
9183
|
+
step(
|
|
9184
|
+
"materialize",
|
|
9185
|
+
"materialize.sql",
|
|
9186
|
+
datasetSteps.filter((item) => selected.has(item.include)).map((item) => item.name),
|
|
9187
|
+
materializeIncluded
|
|
9188
|
+
)
|
|
9189
|
+
);
|
|
9190
|
+
if (materializeIncluded) {
|
|
9191
|
+
scripts["materialize.sql"] = buildStepScript(
|
|
9192
|
+
"CNPJ DB Loader PostgreSQL direct import materialization step",
|
|
9193
|
+
materializeSql(selected),
|
|
9194
|
+
input,
|
|
9195
|
+
true
|
|
9196
|
+
);
|
|
9197
|
+
}
|
|
9198
|
+
const secondaryIncluded = selected.has("secondary-cnaes") && selected.has("establishments");
|
|
9199
|
+
steps.push(
|
|
9200
|
+
step(
|
|
9201
|
+
"materialize-secondary-cnaes",
|
|
9202
|
+
"materialize-secondary-cnaes.sql",
|
|
9203
|
+
["load-establishments"],
|
|
9204
|
+
secondaryIncluded
|
|
9205
|
+
)
|
|
9206
|
+
);
|
|
9207
|
+
if (secondaryIncluded) {
|
|
9208
|
+
scripts["materialize-secondary-cnaes.sql"] = buildStepScript(
|
|
9209
|
+
"CNPJ DB Loader PostgreSQL direct import secondary CNAEs step",
|
|
9210
|
+
[materializeSecondaryCnaesSql()],
|
|
9211
|
+
input,
|
|
9212
|
+
true
|
|
9213
|
+
);
|
|
9214
|
+
}
|
|
9215
|
+
const indexesIncluded = selected.has("indexes");
|
|
9216
|
+
steps.push(
|
|
9217
|
+
step(
|
|
9218
|
+
"indexes",
|
|
9219
|
+
"indexes.sql",
|
|
9220
|
+
materializeIncluded ? ["materialize"] : ["setup"],
|
|
9221
|
+
indexesIncluded
|
|
9222
|
+
)
|
|
9223
|
+
);
|
|
9224
|
+
if (indexesIncluded) {
|
|
9225
|
+
scripts["indexes.sql"] = buildStepScript(
|
|
9226
|
+
"CNPJ DB Loader PostgreSQL direct import indexes step",
|
|
9227
|
+
indexesSql(),
|
|
9228
|
+
input,
|
|
9229
|
+
true
|
|
9230
|
+
);
|
|
9231
|
+
}
|
|
9232
|
+
const analyzeIncluded = selected.has("analyze");
|
|
9233
|
+
const analyzeDependencies = [
|
|
9234
|
+
...domainsIncluded ? ["load-domains"] : [],
|
|
9235
|
+
...materializeIncluded ? ["materialize"] : [],
|
|
9236
|
+
...secondaryIncluded ? ["materialize-secondary-cnaes"] : []
|
|
9237
|
+
];
|
|
9238
|
+
steps.push(
|
|
9239
|
+
step(
|
|
9240
|
+
"analyze",
|
|
9241
|
+
"analyze.sql",
|
|
9242
|
+
analyzeDependencies.length > 0 ? analyzeDependencies : ["setup"],
|
|
9243
|
+
analyzeIncluded
|
|
9244
|
+
)
|
|
9245
|
+
);
|
|
9246
|
+
if (analyzeIncluded) {
|
|
9247
|
+
scripts["analyze.sql"] = buildStepScript(
|
|
9248
|
+
"CNPJ DB Loader PostgreSQL direct import analyze step",
|
|
9249
|
+
analyzeSql(selected),
|
|
9250
|
+
input,
|
|
9251
|
+
true
|
|
9252
|
+
);
|
|
9253
|
+
}
|
|
9254
|
+
const orchestratorLines = [
|
|
9255
|
+
"-- CNPJ DB Loader direct PostgreSQL import orchestrator",
|
|
8796
9256
|
"-- Generated from sanitized Receita files by cnpj-db-loader postgres generate-script.",
|
|
8797
|
-
"-- This path avoids rewriting the dataset into a second CSV tree.",
|
|
8798
9257
|
"-- Execute with psql, for example:",
|
|
8799
|
-
'-- psql "postgres://postgres:postgres@localhost:5432/cnpj" -f import-postgres-direct.sql',
|
|
9258
|
+
'-- psql -d "postgres://postgres:postgres@localhost:5432/cnpj" -f import-postgres-direct.sql',
|
|
8800
9259
|
"",
|
|
8801
9260
|
"\\set ON_ERROR_STOP on",
|
|
8802
|
-
|
|
9261
|
+
echo(
|
|
9262
|
+
`Using source file encoding ${input.sourceEncoding} for psql copy operations...`
|
|
9263
|
+
),
|
|
8803
9264
|
`set client_encoding to ${quoteSqlLiteral(input.sourceEncoding)};`,
|
|
8804
|
-
|
|
9265
|
+
echo(
|
|
9266
|
+
`Starting CNPJ DB Loader direct PostgreSQL import using transaction mode ${input.transactionMode}...`
|
|
9267
|
+
),
|
|
9268
|
+
"",
|
|
9269
|
+
...input.transactionMode === "single" ? ["begin;", ""] : []
|
|
9270
|
+
];
|
|
9271
|
+
for (const name of STEP_ORDER) {
|
|
9272
|
+
const currentStep = steps.find((item) => item.name === name);
|
|
9273
|
+
if (!currentStep?.included) {
|
|
9274
|
+
continue;
|
|
9275
|
+
}
|
|
9276
|
+
orchestratorLines.push(
|
|
9277
|
+
echo(
|
|
9278
|
+
`[orchestrator] Running ${currentStep.name} (${currentStep.file})...`
|
|
9279
|
+
),
|
|
9280
|
+
`\\ir ${currentStep.file}`,
|
|
9281
|
+
echo(`[orchestrator] Completed ${currentStep.name}.`),
|
|
9282
|
+
""
|
|
9283
|
+
);
|
|
9284
|
+
}
|
|
9285
|
+
orchestratorLines.push(
|
|
9286
|
+
...input.transactionMode === "single" ? ["commit;", ""] : [],
|
|
9287
|
+
echo("CNPJ DB Loader hybrid PostgreSQL import completed."),
|
|
9288
|
+
""
|
|
9289
|
+
);
|
|
9290
|
+
scripts["import-postgres-direct.sql"] = orchestratorLines.join("\n");
|
|
9291
|
+
return { scripts, steps };
|
|
9292
|
+
}
|
|
9293
|
+
function generatePostgresDirectImportScript(input) {
|
|
9294
|
+
const grouped = csvFilesByDataset(input.files);
|
|
9295
|
+
const lines = [
|
|
9296
|
+
"-- CNPJ DB Loader hybrid PostgreSQL import script",
|
|
9297
|
+
"-- Generated from PostgreSQL-ready CSV files exported by cnpj-db-loader postgres export-csv.",
|
|
9298
|
+
"-- Execute with psql, for example:",
|
|
9299
|
+
'-- psql -d "postgres://postgres:postgres@localhost:5432/cnpj" -f import-postgres-direct.sql',
|
|
9300
|
+
"",
|
|
9301
|
+
"\\set ON_ERROR_STOP on",
|
|
9302
|
+
echo("Starting CNPJ DB Loader hybrid PostgreSQL import..."),
|
|
8805
9303
|
"",
|
|
8806
9304
|
"begin;",
|
|
8807
9305
|
"",
|
|
8808
9306
|
"-- Keep the final schema and seed data managed by sql/schema.sql.",
|
|
8809
|
-
"-- This script
|
|
8810
|
-
"-- transforms values inside PostgreSQL, resets staging tables and upserts final data.",
|
|
9307
|
+
"-- This script only resets staging tables and then upserts final data.",
|
|
8811
9308
|
"truncate table staging_companies restart identity;",
|
|
8812
9309
|
"truncate table staging_establishments restart identity;",
|
|
8813
9310
|
"truncate table staging_partners restart identity;",
|
|
@@ -8815,10 +9312,10 @@ function generatePostgresSanitizedDirectImportScript(input) {
|
|
|
8815
9312
|
""
|
|
8816
9313
|
];
|
|
8817
9314
|
for (const dataset of DOMAIN_DATASETS) {
|
|
8818
|
-
lines.push(...
|
|
9315
|
+
lines.push(...copyDomainSql(dataset, grouped[dataset] ?? []), "");
|
|
8819
9316
|
}
|
|
8820
9317
|
for (const dataset of STAGING_DATASETS) {
|
|
8821
|
-
lines.push(...
|
|
9318
|
+
lines.push(...copyStagingSql(dataset, grouped[dataset] ?? []), "");
|
|
8822
9319
|
}
|
|
8823
9320
|
lines.push(...materializationAndAnalyzeSql());
|
|
8824
9321
|
return lines.join("\n");
|
|
@@ -8829,11 +9326,13 @@ function materializationAndAnalyzeSql() {
|
|
|
8829
9326
|
"",
|
|
8830
9327
|
materializeEstablishmentsSql(),
|
|
8831
9328
|
"",
|
|
9329
|
+
materializeSecondaryCnaesSql(),
|
|
9330
|
+
"",
|
|
8832
9331
|
materializePartnersSql(),
|
|
8833
9332
|
"",
|
|
8834
9333
|
materializeSimplesSql(),
|
|
8835
9334
|
"",
|
|
8836
|
-
"
|
|
9335
|
+
echo("Refreshing planner statistics..."),
|
|
8837
9336
|
"analyze companies;",
|
|
8838
9337
|
"analyze establishments;",
|
|
8839
9338
|
"analyze establishment_secondary_cnaes;",
|
|
@@ -8848,7 +9347,7 @@ function materializationAndAnalyzeSql() {
|
|
|
8848
9347
|
"",
|
|
8849
9348
|
"commit;",
|
|
8850
9349
|
"",
|
|
8851
|
-
"
|
|
9350
|
+
echo("CNPJ DB Loader hybrid PostgreSQL import completed."),
|
|
8852
9351
|
""
|
|
8853
9352
|
];
|
|
8854
9353
|
}
|
|
@@ -9055,6 +9554,29 @@ async function exportPostgresCsvDataset(inputPath, options = {}) {
|
|
|
9055
9554
|
import { mkdir as mkdir9, stat as stat7, writeFile as writeFile6 } from "fs/promises";
|
|
9056
9555
|
import path17 from "path";
|
|
9057
9556
|
var DEFAULT_SOURCE_ENCODING = "UTF8";
|
|
9557
|
+
var DEFAULT_TRANSACTION_MODE = "single";
|
|
9558
|
+
var ALL_INCLUDE_TARGETS = [
|
|
9559
|
+
"domains",
|
|
9560
|
+
"companies",
|
|
9561
|
+
"establishments",
|
|
9562
|
+
"partners",
|
|
9563
|
+
"simples",
|
|
9564
|
+
"secondary-cnaes",
|
|
9565
|
+
"indexes",
|
|
9566
|
+
"analyze"
|
|
9567
|
+
];
|
|
9568
|
+
var INCLUDE_TARGETS_BY_DATASET = {
|
|
9569
|
+
companies: "companies",
|
|
9570
|
+
establishments: "establishments",
|
|
9571
|
+
partners: "partners",
|
|
9572
|
+
simples_options: "simples",
|
|
9573
|
+
countries: "domains",
|
|
9574
|
+
cities: "domains",
|
|
9575
|
+
partner_qualifications: "domains",
|
|
9576
|
+
legal_natures: "domains",
|
|
9577
|
+
reasons: "domains",
|
|
9578
|
+
cnaes: "domains"
|
|
9579
|
+
};
|
|
9058
9580
|
function defaultPostgresDirectOutputPath(inputPath) {
|
|
9059
9581
|
const baseName = path17.basename(inputPath);
|
|
9060
9582
|
if (baseName.toLowerCase() === "sanitized") {
|
|
@@ -9063,7 +9585,7 @@ function defaultPostgresDirectOutputPath(inputPath) {
|
|
|
9063
9585
|
return path17.join(path17.dirname(inputPath), `${baseName}-postgres-direct`);
|
|
9064
9586
|
}
|
|
9065
9587
|
function inferNextStep5(scriptPath) {
|
|
9066
|
-
return `psql "postgres://postgres:postgres@localhost:5432/cnpj" -f ${scriptPath.replace(/\\/g, "/")}`;
|
|
9588
|
+
return `psql -d "postgres://postgres:postgres@localhost:5432/cnpj" -f ${scriptPath.replace(/\\/g, "/")}`;
|
|
9067
9589
|
}
|
|
9068
9590
|
function normalizeSourceEncoding(value) {
|
|
9069
9591
|
const encoding = (value ?? DEFAULT_SOURCE_ENCODING).trim();
|
|
@@ -9074,6 +9596,41 @@ function normalizeSourceEncoding(value) {
|
|
|
9074
9596
|
}
|
|
9075
9597
|
return encoding.toUpperCase();
|
|
9076
9598
|
}
|
|
9599
|
+
function normalizeTransactionMode(value) {
|
|
9600
|
+
const mode = value ?? DEFAULT_TRANSACTION_MODE;
|
|
9601
|
+
if (!["single", "phase", "none"].includes(mode)) {
|
|
9602
|
+
throw new ValidationError(
|
|
9603
|
+
`Invalid transaction mode: ${String(value)}. Use single, phase or none.`
|
|
9604
|
+
);
|
|
9605
|
+
}
|
|
9606
|
+
return mode;
|
|
9607
|
+
}
|
|
9608
|
+
function isIncludeTarget(value) {
|
|
9609
|
+
return ALL_INCLUDE_TARGETS.includes(value);
|
|
9610
|
+
}
|
|
9611
|
+
function normalizeIncludeTargets(include, dataset) {
|
|
9612
|
+
if (include && include.length > 0) {
|
|
9613
|
+
const unique = [...new Set(include)];
|
|
9614
|
+
const invalid = unique.filter((item) => !isIncludeTarget(item));
|
|
9615
|
+
if (invalid.length > 0) {
|
|
9616
|
+
throw new ValidationError(
|
|
9617
|
+
`Invalid include target(s): ${invalid.join(", ")}. Use ${ALL_INCLUDE_TARGETS.join(", ")}.`
|
|
9618
|
+
);
|
|
9619
|
+
}
|
|
9620
|
+
return unique;
|
|
9621
|
+
}
|
|
9622
|
+
if (dataset) {
|
|
9623
|
+
const target = INCLUDE_TARGETS_BY_DATASET[dataset];
|
|
9624
|
+
if (!target) {
|
|
9625
|
+
return [];
|
|
9626
|
+
}
|
|
9627
|
+
if (target === "establishments") {
|
|
9628
|
+
return ["establishments", "secondary-cnaes", "analyze"];
|
|
9629
|
+
}
|
|
9630
|
+
return [target, "analyze"];
|
|
9631
|
+
}
|
|
9632
|
+
return [...ALL_INCLUDE_TARGETS];
|
|
9633
|
+
}
|
|
9077
9634
|
async function generatePostgresDirectScript(inputPath, options = {}) {
|
|
9078
9635
|
if (options.dataset && !isImportDatasetType(options.dataset)) {
|
|
9079
9636
|
throw new ValidationError(`Unsupported dataset type: ${options.dataset}.`);
|
|
@@ -9089,6 +9646,10 @@ async function generatePostgresDirectScript(inputPath, options = {}) {
|
|
|
9089
9646
|
options.outputPath ?? defaultPostgresDirectOutputPath(validatedPath)
|
|
9090
9647
|
);
|
|
9091
9648
|
const sourceEncoding = normalizeSourceEncoding(options.sourceEncoding);
|
|
9649
|
+
const transactionMode = normalizeTransactionMode(options.transactionMode);
|
|
9650
|
+
const include = normalizeIncludeTargets(options.include, options.dataset);
|
|
9651
|
+
const skipIndexes = options.skipIndexes ?? false;
|
|
9652
|
+
const skipAnalyze = options.skipAnalyze ?? false;
|
|
9092
9653
|
const inspected = await inspectFiles(validatedPath);
|
|
9093
9654
|
const recognizedFiles = inspected.entries.filter((entry) => entry.entryKind === "file").flatMap((entry) => {
|
|
9094
9655
|
if (!isImportDatasetType(entry.inferredType)) {
|
|
@@ -9116,7 +9677,11 @@ async function generatePostgresDirectScript(inputPath, options = {}) {
|
|
|
9116
9677
|
outputPath,
|
|
9117
9678
|
totalFiles: recognizedFiles.length,
|
|
9118
9679
|
datasets,
|
|
9119
|
-
sourceEncoding
|
|
9680
|
+
sourceEncoding,
|
|
9681
|
+
transactionMode,
|
|
9682
|
+
include,
|
|
9683
|
+
skipIndexes,
|
|
9684
|
+
skipAnalyze
|
|
9120
9685
|
});
|
|
9121
9686
|
await mkdir9(outputPath, { recursive: true });
|
|
9122
9687
|
const sourceFiles = [];
|
|
@@ -9152,11 +9717,21 @@ async function generatePostgresDirectScript(inputPath, options = {}) {
|
|
|
9152
9717
|
}
|
|
9153
9718
|
const scriptName = options.scriptName ?? "import-postgres-direct.sql";
|
|
9154
9719
|
const scriptPath = path17.join(outputPath, scriptName);
|
|
9155
|
-
const
|
|
9720
|
+
const generated = generatePostgresDirectScriptFiles({
|
|
9156
9721
|
files: sourceFiles,
|
|
9157
|
-
sourceEncoding
|
|
9722
|
+
sourceEncoding,
|
|
9723
|
+
transactionMode,
|
|
9724
|
+
include,
|
|
9725
|
+
skipIndexes,
|
|
9726
|
+
skipAnalyze
|
|
9158
9727
|
});
|
|
9159
|
-
|
|
9728
|
+
const scriptFiles = [];
|
|
9729
|
+
for (const [fileName, script] of Object.entries(generated.scripts)) {
|
|
9730
|
+
const outputFileName = fileName === "import-postgres-direct.sql" ? scriptName : fileName;
|
|
9731
|
+
const outputFilePath = path17.join(outputPath, outputFileName);
|
|
9732
|
+
await writeFile6(outputFilePath, script, "utf8");
|
|
9733
|
+
scriptFiles.push(outputFilePath);
|
|
9734
|
+
}
|
|
9160
9735
|
const manifestPath = path17.join(outputPath, "manifest.json");
|
|
9161
9736
|
const summaryDatasets = [...summariesByDataset.values()].sort(
|
|
9162
9737
|
(left, right) => IMPORT_ORDER.indexOf(left.dataset) - IMPORT_ORDER.indexOf(right.dataset)
|
|
@@ -9168,13 +9743,19 @@ async function generatePostgresDirectScript(inputPath, options = {}) {
|
|
|
9168
9743
|
const manifest = {
|
|
9169
9744
|
generatedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
9170
9745
|
mode: "direct-sanitized-script",
|
|
9746
|
+
transactionMode,
|
|
9747
|
+
include,
|
|
9748
|
+
skipIndexes,
|
|
9749
|
+
skipAnalyze,
|
|
9171
9750
|
inputPath: path17.resolve(inputPath),
|
|
9172
9751
|
validatedPath,
|
|
9173
9752
|
outputPath,
|
|
9174
9753
|
scriptPath,
|
|
9754
|
+
scriptFiles,
|
|
9175
9755
|
sourceEncoding,
|
|
9176
9756
|
totalFiles: sourceFiles.length,
|
|
9177
9757
|
totalBytes,
|
|
9758
|
+
steps: generated.steps,
|
|
9178
9759
|
datasets: summaryDatasets
|
|
9179
9760
|
};
|
|
9180
9761
|
await writeFile6(
|
|
@@ -9197,15 +9778,19 @@ async function generatePostgresDirectScript(inputPath, options = {}) {
|
|
|
9197
9778
|
scriptPath,
|
|
9198
9779
|
manifestPath,
|
|
9199
9780
|
sourceEncoding,
|
|
9781
|
+
transactionMode,
|
|
9200
9782
|
totalFiles: sourceFiles.length,
|
|
9201
9783
|
totalBytes,
|
|
9202
9784
|
datasets: summaryDatasets,
|
|
9785
|
+
scriptFiles,
|
|
9786
|
+
steps: generated.steps,
|
|
9203
9787
|
warnings: [
|
|
9204
9788
|
...validation.ok ? [] : validation.errors,
|
|
9205
9789
|
"This script imports sanitized Receita files directly with psql \\copy. It avoids rewriting the full dataset into a second CSV tree.",
|
|
9206
|
-
"The generated
|
|
9790
|
+
"The generated scripts expect the database schema generated by cnpj-db-loader to be applied before execution.",
|
|
9207
9791
|
"The direct PostgreSQL script now defaults to UTF8 because the sanitize command writes clean UTF-8 files.",
|
|
9208
|
-
"Use --source-encoding WIN1252 or LATIN1 only when generating scripts for legacy sanitized files produced by older loader versions."
|
|
9792
|
+
"Use --source-encoding WIN1252 or LATIN1 only when generating scripts for legacy sanitized files produced by older loader versions.",
|
|
9793
|
+
"The generated import is now modular. Use import-postgres-direct.sql as the orchestrator or run individual phase scripts manually."
|
|
9209
9794
|
],
|
|
9210
9795
|
nextStep: inferNextStep5(scriptPath)
|
|
9211
9796
|
};
|
|
@@ -9213,7 +9798,6 @@ async function generatePostgresDirectScript(inputPath, options = {}) {
|
|
|
9213
9798
|
export {
|
|
9214
9799
|
AppError,
|
|
9215
9800
|
DEFAULT_FEDERAL_REVENUE_DOWNLOAD_ROOT,
|
|
9216
|
-
DEFAULT_FEDERAL_REVENUE_SHARE_TOKEN,
|
|
9217
9801
|
DEFAULT_FEDERAL_REVENUE_USER_AGENT,
|
|
9218
9802
|
DEFAULT_FEDERAL_REVENUE_WEBDAV_URL,
|
|
9219
9803
|
FEDERAL_REVENUE_CONTROL_DIR,
|
|
@@ -9247,6 +9831,7 @@ export {
|
|
|
9247
9831
|
getAllLayouts,
|
|
9248
9832
|
getCurrentFederalRevenueReference,
|
|
9249
9833
|
getFederalRevenueControlDirectory,
|
|
9834
|
+
getFederalRevenueEffectiveConfig,
|
|
9250
9835
|
getFederalRevenueManifestPath,
|
|
9251
9836
|
getFederalRevenueStatus,
|
|
9252
9837
|
getFederalRevenueSyncLockPath,
|
|
@@ -9263,10 +9848,13 @@ export {
|
|
|
9263
9848
|
materializeImportedData,
|
|
9264
9849
|
prettyJson,
|
|
9265
9850
|
readDatabaseConfig,
|
|
9851
|
+
readFederalRevenueEffectiveConfig,
|
|
9266
9852
|
readFederalRevenueManifest,
|
|
9267
9853
|
resetDefaultDbUrl,
|
|
9854
|
+
resetFederalRevenueConfig,
|
|
9268
9855
|
resolveDatabaseUrl,
|
|
9269
9856
|
resolveDbUrl,
|
|
9857
|
+
resolveFederalRevenueClientOptions,
|
|
9270
9858
|
resolveFederalRevenueReference,
|
|
9271
9859
|
resolveInputMode,
|
|
9272
9860
|
resolveSchemaProfile,
|
|
@@ -9276,6 +9864,7 @@ export {
|
|
|
9276
9864
|
safeWriteText,
|
|
9277
9865
|
sanitizeInputDirectory,
|
|
9278
9866
|
setDefaultDbUrl,
|
|
9867
|
+
setFederalRevenueConfigValue,
|
|
9279
9868
|
showQuarantineRow,
|
|
9280
9869
|
syncFederalRevenueDataset,
|
|
9281
9870
|
testDatabaseConnection,
|