@lexbuild/fr 1.14.0 → 1.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/dist/index.d.ts +104 -6
- package/dist/index.js +256 -102
- package/dist/index.js.map +1 -1
- package/package.json +13 -13
package/dist/index.js
CHANGED
|
@@ -77,8 +77,8 @@ var FR_INLINE_ELEMENTS = /* @__PURE__ */ new Set([
|
|
|
77
77
|
var FR_EMPHASIS_MAP = {
|
|
78
78
|
"01": "bold",
|
|
79
79
|
"02": "italic",
|
|
80
|
-
"03": "
|
|
81
|
-
// bold italic in print —
|
|
80
|
+
"03": "italic",
|
|
81
|
+
// bold italic in print — FR uses T="03" for case names, citations, and publication titles which render as italic
|
|
82
82
|
"04": "italic",
|
|
83
83
|
// italic in headings
|
|
84
84
|
"05": "italic",
|
|
@@ -289,12 +289,18 @@ var FrASTBuilder = class {
|
|
|
289
289
|
return;
|
|
290
290
|
}
|
|
291
291
|
if (name === FR_FTREF_ELEMENT) {
|
|
292
|
-
const
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
292
|
+
const parentFrame = this.stack[this.stack.length - 1];
|
|
293
|
+
if (parentFrame?.kind === "content" && parentFrame.node?.type === "content") {
|
|
294
|
+
const contentNode = parentFrame.node;
|
|
295
|
+
for (let i = contentNode.children.length - 1; i >= 0; i--) {
|
|
296
|
+
const child = contentNode.children[i];
|
|
297
|
+
if (child?.type === "inline" && child.inlineType === "sup") {
|
|
298
|
+
child.inlineType = "footnoteRef";
|
|
299
|
+
break;
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
this.ignoredContainerDepth = 1;
|
|
298
304
|
return;
|
|
299
305
|
}
|
|
300
306
|
if (FR_NOTE_ELEMENTS.has(name)) {
|
|
@@ -414,25 +420,29 @@ var FrASTBuilder = class {
|
|
|
414
420
|
}
|
|
415
421
|
if (frame.kind === "content" && frame.node?.type === "content") {
|
|
416
422
|
const contentNode = frame.node;
|
|
417
|
-
|
|
423
|
+
const normalized = text.replace(/\s+/g, " ");
|
|
424
|
+
if (normalized && normalized !== " ") {
|
|
418
425
|
contentNode.children.push({
|
|
419
426
|
type: "inline",
|
|
420
427
|
inlineType: "text",
|
|
421
|
-
text
|
|
428
|
+
text: normalized
|
|
422
429
|
});
|
|
423
430
|
}
|
|
424
431
|
return;
|
|
425
432
|
}
|
|
426
433
|
if (frame.kind === "inline" && frame.node?.type === "inline") {
|
|
427
434
|
const inlineNode = frame.node;
|
|
435
|
+
const normalized = text.replace(/\s+/g, " ");
|
|
428
436
|
if (inlineNode.children) {
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
437
|
+
if (normalized && normalized !== " ") {
|
|
438
|
+
inlineNode.children.push({
|
|
439
|
+
type: "inline",
|
|
440
|
+
inlineType: "text",
|
|
441
|
+
text: normalized
|
|
442
|
+
});
|
|
443
|
+
}
|
|
434
444
|
} else {
|
|
435
|
-
inlineNode.text = (inlineNode.text ?? "") +
|
|
445
|
+
inlineNode.text = (inlineNode.text ?? "") + normalized;
|
|
436
446
|
}
|
|
437
447
|
return;
|
|
438
448
|
}
|
|
@@ -590,7 +600,8 @@ var FrASTBuilder = class {
|
|
|
590
600
|
} else if (elementName === "B") {
|
|
591
601
|
inlineType = "bold";
|
|
592
602
|
} else if (elementName === "SU") {
|
|
593
|
-
|
|
603
|
+
const insideFootnote = this.findFrame("note") !== void 0;
|
|
604
|
+
inlineType = insideFootnote ? "footnoteRef" : "sup";
|
|
594
605
|
} else if (elementName === "FR") {
|
|
595
606
|
inlineType = "text";
|
|
596
607
|
} else if (elementName === "E") {
|
|
@@ -887,9 +898,26 @@ var FrASTBuilder = class {
|
|
|
887
898
|
const frame = this.popFrame(FR_FRDOC_ELEMENT);
|
|
888
899
|
if (!frame || frame.kind !== "frdoc") return;
|
|
889
900
|
const text = frame.textBuffer.trim();
|
|
890
|
-
const
|
|
891
|
-
if (
|
|
892
|
-
this.currentDocMeta.documentNumber =
|
|
901
|
+
const docMatch = /FR\s+Doc\.\s+([\w-]+)/i.exec(text);
|
|
902
|
+
if (docMatch) {
|
|
903
|
+
this.currentDocMeta.documentNumber = docMatch[1];
|
|
904
|
+
}
|
|
905
|
+
const dateMatch = /Filed\s+(\d{1,2})-(\d{1,2})-(\d{2})\b/.exec(text);
|
|
906
|
+
if (dateMatch) {
|
|
907
|
+
const [, mmStr, ddStr, yyStr] = dateMatch;
|
|
908
|
+
const mm = parseInt(mmStr ?? "0", 10);
|
|
909
|
+
const dd = parseInt(ddStr ?? "0", 10);
|
|
910
|
+
const yy = parseInt(yyStr ?? "0", 10);
|
|
911
|
+
const fullYear = yy < 50 ? 2e3 + yy : 1900 + yy;
|
|
912
|
+
const filed = new Date(fullYear, mm - 1, dd);
|
|
913
|
+
if (filed.getMonth() !== mm - 1 || filed.getDate() !== dd) {
|
|
914
|
+
return;
|
|
915
|
+
}
|
|
916
|
+
filed.setDate(filed.getDate() + 1);
|
|
917
|
+
const pubYear = filed.getFullYear();
|
|
918
|
+
const pubMonth = String(filed.getMonth() + 1).padStart(2, "0");
|
|
919
|
+
const pubDay = String(filed.getDate()).padStart(2, "0");
|
|
920
|
+
this.currentDocMeta.publicationDate = `${pubYear}-${pubMonth}-${pubDay}`;
|
|
893
921
|
}
|
|
894
922
|
}
|
|
895
923
|
// ── Private helpers: Stack navigation ──
|
|
@@ -958,7 +986,7 @@ function normalizeDocumentType(apiType) {
|
|
|
958
986
|
function buildFrFrontmatter(node, _context, xmlMeta, jsonMeta) {
|
|
959
987
|
const documentNumber = jsonMeta?.document_number ?? xmlMeta.documentNumber ?? "";
|
|
960
988
|
const subject = jsonMeta?.title ?? xmlMeta.subject ?? node.heading ?? "";
|
|
961
|
-
const publicationDate = jsonMeta?.publication_date ?? "";
|
|
989
|
+
const publicationDate = jsonMeta?.publication_date ?? xmlMeta.publicationDate ?? "";
|
|
962
990
|
const documentType = jsonMeta ? normalizeDocumentType(jsonMeta.type) : xmlMeta.documentTypeNormalized;
|
|
963
991
|
let agencies;
|
|
964
992
|
if (jsonMeta?.agencies && jsonMeta.agencies.length > 0) {
|
|
@@ -1056,64 +1084,30 @@ import {
|
|
|
1056
1084
|
var FR_DOC_TYPE_SET = new Set(FR_DOCUMENT_TYPE_KEYS);
|
|
1057
1085
|
async function convertFrDocuments(options) {
|
|
1058
1086
|
const xmlFiles = await discoverXmlFiles(options.input, options.from, options.to);
|
|
1059
|
-
|
|
1087
|
+
let documentsConverted = 0;
|
|
1060
1088
|
let totalTokenEstimate = 0;
|
|
1061
1089
|
let peakMemoryBytes = 0;
|
|
1062
1090
|
const linkResolver = createLinkResolver();
|
|
1063
|
-
|
|
1091
|
+
let filesProcessed = 0;
|
|
1064
1092
|
for (const xmlPath of xmlFiles) {
|
|
1093
|
+
let collected;
|
|
1065
1094
|
try {
|
|
1066
|
-
|
|
1067
|
-
parsedFiles.set(xmlPath, collected);
|
|
1095
|
+
collected = await parseXmlFile(xmlPath);
|
|
1068
1096
|
} catch (err) {
|
|
1069
1097
|
console.warn(
|
|
1070
1098
|
`Warning: Failed to parse ${xmlPath}: ${err instanceof Error ? err.message : String(err)}. Skipping.`
|
|
1071
1099
|
);
|
|
1100
|
+
continue;
|
|
1072
1101
|
}
|
|
1073
|
-
}
|
|
1074
|
-
for (const [, collected] of parsedFiles) {
|
|
1075
1102
|
for (const doc of collected) {
|
|
1076
1103
|
if (options.types && options.types.length > 0) {
|
|
1077
1104
|
if (!FR_DOC_TYPE_SET.has(doc.xmlMeta.documentType) || !options.types.includes(doc.xmlMeta.documentType)) {
|
|
1078
1105
|
continue;
|
|
1079
1106
|
}
|
|
1080
1107
|
}
|
|
1081
|
-
if (
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
doc.publicationDate,
|
|
1085
|
-
options.output
|
|
1086
|
-
);
|
|
1087
|
-
linkResolver.register(doc.node.identifier, outputPath);
|
|
1088
|
-
}
|
|
1089
|
-
}
|
|
1090
|
-
}
|
|
1091
|
-
if (options.dryRun) {
|
|
1092
|
-
let count = 0;
|
|
1093
|
-
for (const [, collected] of parsedFiles) {
|
|
1094
|
-
for (const doc of collected) {
|
|
1095
|
-
if (options.types && options.types.length > 0) {
|
|
1096
|
-
if (!FR_DOC_TYPE_SET.has(doc.xmlMeta.documentType) || !options.types.includes(doc.xmlMeta.documentType)) {
|
|
1097
|
-
continue;
|
|
1098
|
-
}
|
|
1099
|
-
}
|
|
1100
|
-
count++;
|
|
1101
|
-
}
|
|
1102
|
-
}
|
|
1103
|
-
return {
|
|
1104
|
-
documentsConverted: count,
|
|
1105
|
-
files: [],
|
|
1106
|
-
totalTokenEstimate: 0,
|
|
1107
|
-
peakMemoryBytes: 0,
|
|
1108
|
-
dryRun: true
|
|
1109
|
-
};
|
|
1110
|
-
}
|
|
1111
|
-
for (const [, collected] of parsedFiles) {
|
|
1112
|
-
for (const doc of collected) {
|
|
1113
|
-
if (options.types && options.types.length > 0) {
|
|
1114
|
-
if (!FR_DOC_TYPE_SET.has(doc.xmlMeta.documentType) || !options.types.includes(doc.xmlMeta.documentType)) {
|
|
1115
|
-
continue;
|
|
1116
|
-
}
|
|
1108
|
+
if (options.dryRun) {
|
|
1109
|
+
documentsConverted++;
|
|
1110
|
+
continue;
|
|
1117
1111
|
}
|
|
1118
1112
|
const outputPath = buildFrOutputPath(
|
|
1119
1113
|
doc.documentNumber,
|
|
@@ -1128,21 +1122,28 @@ async function convertFrDocuments(options) {
|
|
|
1128
1122
|
});
|
|
1129
1123
|
await mkdir(dirname(outputPath), { recursive: true });
|
|
1130
1124
|
await writeFile(outputPath, markdown, "utf-8");
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
totalTokenEstimate += tokenEstimate;
|
|
1125
|
+
documentsConverted++;
|
|
1126
|
+
totalTokenEstimate += Math.round(markdown.length / 4);
|
|
1134
1127
|
const mem = process.memoryUsage().rss;
|
|
1135
1128
|
if (mem > peakMemoryBytes) {
|
|
1136
1129
|
peakMemoryBytes = mem;
|
|
1137
1130
|
}
|
|
1138
1131
|
}
|
|
1132
|
+
filesProcessed++;
|
|
1133
|
+
options.onProgress?.({
|
|
1134
|
+
documentsConverted,
|
|
1135
|
+
filesProcessed,
|
|
1136
|
+
totalFiles: xmlFiles.length,
|
|
1137
|
+
currentFile: xmlPath
|
|
1138
|
+
});
|
|
1139
1139
|
}
|
|
1140
1140
|
return {
|
|
1141
|
-
documentsConverted
|
|
1142
|
-
files,
|
|
1141
|
+
documentsConverted,
|
|
1142
|
+
files: [],
|
|
1143
|
+
// Don't accumulate 750k+ file paths in memory
|
|
1143
1144
|
totalTokenEstimate,
|
|
1144
1145
|
peakMemoryBytes,
|
|
1145
|
-
dryRun:
|
|
1146
|
+
dryRun: options.dryRun
|
|
1146
1147
|
};
|
|
1147
1148
|
}
|
|
1148
1149
|
async function parseXmlFile(xmlPath) {
|
|
@@ -1241,9 +1242,13 @@ async function walkDir(dir, results) {
|
|
|
1241
1242
|
}
|
|
1242
1243
|
}
|
|
1243
1244
|
function inferDateFromPath(filePath) {
|
|
1244
|
-
const
|
|
1245
|
-
if (
|
|
1246
|
-
return `${
|
|
1245
|
+
const bulkMatch = /FR-(\d{4})-(\d{2})-(\d{2})\.xml$/.exec(filePath);
|
|
1246
|
+
if (bulkMatch) {
|
|
1247
|
+
return `${bulkMatch[1]}-${bulkMatch[2]}-${bulkMatch[3]}`;
|
|
1248
|
+
}
|
|
1249
|
+
const perDocMatch = /(\d{4})\/(\d{2})\/[^/]+\.xml$/.exec(filePath);
|
|
1250
|
+
if (perDocMatch) {
|
|
1251
|
+
return `${perDocMatch[1]}-${perDocMatch[2]}-01`;
|
|
1247
1252
|
}
|
|
1248
1253
|
return "";
|
|
1249
1254
|
}
|
|
@@ -1256,7 +1261,7 @@ import { pipeline } from "stream/promises";
|
|
|
1256
1261
|
import { Readable } from "stream";
|
|
1257
1262
|
var FR_API_BASE = "https://www.federalregister.gov/api/v1";
|
|
1258
1263
|
var PER_PAGE = 200;
|
|
1259
|
-
var
|
|
1264
|
+
var DEFAULT_CONCURRENCY = 10;
|
|
1260
1265
|
var MAX_RETRIES = 2;
|
|
1261
1266
|
var RETRY_BASE_DELAY_MS = 2e3;
|
|
1262
1267
|
var API_FIELDS = [
|
|
@@ -1299,7 +1304,7 @@ function buildFrApiListUrl(from, to, page, types) {
|
|
|
1299
1304
|
}
|
|
1300
1305
|
async function downloadFrDocuments(options) {
|
|
1301
1306
|
const to = options.to ?? (/* @__PURE__ */ new Date()).toISOString().slice(0, 10);
|
|
1302
|
-
const
|
|
1307
|
+
const concurrency = options.concurrency ?? DEFAULT_CONCURRENCY;
|
|
1303
1308
|
const files = [];
|
|
1304
1309
|
const failed = [];
|
|
1305
1310
|
let totalBytes = 0;
|
|
@@ -1308,6 +1313,7 @@ async function downloadFrDocuments(options) {
|
|
|
1308
1313
|
const chunks = buildMonthChunks(options.from, to);
|
|
1309
1314
|
for (const chunk of chunks) {
|
|
1310
1315
|
if (options.limit !== void 0 && files.length >= options.limit) break;
|
|
1316
|
+
const chunkDocs = [];
|
|
1311
1317
|
let page = 1;
|
|
1312
1318
|
let hasMore = true;
|
|
1313
1319
|
while (hasMore) {
|
|
@@ -1319,39 +1325,37 @@ async function downloadFrDocuments(options) {
|
|
|
1319
1325
|
`Unexpected API response for ${listUrl}: missing or invalid 'count' field. The FederalRegister.gov API may have changed its response format.`
|
|
1320
1326
|
);
|
|
1321
1327
|
}
|
|
1322
|
-
if (page === 1
|
|
1323
|
-
totalDocumentsFound
|
|
1328
|
+
if (page === 1) {
|
|
1329
|
+
totalDocumentsFound += data.count;
|
|
1324
1330
|
}
|
|
1325
1331
|
const results = data.results ?? [];
|
|
1326
1332
|
for (const doc of results) {
|
|
1327
|
-
if (options.limit !== void 0 && files.length >= options.limit) {
|
|
1328
|
-
hasMore = false;
|
|
1329
|
-
break;
|
|
1330
|
-
}
|
|
1331
|
-
options.onProgress?.({
|
|
1332
|
-
documentsDownloaded: files.length,
|
|
1333
|
-
totalDocuments: totalDocumentsFound,
|
|
1334
|
-
currentDocument: doc.document_number,
|
|
1335
|
-
currentChunk: `${chunk.from.slice(0, 7)}`
|
|
1336
|
-
});
|
|
1337
1333
|
if (!doc.full_text_xml_url) {
|
|
1338
1334
|
skipped++;
|
|
1339
1335
|
continue;
|
|
1340
1336
|
}
|
|
1341
|
-
|
|
1342
|
-
const result = await downloadSingleDocument(doc, options.output, fetchDelay);
|
|
1343
|
-
files.push(result);
|
|
1344
|
-
totalBytes += result.size;
|
|
1345
|
-
} catch (err) {
|
|
1346
|
-
failed.push({
|
|
1347
|
-
documentNumber: doc.document_number,
|
|
1348
|
-
error: err instanceof Error ? err.message : String(err)
|
|
1349
|
-
});
|
|
1350
|
-
}
|
|
1337
|
+
chunkDocs.push(doc);
|
|
1351
1338
|
}
|
|
1352
|
-
hasMore =
|
|
1339
|
+
hasMore = page < (data.total_pages ?? 0);
|
|
1353
1340
|
page++;
|
|
1354
1341
|
}
|
|
1342
|
+
const remaining = options.limit !== void 0 ? options.limit - files.length : chunkDocs.length;
|
|
1343
|
+
const docsToDownload = chunkDocs.slice(0, remaining);
|
|
1344
|
+
const chunkLabel = chunk.from.slice(0, 7);
|
|
1345
|
+
await downloadPool(docsToDownload, concurrency, options.output, (doc, result, error) => {
|
|
1346
|
+
if (result) {
|
|
1347
|
+
files.push(result);
|
|
1348
|
+
totalBytes += result.size;
|
|
1349
|
+
} else if (error) {
|
|
1350
|
+
failed.push({ documentNumber: doc.document_number, error });
|
|
1351
|
+
}
|
|
1352
|
+
options.onProgress?.({
|
|
1353
|
+
documentsDownloaded: files.length,
|
|
1354
|
+
totalDocuments: totalDocumentsFound,
|
|
1355
|
+
currentDocument: doc.document_number,
|
|
1356
|
+
currentChunk: chunkLabel
|
|
1357
|
+
});
|
|
1358
|
+
});
|
|
1355
1359
|
}
|
|
1356
1360
|
return {
|
|
1357
1361
|
documentsDownloaded: files.length,
|
|
@@ -1371,9 +1375,27 @@ async function downloadSingleFrDocument(documentNumber, output) {
|
|
|
1371
1375
|
`Invalid API response for document ${documentNumber}: missing document_number or publication_date`
|
|
1372
1376
|
);
|
|
1373
1377
|
}
|
|
1374
|
-
return downloadSingleDocument(doc, output
|
|
1378
|
+
return downloadSingleDocument(doc, output);
|
|
1375
1379
|
}
|
|
1376
|
-
async function
|
|
1380
|
+
async function downloadPool(docs, concurrency, outputDir, onComplete) {
|
|
1381
|
+
let nextIndex = 0;
|
|
1382
|
+
async function worker() {
|
|
1383
|
+
while (nextIndex < docs.length) {
|
|
1384
|
+
const i = nextIndex++;
|
|
1385
|
+
const doc = docs[i];
|
|
1386
|
+
if (!doc) break;
|
|
1387
|
+
try {
|
|
1388
|
+
const result = await downloadSingleDocument(doc, outputDir);
|
|
1389
|
+
onComplete(doc, result, null);
|
|
1390
|
+
} catch (err) {
|
|
1391
|
+
onComplete(doc, null, err instanceof Error ? err.message : String(err));
|
|
1392
|
+
}
|
|
1393
|
+
}
|
|
1394
|
+
}
|
|
1395
|
+
const workerCount = Math.min(concurrency, docs.length);
|
|
1396
|
+
await Promise.all(Array.from({ length: workerCount }, () => worker()));
|
|
1397
|
+
}
|
|
1398
|
+
async function downloadSingleDocument(doc, outputDir) {
|
|
1377
1399
|
if (!doc.document_number || !doc.publication_date) {
|
|
1378
1400
|
throw new Error(
|
|
1379
1401
|
`Invalid document in API response: missing document_number or publication_date`
|
|
@@ -1389,9 +1411,6 @@ async function downloadSingleDocument(doc, outputDir, fetchDelay) {
|
|
|
1389
1411
|
await mkdir2(dirname2(xmlPath), { recursive: true });
|
|
1390
1412
|
const jsonContent = JSON.stringify(doc, null, 2);
|
|
1391
1413
|
await fsWriteFile(jsonPath, jsonContent, "utf-8");
|
|
1392
|
-
if (fetchDelay > 0) {
|
|
1393
|
-
await sleep(fetchDelay);
|
|
1394
|
-
}
|
|
1395
1414
|
const xmlResponse = await fetchWithRetry(doc.full_text_xml_url);
|
|
1396
1415
|
if (!xmlResponse.body) {
|
|
1397
1416
|
throw new Error(`No response body for ${doc.document_number} XML`);
|
|
@@ -1466,6 +1485,138 @@ async function fetchWithRetry(url, attempt = 0) {
|
|
|
1466
1485
|
function sleep(ms) {
|
|
1467
1486
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
1468
1487
|
}
|
|
1488
|
+
|
|
1489
|
+
// src/govinfo-downloader.ts
|
|
1490
|
+
import { createWriteStream as createWriteStream2 } from "fs";
|
|
1491
|
+
import { mkdir as mkdir3, stat as stat3 } from "fs/promises";
|
|
1492
|
+
import { dirname as dirname3, join as join3 } from "path";
|
|
1493
|
+
import { pipeline as pipeline2 } from "stream/promises";
|
|
1494
|
+
import { Readable as Readable2 } from "stream";
|
|
1495
|
+
var GOVINFO_BASE = "https://www.govinfo.gov/content/pkg";
|
|
1496
|
+
var DEFAULT_CONCURRENCY2 = 10;
|
|
1497
|
+
var MAX_RETRIES2 = 2;
|
|
1498
|
+
var RETRY_BASE_DELAY_MS2 = 2e3;
|
|
1499
|
+
function buildGovinfoFrUrl(date) {
|
|
1500
|
+
return `${GOVINFO_BASE}/FR-${date}/xml/FR-${date}.xml`;
|
|
1501
|
+
}
|
|
1502
|
+
function buildGovinfoBulkPath(date, outputDir) {
|
|
1503
|
+
const year = date.slice(0, 4);
|
|
1504
|
+
return join3(outputDir, "bulk", year, `FR-${date}.xml`);
|
|
1505
|
+
}
|
|
1506
|
+
async function downloadFrBulk(options) {
|
|
1507
|
+
const to = options.to ?? (/* @__PURE__ */ new Date()).toISOString().slice(0, 10);
|
|
1508
|
+
const concurrency = options.concurrency ?? DEFAULT_CONCURRENCY2;
|
|
1509
|
+
const dates = generateDateRange(options.from, to);
|
|
1510
|
+
const files = [];
|
|
1511
|
+
let totalBytes = 0;
|
|
1512
|
+
let skipped = 0;
|
|
1513
|
+
let failed = 0;
|
|
1514
|
+
let nextIndex = 0;
|
|
1515
|
+
async function worker() {
|
|
1516
|
+
while (nextIndex < dates.length) {
|
|
1517
|
+
const i = nextIndex++;
|
|
1518
|
+
const date = dates[i];
|
|
1519
|
+
if (!date) break;
|
|
1520
|
+
options.onProgress?.({
|
|
1521
|
+
downloaded: files.length,
|
|
1522
|
+
totalDays: dates.length,
|
|
1523
|
+
skipped,
|
|
1524
|
+
failed,
|
|
1525
|
+
currentDate: date
|
|
1526
|
+
});
|
|
1527
|
+
const url = buildGovinfoFrUrl(date);
|
|
1528
|
+
const filePath = buildGovinfoBulkPath(date, options.output);
|
|
1529
|
+
try {
|
|
1530
|
+
const result = await downloadSingleDay(url, filePath, date);
|
|
1531
|
+
if (result) {
|
|
1532
|
+
files.push(result);
|
|
1533
|
+
totalBytes += result.size;
|
|
1534
|
+
} else {
|
|
1535
|
+
skipped++;
|
|
1536
|
+
}
|
|
1537
|
+
} catch (err) {
|
|
1538
|
+
console.warn(`Warning: Failed to download ${date}: ${err instanceof Error ? err.message : String(err)}`);
|
|
1539
|
+
failed++;
|
|
1540
|
+
}
|
|
1541
|
+
}
|
|
1542
|
+
}
|
|
1543
|
+
const workerCount = Math.min(concurrency, dates.length);
|
|
1544
|
+
await Promise.all(Array.from({ length: workerCount }, () => worker()));
|
|
1545
|
+
options.onProgress?.({
|
|
1546
|
+
downloaded: files.length,
|
|
1547
|
+
totalDays: dates.length,
|
|
1548
|
+
skipped,
|
|
1549
|
+
failed,
|
|
1550
|
+
currentDate: "done"
|
|
1551
|
+
});
|
|
1552
|
+
return {
|
|
1553
|
+
filesDownloaded: files.length,
|
|
1554
|
+
files,
|
|
1555
|
+
totalBytes,
|
|
1556
|
+
dateRange: { from: options.from, to },
|
|
1557
|
+
skipped,
|
|
1558
|
+
failed
|
|
1559
|
+
};
|
|
1560
|
+
}
|
|
1561
|
+
async function downloadSingleDay(url, filePath, date) {
|
|
1562
|
+
const response = await fetchWithRetry2(url);
|
|
1563
|
+
if (response.status === 404) {
|
|
1564
|
+
return null;
|
|
1565
|
+
}
|
|
1566
|
+
if (!response.ok) {
|
|
1567
|
+
throw new Error(`HTTP ${response.status} for ${url}`);
|
|
1568
|
+
}
|
|
1569
|
+
if (!response.body) {
|
|
1570
|
+
throw new Error(`No response body for ${url}`);
|
|
1571
|
+
}
|
|
1572
|
+
await mkdir3(dirname3(filePath), { recursive: true });
|
|
1573
|
+
const dest = createWriteStream2(filePath);
|
|
1574
|
+
await pipeline2(Readable2.fromWeb(response.body), dest);
|
|
1575
|
+
const fileStat = await stat3(filePath);
|
|
1576
|
+
return {
|
|
1577
|
+
path: filePath,
|
|
1578
|
+
date,
|
|
1579
|
+
size: Number(fileStat.size)
|
|
1580
|
+
};
|
|
1581
|
+
}
|
|
1582
|
+
function generateDateRange(from, to) {
|
|
1583
|
+
const dates = [];
|
|
1584
|
+
const current = /* @__PURE__ */ new Date(from + "T12:00:00Z");
|
|
1585
|
+
const end = /* @__PURE__ */ new Date(to + "T12:00:00Z");
|
|
1586
|
+
while (current <= end) {
|
|
1587
|
+
dates.push(current.toISOString().slice(0, 10));
|
|
1588
|
+
current.setUTCDate(current.getUTCDate() + 1);
|
|
1589
|
+
}
|
|
1590
|
+
return dates;
|
|
1591
|
+
}
|
|
1592
|
+
async function fetchWithRetry2(url, attempt = 0) {
|
|
1593
|
+
let response;
|
|
1594
|
+
try {
|
|
1595
|
+
response = await fetch(url);
|
|
1596
|
+
} catch (err) {
|
|
1597
|
+
if (attempt < MAX_RETRIES2) {
|
|
1598
|
+
const delay = RETRY_BASE_DELAY_MS2 * Math.pow(2, attempt);
|
|
1599
|
+
await sleep2(delay);
|
|
1600
|
+
return fetchWithRetry2(url, attempt + 1);
|
|
1601
|
+
}
|
|
1602
|
+
throw new Error(
|
|
1603
|
+
`Network error after ${MAX_RETRIES2 + 1} attempts for ${url}: ${err instanceof Error ? err.message : String(err)}`,
|
|
1604
|
+
{ cause: err }
|
|
1605
|
+
);
|
|
1606
|
+
}
|
|
1607
|
+
if (response.ok || response.status === 404) return response;
|
|
1608
|
+
if ((response.status === 429 || response.status === 503 || response.status === 504) && attempt < MAX_RETRIES2) {
|
|
1609
|
+
const retryAfter = response.headers.get("Retry-After");
|
|
1610
|
+
const parsedRetry = retryAfter ? parseInt(retryAfter, 10) : NaN;
|
|
1611
|
+
const delay = !isNaN(parsedRetry) && parsedRetry > 0 ? parsedRetry * 1e3 : RETRY_BASE_DELAY_MS2 * Math.pow(2, attempt);
|
|
1612
|
+
await sleep2(delay);
|
|
1613
|
+
return fetchWithRetry2(url, attempt + 1);
|
|
1614
|
+
}
|
|
1615
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText} for ${url}`);
|
|
1616
|
+
}
|
|
1617
|
+
function sleep2(ms) {
|
|
1618
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
1619
|
+
}
|
|
1469
1620
|
export {
|
|
1470
1621
|
FR_BLOCK_ELEMENTS,
|
|
1471
1622
|
FR_CONTENT_ELEMENTS,
|
|
@@ -1493,9 +1644,12 @@ export {
|
|
|
1493
1644
|
buildFrDownloadXmlPath,
|
|
1494
1645
|
buildFrFrontmatter,
|
|
1495
1646
|
buildFrOutputPath,
|
|
1647
|
+
buildGovinfoBulkPath,
|
|
1648
|
+
buildGovinfoFrUrl,
|
|
1496
1649
|
buildMonthDir,
|
|
1497
1650
|
buildYearDir,
|
|
1498
1651
|
convertFrDocuments,
|
|
1652
|
+
downloadFrBulk,
|
|
1499
1653
|
downloadFrDocuments,
|
|
1500
1654
|
downloadSingleFrDocument
|
|
1501
1655
|
};
|