@lexbuild/fr 1.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +231 -0
- package/dist/index.d.ts +414 -0
- package/dist/index.js +1502 -0
- package/dist/index.js.map +1 -0
- package/package.json +67 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,1502 @@
|
|
|
1
|
+
// src/fr-elements.ts
|
|
2
|
+
var FR_DOCUMENT_TYPE_KEYS = ["RULE", "PRORULE", "NOTICE", "PRESDOCU"];
|
|
3
|
+
var FR_DOCUMENT_ELEMENTS = new Set(FR_DOCUMENT_TYPE_KEYS);
|
|
4
|
+
var FR_SECTION_CONTAINERS = /* @__PURE__ */ new Set([
|
|
5
|
+
"RULES",
|
|
6
|
+
"PRORULES",
|
|
7
|
+
"NOTICES",
|
|
8
|
+
"PRESDOCS"
|
|
9
|
+
]);
|
|
10
|
+
var FR_DOCUMENT_TYPE_MAP = {
|
|
11
|
+
RULE: "rule",
|
|
12
|
+
PRORULE: "proposed_rule",
|
|
13
|
+
NOTICE: "notice",
|
|
14
|
+
PRESDOCU: "presidential_document"
|
|
15
|
+
};
|
|
16
|
+
var FR_PREAMBLE_SECTIONS = /* @__PURE__ */ new Set([
|
|
17
|
+
"AGY",
|
|
18
|
+
// Agency section (HD + P)
|
|
19
|
+
"ACT",
|
|
20
|
+
// Action section (HD + P)
|
|
21
|
+
"SUM",
|
|
22
|
+
// Summary section (HD + P)
|
|
23
|
+
"DATES",
|
|
24
|
+
// Dates section (HD + P)
|
|
25
|
+
"EFFDATE",
|
|
26
|
+
// Effective date section (HD + P)
|
|
27
|
+
"ADD",
|
|
28
|
+
// Addresses section (HD + P)
|
|
29
|
+
"FURINF"
|
|
30
|
+
// Further information section (HD + P)
|
|
31
|
+
]);
|
|
32
|
+
var FR_PREAMBLE_META_ELEMENTS = /* @__PURE__ */ new Set([
|
|
33
|
+
"AGENCY",
|
|
34
|
+
// Issuing agency name (attrs: TYPE)
|
|
35
|
+
"SUBAGY",
|
|
36
|
+
// Sub-agency name
|
|
37
|
+
"CFR",
|
|
38
|
+
// CFR citation affected (e.g., "10 CFR Part 2")
|
|
39
|
+
"SUBJECT",
|
|
40
|
+
// Document title/subject
|
|
41
|
+
"DEPDOC",
|
|
42
|
+
// Department document number
|
|
43
|
+
"RIN"
|
|
44
|
+
// Regulation Identifier Number
|
|
45
|
+
]);
|
|
46
|
+
var FR_CONTENT_ELEMENTS = /* @__PURE__ */ new Set([
|
|
47
|
+
"P",
|
|
48
|
+
// Paragraph
|
|
49
|
+
"FP"
|
|
50
|
+
// Flush paragraph (attrs: SOURCE for indent level)
|
|
51
|
+
]);
|
|
52
|
+
var FR_HEADING_ELEMENT = "HD";
|
|
53
|
+
var FR_HD_SOURCE_TO_DEPTH = {
|
|
54
|
+
HED: 1,
|
|
55
|
+
HD1: 2,
|
|
56
|
+
HD2: 3,
|
|
57
|
+
HD3: 4,
|
|
58
|
+
HD4: 5,
|
|
59
|
+
HD5: 6,
|
|
60
|
+
HD6: 6,
|
|
61
|
+
HD8: 6
|
|
62
|
+
};
|
|
63
|
+
var FR_INLINE_ELEMENTS = /* @__PURE__ */ new Set([
|
|
64
|
+
"I",
|
|
65
|
+
// Italic
|
|
66
|
+
"B",
|
|
67
|
+
// Bold
|
|
68
|
+
"E",
|
|
69
|
+
// Emphasis (type varies by T attribute)
|
|
70
|
+
"SU",
|
|
71
|
+
// Superscript / footnote marker
|
|
72
|
+
"FR",
|
|
73
|
+
// Fraction
|
|
74
|
+
"AC"
|
|
75
|
+
// Accent/diacritical
|
|
76
|
+
]);
|
|
77
|
+
var FR_EMPHASIS_MAP = {
|
|
78
|
+
"01": "bold",
|
|
79
|
+
"02": "italic",
|
|
80
|
+
"03": "bold",
|
|
81
|
+
// bold italic in print — treat as bold for Markdown
|
|
82
|
+
"04": "italic",
|
|
83
|
+
// italic in headings
|
|
84
|
+
"05": "italic",
|
|
85
|
+
// small caps — render as italic
|
|
86
|
+
"51": "sub",
|
|
87
|
+
// subscript
|
|
88
|
+
"52": "sub",
|
|
89
|
+
// subscript
|
|
90
|
+
"54": "sub",
|
|
91
|
+
// subscript (math)
|
|
92
|
+
"7462": "italic"
|
|
93
|
+
// special terms (et seq., De minimis)
|
|
94
|
+
};
|
|
95
|
+
var FR_REGTEXT_ELEMENTS = /* @__PURE__ */ new Set([
|
|
96
|
+
"REGTEXT",
|
|
97
|
+
// Regulatory text container (attrs: TITLE, PART)
|
|
98
|
+
"AMDPAR",
|
|
99
|
+
// Amendment instruction paragraph
|
|
100
|
+
"SECTION",
|
|
101
|
+
// Section container
|
|
102
|
+
"SECTNO",
|
|
103
|
+
// Section number designation
|
|
104
|
+
"PART",
|
|
105
|
+
// Part container within REGTEXT
|
|
106
|
+
"AUTH"
|
|
107
|
+
// Authority citation in REGTEXT
|
|
108
|
+
]);
|
|
109
|
+
var FR_LSTSUB_ELEMENT = "LSTSUB";
|
|
110
|
+
var FR_SIGNATURE_ELEMENTS = /* @__PURE__ */ new Set([
|
|
111
|
+
"SIG",
|
|
112
|
+
// Signature block container
|
|
113
|
+
"NAME",
|
|
114
|
+
// Signer name
|
|
115
|
+
"TITLE",
|
|
116
|
+
// Signer title
|
|
117
|
+
"DATED"
|
|
118
|
+
// Date of signature
|
|
119
|
+
]);
|
|
120
|
+
var FR_PRESIDENTIAL_SUBTYPES = /* @__PURE__ */ new Set([
|
|
121
|
+
"EXECORD",
|
|
122
|
+
// Executive Order
|
|
123
|
+
"PRMEMO",
|
|
124
|
+
// Presidential Memorandum
|
|
125
|
+
"PROCLA",
|
|
126
|
+
// Proclamation
|
|
127
|
+
"DETERM",
|
|
128
|
+
// Presidential Determination
|
|
129
|
+
"PRNOTICE",
|
|
130
|
+
// Presidential Notice
|
|
131
|
+
"PRORDER"
|
|
132
|
+
// Presidential Order
|
|
133
|
+
]);
|
|
134
|
+
var FR_PRESIDENTIAL_META_ELEMENTS = /* @__PURE__ */ new Set([
|
|
135
|
+
"PSIG",
|
|
136
|
+
// Presidential signature (initials)
|
|
137
|
+
"PLACE",
|
|
138
|
+
// Place of issuance
|
|
139
|
+
"TITLE3",
|
|
140
|
+
// CFR Title 3 marker
|
|
141
|
+
"PRES"
|
|
142
|
+
// President name
|
|
143
|
+
]);
|
|
144
|
+
var FR_NOTE_ELEMENTS = /* @__PURE__ */ new Set([
|
|
145
|
+
"FTNT",
|
|
146
|
+
// Footnote
|
|
147
|
+
"EDNOTE",
|
|
148
|
+
// Editorial note
|
|
149
|
+
"OLNOTE1"
|
|
150
|
+
// Overlay note
|
|
151
|
+
]);
|
|
152
|
+
var FR_FTREF_ELEMENT = "FTREF";
|
|
153
|
+
var FR_BLOCK_ELEMENTS = /* @__PURE__ */ new Set([
|
|
154
|
+
"EXTRACT",
|
|
155
|
+
// Extracted/quoted text
|
|
156
|
+
"EXAMPLE"
|
|
157
|
+
// Illustrative example
|
|
158
|
+
]);
|
|
159
|
+
var FR_TABLE_ELEMENTS = /* @__PURE__ */ new Set([
|
|
160
|
+
"GPOTABLE",
|
|
161
|
+
// Table root
|
|
162
|
+
"TTITLE",
|
|
163
|
+
// Table title
|
|
164
|
+
"BOXHD",
|
|
165
|
+
// Header box container
|
|
166
|
+
"CHED",
|
|
167
|
+
// Column header entry (attrs: H for level)
|
|
168
|
+
"ROW",
|
|
169
|
+
// Data row (attrs: RUL for horizontal rules)
|
|
170
|
+
"ENT"
|
|
171
|
+
// Cell entry (attrs: I for indent, A for alignment)
|
|
172
|
+
]);
|
|
173
|
+
var FR_IGNORE_ELEMENTS = /* @__PURE__ */ new Set([
|
|
174
|
+
"CNTNTS",
|
|
175
|
+
// Table of contents in daily issue
|
|
176
|
+
"GPH",
|
|
177
|
+
// Graphics (not available in XML)
|
|
178
|
+
"GID"
|
|
179
|
+
// Graphics ID
|
|
180
|
+
]);
|
|
181
|
+
var FR_SKIP_ELEMENTS = /* @__PURE__ */ new Set([
|
|
182
|
+
"PRTPAGE",
|
|
183
|
+
// Page number reference (attrs: P for page)
|
|
184
|
+
"STARS",
|
|
185
|
+
// Visual separator (****)
|
|
186
|
+
"FILED",
|
|
187
|
+
// Filing info
|
|
188
|
+
"UNITNAME",
|
|
189
|
+
// Section name in daily issue
|
|
190
|
+
"VOL",
|
|
191
|
+
// Volume number (daily issue metadata)
|
|
192
|
+
"NO",
|
|
193
|
+
// Issue number (daily issue metadata)
|
|
194
|
+
"DATE",
|
|
195
|
+
// Date (daily issue level — document dates from preamble)
|
|
196
|
+
"NEWPART",
|
|
197
|
+
// New part container in daily issue
|
|
198
|
+
"PTITLE",
|
|
199
|
+
// Part title in daily issue
|
|
200
|
+
"PARTNO",
|
|
201
|
+
// Part number in daily issue
|
|
202
|
+
"PNOTICE"
|
|
203
|
+
// Part notice text
|
|
204
|
+
]);
|
|
205
|
+
var FR_PASSTHROUGH_ELEMENTS = /* @__PURE__ */ new Set([
|
|
206
|
+
"FEDREG",
|
|
207
|
+
// Daily issue root element
|
|
208
|
+
"PREAMB",
|
|
209
|
+
// Preamble — children are handled individually
|
|
210
|
+
"SUPLINF"
|
|
211
|
+
// Supplementary information — children are handled individually
|
|
212
|
+
]);
|
|
213
|
+
var FR_FRDOC_ELEMENT = "FRDOC";
|
|
214
|
+
var FR_BILCOD_ELEMENT = "BILCOD";
|
|
215
|
+
|
|
216
|
+
// src/fr-builder.ts
|
|
217
|
+
var FrASTBuilder = class {
|
|
218
|
+
options;
|
|
219
|
+
stack = [];
|
|
220
|
+
/** Depth inside fully-ignored elements (CNTNTS, GPH) */
|
|
221
|
+
ignoredContainerDepth = 0;
|
|
222
|
+
/** Metadata extracted from current document */
|
|
223
|
+
currentDocMeta = {
|
|
224
|
+
documentType: "",
|
|
225
|
+
documentTypeNormalized: ""
|
|
226
|
+
};
|
|
227
|
+
/** All document metadata collected during parsing */
|
|
228
|
+
documentMetas = [];
|
|
229
|
+
constructor(options) {
|
|
230
|
+
this.options = options;
|
|
231
|
+
}
|
|
232
|
+
/** Get metadata for all documents parsed so far */
|
|
233
|
+
getDocumentMetas() {
|
|
234
|
+
return this.documentMetas;
|
|
235
|
+
}
|
|
236
|
+
/** Handle SAX open element */
|
|
237
|
+
onOpenElement(name, attrs) {
|
|
238
|
+
if (this.ignoredContainerDepth > 0) {
|
|
239
|
+
this.ignoredContainerDepth++;
|
|
240
|
+
return;
|
|
241
|
+
}
|
|
242
|
+
if (FR_IGNORE_ELEMENTS.has(name)) {
|
|
243
|
+
this.ignoredContainerDepth = 1;
|
|
244
|
+
return;
|
|
245
|
+
}
|
|
246
|
+
if (FR_SKIP_ELEMENTS.has(name)) {
|
|
247
|
+
this.ignoredContainerDepth = 1;
|
|
248
|
+
return;
|
|
249
|
+
}
|
|
250
|
+
if (FR_PASSTHROUGH_ELEMENTS.has(name)) {
|
|
251
|
+
return;
|
|
252
|
+
}
|
|
253
|
+
if (FR_SECTION_CONTAINERS.has(name)) {
|
|
254
|
+
return;
|
|
255
|
+
}
|
|
256
|
+
if (FR_DOCUMENT_ELEMENTS.has(name)) {
|
|
257
|
+
this.openDocument(name);
|
|
258
|
+
return;
|
|
259
|
+
}
|
|
260
|
+
if (FR_PRESIDENTIAL_SUBTYPES.has(name)) {
|
|
261
|
+
return;
|
|
262
|
+
}
|
|
263
|
+
if (FR_PRESIDENTIAL_META_ELEMENTS.has(name)) {
|
|
264
|
+
if (name === "PSIG" || name === "PLACE") {
|
|
265
|
+
this.openContent(name);
|
|
266
|
+
return;
|
|
267
|
+
}
|
|
268
|
+
this.stack.push({ kind: "ignore", elementName: name, textBuffer: "" });
|
|
269
|
+
return;
|
|
270
|
+
}
|
|
271
|
+
if (FR_PREAMBLE_META_ELEMENTS.has(name)) {
|
|
272
|
+
this.stack.push({ kind: "preambleMeta", elementName: name, textBuffer: "" });
|
|
273
|
+
return;
|
|
274
|
+
}
|
|
275
|
+
if (FR_PREAMBLE_SECTIONS.has(name)) {
|
|
276
|
+
this.stack.push({ kind: "preambleSection", elementName: name, textBuffer: "" });
|
|
277
|
+
return;
|
|
278
|
+
}
|
|
279
|
+
if (name === FR_HEADING_ELEMENT) {
|
|
280
|
+
this.openHeading(name, attrs);
|
|
281
|
+
return;
|
|
282
|
+
}
|
|
283
|
+
if (FR_CONTENT_ELEMENTS.has(name)) {
|
|
284
|
+
this.openContent(name);
|
|
285
|
+
return;
|
|
286
|
+
}
|
|
287
|
+
if (FR_INLINE_ELEMENTS.has(name)) {
|
|
288
|
+
this.openInline(name, attrs);
|
|
289
|
+
return;
|
|
290
|
+
}
|
|
291
|
+
if (name === FR_FTREF_ELEMENT) {
|
|
292
|
+
const node = {
|
|
293
|
+
type: "inline",
|
|
294
|
+
inlineType: "footnoteRef",
|
|
295
|
+
idref: attrs["ID"]
|
|
296
|
+
};
|
|
297
|
+
this.stack.push({ kind: "inline", elementName: name, node, textBuffer: "" });
|
|
298
|
+
return;
|
|
299
|
+
}
|
|
300
|
+
if (FR_NOTE_ELEMENTS.has(name)) {
|
|
301
|
+
this.openNote(name);
|
|
302
|
+
return;
|
|
303
|
+
}
|
|
304
|
+
if (FR_REGTEXT_ELEMENTS.has(name)) {
|
|
305
|
+
this.openRegtext(name, attrs);
|
|
306
|
+
return;
|
|
307
|
+
}
|
|
308
|
+
if (name === FR_LSTSUB_ELEMENT) {
|
|
309
|
+
this.stack.push({ kind: "block", elementName: name, textBuffer: "" });
|
|
310
|
+
return;
|
|
311
|
+
}
|
|
312
|
+
if (FR_SIGNATURE_ELEMENTS.has(name)) {
|
|
313
|
+
this.openSignature(name);
|
|
314
|
+
return;
|
|
315
|
+
}
|
|
316
|
+
if (FR_BLOCK_ELEMENTS.has(name)) {
|
|
317
|
+
this.stack.push({ kind: "block", elementName: name, textBuffer: "" });
|
|
318
|
+
return;
|
|
319
|
+
}
|
|
320
|
+
if (FR_TABLE_ELEMENTS.has(name)) {
|
|
321
|
+
this.openTableElement(name, attrs);
|
|
322
|
+
return;
|
|
323
|
+
}
|
|
324
|
+
if (name === FR_FRDOC_ELEMENT) {
|
|
325
|
+
this.stack.push({ kind: "frdoc", elementName: name, textBuffer: "" });
|
|
326
|
+
return;
|
|
327
|
+
}
|
|
328
|
+
if (name === FR_BILCOD_ELEMENT) {
|
|
329
|
+
this.ignoredContainerDepth = 1;
|
|
330
|
+
return;
|
|
331
|
+
}
|
|
332
|
+
this.stack.push({ kind: "ignore", elementName: name, textBuffer: "" });
|
|
333
|
+
}
|
|
334
|
+
/** Handle SAX close element */
|
|
335
|
+
onCloseElement(name) {
|
|
336
|
+
if (this.ignoredContainerDepth > 0) {
|
|
337
|
+
this.ignoredContainerDepth--;
|
|
338
|
+
return;
|
|
339
|
+
}
|
|
340
|
+
if (FR_PASSTHROUGH_ELEMENTS.has(name) || FR_SECTION_CONTAINERS.has(name)) {
|
|
341
|
+
return;
|
|
342
|
+
}
|
|
343
|
+
if (FR_PRESIDENTIAL_SUBTYPES.has(name)) {
|
|
344
|
+
return;
|
|
345
|
+
}
|
|
346
|
+
if (FR_DOCUMENT_ELEMENTS.has(name)) {
|
|
347
|
+
this.closeDocument(name);
|
|
348
|
+
return;
|
|
349
|
+
}
|
|
350
|
+
if (FR_PREAMBLE_META_ELEMENTS.has(name)) {
|
|
351
|
+
this.closePreambleMeta(name);
|
|
352
|
+
return;
|
|
353
|
+
}
|
|
354
|
+
if (FR_PREAMBLE_SECTIONS.has(name)) {
|
|
355
|
+
this.popFrame(name);
|
|
356
|
+
return;
|
|
357
|
+
}
|
|
358
|
+
if (name === FR_HEADING_ELEMENT) {
|
|
359
|
+
this.closeHeading(name);
|
|
360
|
+
return;
|
|
361
|
+
}
|
|
362
|
+
if (FR_CONTENT_ELEMENTS.has(name)) {
|
|
363
|
+
this.closeContent(name);
|
|
364
|
+
return;
|
|
365
|
+
}
|
|
366
|
+
if (name === "PSIG" || name === "PLACE") {
|
|
367
|
+
this.closeContent(name);
|
|
368
|
+
return;
|
|
369
|
+
}
|
|
370
|
+
if (FR_INLINE_ELEMENTS.has(name) || name === FR_FTREF_ELEMENT) {
|
|
371
|
+
this.closeInline(name);
|
|
372
|
+
return;
|
|
373
|
+
}
|
|
374
|
+
if (FR_NOTE_ELEMENTS.has(name)) {
|
|
375
|
+
this.closeNote(name);
|
|
376
|
+
return;
|
|
377
|
+
}
|
|
378
|
+
if (FR_REGTEXT_ELEMENTS.has(name)) {
|
|
379
|
+
this.closeRegtext(name);
|
|
380
|
+
return;
|
|
381
|
+
}
|
|
382
|
+
if (name === FR_LSTSUB_ELEMENT) {
|
|
383
|
+
this.popFrame(name);
|
|
384
|
+
return;
|
|
385
|
+
}
|
|
386
|
+
if (FR_SIGNATURE_ELEMENTS.has(name)) {
|
|
387
|
+
this.closeSignature(name);
|
|
388
|
+
return;
|
|
389
|
+
}
|
|
390
|
+
if (FR_BLOCK_ELEMENTS.has(name)) {
|
|
391
|
+
this.popFrame(name);
|
|
392
|
+
return;
|
|
393
|
+
}
|
|
394
|
+
if (FR_TABLE_ELEMENTS.has(name)) {
|
|
395
|
+
this.closeTableElement(name);
|
|
396
|
+
return;
|
|
397
|
+
}
|
|
398
|
+
if (name === FR_FRDOC_ELEMENT) {
|
|
399
|
+
this.closeFrdoc();
|
|
400
|
+
return;
|
|
401
|
+
}
|
|
402
|
+
if (this.stack.length > 0 && this.stack[this.stack.length - 1]?.elementName === name) {
|
|
403
|
+
this.stack.pop();
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
/** Handle SAX text content */
|
|
407
|
+
onText(text) {
|
|
408
|
+
if (this.ignoredContainerDepth > 0) return;
|
|
409
|
+
const frame = this.stack[this.stack.length - 1];
|
|
410
|
+
if (!frame) return;
|
|
411
|
+
if (frame.kind === "heading" || frame.kind === "preambleMeta" || frame.kind === "signatureField" || frame.kind === "tableCell" || frame.kind === "tableHeader" || frame.kind === "frdoc") {
|
|
412
|
+
frame.textBuffer += text;
|
|
413
|
+
return;
|
|
414
|
+
}
|
|
415
|
+
if (frame.kind === "content" && frame.node?.type === "content") {
|
|
416
|
+
const contentNode = frame.node;
|
|
417
|
+
if (text) {
|
|
418
|
+
contentNode.children.push({
|
|
419
|
+
type: "inline",
|
|
420
|
+
inlineType: "text",
|
|
421
|
+
text
|
|
422
|
+
});
|
|
423
|
+
}
|
|
424
|
+
return;
|
|
425
|
+
}
|
|
426
|
+
if (frame.kind === "inline" && frame.node?.type === "inline") {
|
|
427
|
+
const inlineNode = frame.node;
|
|
428
|
+
if (inlineNode.children) {
|
|
429
|
+
inlineNode.children.push({
|
|
430
|
+
type: "inline",
|
|
431
|
+
inlineType: "text",
|
|
432
|
+
text
|
|
433
|
+
});
|
|
434
|
+
} else {
|
|
435
|
+
inlineNode.text = (inlineNode.text ?? "") + text;
|
|
436
|
+
}
|
|
437
|
+
return;
|
|
438
|
+
}
|
|
439
|
+
if (frame.kind === "note" && frame.node?.type === "note") {
|
|
440
|
+
frame.textBuffer += text;
|
|
441
|
+
return;
|
|
442
|
+
}
|
|
443
|
+
}
|
|
444
|
+
// ── Private helpers: Document ──
|
|
445
|
+
openDocument(elementName) {
|
|
446
|
+
this.currentDocMeta = {
|
|
447
|
+
documentType: elementName,
|
|
448
|
+
documentTypeNormalized: FR_DOCUMENT_TYPE_MAP[elementName] ?? elementName.toLowerCase()
|
|
449
|
+
};
|
|
450
|
+
const node = {
|
|
451
|
+
type: "level",
|
|
452
|
+
levelType: "section",
|
|
453
|
+
children: [],
|
|
454
|
+
sourceElement: elementName
|
|
455
|
+
};
|
|
456
|
+
this.stack.push({ kind: "document", elementName, node, textBuffer: "" });
|
|
457
|
+
}
|
|
458
|
+
closeDocument(elementName) {
|
|
459
|
+
const frame = this.popFrame(elementName);
|
|
460
|
+
if (!frame || frame.kind !== "document" || !frame.node) return;
|
|
461
|
+
const levelNode = frame.node;
|
|
462
|
+
if (this.currentDocMeta.subject) {
|
|
463
|
+
levelNode.heading = this.currentDocMeta.subject;
|
|
464
|
+
}
|
|
465
|
+
if (this.currentDocMeta.documentNumber) {
|
|
466
|
+
levelNode.identifier = `/us/fr/${this.currentDocMeta.documentNumber}`;
|
|
467
|
+
levelNode.numValue = this.currentDocMeta.documentNumber;
|
|
468
|
+
}
|
|
469
|
+
const ancestors = [];
|
|
470
|
+
for (const f of this.stack) {
|
|
471
|
+
if (f.kind === "document" && f.node?.type === "level") {
|
|
472
|
+
const ln = f.node;
|
|
473
|
+
ancestors.push({
|
|
474
|
+
levelType: ln.levelType,
|
|
475
|
+
numValue: ln.numValue,
|
|
476
|
+
heading: ln.heading,
|
|
477
|
+
identifier: ln.identifier
|
|
478
|
+
});
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
const context = {
|
|
482
|
+
ancestors,
|
|
483
|
+
documentMeta: {
|
|
484
|
+
dcTitle: this.currentDocMeta.subject,
|
|
485
|
+
dcType: this.currentDocMeta.documentTypeNormalized
|
|
486
|
+
}
|
|
487
|
+
};
|
|
488
|
+
this.documentMetas.push({ ...this.currentDocMeta });
|
|
489
|
+
this.options.onEmit(levelNode, context);
|
|
490
|
+
}
|
|
491
|
+
// ── Private helpers: Preamble ──
|
|
492
|
+
closePreambleMeta(elementName) {
|
|
493
|
+
const frame = this.popFrame(elementName);
|
|
494
|
+
if (!frame || frame.kind !== "preambleMeta") return;
|
|
495
|
+
const text = frame.textBuffer.trim();
|
|
496
|
+
if (!text) return;
|
|
497
|
+
switch (elementName) {
|
|
498
|
+
case "AGENCY":
|
|
499
|
+
this.currentDocMeta.agency = text;
|
|
500
|
+
break;
|
|
501
|
+
case "SUBAGY":
|
|
502
|
+
this.currentDocMeta.subAgency = text;
|
|
503
|
+
break;
|
|
504
|
+
case "CFR":
|
|
505
|
+
this.currentDocMeta.cfrCitation = text;
|
|
506
|
+
break;
|
|
507
|
+
case "SUBJECT":
|
|
508
|
+
this.currentDocMeta.subject = text;
|
|
509
|
+
break;
|
|
510
|
+
case "RIN":
|
|
511
|
+
this.currentDocMeta.rin = text.replace(/^RIN\s+/i, "").trim();
|
|
512
|
+
break;
|
|
513
|
+
case "DEPDOC":
|
|
514
|
+
break;
|
|
515
|
+
}
|
|
516
|
+
}
|
|
517
|
+
// ── Private helpers: Heading ──
|
|
518
|
+
openHeading(_elementName, attrs) {
|
|
519
|
+
const source = attrs["SOURCE"] ?? "HD1";
|
|
520
|
+
const depth = FR_HD_SOURCE_TO_DEPTH[source] ?? 3;
|
|
521
|
+
this.stack.push({
|
|
522
|
+
kind: "heading",
|
|
523
|
+
elementName: FR_HEADING_ELEMENT,
|
|
524
|
+
textBuffer: "",
|
|
525
|
+
headerLevel: depth
|
|
526
|
+
});
|
|
527
|
+
}
|
|
528
|
+
closeHeading(elementName) {
|
|
529
|
+
const frame = this.popFrame(elementName);
|
|
530
|
+
if (!frame || frame.kind !== "heading") return;
|
|
531
|
+
const headingText = frame.textBuffer.trim();
|
|
532
|
+
if (!headingText) return;
|
|
533
|
+
const parentFrame = this.stack[this.stack.length - 1];
|
|
534
|
+
if (parentFrame?.kind === "preambleSection") {
|
|
535
|
+
const contentNode2 = {
|
|
536
|
+
type: "content",
|
|
537
|
+
variant: "content",
|
|
538
|
+
children: [
|
|
539
|
+
{
|
|
540
|
+
type: "inline",
|
|
541
|
+
inlineType: "bold",
|
|
542
|
+
text: headingText
|
|
543
|
+
}
|
|
544
|
+
]
|
|
545
|
+
};
|
|
546
|
+
this.addToDocument(contentNode2);
|
|
547
|
+
return;
|
|
548
|
+
}
|
|
549
|
+
const contentNode = {
|
|
550
|
+
type: "content",
|
|
551
|
+
variant: "content",
|
|
552
|
+
children: [
|
|
553
|
+
{
|
|
554
|
+
type: "inline",
|
|
555
|
+
inlineType: "bold",
|
|
556
|
+
text: headingText
|
|
557
|
+
}
|
|
558
|
+
]
|
|
559
|
+
};
|
|
560
|
+
this.addToDocument(contentNode);
|
|
561
|
+
}
|
|
562
|
+
// ── Private helpers: Content ──
|
|
563
|
+
openContent(elementName) {
|
|
564
|
+
const node = {
|
|
565
|
+
type: "content",
|
|
566
|
+
variant: "content",
|
|
567
|
+
children: []
|
|
568
|
+
};
|
|
569
|
+
this.stack.push({ kind: "content", elementName, node, textBuffer: "" });
|
|
570
|
+
}
|
|
571
|
+
closeContent(elementName) {
|
|
572
|
+
const frame = this.popFrame(elementName);
|
|
573
|
+
if (!frame || !frame.node) return;
|
|
574
|
+
const contentNode = frame.node;
|
|
575
|
+
if (contentNode.children.length === 0) return;
|
|
576
|
+
const parent = this.findParentDocument() ?? this.findParentNote();
|
|
577
|
+
if (parent?.node) {
|
|
578
|
+
if (parent.node.type === "level") {
|
|
579
|
+
parent.node.children.push(contentNode);
|
|
580
|
+
} else if (parent.node.type === "note") {
|
|
581
|
+
parent.node.children.push(contentNode);
|
|
582
|
+
}
|
|
583
|
+
}
|
|
584
|
+
}
|
|
585
|
+
// ── Private helpers: Inline ──
|
|
586
|
+
openInline(elementName, attrs) {
|
|
587
|
+
let inlineType = "text";
|
|
588
|
+
if (elementName === "I") {
|
|
589
|
+
inlineType = "italic";
|
|
590
|
+
} else if (elementName === "B") {
|
|
591
|
+
inlineType = "bold";
|
|
592
|
+
} else if (elementName === "SU") {
|
|
593
|
+
inlineType = "sup";
|
|
594
|
+
} else if (elementName === "FR") {
|
|
595
|
+
inlineType = "text";
|
|
596
|
+
} else if (elementName === "E") {
|
|
597
|
+
const tValue = attrs["T"] ?? "";
|
|
598
|
+
inlineType = FR_EMPHASIS_MAP[tValue] ?? "italic";
|
|
599
|
+
}
|
|
600
|
+
const node = {
|
|
601
|
+
type: "inline",
|
|
602
|
+
inlineType,
|
|
603
|
+
children: []
|
|
604
|
+
};
|
|
605
|
+
this.stack.push({ kind: "inline", elementName, node, textBuffer: "" });
|
|
606
|
+
}
|
|
607
|
+
closeInline(elementName) {
|
|
608
|
+
const frame = this.popFrame(elementName);
|
|
609
|
+
if (!frame || !frame.node) return;
|
|
610
|
+
const inlineNode = frame.node;
|
|
611
|
+
if (inlineNode.inlineType === "footnoteRef" && frame.textBuffer) {
|
|
612
|
+
inlineNode.text = frame.textBuffer.trim();
|
|
613
|
+
}
|
|
614
|
+
const parentFrame = this.stack[this.stack.length - 1];
|
|
615
|
+
if (!parentFrame) return;
|
|
616
|
+
if (parentFrame.kind === "content" && parentFrame.node?.type === "content") {
|
|
617
|
+
parentFrame.node.children.push(inlineNode);
|
|
618
|
+
} else if (parentFrame.kind === "inline" && parentFrame.node?.type === "inline") {
|
|
619
|
+
const parentInline = parentFrame.node;
|
|
620
|
+
if (parentInline.children) {
|
|
621
|
+
parentInline.children.push(inlineNode);
|
|
622
|
+
}
|
|
623
|
+
} else if (parentFrame.kind === "heading" || parentFrame.kind === "preambleMeta") {
|
|
624
|
+
if (inlineNode.text) {
|
|
625
|
+
parentFrame.textBuffer += inlineNode.text;
|
|
626
|
+
} else if (inlineNode.children) {
|
|
627
|
+
for (const child of inlineNode.children) {
|
|
628
|
+
if (child.text) parentFrame.textBuffer += child.text;
|
|
629
|
+
}
|
|
630
|
+
}
|
|
631
|
+
}
|
|
632
|
+
}
|
|
633
|
+
// ── Private helpers: Notes ──
|
|
634
|
+
openNote(elementName) {
|
|
635
|
+
const noteTypeMap = {
|
|
636
|
+
FTNT: "footnote",
|
|
637
|
+
EDNOTE: "editorial",
|
|
638
|
+
OLNOTE1: "general"
|
|
639
|
+
};
|
|
640
|
+
const noteType = noteTypeMap[elementName] ?? elementName.toLowerCase();
|
|
641
|
+
const node = {
|
|
642
|
+
type: "note",
|
|
643
|
+
noteType,
|
|
644
|
+
children: []
|
|
645
|
+
};
|
|
646
|
+
this.stack.push({ kind: "note", elementName, node, textBuffer: "" });
|
|
647
|
+
}
|
|
648
|
+
closeNote(elementName) {
|
|
649
|
+
const frame = this.popFrame(elementName);
|
|
650
|
+
if (!frame || !frame.node) return;
|
|
651
|
+
const noteNode = frame.node;
|
|
652
|
+
if (frame.textBuffer.trim() && noteNode.children.length === 0) {
|
|
653
|
+
const contentNode = {
|
|
654
|
+
type: "content",
|
|
655
|
+
variant: "content",
|
|
656
|
+
children: [
|
|
657
|
+
{
|
|
658
|
+
type: "inline",
|
|
659
|
+
inlineType: "text",
|
|
660
|
+
text: frame.textBuffer.trim()
|
|
661
|
+
}
|
|
662
|
+
]
|
|
663
|
+
};
|
|
664
|
+
noteNode.children.push(contentNode);
|
|
665
|
+
}
|
|
666
|
+
const parentDoc = this.findParentDocument();
|
|
667
|
+
if (parentDoc?.node && parentDoc.node.type === "level") {
|
|
668
|
+
parentDoc.node.children.push(noteNode);
|
|
669
|
+
}
|
|
670
|
+
}
|
|
671
|
+
// ── Private helpers: Regulatory text ──
|
|
672
|
+
openRegtext(elementName, attrs) {
|
|
673
|
+
if (elementName === "REGTEXT") {
|
|
674
|
+
const title = attrs["TITLE"] ?? "";
|
|
675
|
+
const part = attrs["PART"] ?? "";
|
|
676
|
+
const label = title && part ? `${title} CFR Part ${part}` : "";
|
|
677
|
+
if (label) {
|
|
678
|
+
const labelNode = {
|
|
679
|
+
type: "content",
|
|
680
|
+
variant: "content",
|
|
681
|
+
children: [
|
|
682
|
+
{
|
|
683
|
+
type: "inline",
|
|
684
|
+
inlineType: "bold",
|
|
685
|
+
text: label
|
|
686
|
+
}
|
|
687
|
+
]
|
|
688
|
+
};
|
|
689
|
+
this.addToDocument(labelNode);
|
|
690
|
+
}
|
|
691
|
+
this.stack.push({ kind: "regtext", elementName, textBuffer: "" });
|
|
692
|
+
return;
|
|
693
|
+
}
|
|
694
|
+
if (elementName === "AMDPAR") {
|
|
695
|
+
this.openContent(elementName);
|
|
696
|
+
return;
|
|
697
|
+
}
|
|
698
|
+
if (elementName === "SECTION") {
|
|
699
|
+
this.stack.push({ kind: "block", elementName, textBuffer: "" });
|
|
700
|
+
return;
|
|
701
|
+
}
|
|
702
|
+
if (elementName === "SECTNO") {
|
|
703
|
+
this.openContent(elementName);
|
|
704
|
+
return;
|
|
705
|
+
}
|
|
706
|
+
if (elementName === "PART") {
|
|
707
|
+
this.stack.push({ kind: "block", elementName, textBuffer: "" });
|
|
708
|
+
return;
|
|
709
|
+
}
|
|
710
|
+
if (elementName === "AUTH") {
|
|
711
|
+
this.openNote(elementName);
|
|
712
|
+
return;
|
|
713
|
+
}
|
|
714
|
+
}
|
|
715
|
+
closeRegtext(elementName) {
|
|
716
|
+
if (elementName === "REGTEXT") {
|
|
717
|
+
this.popFrame(elementName);
|
|
718
|
+
return;
|
|
719
|
+
}
|
|
720
|
+
if (elementName === "AMDPAR" || elementName === "SECTNO") {
|
|
721
|
+
this.closeContent(elementName);
|
|
722
|
+
return;
|
|
723
|
+
}
|
|
724
|
+
if (elementName === "SECTION" || elementName === "PART") {
|
|
725
|
+
this.popFrame(elementName);
|
|
726
|
+
return;
|
|
727
|
+
}
|
|
728
|
+
if (elementName === "AUTH") {
|
|
729
|
+
this.closeNote(elementName);
|
|
730
|
+
return;
|
|
731
|
+
}
|
|
732
|
+
}
|
|
733
|
+
// ── Private helpers: Signature block ──
|
|
734
|
+
openSignature(elementName) {
|
|
735
|
+
if (elementName === "SIG") {
|
|
736
|
+
const node = {
|
|
737
|
+
type: "note",
|
|
738
|
+
noteType: "signature",
|
|
739
|
+
children: []
|
|
740
|
+
};
|
|
741
|
+
this.stack.push({ kind: "signature", elementName, node, textBuffer: "" });
|
|
742
|
+
return;
|
|
743
|
+
}
|
|
744
|
+
this.stack.push({ kind: "signatureField", elementName, textBuffer: "" });
|
|
745
|
+
}
|
|
746
|
+
closeSignature(elementName) {
|
|
747
|
+
if (elementName === "SIG") {
|
|
748
|
+
const frame2 = this.popFrame(elementName);
|
|
749
|
+
if (!frame2 || !frame2.node) return;
|
|
750
|
+
const sigNode = frame2.node;
|
|
751
|
+
const parentDoc = this.findParentDocument();
|
|
752
|
+
if (parentDoc?.node && parentDoc.node.type === "level") {
|
|
753
|
+
parentDoc.node.children.push(sigNode);
|
|
754
|
+
}
|
|
755
|
+
return;
|
|
756
|
+
}
|
|
757
|
+
const frame = this.popFrame(elementName);
|
|
758
|
+
if (!frame || frame.kind !== "signatureField") return;
|
|
759
|
+
const text = frame.textBuffer.trim();
|
|
760
|
+
if (!text) return;
|
|
761
|
+
const sigFrame = this.findFrame("signature");
|
|
762
|
+
if (sigFrame?.node && sigFrame.node.type === "note") {
|
|
763
|
+
const contentNode = {
|
|
764
|
+
type: "content",
|
|
765
|
+
variant: "content",
|
|
766
|
+
children: [
|
|
767
|
+
{
|
|
768
|
+
type: "inline",
|
|
769
|
+
inlineType: "text",
|
|
770
|
+
text
|
|
771
|
+
}
|
|
772
|
+
]
|
|
773
|
+
};
|
|
774
|
+
sigFrame.node.children.push(contentNode);
|
|
775
|
+
}
|
|
776
|
+
}
|
|
777
|
+
// ── Private helpers: GPOTABLE ──
|
|
778
|
+
openTableElement(elementName, _attrs) {
|
|
779
|
+
if (elementName === "GPOTABLE") {
|
|
780
|
+
this.stack.push({
|
|
781
|
+
kind: "table",
|
|
782
|
+
elementName,
|
|
783
|
+
textBuffer: "",
|
|
784
|
+
headers: [],
|
|
785
|
+
rows: [],
|
|
786
|
+
currentRow: []
|
|
787
|
+
});
|
|
788
|
+
return;
|
|
789
|
+
}
|
|
790
|
+
if (elementName === "TTITLE") {
|
|
791
|
+
this.stack.push({ kind: "heading", elementName, textBuffer: "" });
|
|
792
|
+
return;
|
|
793
|
+
}
|
|
794
|
+
if (elementName === "BOXHD") {
|
|
795
|
+
return;
|
|
796
|
+
}
|
|
797
|
+
if (elementName === "CHED") {
|
|
798
|
+
this.stack.push({ kind: "tableHeader", elementName, textBuffer: "" });
|
|
799
|
+
return;
|
|
800
|
+
}
|
|
801
|
+
if (elementName === "ROW") {
|
|
802
|
+
const tableFrame = this.findTableFrame();
|
|
803
|
+
if (tableFrame) {
|
|
804
|
+
tableFrame.currentRow = [];
|
|
805
|
+
}
|
|
806
|
+
this.stack.push({ kind: "tableRow", elementName, textBuffer: "" });
|
|
807
|
+
return;
|
|
808
|
+
}
|
|
809
|
+
if (elementName === "ENT") {
|
|
810
|
+
this.stack.push({ kind: "tableCell", elementName, textBuffer: "" });
|
|
811
|
+
return;
|
|
812
|
+
}
|
|
813
|
+
}
|
|
814
|
+
closeTableElement(elementName) {
|
|
815
|
+
if (elementName === "GPOTABLE") {
|
|
816
|
+
this.closeGpoTable();
|
|
817
|
+
return;
|
|
818
|
+
}
|
|
819
|
+
if (elementName === "TTITLE") {
|
|
820
|
+
this.popFrame(elementName);
|
|
821
|
+
return;
|
|
822
|
+
}
|
|
823
|
+
if (elementName === "BOXHD") {
|
|
824
|
+
return;
|
|
825
|
+
}
|
|
826
|
+
if (elementName === "CHED") {
|
|
827
|
+
this.closeTableHeader();
|
|
828
|
+
return;
|
|
829
|
+
}
|
|
830
|
+
if (elementName === "ROW") {
|
|
831
|
+
this.closeTableRow();
|
|
832
|
+
return;
|
|
833
|
+
}
|
|
834
|
+
if (elementName === "ENT") {
|
|
835
|
+
this.closeTableCell();
|
|
836
|
+
return;
|
|
837
|
+
}
|
|
838
|
+
}
|
|
839
|
+
closeGpoTable() {
|
|
840
|
+
const frame = this.popFrame("GPOTABLE");
|
|
841
|
+
if (!frame || frame.kind !== "table") return;
|
|
842
|
+
const tableNode = {
|
|
843
|
+
type: "table",
|
|
844
|
+
variant: "xhtml",
|
|
845
|
+
// Reuse the same variant for rendering
|
|
846
|
+
headers: frame.headers ?? [],
|
|
847
|
+
rows: frame.rows ?? []
|
|
848
|
+
};
|
|
849
|
+
const parentDoc = this.findParentDocument();
|
|
850
|
+
if (parentDoc?.node && parentDoc.node.type === "level") {
|
|
851
|
+
parentDoc.node.children.push(tableNode);
|
|
852
|
+
}
|
|
853
|
+
}
|
|
854
|
+
closeTableHeader() {
|
|
855
|
+
const headerFrame = this.popFrame("CHED");
|
|
856
|
+
if (!headerFrame || headerFrame.kind !== "tableHeader") return;
|
|
857
|
+
const tableFrame = this.findTableFrame();
|
|
858
|
+
if (!tableFrame) return;
|
|
859
|
+
const text = headerFrame.textBuffer.trim();
|
|
860
|
+
if (!tableFrame.headers || tableFrame.headers.length === 0) {
|
|
861
|
+
tableFrame.headers = [[]];
|
|
862
|
+
}
|
|
863
|
+
const headerRow = tableFrame.headers[0];
|
|
864
|
+
if (headerRow) {
|
|
865
|
+
headerRow.push(text);
|
|
866
|
+
}
|
|
867
|
+
}
|
|
868
|
+
closeTableRow() {
|
|
869
|
+
const rowFrame = this.popFrame("ROW");
|
|
870
|
+
if (!rowFrame) return;
|
|
871
|
+
const tableFrame = this.findTableFrame();
|
|
872
|
+
if (tableFrame?.currentRow) {
|
|
873
|
+
tableFrame.rows?.push([...tableFrame.currentRow]);
|
|
874
|
+
tableFrame.currentRow = [];
|
|
875
|
+
}
|
|
876
|
+
}
|
|
877
|
+
closeTableCell() {
|
|
878
|
+
const cellFrame = this.stack.pop();
|
|
879
|
+
if (!cellFrame || cellFrame.kind !== "tableCell") return;
|
|
880
|
+
const tableFrame = this.findTableFrame();
|
|
881
|
+
if (tableFrame?.currentRow) {
|
|
882
|
+
tableFrame.currentRow.push(cellFrame.textBuffer.trim());
|
|
883
|
+
}
|
|
884
|
+
}
|
|
885
|
+
// ── Private helpers: FRDOC ──
|
|
886
|
+
closeFrdoc() {
|
|
887
|
+
const frame = this.popFrame(FR_FRDOC_ELEMENT);
|
|
888
|
+
if (!frame || frame.kind !== "frdoc") return;
|
|
889
|
+
const text = frame.textBuffer.trim();
|
|
890
|
+
const match = /FR\s+Doc\.\s+([\d-]+)/i.exec(text);
|
|
891
|
+
if (match) {
|
|
892
|
+
this.currentDocMeta.documentNumber = match[1];
|
|
893
|
+
}
|
|
894
|
+
}
|
|
895
|
+
// ── Private helpers: Stack navigation ──
|
|
896
|
+
addToDocument(node) {
|
|
897
|
+
const docFrame = this.findParentDocument();
|
|
898
|
+
if (docFrame?.node && docFrame.node.type === "level") {
|
|
899
|
+
docFrame.node.children.push(node);
|
|
900
|
+
}
|
|
901
|
+
}
|
|
902
|
+
findParentDocument() {
|
|
903
|
+
for (let i = this.stack.length - 1; i >= 0; i--) {
|
|
904
|
+
if (this.stack[i]?.kind === "document") {
|
|
905
|
+
return this.stack[i];
|
|
906
|
+
}
|
|
907
|
+
}
|
|
908
|
+
return void 0;
|
|
909
|
+
}
|
|
910
|
+
findParentNote() {
|
|
911
|
+
for (let i = this.stack.length - 1; i >= 0; i--) {
|
|
912
|
+
if (this.stack[i]?.kind === "note" || this.stack[i]?.kind === "signature") {
|
|
913
|
+
return this.stack[i];
|
|
914
|
+
}
|
|
915
|
+
}
|
|
916
|
+
return void 0;
|
|
917
|
+
}
|
|
918
|
+
findTableFrame() {
|
|
919
|
+
for (let i = this.stack.length - 1; i >= 0; i--) {
|
|
920
|
+
if (this.stack[i]?.kind === "table") {
|
|
921
|
+
return this.stack[i];
|
|
922
|
+
}
|
|
923
|
+
}
|
|
924
|
+
return void 0;
|
|
925
|
+
}
|
|
926
|
+
findFrame(kind) {
|
|
927
|
+
for (let i = this.stack.length - 1; i >= 0; i--) {
|
|
928
|
+
if (this.stack[i]?.kind === kind) {
|
|
929
|
+
return this.stack[i];
|
|
930
|
+
}
|
|
931
|
+
}
|
|
932
|
+
return void 0;
|
|
933
|
+
}
|
|
934
|
+
popFrame(elementName) {
|
|
935
|
+
if (this.stack.length === 0) return void 0;
|
|
936
|
+
for (let i = this.stack.length - 1; i >= 0; i--) {
|
|
937
|
+
if (this.stack[i]?.elementName === elementName) {
|
|
938
|
+
return this.stack.splice(i, 1)[0];
|
|
939
|
+
}
|
|
940
|
+
}
|
|
941
|
+
console.warn(
|
|
942
|
+
`FrASTBuilder: no matching frame for closing element </${elementName}>, stack has: [${this.stack.map((f) => f.elementName).join(", ")}]`
|
|
943
|
+
);
|
|
944
|
+
return void 0;
|
|
945
|
+
}
|
|
946
|
+
};
|
|
947
|
+
|
|
948
|
+
// src/fr-frontmatter.ts
|
|
949
|
+
function normalizeDocumentType(apiType) {
|
|
950
|
+
const map = {
|
|
951
|
+
Rule: "rule",
|
|
952
|
+
"Proposed Rule": "proposed_rule",
|
|
953
|
+
Notice: "notice",
|
|
954
|
+
"Presidential Document": "presidential_document"
|
|
955
|
+
};
|
|
956
|
+
return map[apiType] ?? apiType.toLowerCase().replace(/\s+/g, "_");
|
|
957
|
+
}
|
|
958
|
+
function buildFrFrontmatter(node, _context, xmlMeta, jsonMeta) {
|
|
959
|
+
const documentNumber = jsonMeta?.document_number ?? xmlMeta.documentNumber ?? "";
|
|
960
|
+
const subject = jsonMeta?.title ?? xmlMeta.subject ?? node.heading ?? "";
|
|
961
|
+
const publicationDate = jsonMeta?.publication_date ?? "";
|
|
962
|
+
const documentType = jsonMeta ? normalizeDocumentType(jsonMeta.type) : xmlMeta.documentTypeNormalized;
|
|
963
|
+
let agencies;
|
|
964
|
+
if (jsonMeta?.agencies && jsonMeta.agencies.length > 0) {
|
|
965
|
+
agencies = jsonMeta.agencies.map((a) => a.name);
|
|
966
|
+
} else if (xmlMeta.agency) {
|
|
967
|
+
agencies = [xmlMeta.agency];
|
|
968
|
+
if (xmlMeta.subAgency) {
|
|
969
|
+
agencies.push(xmlMeta.subAgency);
|
|
970
|
+
}
|
|
971
|
+
}
|
|
972
|
+
let cfrReferences;
|
|
973
|
+
if (jsonMeta?.cfr_references && jsonMeta.cfr_references.length > 0) {
|
|
974
|
+
cfrReferences = jsonMeta.cfr_references.map((r) => `${r.title} CFR Part ${r.part}`);
|
|
975
|
+
} else if (xmlMeta.cfrCitation) {
|
|
976
|
+
cfrReferences = [xmlMeta.cfrCitation];
|
|
977
|
+
}
|
|
978
|
+
let docketIds;
|
|
979
|
+
if (jsonMeta?.docket_ids && jsonMeta.docket_ids.length > 0) {
|
|
980
|
+
docketIds = jsonMeta.docket_ids;
|
|
981
|
+
}
|
|
982
|
+
const primaryAgency = agencies && agencies.length > 0 ? agencies[0] : void 0;
|
|
983
|
+
const frCitation = jsonMeta?.citation;
|
|
984
|
+
const rin = jsonMeta?.regulation_id_numbers?.[0] ?? xmlMeta.rin;
|
|
985
|
+
const fm = {
|
|
986
|
+
source: "fr",
|
|
987
|
+
legal_status: "authoritative_unofficial",
|
|
988
|
+
identifier: node.identifier ?? `/us/fr/${documentNumber}`,
|
|
989
|
+
title: subject,
|
|
990
|
+
title_number: 0,
|
|
991
|
+
// FR documents don't belong to a USC/CFR title
|
|
992
|
+
title_name: "Federal Register",
|
|
993
|
+
section_number: documentNumber,
|
|
994
|
+
section_name: subject,
|
|
995
|
+
positive_law: false,
|
|
996
|
+
currency: publicationDate,
|
|
997
|
+
last_updated: publicationDate,
|
|
998
|
+
// Shared optional fields
|
|
999
|
+
agency: primaryAgency,
|
|
1000
|
+
// FR-specific fields
|
|
1001
|
+
document_number: documentNumber || void 0,
|
|
1002
|
+
document_type: documentType || void 0,
|
|
1003
|
+
fr_citation: frCitation,
|
|
1004
|
+
fr_volume: jsonMeta?.volume,
|
|
1005
|
+
publication_date: publicationDate || void 0,
|
|
1006
|
+
agencies: agencies && agencies.length > 0 ? agencies : void 0,
|
|
1007
|
+
cfr_references: cfrReferences && cfrReferences.length > 0 ? cfrReferences : void 0,
|
|
1008
|
+
docket_ids: docketIds && docketIds.length > 0 ? docketIds : void 0,
|
|
1009
|
+
rin: rin || void 0,
|
|
1010
|
+
effective_date: jsonMeta?.effective_on ?? void 0,
|
|
1011
|
+
comments_close_date: jsonMeta?.comments_close_on ?? void 0,
|
|
1012
|
+
fr_action: jsonMeta?.action ?? void 0
|
|
1013
|
+
};
|
|
1014
|
+
return fm;
|
|
1015
|
+
}
|
|
1016
|
+
|
|
1017
|
+
// src/fr-path.ts
|
|
1018
|
+
import { join } from "path";
|
|
1019
|
+
function buildFrOutputPath(documentNumber, publicationDate, outputRoot) {
|
|
1020
|
+
const { year, month } = parseDateComponents(publicationDate);
|
|
1021
|
+
return join(outputRoot, "fr", year, month, `${documentNumber}.md`);
|
|
1022
|
+
}
|
|
1023
|
+
function buildFrDownloadXmlPath(documentNumber, publicationDate, downloadRoot) {
|
|
1024
|
+
const { year, month } = parseDateComponents(publicationDate);
|
|
1025
|
+
return join(downloadRoot, year, month, `${documentNumber}.xml`);
|
|
1026
|
+
}
|
|
1027
|
+
function buildFrDownloadJsonPath(documentNumber, publicationDate, downloadRoot) {
|
|
1028
|
+
const { year, month } = parseDateComponents(publicationDate);
|
|
1029
|
+
return join(downloadRoot, year, month, `${documentNumber}.json`);
|
|
1030
|
+
}
|
|
1031
|
+
function buildMonthDir(year, month, outputRoot) {
|
|
1032
|
+
return join(outputRoot, "fr", year, month);
|
|
1033
|
+
}
|
|
1034
|
+
function buildYearDir(year, outputRoot) {
|
|
1035
|
+
return join(outputRoot, "fr", year);
|
|
1036
|
+
}
|
|
1037
|
+
function parseDateComponents(date) {
|
|
1038
|
+
const parts = date.split("-");
|
|
1039
|
+
return {
|
|
1040
|
+
year: parts[0] || "0000",
|
|
1041
|
+
month: parts[1] || "00"
|
|
1042
|
+
};
|
|
1043
|
+
}
|
|
1044
|
+
|
|
1045
|
+
// src/converter.ts
|
|
1046
|
+
import { createReadStream, existsSync } from "fs";
|
|
1047
|
+
import { readFile, readdir, stat } from "fs/promises";
|
|
1048
|
+
import { join as join2, dirname } from "path";
|
|
1049
|
+
import {
|
|
1050
|
+
XMLParser,
|
|
1051
|
+
renderDocument,
|
|
1052
|
+
createLinkResolver,
|
|
1053
|
+
writeFile,
|
|
1054
|
+
mkdir
|
|
1055
|
+
} from "@lexbuild/core";
|
|
1056
|
+
var FR_DOC_TYPE_SET = new Set(FR_DOCUMENT_TYPE_KEYS);
|
|
1057
|
+
async function convertFrDocuments(options) {
|
|
1058
|
+
const xmlFiles = await discoverXmlFiles(options.input, options.from, options.to);
|
|
1059
|
+
const files = [];
|
|
1060
|
+
let totalTokenEstimate = 0;
|
|
1061
|
+
let peakMemoryBytes = 0;
|
|
1062
|
+
const linkResolver = createLinkResolver();
|
|
1063
|
+
const parsedFiles = /* @__PURE__ */ new Map();
|
|
1064
|
+
for (const xmlPath of xmlFiles) {
|
|
1065
|
+
try {
|
|
1066
|
+
const collected = await parseXmlFile(xmlPath);
|
|
1067
|
+
parsedFiles.set(xmlPath, collected);
|
|
1068
|
+
} catch (err) {
|
|
1069
|
+
console.warn(
|
|
1070
|
+
`Warning: Failed to parse ${xmlPath}: ${err instanceof Error ? err.message : String(err)}. Skipping.`
|
|
1071
|
+
);
|
|
1072
|
+
}
|
|
1073
|
+
}
|
|
1074
|
+
for (const [, collected] of parsedFiles) {
|
|
1075
|
+
for (const doc of collected) {
|
|
1076
|
+
if (options.types && options.types.length > 0) {
|
|
1077
|
+
if (!FR_DOC_TYPE_SET.has(doc.xmlMeta.documentType) || !options.types.includes(doc.xmlMeta.documentType)) {
|
|
1078
|
+
continue;
|
|
1079
|
+
}
|
|
1080
|
+
}
|
|
1081
|
+
if (doc.node.identifier) {
|
|
1082
|
+
const outputPath = buildFrOutputPath(
|
|
1083
|
+
doc.documentNumber,
|
|
1084
|
+
doc.publicationDate,
|
|
1085
|
+
options.output
|
|
1086
|
+
);
|
|
1087
|
+
linkResolver.register(doc.node.identifier, outputPath);
|
|
1088
|
+
}
|
|
1089
|
+
}
|
|
1090
|
+
}
|
|
1091
|
+
if (options.dryRun) {
|
|
1092
|
+
let count = 0;
|
|
1093
|
+
for (const [, collected] of parsedFiles) {
|
|
1094
|
+
for (const doc of collected) {
|
|
1095
|
+
if (options.types && options.types.length > 0) {
|
|
1096
|
+
if (!FR_DOC_TYPE_SET.has(doc.xmlMeta.documentType) || !options.types.includes(doc.xmlMeta.documentType)) {
|
|
1097
|
+
continue;
|
|
1098
|
+
}
|
|
1099
|
+
}
|
|
1100
|
+
count++;
|
|
1101
|
+
}
|
|
1102
|
+
}
|
|
1103
|
+
return {
|
|
1104
|
+
documentsConverted: count,
|
|
1105
|
+
files: [],
|
|
1106
|
+
totalTokenEstimate: 0,
|
|
1107
|
+
peakMemoryBytes: 0,
|
|
1108
|
+
dryRun: true
|
|
1109
|
+
};
|
|
1110
|
+
}
|
|
1111
|
+
for (const [, collected] of parsedFiles) {
|
|
1112
|
+
for (const doc of collected) {
|
|
1113
|
+
if (options.types && options.types.length > 0) {
|
|
1114
|
+
if (!FR_DOC_TYPE_SET.has(doc.xmlMeta.documentType) || !options.types.includes(doc.xmlMeta.documentType)) {
|
|
1115
|
+
continue;
|
|
1116
|
+
}
|
|
1117
|
+
}
|
|
1118
|
+
const outputPath = buildFrOutputPath(
|
|
1119
|
+
doc.documentNumber,
|
|
1120
|
+
doc.publicationDate,
|
|
1121
|
+
options.output
|
|
1122
|
+
);
|
|
1123
|
+
const frontmatter = buildFrFrontmatter(doc.node, doc.context, doc.xmlMeta, doc.jsonMeta);
|
|
1124
|
+
const markdown = renderDocument(doc.node, frontmatter, {
|
|
1125
|
+
headingOffset: 0,
|
|
1126
|
+
linkStyle: options.linkStyle,
|
|
1127
|
+
resolveLink: options.linkStyle === "relative" ? (id) => linkResolver.resolve(id, outputPath) : void 0
|
|
1128
|
+
});
|
|
1129
|
+
await mkdir(dirname(outputPath), { recursive: true });
|
|
1130
|
+
await writeFile(outputPath, markdown, "utf-8");
|
|
1131
|
+
files.push(outputPath);
|
|
1132
|
+
const tokenEstimate = Math.round(markdown.length / 4);
|
|
1133
|
+
totalTokenEstimate += tokenEstimate;
|
|
1134
|
+
const mem = process.memoryUsage().rss;
|
|
1135
|
+
if (mem > peakMemoryBytes) {
|
|
1136
|
+
peakMemoryBytes = mem;
|
|
1137
|
+
}
|
|
1138
|
+
}
|
|
1139
|
+
}
|
|
1140
|
+
return {
|
|
1141
|
+
documentsConverted: files.length,
|
|
1142
|
+
files,
|
|
1143
|
+
totalTokenEstimate,
|
|
1144
|
+
peakMemoryBytes,
|
|
1145
|
+
dryRun: false
|
|
1146
|
+
};
|
|
1147
|
+
}
|
|
1148
|
+
async function parseXmlFile(xmlPath) {
|
|
1149
|
+
const collected = [];
|
|
1150
|
+
const builder = new FrASTBuilder({
|
|
1151
|
+
onEmit: (node, context) => {
|
|
1152
|
+
const currentMetas = builder.getDocumentMetas();
|
|
1153
|
+
const meta = currentMetas[currentMetas.length - 1];
|
|
1154
|
+
if (!meta) {
|
|
1155
|
+
console.warn(
|
|
1156
|
+
`Warning: No XML metadata extracted for emitted document in ${xmlPath}. Frontmatter will have empty document_type and document_number.`
|
|
1157
|
+
);
|
|
1158
|
+
}
|
|
1159
|
+
collected.push({
|
|
1160
|
+
node,
|
|
1161
|
+
context,
|
|
1162
|
+
xmlMeta: meta ?? { documentType: "", documentTypeNormalized: "" },
|
|
1163
|
+
publicationDate: "",
|
|
1164
|
+
documentNumber: meta?.documentNumber ?? ""
|
|
1165
|
+
});
|
|
1166
|
+
}
|
|
1167
|
+
});
|
|
1168
|
+
const parser = new XMLParser({ defaultNamespace: "" });
|
|
1169
|
+
parser.on("openElement", (name, attrs) => builder.onOpenElement(name, attrs));
|
|
1170
|
+
parser.on("closeElement", (name) => builder.onCloseElement(name));
|
|
1171
|
+
parser.on("text", (text) => builder.onText(text));
|
|
1172
|
+
const stream = createReadStream(xmlPath, "utf-8");
|
|
1173
|
+
await parser.parseStream(stream);
|
|
1174
|
+
const jsonPath = xmlPath.replace(/\.xml$/, ".json");
|
|
1175
|
+
let jsonMeta;
|
|
1176
|
+
if (existsSync(jsonPath)) {
|
|
1177
|
+
try {
|
|
1178
|
+
const raw = await readFile(jsonPath, "utf-8");
|
|
1179
|
+
jsonMeta = JSON.parse(raw);
|
|
1180
|
+
} catch (err) {
|
|
1181
|
+
console.warn(
|
|
1182
|
+
`Warning: Failed to parse JSON sidecar ${jsonPath}: ${err instanceof Error ? err.message : String(err)}. Continuing without enriched metadata.`
|
|
1183
|
+
);
|
|
1184
|
+
}
|
|
1185
|
+
}
|
|
1186
|
+
for (const doc of collected) {
|
|
1187
|
+
if (jsonMeta && jsonMeta.document_number === doc.documentNumber) {
|
|
1188
|
+
doc.jsonMeta = jsonMeta;
|
|
1189
|
+
doc.publicationDate = jsonMeta.publication_date;
|
|
1190
|
+
} else {
|
|
1191
|
+
const inferredDate = inferDateFromPath(xmlPath);
|
|
1192
|
+
if (!inferredDate) {
|
|
1193
|
+
console.warn(
|
|
1194
|
+
`Warning: No publication date for document ${doc.documentNumber || "(unknown)"} \u2014 no JSON sidecar and path ${xmlPath} has no YYYY/MM/ pattern. Output will be in 0000/00/.`
|
|
1195
|
+
);
|
|
1196
|
+
}
|
|
1197
|
+
doc.publicationDate = inferredDate;
|
|
1198
|
+
}
|
|
1199
|
+
}
|
|
1200
|
+
return collected;
|
|
1201
|
+
}
|
|
1202
|
+
async function discoverXmlFiles(input, from, to) {
|
|
1203
|
+
let inputStat;
|
|
1204
|
+
try {
|
|
1205
|
+
inputStat = await stat(input);
|
|
1206
|
+
} catch (err) {
|
|
1207
|
+
throw new Error(
|
|
1208
|
+
`Cannot access input path "${input}": ${err instanceof Error ? err.message : String(err)}`,
|
|
1209
|
+
{ cause: err }
|
|
1210
|
+
);
|
|
1211
|
+
}
|
|
1212
|
+
if (inputStat.isFile()) {
|
|
1213
|
+
return [input];
|
|
1214
|
+
}
|
|
1215
|
+
if (!inputStat.isDirectory()) {
|
|
1216
|
+
throw new Error(`Input path "${input}" is not a file or directory`);
|
|
1217
|
+
}
|
|
1218
|
+
const xmlFiles = [];
|
|
1219
|
+
await walkDir(input, xmlFiles);
|
|
1220
|
+
let filtered = xmlFiles;
|
|
1221
|
+
if (from || to) {
|
|
1222
|
+
filtered = xmlFiles.filter((f) => {
|
|
1223
|
+
const date = inferDateFromPath(f);
|
|
1224
|
+
if (!date) return true;
|
|
1225
|
+
if (from && date < from) return false;
|
|
1226
|
+
if (to && date > to + "-32") return false;
|
|
1227
|
+
return true;
|
|
1228
|
+
});
|
|
1229
|
+
}
|
|
1230
|
+
return filtered.sort();
|
|
1231
|
+
}
|
|
1232
|
+
async function walkDir(dir, results) {
|
|
1233
|
+
const entries = await readdir(dir, { withFileTypes: true });
|
|
1234
|
+
for (const entry of entries) {
|
|
1235
|
+
const fullPath = join2(dir, entry.name);
|
|
1236
|
+
if (entry.isDirectory()) {
|
|
1237
|
+
await walkDir(fullPath, results);
|
|
1238
|
+
} else if (entry.isFile() && entry.name.endsWith(".xml")) {
|
|
1239
|
+
results.push(fullPath);
|
|
1240
|
+
}
|
|
1241
|
+
}
|
|
1242
|
+
}
|
|
1243
|
+
function inferDateFromPath(filePath) {
|
|
1244
|
+
const match = /(\d{4})\/(\d{2})\/[^/]+\.xml$/.exec(filePath);
|
|
1245
|
+
if (match) {
|
|
1246
|
+
return `${match[1]}-${match[2]}-01`;
|
|
1247
|
+
}
|
|
1248
|
+
return "";
|
|
1249
|
+
}
|
|
1250
|
+
|
|
1251
|
+
// src/downloader.ts
|
|
1252
|
+
import { createWriteStream } from "fs";
|
|
1253
|
+
import { mkdir as mkdir2, stat as stat2, writeFile as fsWriteFile } from "fs/promises";
|
|
1254
|
+
import { dirname as dirname2 } from "path";
|
|
1255
|
+
import { pipeline } from "stream/promises";
|
|
1256
|
+
import { Readable } from "stream";
|
|
1257
|
+
var FR_API_BASE = "https://www.federalregister.gov/api/v1";
|
|
1258
|
+
var PER_PAGE = 200;
|
|
1259
|
+
var DEFAULT_FETCH_DELAY_MS = 100;
|
|
1260
|
+
var MAX_RETRIES = 2;
|
|
1261
|
+
var RETRY_BASE_DELAY_MS = 2e3;
|
|
1262
|
+
var API_FIELDS = [
|
|
1263
|
+
"document_number",
|
|
1264
|
+
"type",
|
|
1265
|
+
"title",
|
|
1266
|
+
"publication_date",
|
|
1267
|
+
"citation",
|
|
1268
|
+
"volume",
|
|
1269
|
+
"start_page",
|
|
1270
|
+
"end_page",
|
|
1271
|
+
"agencies",
|
|
1272
|
+
"cfr_references",
|
|
1273
|
+
"docket_ids",
|
|
1274
|
+
"regulation_id_numbers",
|
|
1275
|
+
"effective_on",
|
|
1276
|
+
"comments_close_on",
|
|
1277
|
+
"action",
|
|
1278
|
+
"abstract",
|
|
1279
|
+
"significant",
|
|
1280
|
+
"topics",
|
|
1281
|
+
"full_text_xml_url"
|
|
1282
|
+
];
|
|
1283
|
+
function buildFrApiListUrl(from, to, page, types) {
|
|
1284
|
+
const params = new URLSearchParams();
|
|
1285
|
+
params.set("conditions[publication_date][gte]", from);
|
|
1286
|
+
params.set("conditions[publication_date][lte]", to);
|
|
1287
|
+
params.set("per_page", String(PER_PAGE));
|
|
1288
|
+
params.set("page", String(page));
|
|
1289
|
+
params.set("order", "oldest");
|
|
1290
|
+
for (const field of API_FIELDS) {
|
|
1291
|
+
params.append("fields[]", field);
|
|
1292
|
+
}
|
|
1293
|
+
if (types && types.length > 0) {
|
|
1294
|
+
for (const t of types) {
|
|
1295
|
+
params.append("conditions[type][]", t);
|
|
1296
|
+
}
|
|
1297
|
+
}
|
|
1298
|
+
return `${FR_API_BASE}/documents.json?${params.toString()}`;
|
|
1299
|
+
}
|
|
1300
|
+
async function downloadFrDocuments(options) {
|
|
1301
|
+
const to = options.to ?? (/* @__PURE__ */ new Date()).toISOString().slice(0, 10);
|
|
1302
|
+
const fetchDelay = options.fetchDelayMs ?? DEFAULT_FETCH_DELAY_MS;
|
|
1303
|
+
const files = [];
|
|
1304
|
+
const failed = [];
|
|
1305
|
+
let totalBytes = 0;
|
|
1306
|
+
let skipped = 0;
|
|
1307
|
+
let totalDocumentsFound = 0;
|
|
1308
|
+
const chunks = buildMonthChunks(options.from, to);
|
|
1309
|
+
for (const chunk of chunks) {
|
|
1310
|
+
if (options.limit !== void 0 && files.length >= options.limit) break;
|
|
1311
|
+
let page = 1;
|
|
1312
|
+
let hasMore = true;
|
|
1313
|
+
while (hasMore) {
|
|
1314
|
+
const listUrl = buildFrApiListUrl(chunk.from, chunk.to, page, options.types);
|
|
1315
|
+
const response = await fetchWithRetry(listUrl);
|
|
1316
|
+
const data = await response.json();
|
|
1317
|
+
if (typeof data.count !== "number") {
|
|
1318
|
+
throw new Error(
|
|
1319
|
+
`Unexpected API response for ${listUrl}: missing or invalid 'count' field. The FederalRegister.gov API may have changed its response format.`
|
|
1320
|
+
);
|
|
1321
|
+
}
|
|
1322
|
+
if (page === 1 && totalDocumentsFound === 0) {
|
|
1323
|
+
totalDocumentsFound = data.count;
|
|
1324
|
+
}
|
|
1325
|
+
const results = data.results ?? [];
|
|
1326
|
+
for (const doc of results) {
|
|
1327
|
+
if (options.limit !== void 0 && files.length >= options.limit) {
|
|
1328
|
+
hasMore = false;
|
|
1329
|
+
break;
|
|
1330
|
+
}
|
|
1331
|
+
options.onProgress?.({
|
|
1332
|
+
documentsDownloaded: files.length,
|
|
1333
|
+
totalDocuments: totalDocumentsFound,
|
|
1334
|
+
currentDocument: doc.document_number,
|
|
1335
|
+
currentChunk: `${chunk.from.slice(0, 7)}`
|
|
1336
|
+
});
|
|
1337
|
+
if (!doc.full_text_xml_url) {
|
|
1338
|
+
skipped++;
|
|
1339
|
+
continue;
|
|
1340
|
+
}
|
|
1341
|
+
try {
|
|
1342
|
+
const result = await downloadSingleDocument(doc, options.output, fetchDelay);
|
|
1343
|
+
files.push(result);
|
|
1344
|
+
totalBytes += result.size;
|
|
1345
|
+
} catch (err) {
|
|
1346
|
+
failed.push({
|
|
1347
|
+
documentNumber: doc.document_number,
|
|
1348
|
+
error: err instanceof Error ? err.message : String(err)
|
|
1349
|
+
});
|
|
1350
|
+
}
|
|
1351
|
+
}
|
|
1352
|
+
hasMore = hasMore && page < (data.total_pages ?? 0);
|
|
1353
|
+
page++;
|
|
1354
|
+
}
|
|
1355
|
+
}
|
|
1356
|
+
return {
|
|
1357
|
+
documentsDownloaded: files.length,
|
|
1358
|
+
files,
|
|
1359
|
+
totalBytes,
|
|
1360
|
+
dateRange: { from: options.from, to },
|
|
1361
|
+
skipped,
|
|
1362
|
+
failed
|
|
1363
|
+
};
|
|
1364
|
+
}
|
|
1365
|
+
async function downloadSingleFrDocument(documentNumber, output) {
|
|
1366
|
+
const metaUrl = `${FR_API_BASE}/documents/${documentNumber}.json?${new URLSearchParams(API_FIELDS.map((f) => ["fields[]", f])).toString()}`;
|
|
1367
|
+
const metaResponse = await fetchWithRetry(metaUrl);
|
|
1368
|
+
const doc = await metaResponse.json();
|
|
1369
|
+
if (!doc.document_number || !doc.publication_date) {
|
|
1370
|
+
throw new Error(
|
|
1371
|
+
`Invalid API response for document ${documentNumber}: missing document_number or publication_date`
|
|
1372
|
+
);
|
|
1373
|
+
}
|
|
1374
|
+
return downloadSingleDocument(doc, output, 0);
|
|
1375
|
+
}
|
|
1376
|
+
async function downloadSingleDocument(doc, outputDir, fetchDelay) {
|
|
1377
|
+
if (!doc.document_number || !doc.publication_date) {
|
|
1378
|
+
throw new Error(
|
|
1379
|
+
`Invalid document in API response: missing document_number or publication_date`
|
|
1380
|
+
);
|
|
1381
|
+
}
|
|
1382
|
+
if (!doc.full_text_xml_url) {
|
|
1383
|
+
throw new Error(
|
|
1384
|
+
`Document ${doc.document_number} has no full_text_xml_url \u2014 cannot download XML`
|
|
1385
|
+
);
|
|
1386
|
+
}
|
|
1387
|
+
const xmlPath = buildFrDownloadXmlPath(doc.document_number, doc.publication_date, outputDir);
|
|
1388
|
+
const jsonPath = buildFrDownloadJsonPath(doc.document_number, doc.publication_date, outputDir);
|
|
1389
|
+
await mkdir2(dirname2(xmlPath), { recursive: true });
|
|
1390
|
+
const jsonContent = JSON.stringify(doc, null, 2);
|
|
1391
|
+
await fsWriteFile(jsonPath, jsonContent, "utf-8");
|
|
1392
|
+
if (fetchDelay > 0) {
|
|
1393
|
+
await sleep(fetchDelay);
|
|
1394
|
+
}
|
|
1395
|
+
const xmlResponse = await fetchWithRetry(doc.full_text_xml_url);
|
|
1396
|
+
if (!xmlResponse.body) {
|
|
1397
|
+
throw new Error(`No response body for ${doc.document_number} XML`);
|
|
1398
|
+
}
|
|
1399
|
+
const dest = createWriteStream(xmlPath);
|
|
1400
|
+
try {
|
|
1401
|
+
await pipeline(Readable.fromWeb(xmlResponse.body), dest);
|
|
1402
|
+
} catch (err) {
|
|
1403
|
+
throw new Error(
|
|
1404
|
+
`Failed to write XML for document ${doc.document_number} from ${doc.full_text_xml_url}: ${err instanceof Error ? err.message : String(err)}`,
|
|
1405
|
+
{ cause: err }
|
|
1406
|
+
);
|
|
1407
|
+
}
|
|
1408
|
+
const xmlStat = await stat2(xmlPath);
|
|
1409
|
+
const jsonSize = Buffer.byteLength(jsonContent, "utf-8");
|
|
1410
|
+
return {
|
|
1411
|
+
xmlPath,
|
|
1412
|
+
jsonPath,
|
|
1413
|
+
documentNumber: doc.document_number,
|
|
1414
|
+
publicationDate: doc.publication_date,
|
|
1415
|
+
size: Number(xmlStat.size) + jsonSize
|
|
1416
|
+
};
|
|
1417
|
+
}
|
|
1418
|
+
function buildMonthChunks(from, to) {
|
|
1419
|
+
const chunks = [];
|
|
1420
|
+
let current = /* @__PURE__ */ new Date(from + "T00:00:00Z");
|
|
1421
|
+
const end = /* @__PURE__ */ new Date(to + "T00:00:00Z");
|
|
1422
|
+
while (current <= end) {
|
|
1423
|
+
const chunkStart = current.toISOString().slice(0, 10);
|
|
1424
|
+
const monthEnd = new Date(
|
|
1425
|
+
Date.UTC(current.getUTCFullYear(), current.getUTCMonth() + 1, 0)
|
|
1426
|
+
);
|
|
1427
|
+
const chunkEnd = monthEnd <= end ? monthEnd.toISOString().slice(0, 10) : to;
|
|
1428
|
+
chunks.push({ from: chunkStart, to: chunkEnd });
|
|
1429
|
+
current = new Date(
|
|
1430
|
+
Date.UTC(current.getUTCFullYear(), current.getUTCMonth() + 1, 1)
|
|
1431
|
+
);
|
|
1432
|
+
}
|
|
1433
|
+
return chunks;
|
|
1434
|
+
}
|
|
1435
|
+
async function fetchWithRetry(url, attempt = 0) {
|
|
1436
|
+
let response;
|
|
1437
|
+
try {
|
|
1438
|
+
response = await fetch(url);
|
|
1439
|
+
} catch (err) {
|
|
1440
|
+
if (attempt < MAX_RETRIES) {
|
|
1441
|
+
const delay = RETRY_BASE_DELAY_MS * Math.pow(2, attempt);
|
|
1442
|
+
console.warn(
|
|
1443
|
+
`Network error for ${url}: ${err instanceof Error ? err.message : String(err)}. Retrying in ${delay}ms (attempt ${attempt + 1}/${MAX_RETRIES})...`
|
|
1444
|
+
);
|
|
1445
|
+
await sleep(delay);
|
|
1446
|
+
return fetchWithRetry(url, attempt + 1);
|
|
1447
|
+
}
|
|
1448
|
+
throw new Error(
|
|
1449
|
+
`Network error after ${MAX_RETRIES + 1} attempts for ${url}: ${err instanceof Error ? err.message : String(err)}`,
|
|
1450
|
+
{ cause: err }
|
|
1451
|
+
);
|
|
1452
|
+
}
|
|
1453
|
+
if (response.ok) return response;
|
|
1454
|
+
if ((response.status === 429 || response.status === 503 || response.status === 504) && attempt < MAX_RETRIES) {
|
|
1455
|
+
const retryAfter = response.headers.get("Retry-After");
|
|
1456
|
+
const parsedRetry = retryAfter ? parseInt(retryAfter, 10) : NaN;
|
|
1457
|
+
const delay = !isNaN(parsedRetry) && parsedRetry > 0 ? parsedRetry * 1e3 : RETRY_BASE_DELAY_MS * Math.pow(2, attempt);
|
|
1458
|
+
console.warn(
|
|
1459
|
+
`HTTP ${response.status} for ${url}. Retrying in ${delay}ms (attempt ${attempt + 1}/${MAX_RETRIES})...`
|
|
1460
|
+
);
|
|
1461
|
+
await sleep(delay);
|
|
1462
|
+
return fetchWithRetry(url, attempt + 1);
|
|
1463
|
+
}
|
|
1464
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText} for ${url}`);
|
|
1465
|
+
}
|
|
1466
|
+
function sleep(ms) {
|
|
1467
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
1468
|
+
}
|
|
1469
|
+
export {
|
|
1470
|
+
FR_BLOCK_ELEMENTS,
|
|
1471
|
+
FR_CONTENT_ELEMENTS,
|
|
1472
|
+
FR_DOCUMENT_ELEMENTS,
|
|
1473
|
+
FR_DOCUMENT_TYPE_KEYS,
|
|
1474
|
+
FR_DOCUMENT_TYPE_MAP,
|
|
1475
|
+
FR_EMPHASIS_MAP,
|
|
1476
|
+
FR_HD_SOURCE_TO_DEPTH,
|
|
1477
|
+
FR_HEADING_ELEMENT,
|
|
1478
|
+
FR_IGNORE_ELEMENTS,
|
|
1479
|
+
FR_INLINE_ELEMENTS,
|
|
1480
|
+
FR_NOTE_ELEMENTS,
|
|
1481
|
+
FR_PASSTHROUGH_ELEMENTS,
|
|
1482
|
+
FR_PREAMBLE_META_ELEMENTS,
|
|
1483
|
+
FR_PREAMBLE_SECTIONS,
|
|
1484
|
+
FR_PRESIDENTIAL_SUBTYPES,
|
|
1485
|
+
FR_REGTEXT_ELEMENTS,
|
|
1486
|
+
FR_SECTION_CONTAINERS,
|
|
1487
|
+
FR_SIGNATURE_ELEMENTS,
|
|
1488
|
+
FR_SKIP_ELEMENTS,
|
|
1489
|
+
FR_TABLE_ELEMENTS,
|
|
1490
|
+
FrASTBuilder,
|
|
1491
|
+
buildFrApiListUrl,
|
|
1492
|
+
buildFrDownloadJsonPath,
|
|
1493
|
+
buildFrDownloadXmlPath,
|
|
1494
|
+
buildFrFrontmatter,
|
|
1495
|
+
buildFrOutputPath,
|
|
1496
|
+
buildMonthDir,
|
|
1497
|
+
buildYearDir,
|
|
1498
|
+
convertFrDocuments,
|
|
1499
|
+
downloadFrDocuments,
|
|
1500
|
+
downloadSingleFrDocument
|
|
1501
|
+
};
|
|
1502
|
+
//# sourceMappingURL=index.js.map
|