@lexbuild/fr 1.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,1502 @@
1
+ // src/fr-elements.ts
2
+ var FR_DOCUMENT_TYPE_KEYS = ["RULE", "PRORULE", "NOTICE", "PRESDOCU"];
3
+ var FR_DOCUMENT_ELEMENTS = new Set(FR_DOCUMENT_TYPE_KEYS);
4
+ var FR_SECTION_CONTAINERS = /* @__PURE__ */ new Set([
5
+ "RULES",
6
+ "PRORULES",
7
+ "NOTICES",
8
+ "PRESDOCS"
9
+ ]);
10
+ var FR_DOCUMENT_TYPE_MAP = {
11
+ RULE: "rule",
12
+ PRORULE: "proposed_rule",
13
+ NOTICE: "notice",
14
+ PRESDOCU: "presidential_document"
15
+ };
16
+ var FR_PREAMBLE_SECTIONS = /* @__PURE__ */ new Set([
17
+ "AGY",
18
+ // Agency section (HD + P)
19
+ "ACT",
20
+ // Action section (HD + P)
21
+ "SUM",
22
+ // Summary section (HD + P)
23
+ "DATES",
24
+ // Dates section (HD + P)
25
+ "EFFDATE",
26
+ // Effective date section (HD + P)
27
+ "ADD",
28
+ // Addresses section (HD + P)
29
+ "FURINF"
30
+ // Further information section (HD + P)
31
+ ]);
32
+ var FR_PREAMBLE_META_ELEMENTS = /* @__PURE__ */ new Set([
33
+ "AGENCY",
34
+ // Issuing agency name (attrs: TYPE)
35
+ "SUBAGY",
36
+ // Sub-agency name
37
+ "CFR",
38
+ // CFR citation affected (e.g., "10 CFR Part 2")
39
+ "SUBJECT",
40
+ // Document title/subject
41
+ "DEPDOC",
42
+ // Department document number
43
+ "RIN"
44
+ // Regulation Identifier Number
45
+ ]);
46
+ var FR_CONTENT_ELEMENTS = /* @__PURE__ */ new Set([
47
+ "P",
48
+ // Paragraph
49
+ "FP"
50
+ // Flush paragraph (attrs: SOURCE for indent level)
51
+ ]);
52
+ var FR_HEADING_ELEMENT = "HD";
53
+ var FR_HD_SOURCE_TO_DEPTH = {
54
+ HED: 1,
55
+ HD1: 2,
56
+ HD2: 3,
57
+ HD3: 4,
58
+ HD4: 5,
59
+ HD5: 6,
60
+ HD6: 6,
61
+ HD8: 6
62
+ };
63
+ var FR_INLINE_ELEMENTS = /* @__PURE__ */ new Set([
64
+ "I",
65
+ // Italic
66
+ "B",
67
+ // Bold
68
+ "E",
69
+ // Emphasis (type varies by T attribute)
70
+ "SU",
71
+ // Superscript / footnote marker
72
+ "FR",
73
+ // Fraction
74
+ "AC"
75
+ // Accent/diacritical
76
+ ]);
77
+ var FR_EMPHASIS_MAP = {
78
+ "01": "bold",
79
+ "02": "italic",
80
+ "03": "bold",
81
+ // bold italic in print — treat as bold for Markdown
82
+ "04": "italic",
83
+ // italic in headings
84
+ "05": "italic",
85
+ // small caps — render as italic
86
+ "51": "sub",
87
+ // subscript
88
+ "52": "sub",
89
+ // subscript
90
+ "54": "sub",
91
+ // subscript (math)
92
+ "7462": "italic"
93
+ // special terms (et seq., De minimis)
94
+ };
95
+ var FR_REGTEXT_ELEMENTS = /* @__PURE__ */ new Set([
96
+ "REGTEXT",
97
+ // Regulatory text container (attrs: TITLE, PART)
98
+ "AMDPAR",
99
+ // Amendment instruction paragraph
100
+ "SECTION",
101
+ // Section container
102
+ "SECTNO",
103
+ // Section number designation
104
+ "PART",
105
+ // Part container within REGTEXT
106
+ "AUTH"
107
+ // Authority citation in REGTEXT
108
+ ]);
109
+ var FR_LSTSUB_ELEMENT = "LSTSUB";
110
+ var FR_SIGNATURE_ELEMENTS = /* @__PURE__ */ new Set([
111
+ "SIG",
112
+ // Signature block container
113
+ "NAME",
114
+ // Signer name
115
+ "TITLE",
116
+ // Signer title
117
+ "DATED"
118
+ // Date of signature
119
+ ]);
120
+ var FR_PRESIDENTIAL_SUBTYPES = /* @__PURE__ */ new Set([
121
+ "EXECORD",
122
+ // Executive Order
123
+ "PRMEMO",
124
+ // Presidential Memorandum
125
+ "PROCLA",
126
+ // Proclamation
127
+ "DETERM",
128
+ // Presidential Determination
129
+ "PRNOTICE",
130
+ // Presidential Notice
131
+ "PRORDER"
132
+ // Presidential Order
133
+ ]);
134
+ var FR_PRESIDENTIAL_META_ELEMENTS = /* @__PURE__ */ new Set([
135
+ "PSIG",
136
+ // Presidential signature (initials)
137
+ "PLACE",
138
+ // Place of issuance
139
+ "TITLE3",
140
+ // CFR Title 3 marker
141
+ "PRES"
142
+ // President name
143
+ ]);
144
+ var FR_NOTE_ELEMENTS = /* @__PURE__ */ new Set([
145
+ "FTNT",
146
+ // Footnote
147
+ "EDNOTE",
148
+ // Editorial note
149
+ "OLNOTE1"
150
+ // Overlay note
151
+ ]);
152
+ var FR_FTREF_ELEMENT = "FTREF";
153
+ var FR_BLOCK_ELEMENTS = /* @__PURE__ */ new Set([
154
+ "EXTRACT",
155
+ // Extracted/quoted text
156
+ "EXAMPLE"
157
+ // Illustrative example
158
+ ]);
159
+ var FR_TABLE_ELEMENTS = /* @__PURE__ */ new Set([
160
+ "GPOTABLE",
161
+ // Table root
162
+ "TTITLE",
163
+ // Table title
164
+ "BOXHD",
165
+ // Header box container
166
+ "CHED",
167
+ // Column header entry (attrs: H for level)
168
+ "ROW",
169
+ // Data row (attrs: RUL for horizontal rules)
170
+ "ENT"
171
+ // Cell entry (attrs: I for indent, A for alignment)
172
+ ]);
173
+ var FR_IGNORE_ELEMENTS = /* @__PURE__ */ new Set([
174
+ "CNTNTS",
175
+ // Table of contents in daily issue
176
+ "GPH",
177
+ // Graphics (not available in XML)
178
+ "GID"
179
+ // Graphics ID
180
+ ]);
181
+ var FR_SKIP_ELEMENTS = /* @__PURE__ */ new Set([
182
+ "PRTPAGE",
183
+ // Page number reference (attrs: P for page)
184
+ "STARS",
185
+ // Visual separator (****)
186
+ "FILED",
187
+ // Filing info
188
+ "UNITNAME",
189
+ // Section name in daily issue
190
+ "VOL",
191
+ // Volume number (daily issue metadata)
192
+ "NO",
193
+ // Issue number (daily issue metadata)
194
+ "DATE",
195
+ // Date (daily issue level — document dates from preamble)
196
+ "NEWPART",
197
+ // New part container in daily issue
198
+ "PTITLE",
199
+ // Part title in daily issue
200
+ "PARTNO",
201
+ // Part number in daily issue
202
+ "PNOTICE"
203
+ // Part notice text
204
+ ]);
205
+ var FR_PASSTHROUGH_ELEMENTS = /* @__PURE__ */ new Set([
206
+ "FEDREG",
207
+ // Daily issue root element
208
+ "PREAMB",
209
+ // Preamble — children are handled individually
210
+ "SUPLINF"
211
+ // Supplementary information — children are handled individually
212
+ ]);
213
+ var FR_FRDOC_ELEMENT = "FRDOC";
214
+ var FR_BILCOD_ELEMENT = "BILCOD";
215
+
216
+ // src/fr-builder.ts
217
+ var FrASTBuilder = class {
218
+ options;
219
+ stack = [];
220
+ /** Depth inside fully-ignored elements (CNTNTS, GPH) */
221
+ ignoredContainerDepth = 0;
222
+ /** Metadata extracted from current document */
223
+ currentDocMeta = {
224
+ documentType: "",
225
+ documentTypeNormalized: ""
226
+ };
227
+ /** All document metadata collected during parsing */
228
+ documentMetas = [];
229
+ constructor(options) {
230
+ this.options = options;
231
+ }
232
+ /** Get metadata for all documents parsed so far */
233
+ getDocumentMetas() {
234
+ return this.documentMetas;
235
+ }
236
+ /** Handle SAX open element */
237
+ onOpenElement(name, attrs) {
238
+ if (this.ignoredContainerDepth > 0) {
239
+ this.ignoredContainerDepth++;
240
+ return;
241
+ }
242
+ if (FR_IGNORE_ELEMENTS.has(name)) {
243
+ this.ignoredContainerDepth = 1;
244
+ return;
245
+ }
246
+ if (FR_SKIP_ELEMENTS.has(name)) {
247
+ this.ignoredContainerDepth = 1;
248
+ return;
249
+ }
250
+ if (FR_PASSTHROUGH_ELEMENTS.has(name)) {
251
+ return;
252
+ }
253
+ if (FR_SECTION_CONTAINERS.has(name)) {
254
+ return;
255
+ }
256
+ if (FR_DOCUMENT_ELEMENTS.has(name)) {
257
+ this.openDocument(name);
258
+ return;
259
+ }
260
+ if (FR_PRESIDENTIAL_SUBTYPES.has(name)) {
261
+ return;
262
+ }
263
+ if (FR_PRESIDENTIAL_META_ELEMENTS.has(name)) {
264
+ if (name === "PSIG" || name === "PLACE") {
265
+ this.openContent(name);
266
+ return;
267
+ }
268
+ this.stack.push({ kind: "ignore", elementName: name, textBuffer: "" });
269
+ return;
270
+ }
271
+ if (FR_PREAMBLE_META_ELEMENTS.has(name)) {
272
+ this.stack.push({ kind: "preambleMeta", elementName: name, textBuffer: "" });
273
+ return;
274
+ }
275
+ if (FR_PREAMBLE_SECTIONS.has(name)) {
276
+ this.stack.push({ kind: "preambleSection", elementName: name, textBuffer: "" });
277
+ return;
278
+ }
279
+ if (name === FR_HEADING_ELEMENT) {
280
+ this.openHeading(name, attrs);
281
+ return;
282
+ }
283
+ if (FR_CONTENT_ELEMENTS.has(name)) {
284
+ this.openContent(name);
285
+ return;
286
+ }
287
+ if (FR_INLINE_ELEMENTS.has(name)) {
288
+ this.openInline(name, attrs);
289
+ return;
290
+ }
291
+ if (name === FR_FTREF_ELEMENT) {
292
+ const node = {
293
+ type: "inline",
294
+ inlineType: "footnoteRef",
295
+ idref: attrs["ID"]
296
+ };
297
+ this.stack.push({ kind: "inline", elementName: name, node, textBuffer: "" });
298
+ return;
299
+ }
300
+ if (FR_NOTE_ELEMENTS.has(name)) {
301
+ this.openNote(name);
302
+ return;
303
+ }
304
+ if (FR_REGTEXT_ELEMENTS.has(name)) {
305
+ this.openRegtext(name, attrs);
306
+ return;
307
+ }
308
+ if (name === FR_LSTSUB_ELEMENT) {
309
+ this.stack.push({ kind: "block", elementName: name, textBuffer: "" });
310
+ return;
311
+ }
312
+ if (FR_SIGNATURE_ELEMENTS.has(name)) {
313
+ this.openSignature(name);
314
+ return;
315
+ }
316
+ if (FR_BLOCK_ELEMENTS.has(name)) {
317
+ this.stack.push({ kind: "block", elementName: name, textBuffer: "" });
318
+ return;
319
+ }
320
+ if (FR_TABLE_ELEMENTS.has(name)) {
321
+ this.openTableElement(name, attrs);
322
+ return;
323
+ }
324
+ if (name === FR_FRDOC_ELEMENT) {
325
+ this.stack.push({ kind: "frdoc", elementName: name, textBuffer: "" });
326
+ return;
327
+ }
328
+ if (name === FR_BILCOD_ELEMENT) {
329
+ this.ignoredContainerDepth = 1;
330
+ return;
331
+ }
332
+ this.stack.push({ kind: "ignore", elementName: name, textBuffer: "" });
333
+ }
334
+ /** Handle SAX close element */
335
+ onCloseElement(name) {
336
+ if (this.ignoredContainerDepth > 0) {
337
+ this.ignoredContainerDepth--;
338
+ return;
339
+ }
340
+ if (FR_PASSTHROUGH_ELEMENTS.has(name) || FR_SECTION_CONTAINERS.has(name)) {
341
+ return;
342
+ }
343
+ if (FR_PRESIDENTIAL_SUBTYPES.has(name)) {
344
+ return;
345
+ }
346
+ if (FR_DOCUMENT_ELEMENTS.has(name)) {
347
+ this.closeDocument(name);
348
+ return;
349
+ }
350
+ if (FR_PREAMBLE_META_ELEMENTS.has(name)) {
351
+ this.closePreambleMeta(name);
352
+ return;
353
+ }
354
+ if (FR_PREAMBLE_SECTIONS.has(name)) {
355
+ this.popFrame(name);
356
+ return;
357
+ }
358
+ if (name === FR_HEADING_ELEMENT) {
359
+ this.closeHeading(name);
360
+ return;
361
+ }
362
+ if (FR_CONTENT_ELEMENTS.has(name)) {
363
+ this.closeContent(name);
364
+ return;
365
+ }
366
+ if (name === "PSIG" || name === "PLACE") {
367
+ this.closeContent(name);
368
+ return;
369
+ }
370
+ if (FR_INLINE_ELEMENTS.has(name) || name === FR_FTREF_ELEMENT) {
371
+ this.closeInline(name);
372
+ return;
373
+ }
374
+ if (FR_NOTE_ELEMENTS.has(name)) {
375
+ this.closeNote(name);
376
+ return;
377
+ }
378
+ if (FR_REGTEXT_ELEMENTS.has(name)) {
379
+ this.closeRegtext(name);
380
+ return;
381
+ }
382
+ if (name === FR_LSTSUB_ELEMENT) {
383
+ this.popFrame(name);
384
+ return;
385
+ }
386
+ if (FR_SIGNATURE_ELEMENTS.has(name)) {
387
+ this.closeSignature(name);
388
+ return;
389
+ }
390
+ if (FR_BLOCK_ELEMENTS.has(name)) {
391
+ this.popFrame(name);
392
+ return;
393
+ }
394
+ if (FR_TABLE_ELEMENTS.has(name)) {
395
+ this.closeTableElement(name);
396
+ return;
397
+ }
398
+ if (name === FR_FRDOC_ELEMENT) {
399
+ this.closeFrdoc();
400
+ return;
401
+ }
402
+ if (this.stack.length > 0 && this.stack[this.stack.length - 1]?.elementName === name) {
403
+ this.stack.pop();
404
+ }
405
+ }
406
+ /** Handle SAX text content */
407
+ onText(text) {
408
+ if (this.ignoredContainerDepth > 0) return;
409
+ const frame = this.stack[this.stack.length - 1];
410
+ if (!frame) return;
411
+ if (frame.kind === "heading" || frame.kind === "preambleMeta" || frame.kind === "signatureField" || frame.kind === "tableCell" || frame.kind === "tableHeader" || frame.kind === "frdoc") {
412
+ frame.textBuffer += text;
413
+ return;
414
+ }
415
+ if (frame.kind === "content" && frame.node?.type === "content") {
416
+ const contentNode = frame.node;
417
+ if (text) {
418
+ contentNode.children.push({
419
+ type: "inline",
420
+ inlineType: "text",
421
+ text
422
+ });
423
+ }
424
+ return;
425
+ }
426
+ if (frame.kind === "inline" && frame.node?.type === "inline") {
427
+ const inlineNode = frame.node;
428
+ if (inlineNode.children) {
429
+ inlineNode.children.push({
430
+ type: "inline",
431
+ inlineType: "text",
432
+ text
433
+ });
434
+ } else {
435
+ inlineNode.text = (inlineNode.text ?? "") + text;
436
+ }
437
+ return;
438
+ }
439
+ if (frame.kind === "note" && frame.node?.type === "note") {
440
+ frame.textBuffer += text;
441
+ return;
442
+ }
443
+ }
444
+ // ── Private helpers: Document ──
445
+ openDocument(elementName) {
446
+ this.currentDocMeta = {
447
+ documentType: elementName,
448
+ documentTypeNormalized: FR_DOCUMENT_TYPE_MAP[elementName] ?? elementName.toLowerCase()
449
+ };
450
+ const node = {
451
+ type: "level",
452
+ levelType: "section",
453
+ children: [],
454
+ sourceElement: elementName
455
+ };
456
+ this.stack.push({ kind: "document", elementName, node, textBuffer: "" });
457
+ }
458
+ closeDocument(elementName) {
459
+ const frame = this.popFrame(elementName);
460
+ if (!frame || frame.kind !== "document" || !frame.node) return;
461
+ const levelNode = frame.node;
462
+ if (this.currentDocMeta.subject) {
463
+ levelNode.heading = this.currentDocMeta.subject;
464
+ }
465
+ if (this.currentDocMeta.documentNumber) {
466
+ levelNode.identifier = `/us/fr/${this.currentDocMeta.documentNumber}`;
467
+ levelNode.numValue = this.currentDocMeta.documentNumber;
468
+ }
469
+ const ancestors = [];
470
+ for (const f of this.stack) {
471
+ if (f.kind === "document" && f.node?.type === "level") {
472
+ const ln = f.node;
473
+ ancestors.push({
474
+ levelType: ln.levelType,
475
+ numValue: ln.numValue,
476
+ heading: ln.heading,
477
+ identifier: ln.identifier
478
+ });
479
+ }
480
+ }
481
+ const context = {
482
+ ancestors,
483
+ documentMeta: {
484
+ dcTitle: this.currentDocMeta.subject,
485
+ dcType: this.currentDocMeta.documentTypeNormalized
486
+ }
487
+ };
488
+ this.documentMetas.push({ ...this.currentDocMeta });
489
+ this.options.onEmit(levelNode, context);
490
+ }
491
+ // ── Private helpers: Preamble ──
492
+ closePreambleMeta(elementName) {
493
+ const frame = this.popFrame(elementName);
494
+ if (!frame || frame.kind !== "preambleMeta") return;
495
+ const text = frame.textBuffer.trim();
496
+ if (!text) return;
497
+ switch (elementName) {
498
+ case "AGENCY":
499
+ this.currentDocMeta.agency = text;
500
+ break;
501
+ case "SUBAGY":
502
+ this.currentDocMeta.subAgency = text;
503
+ break;
504
+ case "CFR":
505
+ this.currentDocMeta.cfrCitation = text;
506
+ break;
507
+ case "SUBJECT":
508
+ this.currentDocMeta.subject = text;
509
+ break;
510
+ case "RIN":
511
+ this.currentDocMeta.rin = text.replace(/^RIN\s+/i, "").trim();
512
+ break;
513
+ case "DEPDOC":
514
+ break;
515
+ }
516
+ }
517
+ // ── Private helpers: Heading ──
518
+ openHeading(_elementName, attrs) {
519
+ const source = attrs["SOURCE"] ?? "HD1";
520
+ const depth = FR_HD_SOURCE_TO_DEPTH[source] ?? 3;
521
+ this.stack.push({
522
+ kind: "heading",
523
+ elementName: FR_HEADING_ELEMENT,
524
+ textBuffer: "",
525
+ headerLevel: depth
526
+ });
527
+ }
528
+ closeHeading(elementName) {
529
+ const frame = this.popFrame(elementName);
530
+ if (!frame || frame.kind !== "heading") return;
531
+ const headingText = frame.textBuffer.trim();
532
+ if (!headingText) return;
533
+ const parentFrame = this.stack[this.stack.length - 1];
534
+ if (parentFrame?.kind === "preambleSection") {
535
+ const contentNode2 = {
536
+ type: "content",
537
+ variant: "content",
538
+ children: [
539
+ {
540
+ type: "inline",
541
+ inlineType: "bold",
542
+ text: headingText
543
+ }
544
+ ]
545
+ };
546
+ this.addToDocument(contentNode2);
547
+ return;
548
+ }
549
+ const contentNode = {
550
+ type: "content",
551
+ variant: "content",
552
+ children: [
553
+ {
554
+ type: "inline",
555
+ inlineType: "bold",
556
+ text: headingText
557
+ }
558
+ ]
559
+ };
560
+ this.addToDocument(contentNode);
561
+ }
562
+ // ── Private helpers: Content ──
563
+ openContent(elementName) {
564
+ const node = {
565
+ type: "content",
566
+ variant: "content",
567
+ children: []
568
+ };
569
+ this.stack.push({ kind: "content", elementName, node, textBuffer: "" });
570
+ }
571
+ closeContent(elementName) {
572
+ const frame = this.popFrame(elementName);
573
+ if (!frame || !frame.node) return;
574
+ const contentNode = frame.node;
575
+ if (contentNode.children.length === 0) return;
576
+ const parent = this.findParentDocument() ?? this.findParentNote();
577
+ if (parent?.node) {
578
+ if (parent.node.type === "level") {
579
+ parent.node.children.push(contentNode);
580
+ } else if (parent.node.type === "note") {
581
+ parent.node.children.push(contentNode);
582
+ }
583
+ }
584
+ }
585
+ // ── Private helpers: Inline ──
586
+ openInline(elementName, attrs) {
587
+ let inlineType = "text";
588
+ if (elementName === "I") {
589
+ inlineType = "italic";
590
+ } else if (elementName === "B") {
591
+ inlineType = "bold";
592
+ } else if (elementName === "SU") {
593
+ inlineType = "sup";
594
+ } else if (elementName === "FR") {
595
+ inlineType = "text";
596
+ } else if (elementName === "E") {
597
+ const tValue = attrs["T"] ?? "";
598
+ inlineType = FR_EMPHASIS_MAP[tValue] ?? "italic";
599
+ }
600
+ const node = {
601
+ type: "inline",
602
+ inlineType,
603
+ children: []
604
+ };
605
+ this.stack.push({ kind: "inline", elementName, node, textBuffer: "" });
606
+ }
607
+ closeInline(elementName) {
608
+ const frame = this.popFrame(elementName);
609
+ if (!frame || !frame.node) return;
610
+ const inlineNode = frame.node;
611
+ if (inlineNode.inlineType === "footnoteRef" && frame.textBuffer) {
612
+ inlineNode.text = frame.textBuffer.trim();
613
+ }
614
+ const parentFrame = this.stack[this.stack.length - 1];
615
+ if (!parentFrame) return;
616
+ if (parentFrame.kind === "content" && parentFrame.node?.type === "content") {
617
+ parentFrame.node.children.push(inlineNode);
618
+ } else if (parentFrame.kind === "inline" && parentFrame.node?.type === "inline") {
619
+ const parentInline = parentFrame.node;
620
+ if (parentInline.children) {
621
+ parentInline.children.push(inlineNode);
622
+ }
623
+ } else if (parentFrame.kind === "heading" || parentFrame.kind === "preambleMeta") {
624
+ if (inlineNode.text) {
625
+ parentFrame.textBuffer += inlineNode.text;
626
+ } else if (inlineNode.children) {
627
+ for (const child of inlineNode.children) {
628
+ if (child.text) parentFrame.textBuffer += child.text;
629
+ }
630
+ }
631
+ }
632
+ }
633
+ // ── Private helpers: Notes ──
634
+ openNote(elementName) {
635
+ const noteTypeMap = {
636
+ FTNT: "footnote",
637
+ EDNOTE: "editorial",
638
+ OLNOTE1: "general"
639
+ };
640
+ const noteType = noteTypeMap[elementName] ?? elementName.toLowerCase();
641
+ const node = {
642
+ type: "note",
643
+ noteType,
644
+ children: []
645
+ };
646
+ this.stack.push({ kind: "note", elementName, node, textBuffer: "" });
647
+ }
648
+ closeNote(elementName) {
649
+ const frame = this.popFrame(elementName);
650
+ if (!frame || !frame.node) return;
651
+ const noteNode = frame.node;
652
+ if (frame.textBuffer.trim() && noteNode.children.length === 0) {
653
+ const contentNode = {
654
+ type: "content",
655
+ variant: "content",
656
+ children: [
657
+ {
658
+ type: "inline",
659
+ inlineType: "text",
660
+ text: frame.textBuffer.trim()
661
+ }
662
+ ]
663
+ };
664
+ noteNode.children.push(contentNode);
665
+ }
666
+ const parentDoc = this.findParentDocument();
667
+ if (parentDoc?.node && parentDoc.node.type === "level") {
668
+ parentDoc.node.children.push(noteNode);
669
+ }
670
+ }
671
+ // ── Private helpers: Regulatory text ──
672
+ openRegtext(elementName, attrs) {
673
+ if (elementName === "REGTEXT") {
674
+ const title = attrs["TITLE"] ?? "";
675
+ const part = attrs["PART"] ?? "";
676
+ const label = title && part ? `${title} CFR Part ${part}` : "";
677
+ if (label) {
678
+ const labelNode = {
679
+ type: "content",
680
+ variant: "content",
681
+ children: [
682
+ {
683
+ type: "inline",
684
+ inlineType: "bold",
685
+ text: label
686
+ }
687
+ ]
688
+ };
689
+ this.addToDocument(labelNode);
690
+ }
691
+ this.stack.push({ kind: "regtext", elementName, textBuffer: "" });
692
+ return;
693
+ }
694
+ if (elementName === "AMDPAR") {
695
+ this.openContent(elementName);
696
+ return;
697
+ }
698
+ if (elementName === "SECTION") {
699
+ this.stack.push({ kind: "block", elementName, textBuffer: "" });
700
+ return;
701
+ }
702
+ if (elementName === "SECTNO") {
703
+ this.openContent(elementName);
704
+ return;
705
+ }
706
+ if (elementName === "PART") {
707
+ this.stack.push({ kind: "block", elementName, textBuffer: "" });
708
+ return;
709
+ }
710
+ if (elementName === "AUTH") {
711
+ this.openNote(elementName);
712
+ return;
713
+ }
714
+ }
715
+ closeRegtext(elementName) {
716
+ if (elementName === "REGTEXT") {
717
+ this.popFrame(elementName);
718
+ return;
719
+ }
720
+ if (elementName === "AMDPAR" || elementName === "SECTNO") {
721
+ this.closeContent(elementName);
722
+ return;
723
+ }
724
+ if (elementName === "SECTION" || elementName === "PART") {
725
+ this.popFrame(elementName);
726
+ return;
727
+ }
728
+ if (elementName === "AUTH") {
729
+ this.closeNote(elementName);
730
+ return;
731
+ }
732
+ }
733
+ // ── Private helpers: Signature block ──
734
+ openSignature(elementName) {
735
+ if (elementName === "SIG") {
736
+ const node = {
737
+ type: "note",
738
+ noteType: "signature",
739
+ children: []
740
+ };
741
+ this.stack.push({ kind: "signature", elementName, node, textBuffer: "" });
742
+ return;
743
+ }
744
+ this.stack.push({ kind: "signatureField", elementName, textBuffer: "" });
745
+ }
746
+ closeSignature(elementName) {
747
+ if (elementName === "SIG") {
748
+ const frame2 = this.popFrame(elementName);
749
+ if (!frame2 || !frame2.node) return;
750
+ const sigNode = frame2.node;
751
+ const parentDoc = this.findParentDocument();
752
+ if (parentDoc?.node && parentDoc.node.type === "level") {
753
+ parentDoc.node.children.push(sigNode);
754
+ }
755
+ return;
756
+ }
757
+ const frame = this.popFrame(elementName);
758
+ if (!frame || frame.kind !== "signatureField") return;
759
+ const text = frame.textBuffer.trim();
760
+ if (!text) return;
761
+ const sigFrame = this.findFrame("signature");
762
+ if (sigFrame?.node && sigFrame.node.type === "note") {
763
+ const contentNode = {
764
+ type: "content",
765
+ variant: "content",
766
+ children: [
767
+ {
768
+ type: "inline",
769
+ inlineType: "text",
770
+ text
771
+ }
772
+ ]
773
+ };
774
+ sigFrame.node.children.push(contentNode);
775
+ }
776
+ }
777
+ // ── Private helpers: GPOTABLE ──
778
+ openTableElement(elementName, _attrs) {
779
+ if (elementName === "GPOTABLE") {
780
+ this.stack.push({
781
+ kind: "table",
782
+ elementName,
783
+ textBuffer: "",
784
+ headers: [],
785
+ rows: [],
786
+ currentRow: []
787
+ });
788
+ return;
789
+ }
790
+ if (elementName === "TTITLE") {
791
+ this.stack.push({ kind: "heading", elementName, textBuffer: "" });
792
+ return;
793
+ }
794
+ if (elementName === "BOXHD") {
795
+ return;
796
+ }
797
+ if (elementName === "CHED") {
798
+ this.stack.push({ kind: "tableHeader", elementName, textBuffer: "" });
799
+ return;
800
+ }
801
+ if (elementName === "ROW") {
802
+ const tableFrame = this.findTableFrame();
803
+ if (tableFrame) {
804
+ tableFrame.currentRow = [];
805
+ }
806
+ this.stack.push({ kind: "tableRow", elementName, textBuffer: "" });
807
+ return;
808
+ }
809
+ if (elementName === "ENT") {
810
+ this.stack.push({ kind: "tableCell", elementName, textBuffer: "" });
811
+ return;
812
+ }
813
+ }
814
+ closeTableElement(elementName) {
815
+ if (elementName === "GPOTABLE") {
816
+ this.closeGpoTable();
817
+ return;
818
+ }
819
+ if (elementName === "TTITLE") {
820
+ this.popFrame(elementName);
821
+ return;
822
+ }
823
+ if (elementName === "BOXHD") {
824
+ return;
825
+ }
826
+ if (elementName === "CHED") {
827
+ this.closeTableHeader();
828
+ return;
829
+ }
830
+ if (elementName === "ROW") {
831
+ this.closeTableRow();
832
+ return;
833
+ }
834
+ if (elementName === "ENT") {
835
+ this.closeTableCell();
836
+ return;
837
+ }
838
+ }
839
+ closeGpoTable() {
840
+ const frame = this.popFrame("GPOTABLE");
841
+ if (!frame || frame.kind !== "table") return;
842
+ const tableNode = {
843
+ type: "table",
844
+ variant: "xhtml",
845
+ // Reuse the same variant for rendering
846
+ headers: frame.headers ?? [],
847
+ rows: frame.rows ?? []
848
+ };
849
+ const parentDoc = this.findParentDocument();
850
+ if (parentDoc?.node && parentDoc.node.type === "level") {
851
+ parentDoc.node.children.push(tableNode);
852
+ }
853
+ }
854
+ closeTableHeader() {
855
+ const headerFrame = this.popFrame("CHED");
856
+ if (!headerFrame || headerFrame.kind !== "tableHeader") return;
857
+ const tableFrame = this.findTableFrame();
858
+ if (!tableFrame) return;
859
+ const text = headerFrame.textBuffer.trim();
860
+ if (!tableFrame.headers || tableFrame.headers.length === 0) {
861
+ tableFrame.headers = [[]];
862
+ }
863
+ const headerRow = tableFrame.headers[0];
864
+ if (headerRow) {
865
+ headerRow.push(text);
866
+ }
867
+ }
868
+ closeTableRow() {
869
+ const rowFrame = this.popFrame("ROW");
870
+ if (!rowFrame) return;
871
+ const tableFrame = this.findTableFrame();
872
+ if (tableFrame?.currentRow) {
873
+ tableFrame.rows?.push([...tableFrame.currentRow]);
874
+ tableFrame.currentRow = [];
875
+ }
876
+ }
877
+ closeTableCell() {
878
+ const cellFrame = this.stack.pop();
879
+ if (!cellFrame || cellFrame.kind !== "tableCell") return;
880
+ const tableFrame = this.findTableFrame();
881
+ if (tableFrame?.currentRow) {
882
+ tableFrame.currentRow.push(cellFrame.textBuffer.trim());
883
+ }
884
+ }
885
+ // ── Private helpers: FRDOC ──
886
+ closeFrdoc() {
887
+ const frame = this.popFrame(FR_FRDOC_ELEMENT);
888
+ if (!frame || frame.kind !== "frdoc") return;
889
+ const text = frame.textBuffer.trim();
890
+ const match = /FR\s+Doc\.\s+([\d-]+)/i.exec(text);
891
+ if (match) {
892
+ this.currentDocMeta.documentNumber = match[1];
893
+ }
894
+ }
895
+ // ── Private helpers: Stack navigation ──
896
+ addToDocument(node) {
897
+ const docFrame = this.findParentDocument();
898
+ if (docFrame?.node && docFrame.node.type === "level") {
899
+ docFrame.node.children.push(node);
900
+ }
901
+ }
902
+ findParentDocument() {
903
+ for (let i = this.stack.length - 1; i >= 0; i--) {
904
+ if (this.stack[i]?.kind === "document") {
905
+ return this.stack[i];
906
+ }
907
+ }
908
+ return void 0;
909
+ }
910
+ findParentNote() {
911
+ for (let i = this.stack.length - 1; i >= 0; i--) {
912
+ if (this.stack[i]?.kind === "note" || this.stack[i]?.kind === "signature") {
913
+ return this.stack[i];
914
+ }
915
+ }
916
+ return void 0;
917
+ }
918
+ findTableFrame() {
919
+ for (let i = this.stack.length - 1; i >= 0; i--) {
920
+ if (this.stack[i]?.kind === "table") {
921
+ return this.stack[i];
922
+ }
923
+ }
924
+ return void 0;
925
+ }
926
+ findFrame(kind) {
927
+ for (let i = this.stack.length - 1; i >= 0; i--) {
928
+ if (this.stack[i]?.kind === kind) {
929
+ return this.stack[i];
930
+ }
931
+ }
932
+ return void 0;
933
+ }
934
+ popFrame(elementName) {
935
+ if (this.stack.length === 0) return void 0;
936
+ for (let i = this.stack.length - 1; i >= 0; i--) {
937
+ if (this.stack[i]?.elementName === elementName) {
938
+ return this.stack.splice(i, 1)[0];
939
+ }
940
+ }
941
+ console.warn(
942
+ `FrASTBuilder: no matching frame for closing element </${elementName}>, stack has: [${this.stack.map((f) => f.elementName).join(", ")}]`
943
+ );
944
+ return void 0;
945
+ }
946
+ };
947
+
948
+ // src/fr-frontmatter.ts
949
+ function normalizeDocumentType(apiType) {
950
+ const map = {
951
+ Rule: "rule",
952
+ "Proposed Rule": "proposed_rule",
953
+ Notice: "notice",
954
+ "Presidential Document": "presidential_document"
955
+ };
956
+ return map[apiType] ?? apiType.toLowerCase().replace(/\s+/g, "_");
957
+ }
958
+ function buildFrFrontmatter(node, _context, xmlMeta, jsonMeta) {
959
+ const documentNumber = jsonMeta?.document_number ?? xmlMeta.documentNumber ?? "";
960
+ const subject = jsonMeta?.title ?? xmlMeta.subject ?? node.heading ?? "";
961
+ const publicationDate = jsonMeta?.publication_date ?? "";
962
+ const documentType = jsonMeta ? normalizeDocumentType(jsonMeta.type) : xmlMeta.documentTypeNormalized;
963
+ let agencies;
964
+ if (jsonMeta?.agencies && jsonMeta.agencies.length > 0) {
965
+ agencies = jsonMeta.agencies.map((a) => a.name);
966
+ } else if (xmlMeta.agency) {
967
+ agencies = [xmlMeta.agency];
968
+ if (xmlMeta.subAgency) {
969
+ agencies.push(xmlMeta.subAgency);
970
+ }
971
+ }
972
+ let cfrReferences;
973
+ if (jsonMeta?.cfr_references && jsonMeta.cfr_references.length > 0) {
974
+ cfrReferences = jsonMeta.cfr_references.map((r) => `${r.title} CFR Part ${r.part}`);
975
+ } else if (xmlMeta.cfrCitation) {
976
+ cfrReferences = [xmlMeta.cfrCitation];
977
+ }
978
+ let docketIds;
979
+ if (jsonMeta?.docket_ids && jsonMeta.docket_ids.length > 0) {
980
+ docketIds = jsonMeta.docket_ids;
981
+ }
982
+ const primaryAgency = agencies && agencies.length > 0 ? agencies[0] : void 0;
983
+ const frCitation = jsonMeta?.citation;
984
+ const rin = jsonMeta?.regulation_id_numbers?.[0] ?? xmlMeta.rin;
985
+ const fm = {
986
+ source: "fr",
987
+ legal_status: "authoritative_unofficial",
988
+ identifier: node.identifier ?? `/us/fr/${documentNumber}`,
989
+ title: subject,
990
+ title_number: 0,
991
+ // FR documents don't belong to a USC/CFR title
992
+ title_name: "Federal Register",
993
+ section_number: documentNumber,
994
+ section_name: subject,
995
+ positive_law: false,
996
+ currency: publicationDate,
997
+ last_updated: publicationDate,
998
+ // Shared optional fields
999
+ agency: primaryAgency,
1000
+ // FR-specific fields
1001
+ document_number: documentNumber || void 0,
1002
+ document_type: documentType || void 0,
1003
+ fr_citation: frCitation,
1004
+ fr_volume: jsonMeta?.volume,
1005
+ publication_date: publicationDate || void 0,
1006
+ agencies: agencies && agencies.length > 0 ? agencies : void 0,
1007
+ cfr_references: cfrReferences && cfrReferences.length > 0 ? cfrReferences : void 0,
1008
+ docket_ids: docketIds && docketIds.length > 0 ? docketIds : void 0,
1009
+ rin: rin || void 0,
1010
+ effective_date: jsonMeta?.effective_on ?? void 0,
1011
+ comments_close_date: jsonMeta?.comments_close_on ?? void 0,
1012
+ fr_action: jsonMeta?.action ?? void 0
1013
+ };
1014
+ return fm;
1015
+ }
1016
+
1017
+ // src/fr-path.ts
1018
+ import { join } from "path";
1019
+ function buildFrOutputPath(documentNumber, publicationDate, outputRoot) {
1020
+ const { year, month } = parseDateComponents(publicationDate);
1021
+ return join(outputRoot, "fr", year, month, `${documentNumber}.md`);
1022
+ }
1023
+ function buildFrDownloadXmlPath(documentNumber, publicationDate, downloadRoot) {
1024
+ const { year, month } = parseDateComponents(publicationDate);
1025
+ return join(downloadRoot, year, month, `${documentNumber}.xml`);
1026
+ }
1027
+ function buildFrDownloadJsonPath(documentNumber, publicationDate, downloadRoot) {
1028
+ const { year, month } = parseDateComponents(publicationDate);
1029
+ return join(downloadRoot, year, month, `${documentNumber}.json`);
1030
+ }
1031
+ function buildMonthDir(year, month, outputRoot) {
1032
+ return join(outputRoot, "fr", year, month);
1033
+ }
1034
+ function buildYearDir(year, outputRoot) {
1035
+ return join(outputRoot, "fr", year);
1036
+ }
1037
+ function parseDateComponents(date) {
1038
+ const parts = date.split("-");
1039
+ return {
1040
+ year: parts[0] || "0000",
1041
+ month: parts[1] || "00"
1042
+ };
1043
+ }
1044
+
1045
+ // src/converter.ts
1046
+ import { createReadStream, existsSync } from "fs";
1047
+ import { readFile, readdir, stat } from "fs/promises";
1048
+ import { join as join2, dirname } from "path";
1049
+ import {
1050
+ XMLParser,
1051
+ renderDocument,
1052
+ createLinkResolver,
1053
+ writeFile,
1054
+ mkdir
1055
+ } from "@lexbuild/core";
1056
+ var FR_DOC_TYPE_SET = new Set(FR_DOCUMENT_TYPE_KEYS);
1057
+ async function convertFrDocuments(options) {
1058
+ const xmlFiles = await discoverXmlFiles(options.input, options.from, options.to);
1059
+ const files = [];
1060
+ let totalTokenEstimate = 0;
1061
+ let peakMemoryBytes = 0;
1062
+ const linkResolver = createLinkResolver();
1063
+ const parsedFiles = /* @__PURE__ */ new Map();
1064
+ for (const xmlPath of xmlFiles) {
1065
+ try {
1066
+ const collected = await parseXmlFile(xmlPath);
1067
+ parsedFiles.set(xmlPath, collected);
1068
+ } catch (err) {
1069
+ console.warn(
1070
+ `Warning: Failed to parse ${xmlPath}: ${err instanceof Error ? err.message : String(err)}. Skipping.`
1071
+ );
1072
+ }
1073
+ }
1074
+ for (const [, collected] of parsedFiles) {
1075
+ for (const doc of collected) {
1076
+ if (options.types && options.types.length > 0) {
1077
+ if (!FR_DOC_TYPE_SET.has(doc.xmlMeta.documentType) || !options.types.includes(doc.xmlMeta.documentType)) {
1078
+ continue;
1079
+ }
1080
+ }
1081
+ if (doc.node.identifier) {
1082
+ const outputPath = buildFrOutputPath(
1083
+ doc.documentNumber,
1084
+ doc.publicationDate,
1085
+ options.output
1086
+ );
1087
+ linkResolver.register(doc.node.identifier, outputPath);
1088
+ }
1089
+ }
1090
+ }
1091
+ if (options.dryRun) {
1092
+ let count = 0;
1093
+ for (const [, collected] of parsedFiles) {
1094
+ for (const doc of collected) {
1095
+ if (options.types && options.types.length > 0) {
1096
+ if (!FR_DOC_TYPE_SET.has(doc.xmlMeta.documentType) || !options.types.includes(doc.xmlMeta.documentType)) {
1097
+ continue;
1098
+ }
1099
+ }
1100
+ count++;
1101
+ }
1102
+ }
1103
+ return {
1104
+ documentsConverted: count,
1105
+ files: [],
1106
+ totalTokenEstimate: 0,
1107
+ peakMemoryBytes: 0,
1108
+ dryRun: true
1109
+ };
1110
+ }
1111
+ for (const [, collected] of parsedFiles) {
1112
+ for (const doc of collected) {
1113
+ if (options.types && options.types.length > 0) {
1114
+ if (!FR_DOC_TYPE_SET.has(doc.xmlMeta.documentType) || !options.types.includes(doc.xmlMeta.documentType)) {
1115
+ continue;
1116
+ }
1117
+ }
1118
+ const outputPath = buildFrOutputPath(
1119
+ doc.documentNumber,
1120
+ doc.publicationDate,
1121
+ options.output
1122
+ );
1123
+ const frontmatter = buildFrFrontmatter(doc.node, doc.context, doc.xmlMeta, doc.jsonMeta);
1124
+ const markdown = renderDocument(doc.node, frontmatter, {
1125
+ headingOffset: 0,
1126
+ linkStyle: options.linkStyle,
1127
+ resolveLink: options.linkStyle === "relative" ? (id) => linkResolver.resolve(id, outputPath) : void 0
1128
+ });
1129
+ await mkdir(dirname(outputPath), { recursive: true });
1130
+ await writeFile(outputPath, markdown, "utf-8");
1131
+ files.push(outputPath);
1132
+ const tokenEstimate = Math.round(markdown.length / 4);
1133
+ totalTokenEstimate += tokenEstimate;
1134
+ const mem = process.memoryUsage().rss;
1135
+ if (mem > peakMemoryBytes) {
1136
+ peakMemoryBytes = mem;
1137
+ }
1138
+ }
1139
+ }
1140
+ return {
1141
+ documentsConverted: files.length,
1142
+ files,
1143
+ totalTokenEstimate,
1144
+ peakMemoryBytes,
1145
+ dryRun: false
1146
+ };
1147
+ }
1148
+ async function parseXmlFile(xmlPath) {
1149
+ const collected = [];
1150
+ const builder = new FrASTBuilder({
1151
+ onEmit: (node, context) => {
1152
+ const currentMetas = builder.getDocumentMetas();
1153
+ const meta = currentMetas[currentMetas.length - 1];
1154
+ if (!meta) {
1155
+ console.warn(
1156
+ `Warning: No XML metadata extracted for emitted document in ${xmlPath}. Frontmatter will have empty document_type and document_number.`
1157
+ );
1158
+ }
1159
+ collected.push({
1160
+ node,
1161
+ context,
1162
+ xmlMeta: meta ?? { documentType: "", documentTypeNormalized: "" },
1163
+ publicationDate: "",
1164
+ documentNumber: meta?.documentNumber ?? ""
1165
+ });
1166
+ }
1167
+ });
1168
+ const parser = new XMLParser({ defaultNamespace: "" });
1169
+ parser.on("openElement", (name, attrs) => builder.onOpenElement(name, attrs));
1170
+ parser.on("closeElement", (name) => builder.onCloseElement(name));
1171
+ parser.on("text", (text) => builder.onText(text));
1172
+ const stream = createReadStream(xmlPath, "utf-8");
1173
+ await parser.parseStream(stream);
1174
+ const jsonPath = xmlPath.replace(/\.xml$/, ".json");
1175
+ let jsonMeta;
1176
+ if (existsSync(jsonPath)) {
1177
+ try {
1178
+ const raw = await readFile(jsonPath, "utf-8");
1179
+ jsonMeta = JSON.parse(raw);
1180
+ } catch (err) {
1181
+ console.warn(
1182
+ `Warning: Failed to parse JSON sidecar ${jsonPath}: ${err instanceof Error ? err.message : String(err)}. Continuing without enriched metadata.`
1183
+ );
1184
+ }
1185
+ }
1186
+ for (const doc of collected) {
1187
+ if (jsonMeta && jsonMeta.document_number === doc.documentNumber) {
1188
+ doc.jsonMeta = jsonMeta;
1189
+ doc.publicationDate = jsonMeta.publication_date;
1190
+ } else {
1191
+ const inferredDate = inferDateFromPath(xmlPath);
1192
+ if (!inferredDate) {
1193
+ console.warn(
1194
+ `Warning: No publication date for document ${doc.documentNumber || "(unknown)"} \u2014 no JSON sidecar and path ${xmlPath} has no YYYY/MM/ pattern. Output will be in 0000/00/.`
1195
+ );
1196
+ }
1197
+ doc.publicationDate = inferredDate;
1198
+ }
1199
+ }
1200
+ return collected;
1201
+ }
1202
+ async function discoverXmlFiles(input, from, to) {
1203
+ let inputStat;
1204
+ try {
1205
+ inputStat = await stat(input);
1206
+ } catch (err) {
1207
+ throw new Error(
1208
+ `Cannot access input path "${input}": ${err instanceof Error ? err.message : String(err)}`,
1209
+ { cause: err }
1210
+ );
1211
+ }
1212
+ if (inputStat.isFile()) {
1213
+ return [input];
1214
+ }
1215
+ if (!inputStat.isDirectory()) {
1216
+ throw new Error(`Input path "${input}" is not a file or directory`);
1217
+ }
1218
+ const xmlFiles = [];
1219
+ await walkDir(input, xmlFiles);
1220
+ let filtered = xmlFiles;
1221
+ if (from || to) {
1222
+ filtered = xmlFiles.filter((f) => {
1223
+ const date = inferDateFromPath(f);
1224
+ if (!date) return true;
1225
+ if (from && date < from) return false;
1226
+ if (to && date > to + "-32") return false;
1227
+ return true;
1228
+ });
1229
+ }
1230
+ return filtered.sort();
1231
+ }
1232
+ async function walkDir(dir, results) {
1233
+ const entries = await readdir(dir, { withFileTypes: true });
1234
+ for (const entry of entries) {
1235
+ const fullPath = join2(dir, entry.name);
1236
+ if (entry.isDirectory()) {
1237
+ await walkDir(fullPath, results);
1238
+ } else if (entry.isFile() && entry.name.endsWith(".xml")) {
1239
+ results.push(fullPath);
1240
+ }
1241
+ }
1242
+ }
1243
+ function inferDateFromPath(filePath) {
1244
+ const match = /(\d{4})\/(\d{2})\/[^/]+\.xml$/.exec(filePath);
1245
+ if (match) {
1246
+ return `${match[1]}-${match[2]}-01`;
1247
+ }
1248
+ return "";
1249
+ }
1250
+
1251
+ // src/downloader.ts
1252
+ import { createWriteStream } from "fs";
1253
+ import { mkdir as mkdir2, stat as stat2, writeFile as fsWriteFile } from "fs/promises";
1254
+ import { dirname as dirname2 } from "path";
1255
+ import { pipeline } from "stream/promises";
1256
+ import { Readable } from "stream";
1257
+ var FR_API_BASE = "https://www.federalregister.gov/api/v1";
1258
+ var PER_PAGE = 200;
1259
+ var DEFAULT_FETCH_DELAY_MS = 100;
1260
+ var MAX_RETRIES = 2;
1261
+ var RETRY_BASE_DELAY_MS = 2e3;
1262
+ var API_FIELDS = [
1263
+ "document_number",
1264
+ "type",
1265
+ "title",
1266
+ "publication_date",
1267
+ "citation",
1268
+ "volume",
1269
+ "start_page",
1270
+ "end_page",
1271
+ "agencies",
1272
+ "cfr_references",
1273
+ "docket_ids",
1274
+ "regulation_id_numbers",
1275
+ "effective_on",
1276
+ "comments_close_on",
1277
+ "action",
1278
+ "abstract",
1279
+ "significant",
1280
+ "topics",
1281
+ "full_text_xml_url"
1282
+ ];
1283
+ function buildFrApiListUrl(from, to, page, types) {
1284
+ const params = new URLSearchParams();
1285
+ params.set("conditions[publication_date][gte]", from);
1286
+ params.set("conditions[publication_date][lte]", to);
1287
+ params.set("per_page", String(PER_PAGE));
1288
+ params.set("page", String(page));
1289
+ params.set("order", "oldest");
1290
+ for (const field of API_FIELDS) {
1291
+ params.append("fields[]", field);
1292
+ }
1293
+ if (types && types.length > 0) {
1294
+ for (const t of types) {
1295
+ params.append("conditions[type][]", t);
1296
+ }
1297
+ }
1298
+ return `${FR_API_BASE}/documents.json?${params.toString()}`;
1299
+ }
1300
+ async function downloadFrDocuments(options) {
1301
+ const to = options.to ?? (/* @__PURE__ */ new Date()).toISOString().slice(0, 10);
1302
+ const fetchDelay = options.fetchDelayMs ?? DEFAULT_FETCH_DELAY_MS;
1303
+ const files = [];
1304
+ const failed = [];
1305
+ let totalBytes = 0;
1306
+ let skipped = 0;
1307
+ let totalDocumentsFound = 0;
1308
+ const chunks = buildMonthChunks(options.from, to);
1309
+ for (const chunk of chunks) {
1310
+ if (options.limit !== void 0 && files.length >= options.limit) break;
1311
+ let page = 1;
1312
+ let hasMore = true;
1313
+ while (hasMore) {
1314
+ const listUrl = buildFrApiListUrl(chunk.from, chunk.to, page, options.types);
1315
+ const response = await fetchWithRetry(listUrl);
1316
+ const data = await response.json();
1317
+ if (typeof data.count !== "number") {
1318
+ throw new Error(
1319
+ `Unexpected API response for ${listUrl}: missing or invalid 'count' field. The FederalRegister.gov API may have changed its response format.`
1320
+ );
1321
+ }
1322
+ if (page === 1 && totalDocumentsFound === 0) {
1323
+ totalDocumentsFound = data.count;
1324
+ }
1325
+ const results = data.results ?? [];
1326
+ for (const doc of results) {
1327
+ if (options.limit !== void 0 && files.length >= options.limit) {
1328
+ hasMore = false;
1329
+ break;
1330
+ }
1331
+ options.onProgress?.({
1332
+ documentsDownloaded: files.length,
1333
+ totalDocuments: totalDocumentsFound,
1334
+ currentDocument: doc.document_number,
1335
+ currentChunk: `${chunk.from.slice(0, 7)}`
1336
+ });
1337
+ if (!doc.full_text_xml_url) {
1338
+ skipped++;
1339
+ continue;
1340
+ }
1341
+ try {
1342
+ const result = await downloadSingleDocument(doc, options.output, fetchDelay);
1343
+ files.push(result);
1344
+ totalBytes += result.size;
1345
+ } catch (err) {
1346
+ failed.push({
1347
+ documentNumber: doc.document_number,
1348
+ error: err instanceof Error ? err.message : String(err)
1349
+ });
1350
+ }
1351
+ }
1352
+ hasMore = hasMore && page < (data.total_pages ?? 0);
1353
+ page++;
1354
+ }
1355
+ }
1356
+ return {
1357
+ documentsDownloaded: files.length,
1358
+ files,
1359
+ totalBytes,
1360
+ dateRange: { from: options.from, to },
1361
+ skipped,
1362
+ failed
1363
+ };
1364
+ }
1365
+ async function downloadSingleFrDocument(documentNumber, output) {
1366
+ const metaUrl = `${FR_API_BASE}/documents/${documentNumber}.json?${new URLSearchParams(API_FIELDS.map((f) => ["fields[]", f])).toString()}`;
1367
+ const metaResponse = await fetchWithRetry(metaUrl);
1368
+ const doc = await metaResponse.json();
1369
+ if (!doc.document_number || !doc.publication_date) {
1370
+ throw new Error(
1371
+ `Invalid API response for document ${documentNumber}: missing document_number or publication_date`
1372
+ );
1373
+ }
1374
+ return downloadSingleDocument(doc, output, 0);
1375
+ }
1376
+ async function downloadSingleDocument(doc, outputDir, fetchDelay) {
1377
+ if (!doc.document_number || !doc.publication_date) {
1378
+ throw new Error(
1379
+ `Invalid document in API response: missing document_number or publication_date`
1380
+ );
1381
+ }
1382
+ if (!doc.full_text_xml_url) {
1383
+ throw new Error(
1384
+ `Document ${doc.document_number} has no full_text_xml_url \u2014 cannot download XML`
1385
+ );
1386
+ }
1387
+ const xmlPath = buildFrDownloadXmlPath(doc.document_number, doc.publication_date, outputDir);
1388
+ const jsonPath = buildFrDownloadJsonPath(doc.document_number, doc.publication_date, outputDir);
1389
+ await mkdir2(dirname2(xmlPath), { recursive: true });
1390
+ const jsonContent = JSON.stringify(doc, null, 2);
1391
+ await fsWriteFile(jsonPath, jsonContent, "utf-8");
1392
+ if (fetchDelay > 0) {
1393
+ await sleep(fetchDelay);
1394
+ }
1395
+ const xmlResponse = await fetchWithRetry(doc.full_text_xml_url);
1396
+ if (!xmlResponse.body) {
1397
+ throw new Error(`No response body for ${doc.document_number} XML`);
1398
+ }
1399
+ const dest = createWriteStream(xmlPath);
1400
+ try {
1401
+ await pipeline(Readable.fromWeb(xmlResponse.body), dest);
1402
+ } catch (err) {
1403
+ throw new Error(
1404
+ `Failed to write XML for document ${doc.document_number} from ${doc.full_text_xml_url}: ${err instanceof Error ? err.message : String(err)}`,
1405
+ { cause: err }
1406
+ );
1407
+ }
1408
+ const xmlStat = await stat2(xmlPath);
1409
+ const jsonSize = Buffer.byteLength(jsonContent, "utf-8");
1410
+ return {
1411
+ xmlPath,
1412
+ jsonPath,
1413
+ documentNumber: doc.document_number,
1414
+ publicationDate: doc.publication_date,
1415
+ size: Number(xmlStat.size) + jsonSize
1416
+ };
1417
+ }
1418
+ function buildMonthChunks(from, to) {
1419
+ const chunks = [];
1420
+ let current = /* @__PURE__ */ new Date(from + "T00:00:00Z");
1421
+ const end = /* @__PURE__ */ new Date(to + "T00:00:00Z");
1422
+ while (current <= end) {
1423
+ const chunkStart = current.toISOString().slice(0, 10);
1424
+ const monthEnd = new Date(
1425
+ Date.UTC(current.getUTCFullYear(), current.getUTCMonth() + 1, 0)
1426
+ );
1427
+ const chunkEnd = monthEnd <= end ? monthEnd.toISOString().slice(0, 10) : to;
1428
+ chunks.push({ from: chunkStart, to: chunkEnd });
1429
+ current = new Date(
1430
+ Date.UTC(current.getUTCFullYear(), current.getUTCMonth() + 1, 1)
1431
+ );
1432
+ }
1433
+ return chunks;
1434
+ }
1435
+ async function fetchWithRetry(url, attempt = 0) {
1436
+ let response;
1437
+ try {
1438
+ response = await fetch(url);
1439
+ } catch (err) {
1440
+ if (attempt < MAX_RETRIES) {
1441
+ const delay = RETRY_BASE_DELAY_MS * Math.pow(2, attempt);
1442
+ console.warn(
1443
+ `Network error for ${url}: ${err instanceof Error ? err.message : String(err)}. Retrying in ${delay}ms (attempt ${attempt + 1}/${MAX_RETRIES})...`
1444
+ );
1445
+ await sleep(delay);
1446
+ return fetchWithRetry(url, attempt + 1);
1447
+ }
1448
+ throw new Error(
1449
+ `Network error after ${MAX_RETRIES + 1} attempts for ${url}: ${err instanceof Error ? err.message : String(err)}`,
1450
+ { cause: err }
1451
+ );
1452
+ }
1453
+ if (response.ok) return response;
1454
+ if ((response.status === 429 || response.status === 503 || response.status === 504) && attempt < MAX_RETRIES) {
1455
+ const retryAfter = response.headers.get("Retry-After");
1456
+ const parsedRetry = retryAfter ? parseInt(retryAfter, 10) : NaN;
1457
+ const delay = !isNaN(parsedRetry) && parsedRetry > 0 ? parsedRetry * 1e3 : RETRY_BASE_DELAY_MS * Math.pow(2, attempt);
1458
+ console.warn(
1459
+ `HTTP ${response.status} for ${url}. Retrying in ${delay}ms (attempt ${attempt + 1}/${MAX_RETRIES})...`
1460
+ );
1461
+ await sleep(delay);
1462
+ return fetchWithRetry(url, attempt + 1);
1463
+ }
1464
+ throw new Error(`HTTP ${response.status}: ${response.statusText} for ${url}`);
1465
+ }
1466
+ function sleep(ms) {
1467
+ return new Promise((resolve) => setTimeout(resolve, ms));
1468
+ }
1469
+ export {
1470
+ FR_BLOCK_ELEMENTS,
1471
+ FR_CONTENT_ELEMENTS,
1472
+ FR_DOCUMENT_ELEMENTS,
1473
+ FR_DOCUMENT_TYPE_KEYS,
1474
+ FR_DOCUMENT_TYPE_MAP,
1475
+ FR_EMPHASIS_MAP,
1476
+ FR_HD_SOURCE_TO_DEPTH,
1477
+ FR_HEADING_ELEMENT,
1478
+ FR_IGNORE_ELEMENTS,
1479
+ FR_INLINE_ELEMENTS,
1480
+ FR_NOTE_ELEMENTS,
1481
+ FR_PASSTHROUGH_ELEMENTS,
1482
+ FR_PREAMBLE_META_ELEMENTS,
1483
+ FR_PREAMBLE_SECTIONS,
1484
+ FR_PRESIDENTIAL_SUBTYPES,
1485
+ FR_REGTEXT_ELEMENTS,
1486
+ FR_SECTION_CONTAINERS,
1487
+ FR_SIGNATURE_ELEMENTS,
1488
+ FR_SKIP_ELEMENTS,
1489
+ FR_TABLE_ELEMENTS,
1490
+ FrASTBuilder,
1491
+ buildFrApiListUrl,
1492
+ buildFrDownloadJsonPath,
1493
+ buildFrDownloadXmlPath,
1494
+ buildFrFrontmatter,
1495
+ buildFrOutputPath,
1496
+ buildMonthDir,
1497
+ buildYearDir,
1498
+ convertFrDocuments,
1499
+ downloadFrDocuments,
1500
+ downloadSingleFrDocument
1501
+ };
1502
+ //# sourceMappingURL=index.js.map