@lexbuild/ecfr 1.9.0 → 1.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,1344 @@
1
+ // src/converter.ts
2
+ import { createReadStream } from "fs";
3
+ import { mkdir, writeFile } from "fs/promises";
4
+ import { join as join2, dirname, basename, relative } from "path";
5
+ import {
6
+ XMLParser,
7
+ renderDocument,
8
+ createLinkResolver,
9
+ FORMAT_VERSION,
10
+ GENERATOR
11
+ } from "@lexbuild/core";
12
+
13
+ // src/ecfr-builder.ts
14
+ import { LEVEL_TYPES } from "@lexbuild/core";
15
+
16
+ // src/ecfr-elements.ts
17
+ var ECFR_TYPE_TO_LEVEL = {
18
+ TITLE: "title",
19
+ SUBTITLE: "subtitle",
20
+ CHAPTER: "chapter",
21
+ SUBCHAP: "subchapter",
22
+ PART: "part",
23
+ SUBPART: "subpart",
24
+ SUBJGRP: "subpart",
25
+ // Subject groups act like subparts
26
+ SECTION: "section",
27
+ APPENDIX: "appendix"
28
+ };
29
+ var ECFR_DIV_ELEMENTS = /* @__PURE__ */ new Set([
30
+ "DIV1",
31
+ "DIV2",
32
+ "DIV3",
33
+ "DIV4",
34
+ "DIV5",
35
+ "DIV6",
36
+ "DIV7",
37
+ "DIV8",
38
+ "DIV9"
39
+ ]);
40
+ var ECFR_CONTENT_ELEMENTS = /* @__PURE__ */ new Set([
41
+ "P",
42
+ // Paragraph (primary content element)
43
+ "FP",
44
+ // Flush paragraph
45
+ "FP-1",
46
+ // Indented flush paragraph (level 1)
47
+ "FP-2",
48
+ // Indented flush paragraph (level 2)
49
+ "FP-DASH",
50
+ // Dash-leader flush paragraph (form lines)
51
+ "FP1-2",
52
+ // Alternative indented paragraph
53
+ "FRP"
54
+ // Flush right paragraph
55
+ ]);
56
+ var ECFR_INLINE_ELEMENTS = /* @__PURE__ */ new Set([
57
+ "I",
58
+ // Italic
59
+ "B",
60
+ // Bold
61
+ "E",
62
+ // Emphasis (type varies by T attribute)
63
+ "SU",
64
+ // Superscript
65
+ "FR",
66
+ // Fraction
67
+ "AC"
68
+ // Accent/diacritical
69
+ ]);
70
+ var ECFR_EMPHASIS_MAP = {
71
+ "01": "bold",
72
+ "02": "italic",
73
+ "03": "bold",
74
+ // bold italic in print — treat as bold for Markdown
75
+ "04": "italic",
76
+ // italic in headings
77
+ "05": "italic",
78
+ // small caps — render as italic
79
+ "51": "sub",
80
+ // subscript
81
+ "52": "sub",
82
+ // subscript
83
+ "54": "sub",
84
+ // subscript (math)
85
+ "7462": "italic"
86
+ // special terms (et seq., De minimis)
87
+ };
88
+ var ECFR_NOTE_ELEMENTS = /* @__PURE__ */ new Set([
89
+ "AUTH",
90
+ // Authority citation
91
+ "SOURCE",
92
+ // Source/provenance note
93
+ "EDNOTE",
94
+ // Editorial note
95
+ "EFFDNOT",
96
+ // Effective date note
97
+ "CITA",
98
+ // Citation / amendment history
99
+ "APPRO",
100
+ // OMB approval note
101
+ "NOTE",
102
+ // General note
103
+ "CROSSREF",
104
+ // Cross-reference block
105
+ "SECAUTH",
106
+ // Section-level authority
107
+ "FTNT"
108
+ // Footnote
109
+ ]);
110
+ var ECFR_HEADING_ELEMENTS = /* @__PURE__ */ new Set(["HD1", "HD2", "HD3"]);
111
+ var ECFR_BLOCK_ELEMENTS = /* @__PURE__ */ new Set([
112
+ "EXTRACT",
113
+ // Extracted/quoted text
114
+ "EXAMPLE"
115
+ // Example text
116
+ ]);
117
+ var ECFR_IGNORE_ELEMENTS = /* @__PURE__ */ new Set([
118
+ "CFRTOC",
119
+ // Table of contents (skip subtree)
120
+ "HEADER"
121
+ // File metadata header (skip subtree)
122
+ ]);
123
+ var ECFR_PASSTHROUGH_ELEMENTS = /* @__PURE__ */ new Set(["DLPSTEXTCLASS", "TEXT", "BODY", "ECFRBRWS"]);
124
+ var ECFR_SKIP_ELEMENTS = /* @__PURE__ */ new Set([
125
+ "PTHD",
126
+ // Part heading in TOC
127
+ "CHAPTI",
128
+ // Chapter item in TOC
129
+ "SECHD",
130
+ // Section heading in TOC
131
+ "SUBJECT",
132
+ // Subject text in TOC
133
+ "RESERVED",
134
+ // Reserved placeholder
135
+ "PG",
136
+ // Page number
137
+ "STARS",
138
+ // Visual separator
139
+ "AMDDATE"
140
+ // Amendment date
141
+ ]);
142
+ var ECFR_REF_ELEMENTS = /* @__PURE__ */ new Set([
143
+ "XREF",
144
+ // Cross-reference link
145
+ "FTREF"
146
+ // Footnote reference marker
147
+ ]);
148
+ var ECFR_TABLE_ELEMENTS = /* @__PURE__ */ new Set(["TABLE", "TR", "TH", "TD"]);
149
+
150
+ // src/ecfr-builder.ts
151
+ var EcfrASTBuilder = class {
152
+ options;
153
+ stack = [];
154
+ documentMeta = {};
155
+ emitAtIndex;
156
+ /** Track title number from metadata header */
157
+ titleNumber = "";
158
+ /** Depth inside CFRTOC or other ignored container */
159
+ ignoredContainerDepth = 0;
160
+ /** Part-level notes (authority/source) keyed by part identifier */
161
+ partNotes = /* @__PURE__ */ new Map();
162
+ constructor(options) {
163
+ this.options = options;
164
+ this.emitAtIndex = LEVEL_TYPES.indexOf(options.emitAt);
165
+ }
166
+ /** Get part-level notes (authority/source) captured during parsing */
167
+ getPartNotes() {
168
+ return this.partNotes;
169
+ }
170
+ /** Handle SAX open element */
171
+ onOpenElement(name, attrs) {
172
+ if (this.ignoredContainerDepth > 0) {
173
+ this.ignoredContainerDepth++;
174
+ return;
175
+ }
176
+ if (ECFR_IGNORE_ELEMENTS.has(name)) {
177
+ this.ignoredContainerDepth = 1;
178
+ return;
179
+ }
180
+ if (ECFR_PASSTHROUGH_ELEMENTS.has(name)) {
181
+ return;
182
+ }
183
+ if (ECFR_SKIP_ELEMENTS.has(name)) {
184
+ this.ignoredContainerDepth = 1;
185
+ return;
186
+ }
187
+ if (ECFR_DIV_ELEMENTS.has(name)) {
188
+ const divType = attrs["TYPE"];
189
+ if (divType) {
190
+ const levelType = ECFR_TYPE_TO_LEVEL[divType];
191
+ if (levelType) {
192
+ this.openLevel(levelType, name, attrs);
193
+ return;
194
+ }
195
+ }
196
+ this.stack.push({ kind: "ignore", elementName: name, textBuffer: "" });
197
+ return;
198
+ }
199
+ if (name === "HEAD") {
200
+ this.stack.push({ kind: "heading", elementName: name, textBuffer: "" });
201
+ return;
202
+ }
203
+ if (name === "HED") {
204
+ this.stack.push({ kind: "heading", elementName: name, textBuffer: "" });
205
+ return;
206
+ }
207
+ if (name === "PSPACE") {
208
+ this.stack.push({ kind: "noteContent", elementName: name, textBuffer: "" });
209
+ return;
210
+ }
211
+ if (ECFR_CONTENT_ELEMENTS.has(name)) {
212
+ this.openContent(name);
213
+ return;
214
+ }
215
+ if (ECFR_HEADING_ELEMENTS.has(name)) {
216
+ this.openContent(name);
217
+ return;
218
+ }
219
+ if (ECFR_INLINE_ELEMENTS.has(name)) {
220
+ this.openInline(name, attrs);
221
+ return;
222
+ }
223
+ if (ECFR_REF_ELEMENTS.has(name)) {
224
+ this.openRef(name, attrs);
225
+ return;
226
+ }
227
+ if (ECFR_NOTE_ELEMENTS.has(name)) {
228
+ this.openNote(name, attrs);
229
+ return;
230
+ }
231
+ if (ECFR_BLOCK_ELEMENTS.has(name)) {
232
+ this.stack.push({ kind: "block", elementName: name, textBuffer: "" });
233
+ return;
234
+ }
235
+ if (name === "TABLE") {
236
+ this.stack.push({
237
+ kind: "table",
238
+ elementName: name,
239
+ textBuffer: "",
240
+ headers: [],
241
+ rows: [],
242
+ currentRow: [],
243
+ isHeaderRow: false
244
+ });
245
+ return;
246
+ }
247
+ if (name === "TR") {
248
+ const tableFrame = this.findTableFrame();
249
+ if (tableFrame) {
250
+ tableFrame.currentRow = [];
251
+ tableFrame.isHeaderRow = false;
252
+ this.stack.push({ kind: "tableRow", elementName: name, textBuffer: "" });
253
+ }
254
+ return;
255
+ }
256
+ if (name === "TH") {
257
+ const tableFrame = this.findTableFrame();
258
+ if (tableFrame) {
259
+ tableFrame.isHeaderRow = true;
260
+ this.stack.push({ kind: "tableCell", elementName: name, textBuffer: "" });
261
+ }
262
+ return;
263
+ }
264
+ if (name === "TD") {
265
+ this.stack.push({ kind: "tableCell", elementName: name, textBuffer: "" });
266
+ return;
267
+ }
268
+ if (name === "DIV" || name === "div") {
269
+ this.stack.push({ kind: "ignore", elementName: name, textBuffer: "" });
270
+ return;
271
+ }
272
+ if (name === "img") {
273
+ return;
274
+ }
275
+ this.stack.push({ kind: "ignore", elementName: name, textBuffer: "" });
276
+ }
277
+ /** Handle SAX close element */
278
+ onCloseElement(name) {
279
+ if (this.ignoredContainerDepth > 0) {
280
+ this.ignoredContainerDepth--;
281
+ return;
282
+ }
283
+ if (ECFR_PASSTHROUGH_ELEMENTS.has(name)) {
284
+ return;
285
+ }
286
+ if (name === "HEAD") {
287
+ const frame = this.popFrame(name);
288
+ if (frame) {
289
+ const parentLevel = this.findParentLevel();
290
+ if (parentLevel?.node && parentLevel.node.type === "level") {
291
+ const levelNode = parentLevel.node;
292
+ const headText = frame.textBuffer.trim();
293
+ if (levelNode.levelType === "section" && levelNode.numValue) {
294
+ const prefix = `\xA7 ${levelNode.numValue}`;
295
+ let stripped = headText;
296
+ if (stripped.startsWith(prefix)) {
297
+ stripped = stripped.slice(prefix.length).replace(/^[\s.]+/, "").trim();
298
+ }
299
+ levelNode.heading = stripped || headText;
300
+ } else {
301
+ levelNode.heading = stripLevelPrefix(headText);
302
+ }
303
+ }
304
+ }
305
+ return;
306
+ }
307
+ if (name === "HED") {
308
+ this.popFrame(name);
309
+ return;
310
+ }
311
+ if (name === "PSPACE") {
312
+ const frame = this.popFrame(name);
313
+ if (frame) {
314
+ const parentNote = this.findParentNote();
315
+ if (parentNote?.node && parentNote.node.type === "note") {
316
+ const noteNode = parentNote.node;
317
+ const textNode = {
318
+ type: "inline",
319
+ inlineType: "text",
320
+ text: frame.textBuffer.trim()
321
+ };
322
+ const contentNode = {
323
+ type: "content",
324
+ variant: "content",
325
+ children: [textNode]
326
+ };
327
+ noteNode.children.push(contentNode);
328
+ }
329
+ }
330
+ return;
331
+ }
332
+ if (ECFR_DIV_ELEMENTS.has(name)) {
333
+ this.closeLevel(name);
334
+ return;
335
+ }
336
+ if (ECFR_CONTENT_ELEMENTS.has(name) || ECFR_HEADING_ELEMENTS.has(name)) {
337
+ this.closeContent(name);
338
+ return;
339
+ }
340
+ if (ECFR_INLINE_ELEMENTS.has(name)) {
341
+ this.closeInline(name);
342
+ return;
343
+ }
344
+ if (ECFR_REF_ELEMENTS.has(name)) {
345
+ this.closeInline(name);
346
+ return;
347
+ }
348
+ if (ECFR_NOTE_ELEMENTS.has(name)) {
349
+ this.closeNote(name);
350
+ return;
351
+ }
352
+ if (ECFR_BLOCK_ELEMENTS.has(name)) {
353
+ this.popFrame(name);
354
+ return;
355
+ }
356
+ if (name === "TABLE") {
357
+ this.closeTable();
358
+ return;
359
+ }
360
+ if (name === "TR") {
361
+ this.closeTableRow();
362
+ return;
363
+ }
364
+ if (name === "TH" || name === "TD") {
365
+ this.closeTableCell();
366
+ return;
367
+ }
368
+ if (name === "img") {
369
+ return;
370
+ }
371
+ if (this.stack.length > 0 && this.stack[this.stack.length - 1]?.elementName === name) {
372
+ this.stack.pop();
373
+ }
374
+ }
375
+ /** Handle SAX text content */
376
+ onText(text) {
377
+ if (this.ignoredContainerDepth > 0) return;
378
+ const frame = this.stack[this.stack.length - 1];
379
+ if (!frame) return;
380
+ if (frame.kind === "heading" || frame.kind === "noteContent" || frame.kind === "tableCell") {
381
+ frame.textBuffer += text;
382
+ return;
383
+ }
384
+ if (frame.kind === "content" && frame.node?.type === "content") {
385
+ const contentNode = frame.node;
386
+ const trimmed = text;
387
+ if (trimmed) {
388
+ contentNode.children.push({
389
+ type: "inline",
390
+ inlineType: "text",
391
+ text: trimmed
392
+ });
393
+ }
394
+ return;
395
+ }
396
+ if (frame.kind === "inline" && frame.node?.type === "inline") {
397
+ const inlineNode = frame.node;
398
+ if (inlineNode.children) {
399
+ inlineNode.children.push({
400
+ type: "inline",
401
+ inlineType: "text",
402
+ text
403
+ });
404
+ } else {
405
+ inlineNode.text = (inlineNode.text ?? "") + text;
406
+ }
407
+ return;
408
+ }
409
+ if (frame.kind === "note" && frame.node?.type === "note") {
410
+ frame.textBuffer += text;
411
+ return;
412
+ }
413
+ if (frame.kind === "level") {
414
+ return;
415
+ }
416
+ }
417
+ // ---- Private helpers ----
418
+ openLevel(levelType, elementName, attrs) {
419
+ const nAttr = attrs["N"] ?? "";
420
+ const nodeAttr = attrs["NODE"] ?? "";
421
+ let numValue = nAttr.replace(/^§\s*/, "").trim();
422
+ const num = nAttr.trim();
423
+ if (levelType === "title") {
424
+ const titleFromNode = nodeAttr.split(":")[0];
425
+ if (titleFromNode) {
426
+ numValue = titleFromNode;
427
+ }
428
+ }
429
+ let identifier;
430
+ if (levelType === "title") {
431
+ identifier = `/us/cfr/t${numValue}`;
432
+ this.titleNumber = numValue;
433
+ } else if (levelType === "section") {
434
+ identifier = `/us/cfr/t${this.titleNumber}/s${numValue}`;
435
+ } else if (levelType === "part") {
436
+ identifier = `/us/cfr/t${this.titleNumber}/pt${numValue}`;
437
+ } else if (levelType === "chapter") {
438
+ identifier = `/us/cfr/t${this.titleNumber}/ch${numValue}`;
439
+ }
440
+ const node = {
441
+ type: "level",
442
+ levelType,
443
+ num: num || void 0,
444
+ numValue: numValue || void 0,
445
+ identifier,
446
+ children: [],
447
+ sourceElement: elementName
448
+ };
449
+ this.stack.push({ kind: "level", elementName, node, textBuffer: "" });
450
+ }
451
+ closeLevel(elementName) {
452
+ const frame = this.popFrame(elementName);
453
+ if (!frame || frame.kind !== "level" || !frame.node) return;
454
+ const levelNode = frame.node;
455
+ const levelIndex = LEVEL_TYPES.indexOf(levelNode.levelType);
456
+ if (levelNode.levelType === "part" && levelNode.identifier) {
457
+ let authority;
458
+ let regulatorySource;
459
+ for (const child of levelNode.children) {
460
+ if (child.type === "note") {
461
+ const noteNode = child;
462
+ if (noteNode.noteType === "authority" && !authority) {
463
+ authority = this.extractNoteText(noteNode);
464
+ }
465
+ if (noteNode.noteType === "regulatorySource" && !regulatorySource) {
466
+ regulatorySource = this.extractNoteText(noteNode);
467
+ }
468
+ }
469
+ }
470
+ if (authority || regulatorySource) {
471
+ this.partNotes.set(levelNode.identifier, { authority, regulatorySource });
472
+ }
473
+ }
474
+ if (levelIndex >= 0 && levelIndex >= this.emitAtIndex) {
475
+ const ancestors = [];
476
+ for (const f of this.stack) {
477
+ if (f.kind === "level" && f.node?.type === "level") {
478
+ const ln = f.node;
479
+ ancestors.push({
480
+ levelType: ln.levelType,
481
+ numValue: ln.numValue,
482
+ heading: ln.heading,
483
+ identifier: ln.identifier
484
+ });
485
+ }
486
+ }
487
+ const context = {
488
+ ancestors,
489
+ documentMeta: { ...this.documentMeta }
490
+ };
491
+ this.options.onEmit(levelNode, context);
492
+ } else {
493
+ const parentLevel = this.findParentLevel();
494
+ if (parentLevel?.node && parentLevel.node.type === "level") {
495
+ parentLevel.node.children.push(levelNode);
496
+ }
497
+ }
498
+ }
499
+ openContent(elementName) {
500
+ const variant = "content";
501
+ const isSubHeading = ECFR_HEADING_ELEMENTS.has(elementName);
502
+ const node = {
503
+ type: "content",
504
+ variant,
505
+ children: []
506
+ };
507
+ if (isSubHeading) {
508
+ node.children.push({
509
+ type: "inline",
510
+ inlineType: "bold",
511
+ children: []
512
+ });
513
+ }
514
+ this.stack.push({ kind: "content", elementName, node, textBuffer: "" });
515
+ }
516
+ closeContent(elementName) {
517
+ const frame = this.popFrame(elementName);
518
+ if (!frame || !frame.node) return;
519
+ const contentNode = frame.node;
520
+ if (ECFR_HEADING_ELEMENTS.has(elementName)) {
521
+ const boldNode = contentNode.children[0];
522
+ if (boldNode && boldNode.type === "inline" && boldNode.inlineType === "bold") {
523
+ if (!boldNode.text && (!boldNode.children || boldNode.children.length === 0) && contentNode.children.length <= 1) {
524
+ return;
525
+ }
526
+ }
527
+ }
528
+ const parent = this.findParentLevel() ?? this.findParentNote();
529
+ if (parent?.node) {
530
+ if (parent.node.type === "level") {
531
+ parent.node.children.push(contentNode);
532
+ } else if (parent.node.type === "note") {
533
+ parent.node.children.push(contentNode);
534
+ }
535
+ }
536
+ }
537
+ openInline(elementName, attrs) {
538
+ let inlineType = "text";
539
+ if (elementName === "I") {
540
+ inlineType = "italic";
541
+ } else if (elementName === "B") {
542
+ inlineType = "bold";
543
+ } else if (elementName === "SU") {
544
+ inlineType = "sup";
545
+ } else if (elementName === "FR") {
546
+ inlineType = "text";
547
+ } else if (elementName === "E") {
548
+ const tValue = attrs["T"] ?? "";
549
+ inlineType = ECFR_EMPHASIS_MAP[tValue] ?? "italic";
550
+ }
551
+ const node = {
552
+ type: "inline",
553
+ inlineType,
554
+ children: []
555
+ };
556
+ this.stack.push({ kind: "inline", elementName, node, textBuffer: "" });
557
+ }
558
+ openRef(elementName, attrs) {
559
+ if (elementName === "FTREF") {
560
+ const node = {
561
+ type: "inline",
562
+ inlineType: "footnoteRef",
563
+ idref: attrs["ID"]
564
+ };
565
+ this.stack.push({ kind: "inline", elementName, node, textBuffer: "" });
566
+ } else {
567
+ const node = {
568
+ type: "inline",
569
+ inlineType: "ref",
570
+ href: attrs["ID"],
571
+ children: []
572
+ };
573
+ this.stack.push({ kind: "inline", elementName, node, textBuffer: "" });
574
+ }
575
+ }
576
+ closeInline(elementName) {
577
+ const frame = this.popFrame(elementName);
578
+ if (!frame || !frame.node) return;
579
+ const inlineNode = frame.node;
580
+ if (inlineNode.inlineType === "footnoteRef" && frame.textBuffer) {
581
+ inlineNode.text = frame.textBuffer.trim();
582
+ }
583
+ const parentFrame = this.stack[this.stack.length - 1];
584
+ if (!parentFrame) return;
585
+ if (parentFrame.kind === "content" && parentFrame.node?.type === "content") {
586
+ const parentContent = parentFrame.node;
587
+ if (ECFR_HEADING_ELEMENTS.has(parentFrame.elementName) && parentContent.children.length > 0 && parentContent.children[0]?.type === "inline" && parentContent.children[0].inlineType === "bold") {
588
+ const boldNode = parentContent.children[0];
589
+ if (boldNode.children) {
590
+ boldNode.children.push(inlineNode);
591
+ }
592
+ } else {
593
+ parentContent.children.push(inlineNode);
594
+ }
595
+ } else if (parentFrame.kind === "inline" && parentFrame.node?.type === "inline") {
596
+ const parentInline = parentFrame.node;
597
+ if (parentInline.children) {
598
+ parentInline.children.push(inlineNode);
599
+ }
600
+ } else if (parentFrame.kind === "note") {
601
+ frame.textBuffer = "";
602
+ }
603
+ }
604
+ openNote(elementName, _attrs) {
605
+ const noteTypeMap = {
606
+ AUTH: "authority",
607
+ SOURCE: "regulatorySource",
608
+ EDNOTE: "editorial",
609
+ EFFDNOT: "effectiveDate",
610
+ CITA: "citation",
611
+ APPRO: "approval",
612
+ NOTE: "general",
613
+ CROSSREF: "crossReference",
614
+ SECAUTH: "sectionAuthority",
615
+ FTNT: "footnote"
616
+ };
617
+ const noteType = noteTypeMap[elementName] ?? elementName.toLowerCase();
618
+ const node = {
619
+ type: "note",
620
+ noteType,
621
+ children: []
622
+ };
623
+ this.stack.push({ kind: "note", elementName, node, textBuffer: "" });
624
+ }
625
+ closeNote(elementName) {
626
+ const frame = this.popFrame(elementName);
627
+ if (!frame || !frame.node) return;
628
+ const noteNode = frame.node;
629
+ if (frame.textBuffer.trim() && noteNode.children.length === 0) {
630
+ const textNode = {
631
+ type: "inline",
632
+ inlineType: "text",
633
+ text: frame.textBuffer.trim()
634
+ };
635
+ const contentNode = {
636
+ type: "content",
637
+ variant: "content",
638
+ children: [textNode]
639
+ };
640
+ noteNode.children.push(contentNode);
641
+ }
642
+ const parentLevel = this.findParentLevel();
643
+ if (parentLevel?.node && parentLevel.node.type === "level") {
644
+ const levelNode = parentLevel.node;
645
+ if (noteNode.noteType === "regulatorySource") {
646
+ const sourceText = this.extractNoteText(noteNode);
647
+ if (sourceText) {
648
+ const sourceCreditNode = {
649
+ type: "sourceCredit",
650
+ children: [{ type: "inline", inlineType: "text", text: sourceText }]
651
+ };
652
+ levelNode.children.push(sourceCreditNode);
653
+ }
654
+ }
655
+ levelNode.children.push(noteNode);
656
+ }
657
+ }
658
+ closeTable() {
659
+ const frame = this.popFrame("TABLE");
660
+ if (!frame || frame.kind !== "table") return;
661
+ const tableNode = {
662
+ type: "table",
663
+ variant: "xhtml",
664
+ headers: frame.headers ?? [],
665
+ rows: frame.rows ?? []
666
+ };
667
+ const parentLevel = this.findParentLevel();
668
+ if (parentLevel?.node && parentLevel.node.type === "level") {
669
+ parentLevel.node.children.push(tableNode);
670
+ }
671
+ }
672
+ closeTableRow() {
673
+ const rowFrame = this.popFrame("TR");
674
+ if (!rowFrame) return;
675
+ const tableFrame = this.findTableFrame();
676
+ if (tableFrame && tableFrame.currentRow) {
677
+ if (tableFrame.isHeaderRow) {
678
+ tableFrame.headers?.push([...tableFrame.currentRow]);
679
+ } else {
680
+ tableFrame.rows?.push([...tableFrame.currentRow]);
681
+ }
682
+ tableFrame.currentRow = [];
683
+ }
684
+ }
685
+ closeTableCell() {
686
+ const cellFrame = this.stack.pop();
687
+ if (!cellFrame || cellFrame.kind !== "tableCell") return;
688
+ const tableFrame = this.findTableFrame();
689
+ if (tableFrame?.currentRow) {
690
+ tableFrame.currentRow.push(cellFrame.textBuffer.trim());
691
+ }
692
+ }
693
+ popFrame(elementName) {
694
+ if (this.stack.length === 0) return void 0;
695
+ for (let i = this.stack.length - 1; i >= 0; i--) {
696
+ if (this.stack[i]?.elementName === elementName) {
697
+ return this.stack.splice(i, 1)[0];
698
+ }
699
+ }
700
+ return this.stack.pop();
701
+ }
702
+ findParentLevel() {
703
+ for (let i = this.stack.length - 1; i >= 0; i--) {
704
+ if (this.stack[i]?.kind === "level") {
705
+ return this.stack[i];
706
+ }
707
+ }
708
+ return void 0;
709
+ }
710
+ findParentNote() {
711
+ for (let i = this.stack.length - 1; i >= 0; i--) {
712
+ if (this.stack[i]?.kind === "note") {
713
+ return this.stack[i];
714
+ }
715
+ }
716
+ return void 0;
717
+ }
718
+ findTableFrame() {
719
+ for (let i = this.stack.length - 1; i >= 0; i--) {
720
+ if (this.stack[i]?.kind === "table") {
721
+ return this.stack[i];
722
+ }
723
+ }
724
+ return void 0;
725
+ }
726
+ extractNoteText(noteNode) {
727
+ const parts = [];
728
+ for (const child of noteNode.children) {
729
+ if (child.type === "content") {
730
+ for (const inline of child.children) {
731
+ if (inline.text) parts.push(inline.text);
732
+ }
733
+ }
734
+ }
735
+ return parts.join("").trim();
736
+ }
737
+ };
738
+ function stripLevelPrefix(heading) {
739
+ const match = /^(?:CHAPTER|PART|SUBCHAPTER|SUBPART|SUBTITLE|DIVISION|ARTICLE)\s+[A-Za-z0-9]+\s*[—–-]\s*/i.exec(
740
+ heading
741
+ );
742
+ if (match) {
743
+ const stripped = heading.slice(match[0].length).trim();
744
+ return stripped || heading.trim();
745
+ }
746
+ const titleMatch = /^Title\s+\d+\s*[—–-]\s*/i.exec(heading);
747
+ if (titleMatch) {
748
+ let stripped = heading.slice(titleMatch[0].length).trim();
749
+ const volIdx = stripped.search(/--Volume\s/i);
750
+ if (volIdx !== -1) {
751
+ stripped = stripped.slice(0, volIdx).trim();
752
+ }
753
+ return stripped || heading.trim();
754
+ }
755
+ return heading.trim();
756
+ }
757
+
758
+ // src/ecfr-frontmatter.ts
759
+ function buildEcfrFrontmatter(node, context) {
760
+ const titleAncestor = context.ancestors.find((a) => a.levelType === "title");
761
+ const partAncestor = context.ancestors.find((a) => a.levelType === "part");
762
+ const chapterAncestor = context.ancestors.find((a) => a.levelType === "chapter");
763
+ const subchapterAncestor = context.ancestors.find((a) => a.levelType === "subchapter");
764
+ const titleNum = parseInt(titleAncestor?.numValue ?? node.numValue ?? "0", 10);
765
+ const sectionNum = node.numValue ?? "0";
766
+ const sectionName = node.heading?.trim() ?? "";
767
+ const titleName = titleAncestor?.heading?.trim() ?? context.documentMeta.dcTitle ?? "";
768
+ let displayTitle;
769
+ if (node.levelType === "title") {
770
+ displayTitle = `Title ${titleNum} \u2014 ${titleName}`;
771
+ } else if (node.levelType === "part") {
772
+ displayTitle = `${titleNum} CFR Part ${sectionNum} - ${sectionName}`;
773
+ } else {
774
+ displayTitle = `${titleNum} CFR \xA7 ${sectionNum} - ${sectionName}`;
775
+ }
776
+ const authority = extractNoteText(node, "authority");
777
+ const regulatorySource = extractNoteText(node, "regulatorySource");
778
+ const partAuthority = authority ?? extractNoteTextFromAncestors(context, "authority");
779
+ const partSource = regulatorySource ?? extractNoteTextFromAncestors(context, "regulatorySource");
780
+ const sourceCredit = extractSourceCreditText(node);
781
+ const today = (/* @__PURE__ */ new Date()).toISOString().slice(0, 10);
782
+ const fm = {
783
+ source: "ecfr",
784
+ legal_status: "authoritative_unofficial",
785
+ identifier: node.identifier ?? `/us/cfr/t${titleNum}/s${sectionNum}`,
786
+ title: displayTitle,
787
+ title_number: titleNum,
788
+ title_name: titleName,
789
+ positive_law: false,
790
+ // Regulations, not legislation
791
+ currency: today,
792
+ last_updated: today
793
+ };
794
+ if (node.levelType === "section" || node.levelType === "part") {
795
+ fm.section_number = sectionNum;
796
+ fm.section_name = sectionName;
797
+ }
798
+ if (chapterAncestor?.numValue) {
799
+ const parsed = parseInt(chapterAncestor.numValue, 10);
800
+ if (!isNaN(parsed)) {
801
+ fm.chapter_number = parsed;
802
+ }
803
+ }
804
+ if (chapterAncestor?.heading) {
805
+ fm.chapter_name = chapterAncestor.heading.trim();
806
+ }
807
+ if (subchapterAncestor?.numValue) {
808
+ fm.subchapter_number = subchapterAncestor.numValue;
809
+ }
810
+ if (subchapterAncestor?.heading) {
811
+ fm.subchapter_name = subchapterAncestor.heading.trim();
812
+ }
813
+ if (partAncestor?.numValue) {
814
+ fm.part_number = partAncestor.numValue;
815
+ fm.cfr_part = partAncestor.numValue;
816
+ } else if (node.levelType === "part") {
817
+ fm.part_number = sectionNum;
818
+ fm.cfr_part = sectionNum;
819
+ }
820
+ if (partAncestor?.heading) {
821
+ fm.part_name = partAncestor.heading.trim();
822
+ } else if (node.levelType === "part") {
823
+ fm.part_name = sectionName;
824
+ }
825
+ if (partAuthority) {
826
+ fm.authority = partAuthority;
827
+ }
828
+ if (partSource) {
829
+ fm.regulatory_source = partSource;
830
+ }
831
+ if (sourceCredit) {
832
+ fm.source_credit = sourceCredit;
833
+ }
834
+ if (node.status) {
835
+ fm.status = node.status;
836
+ }
837
+ return fm;
838
+ }
839
+ function extractNoteText(node, noteType) {
840
+ for (const child of node.children) {
841
+ if (child.type === "note" && child.noteType === noteType) {
842
+ return flattenNoteText(child);
843
+ }
844
+ }
845
+ return void 0;
846
+ }
847
+ function extractNoteTextFromAncestors(_context, _noteType) {
848
+ return void 0;
849
+ }
850
+ function extractSourceCreditText(node) {
851
+ for (const child of node.children) {
852
+ if (child.type === "sourceCredit") {
853
+ const parts = [];
854
+ for (const inline of child.children) {
855
+ if (inline.type === "inline" && "text" in inline) {
856
+ parts.push(inline.text);
857
+ }
858
+ }
859
+ const text = parts.join("").trim();
860
+ return text || void 0;
861
+ }
862
+ }
863
+ return void 0;
864
+ }
865
+ function flattenNoteText(node) {
866
+ const parts = [];
867
+ if ("children" in node && Array.isArray(node.children)) {
868
+ for (const child of node.children) {
869
+ if (child.type === "content" && "children" in child) {
870
+ for (const inline of child.children) {
871
+ if (inline.type === "inline" && "text" in inline && inline.text) {
872
+ parts.push(inline.text);
873
+ }
874
+ }
875
+ } else if (child.type === "inline" && "text" in child && child.text) {
876
+ parts.push(child.text);
877
+ } else {
878
+ parts.push(flattenNoteText(child));
879
+ }
880
+ }
881
+ }
882
+ return parts.join("").trim();
883
+ }
884
+
885
+ // src/ecfr-path.ts
886
+ import { join } from "path";
887
+ function buildEcfrOutputPath(node, context, outputRoot) {
888
+ const titleNum = findAncestorValue(context, "title") ?? node.numValue ?? "0";
889
+ const chapterNum = findAncestorValue(context, "chapter");
890
+ const partNum = findAncestorValue(context, "part");
891
+ const titleDir = `title-${padTwo(titleNum)}`;
892
+ const segments = [outputRoot, "ecfr", titleDir];
893
+ if (chapterNum) {
894
+ segments.push(`chapter-${chapterNum}`);
895
+ }
896
+ if (node.levelType === "title") {
897
+ return join(outputRoot, "ecfr", `${titleDir}.md`);
898
+ } else if (node.levelType === "chapter") {
899
+ const chapNum = node.numValue ?? "0";
900
+ return join(outputRoot, "ecfr", titleDir, `chapter-${chapNum}.md`);
901
+ } else if (node.levelType === "part") {
902
+ segments.push(`part-${node.numValue ?? "0"}.md`);
903
+ } else if (node.levelType === "appendix") {
904
+ const appendixName = sanitizeFilename(node.numValue ?? node.heading ?? "appendix");
905
+ if (partNum) {
906
+ segments.push(`part-${partNum}`);
907
+ }
908
+ segments.push(`${appendixName}.md`);
909
+ } else {
910
+ if (partNum) {
911
+ segments.push(`part-${partNum}`);
912
+ }
913
+ const sectionNum = node.numValue ?? "0";
914
+ segments.push(`section-${sectionNum}.md`);
915
+ }
916
+ return join(...segments);
917
+ }
918
+ function buildTitleDir(titleNum, outputRoot) {
919
+ return join(outputRoot, "ecfr", `title-${padTwo(titleNum)}`);
920
+ }
921
+ function findAncestorValue(context, levelType) {
922
+ return context.ancestors.find((a) => a.levelType === levelType)?.numValue;
923
+ }
924
+ function padTwo(num) {
925
+ const n = parseInt(num, 10);
926
+ return isNaN(n) ? num : String(n).padStart(2, "0");
927
+ }
928
+ function sanitizeFilename(name) {
929
+ const sanitized = name.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-|-$/g, "").slice(0, 50);
930
+ return sanitized || "appendix";
931
+ }
932
+
933
+ // src/converter.ts
934
+ async function convertEcfrTitle(options) {
935
+ const { input, output, granularity, dryRun } = options;
936
+ let peakMemory = process.memoryUsage().rss;
937
+ const emitAt = granularity === "title" ? "title" : granularity === "part" ? "part" : "section";
938
+ const collected = [];
939
+ const builder = new EcfrASTBuilder({
940
+ emitAt,
941
+ onEmit: (node, context) => {
942
+ collected.push({ node, context });
943
+ }
944
+ });
945
+ const parser = new XMLParser({ defaultNamespace: "" });
946
+ parser.on("openElement", (name, attrs) => builder.onOpenElement(name, attrs));
947
+ parser.on("closeElement", (name) => builder.onCloseElement(name));
948
+ parser.on("text", (text) => builder.onText(text));
949
+ const stream = createReadStream(input, "utf-8");
950
+ await parser.parseStream(stream);
951
+ const rss = process.memoryUsage().rss;
952
+ if (rss > peakMemory) peakMemory = rss;
953
+ const partNotes = builder.getPartNotes();
954
+ let titleNumber = "0";
955
+ let titleName = "";
956
+ const firstCollected = collected[0];
957
+ if (firstCollected) {
958
+ const firstCtx = firstCollected.context;
959
+ const titleAncestor = firstCtx.ancestors.find((a) => a.levelType === "title");
960
+ if (titleAncestor) {
961
+ titleNumber = titleAncestor.numValue ?? "0";
962
+ titleName = titleAncestor.heading ?? firstCtx.documentMeta.dcTitle ?? "";
963
+ } else if (firstCollected.node.levelType === "title") {
964
+ titleNumber = firstCollected.node.numValue ?? "0";
965
+ titleName = firstCollected.node.heading ?? "";
966
+ }
967
+ }
968
+ const notesFilter = buildNotesFilter(options);
969
+ const renderOpts = {
970
+ headingOffset: 0,
971
+ linkStyle: options.linkStyle,
972
+ notesFilter
973
+ };
974
+ if (dryRun) {
975
+ return buildDryRunResult(collected, granularity, titleNumber, titleName, peakMemory);
976
+ }
977
+ const linkResolver = createLinkResolver();
978
+ const sectionMetas = [];
979
+ if (granularity === "section") {
980
+ const counts = /* @__PURE__ */ new Map();
981
+ for (const { node, context } of collected) {
982
+ const partNum = context.ancestors.find((a) => a.levelType === "part")?.numValue ?? "__root__";
983
+ const secNum = node.numValue ?? "0";
984
+ const key = `${partNum}/${secNum}`;
985
+ counts.set(key, (counts.get(key) ?? 0) + 1);
986
+ }
987
+ const seen = /* @__PURE__ */ new Map();
988
+ const outputPaths = [];
989
+ for (const { node, context } of collected) {
990
+ const partNum = context.ancestors.find((a) => a.levelType === "part")?.numValue ?? "__root__";
991
+ const secNum = node.numValue ?? "0";
992
+ const key = `${partNum}/${secNum}`;
993
+ const occurrence = (seen.get(key) ?? 0) + 1;
994
+ seen.set(key, occurrence);
995
+ const total = counts.get(key) ?? 1;
996
+ const suffix = total > 1 && occurrence > 1 ? `-${occurrence}` : "";
997
+ const filePath = buildEcfrOutputPath(node, context, output);
998
+ const suffixedPath = suffix ? filePath.replace(/\.md$/, `${suffix}.md`) : filePath;
999
+ outputPaths.push(suffixedPath);
1000
+ if (node.identifier && occurrence === 1) {
1001
+ linkResolver.register(node.identifier, suffixedPath);
1002
+ }
1003
+ }
1004
+ for (let i = 0; i < collected.length; i++) {
1005
+ const item = collected[i];
1006
+ const suffixedPath = outputPaths[i];
1007
+ if (!item || !suffixedPath) continue;
1008
+ const { node, context } = item;
1009
+ const frontmatter = buildEcfrFrontmatter(node, context);
1010
+ const partId = context.ancestors.find((a) => a.levelType === "part")?.identifier;
1011
+ if (partId && (!frontmatter.authority || !frontmatter.regulatory_source)) {
1012
+ const partNoteData = partNotes.get(partId);
1013
+ if (partNoteData) {
1014
+ if (!frontmatter.authority && partNoteData.authority) {
1015
+ frontmatter.authority = partNoteData.authority;
1016
+ }
1017
+ if (!frontmatter.regulatory_source && partNoteData.regulatorySource) {
1018
+ frontmatter.regulatory_source = partNoteData.regulatorySource;
1019
+ }
1020
+ }
1021
+ }
1022
+ const fromFile = suffixedPath;
1023
+ const markdown = renderDocument(node, frontmatter, {
1024
+ ...renderOpts,
1025
+ resolveLink: (identifier) => linkResolver.resolve(identifier, fromFile)
1026
+ });
1027
+ await mkdir(dirname(suffixedPath), { recursive: true });
1028
+ await writeFile(suffixedPath, markdown, "utf-8");
1029
+ const hasNotes = node.children.some((c) => c.type === "note" || c.type === "notesContainer");
1030
+ const secNum = node.numValue ?? "0";
1031
+ const partNum = context.ancestors.find((a) => a.levelType === "part")?.numValue ?? "__root__";
1032
+ sectionMetas.push({
1033
+ identifier: node.identifier ?? `/us/cfr/t${titleNumber}/s${secNum}`,
1034
+ number: secNum,
1035
+ name: node.heading?.trim() ?? "",
1036
+ fileName: basename(suffixedPath),
1037
+ relativeFile: relative(buildTitleDir(titleNumber, output), suffixedPath),
1038
+ contentLength: markdown.length,
1039
+ hasNotes,
1040
+ status: node.status ?? "current",
1041
+ partIdentifier: context.ancestors.find((a) => a.levelType === "part")?.identifier ?? "",
1042
+ partNumber: partNum,
1043
+ partName: context.ancestors.find((a) => a.levelType === "part")?.heading?.trim() ?? ""
1044
+ });
1045
+ const currentRss = process.memoryUsage().rss;
1046
+ if (currentRss > peakMemory) peakMemory = currentRss;
1047
+ }
1048
+ await writeMetaFiles(sectionMetas, titleNumber, titleName, output, granularity, input);
1049
+ const files2 = sectionMetas.map((m) => join2(buildTitleDir(titleNumber, output), m.relativeFile));
1050
+ return {
1051
+ sectionsWritten: sectionMetas.length,
1052
+ files: files2,
1053
+ titleNumber,
1054
+ titleName,
1055
+ dryRun: false,
1056
+ partCount: new Set(sectionMetas.map((s) => s.partNumber)).size,
1057
+ totalTokenEstimate: Math.ceil(sectionMetas.reduce((sum, m) => sum + m.contentLength, 0) / 4),
1058
+ peakMemoryBytes: peakMemory
1059
+ };
1060
+ }
1061
+ const files = [];
1062
+ let totalLength = 0;
1063
+ if (granularity === "chapter") {
1064
+ const chapterMap = /* @__PURE__ */ new Map();
1065
+ for (const item of collected) {
1066
+ const chapterAnc = item.context.ancestors.find((a) => a.levelType === "chapter");
1067
+ const chapterKey = chapterAnc?.numValue ?? "__root__";
1068
+ const existing = chapterMap.get(chapterKey);
1069
+ if (existing) {
1070
+ existing.sections.push(item);
1071
+ } else {
1072
+ chapterMap.set(chapterKey, {
1073
+ sections: [item],
1074
+ chapterAncestor: chapterAnc ?? { levelType: "chapter", numValue: chapterKey },
1075
+ firstContext: item.context
1076
+ });
1077
+ }
1078
+ }
1079
+ for (const [_chapterKey, { sections, chapterAncestor, firstContext }] of chapterMap) {
1080
+ const chapterNode = {
1081
+ type: "level",
1082
+ levelType: "chapter",
1083
+ num: chapterAncestor.numValue,
1084
+ numValue: chapterAncestor.numValue,
1085
+ heading: chapterAncestor.heading,
1086
+ identifier: chapterAncestor.identifier,
1087
+ children: sections.map((s) => s.node)
1088
+ };
1089
+ const frontmatter = buildEcfrFrontmatter(chapterNode, firstContext);
1090
+ const markdown = renderDocument(chapterNode, frontmatter, renderOpts);
1091
+ const filePath = buildEcfrOutputPath(chapterNode, firstContext, output);
1092
+ await mkdir(dirname(filePath), { recursive: true });
1093
+ await writeFile(filePath, markdown, "utf-8");
1094
+ files.push(filePath);
1095
+ totalLength += markdown.length;
1096
+ }
1097
+ } else {
1098
+ const targetLevel = emitAt;
1099
+ const filtered = collected.filter((c) => c.node.levelType === targetLevel);
1100
+ for (const { node, context } of filtered) {
1101
+ const frontmatter = buildEcfrFrontmatter(node, context);
1102
+ const markdown = renderDocument(node, frontmatter, renderOpts);
1103
+ const filePath = buildEcfrOutputPath(node, context, output);
1104
+ await mkdir(dirname(filePath), { recursive: true });
1105
+ await writeFile(filePath, markdown, "utf-8");
1106
+ files.push(filePath);
1107
+ totalLength += markdown.length;
1108
+ }
1109
+ }
1110
+ const partCount = granularity === "part" ? files.length : granularity === "chapter" ? new Set(
1111
+ collected.map((c) => c.context.ancestors.find((a) => a.levelType === "part")?.numValue).filter(Boolean)
1112
+ ).size : 0;
1113
+ return {
1114
+ sectionsWritten: files.length,
1115
+ files,
1116
+ titleNumber,
1117
+ titleName,
1118
+ dryRun: false,
1119
+ partCount,
1120
+ totalTokenEstimate: Math.ceil(totalLength / 4),
1121
+ peakMemoryBytes: peakMemory
1122
+ };
1123
+ }
1124
+ function buildDryRunResult(collected, granularity, titleNumber, titleName, peakMemory) {
1125
+ let totalEstimate = 0;
1126
+ let count;
1127
+ if (granularity === "chapter") {
1128
+ const chapterKeys = /* @__PURE__ */ new Set();
1129
+ for (const { node, context } of collected) {
1130
+ const chapterAnc = context.ancestors.find((a) => a.levelType === "chapter");
1131
+ const key = chapterAnc?.numValue ?? "__root__";
1132
+ chapterKeys.add(key);
1133
+ totalEstimate += estimateTokens(node);
1134
+ }
1135
+ count = chapterKeys.size;
1136
+ } else {
1137
+ const targetLevel = granularity === "title" ? "title" : granularity === "part" ? "part" : "section";
1138
+ const filtered = collected.filter((c) => c.node.levelType === targetLevel);
1139
+ count = filtered.length;
1140
+ for (const { node } of filtered) {
1141
+ totalEstimate += estimateTokens(node);
1142
+ }
1143
+ }
1144
+ return {
1145
+ sectionsWritten: count,
1146
+ files: [],
1147
+ titleNumber,
1148
+ titleName,
1149
+ dryRun: true,
1150
+ partCount: 0,
1151
+ totalTokenEstimate: totalEstimate,
1152
+ peakMemoryBytes: peakMemory
1153
+ };
1154
+ }
1155
+ function estimateTokens(node) {
1156
+ let length = 0;
1157
+ function walk(n) {
1158
+ if (n.type === "inline" && "text" in n && n.text) {
1159
+ length += n.text.length;
1160
+ }
1161
+ if ("children" in n && Array.isArray(n.children)) {
1162
+ for (const child of n.children) {
1163
+ walk(child);
1164
+ }
1165
+ }
1166
+ }
1167
+ walk(node);
1168
+ return Math.ceil(length / 4);
1169
+ }
1170
+ function buildNotesFilter(options) {
1171
+ if (options.includeNotes) return void 0;
1172
+ const hasSelective = options.includeEditorialNotes || options.includeStatutoryNotes || options.includeAmendments;
1173
+ if (!hasSelective) {
1174
+ return { editorial: false, statutory: false, amendments: false };
1175
+ }
1176
+ return {
1177
+ editorial: options.includeEditorialNotes,
1178
+ statutory: options.includeStatutoryNotes,
1179
+ amendments: options.includeAmendments
1180
+ };
1181
+ }
1182
+ async function writeMetaFiles(sectionMetas, titleNumber, titleName, outputRoot, granularity, sourceXml) {
1183
+ const partMap = /* @__PURE__ */ new Map();
1184
+ for (const meta of sectionMetas) {
1185
+ const key = meta.partNumber;
1186
+ const arr = partMap.get(key) ?? [];
1187
+ arr.push(meta);
1188
+ partMap.set(key, arr);
1189
+ }
1190
+ const parts = [];
1191
+ for (const [partNum, sections] of partMap) {
1192
+ const first = sections[0];
1193
+ if (!first) continue;
1194
+ parts.push({
1195
+ identifier: first.partIdentifier || `/us/cfr/t${titleNumber}/pt${partNum}`,
1196
+ number: partNum,
1197
+ name: first.partName,
1198
+ directory: `part-${partNum}`,
1199
+ sections: sections.map((s) => ({
1200
+ identifier: s.identifier,
1201
+ number: s.number,
1202
+ name: s.name,
1203
+ file: s.fileName,
1204
+ token_estimate: Math.ceil(s.contentLength / 4),
1205
+ has_notes: s.hasNotes,
1206
+ status: s.status
1207
+ }))
1208
+ });
1209
+ }
1210
+ const titleDir = buildTitleDir(titleNumber, outputRoot);
1211
+ await mkdir(titleDir, { recursive: true });
1212
+ for (const part of parts) {
1213
+ const partDir = join2(titleDir, getPartDirPath(sectionMetas, part.number));
1214
+ await mkdir(partDir, { recursive: true });
1215
+ const partMeta = {
1216
+ format_version: FORMAT_VERSION,
1217
+ identifier: part.identifier,
1218
+ part_number: part.number,
1219
+ part_name: part.name,
1220
+ title_number: parseInt(titleNumber, 10),
1221
+ section_count: part.sections.length,
1222
+ sections: part.sections
1223
+ };
1224
+ await writeFile(join2(partDir, "_meta.json"), JSON.stringify(partMeta, null, 2) + "\n", "utf-8");
1225
+ }
1226
+ const totalTokens = sectionMetas.reduce((sum, m) => sum + m.contentLength, 0);
1227
+ const titleMeta = {
1228
+ format_version: FORMAT_VERSION,
1229
+ generator: GENERATOR,
1230
+ generated_at: (/* @__PURE__ */ new Date()).toISOString(),
1231
+ identifier: `/us/cfr/t${titleNumber}`,
1232
+ title_number: parseInt(titleNumber, 10),
1233
+ title_name: titleName,
1234
+ source: "ecfr",
1235
+ legal_status: "authoritative_unofficial",
1236
+ currency: (/* @__PURE__ */ new Date()).toISOString().slice(0, 10),
1237
+ source_xml: basename(sourceXml),
1238
+ granularity,
1239
+ stats: {
1240
+ part_count: parts.length,
1241
+ section_count: sectionMetas.length,
1242
+ total_files: sectionMetas.length,
1243
+ total_tokens_estimate: Math.ceil(totalTokens / 4)
1244
+ },
1245
+ parts
1246
+ };
1247
+ await writeFile(join2(titleDir, "_meta.json"), JSON.stringify(titleMeta, null, 2) + "\n", "utf-8");
1248
+ const readme = buildReadme(titleNumber, titleName, parts, sectionMetas, granularity);
1249
+ await writeFile(join2(titleDir, "README.md"), readme, "utf-8");
1250
+ }
1251
+ function getPartDirPath(sectionMetas, partNumber) {
1252
+ const first = sectionMetas.find((m) => m.partNumber === partNumber);
1253
+ if (!first) return `part-${partNumber}`;
1254
+ const dir = dirname(first.relativeFile);
1255
+ return dir === "." ? `part-${partNumber}` : dir;
1256
+ }
1257
+ function buildReadme(titleNumber, titleName, parts, sectionMetas, granularity) {
1258
+ const totalTokens = Math.ceil(sectionMetas.reduce((sum, m) => sum + m.contentLength, 0) / 4);
1259
+ const lines = [];
1260
+ lines.push(`# Title ${titleNumber} \u2014 ${titleName}`);
1261
+ lines.push("");
1262
+ lines.push("| Metric | Value |");
1263
+ lines.push("|--------|-------|");
1264
+ lines.push(`| Source | eCFR (govinfo.gov) |`);
1265
+ lines.push(`| Legal Status | Authoritative, unofficial |`);
1266
+ lines.push(`| Parts | ${parts.length.toLocaleString()} |`);
1267
+ lines.push(`| Sections | ${sectionMetas.length.toLocaleString()} |`);
1268
+ lines.push(`| Estimated Tokens | ${totalTokens.toLocaleString()} |`);
1269
+ lines.push(`| Granularity | ${granularity} |`);
1270
+ lines.push("");
1271
+ lines.push("## Parts");
1272
+ lines.push("");
1273
+ for (const part of parts) {
1274
+ lines.push(`### Part ${part.number} \u2014 ${part.name} (${part.sections.length} sections)`);
1275
+ lines.push("");
1276
+ }
1277
+ lines.push("---");
1278
+ lines.push("");
1279
+ lines.push("Generated by LexBuild");
1280
+ lines.push("");
1281
+ return lines.join("\n");
1282
+ }
1283
+
1284
+ // src/downloader.ts
1285
+ import { createWriteStream } from "fs";
1286
+ import { mkdir as mkdir2, stat } from "fs/promises";
1287
+ import { join as join3 } from "path";
1288
+ import { pipeline } from "stream/promises";
1289
+ import { Readable } from "stream";
1290
+ var ECFR_BULK_BASE = "https://www.govinfo.gov/bulkdata/ECFR";
1291
+ var ECFR_TITLE_COUNT = 50;
1292
+ var ECFR_TITLE_NUMBERS = Array.from({ length: ECFR_TITLE_COUNT }, (_, i) => i + 1);
1293
+ var RESERVED_TITLES = /* @__PURE__ */ new Set([35]);
1294
+ function buildEcfrDownloadUrl(titleNumber) {
1295
+ return `${ECFR_BULK_BASE}/title-${titleNumber}/ECFR-title${titleNumber}.xml`;
1296
+ }
1297
+ async function downloadEcfrTitles(options) {
1298
+ const { output } = options;
1299
+ const titles = options.titles ?? ECFR_TITLE_NUMBERS;
1300
+ await mkdir2(output, { recursive: true });
1301
+ const files = [];
1302
+ let totalBytes = 0;
1303
+ for (const titleNum of titles) {
1304
+ if (RESERVED_TITLES.has(titleNum)) continue;
1305
+ const url = buildEcfrDownloadUrl(titleNum);
1306
+ const filePath = join3(output, `ECFR-title${titleNum}.xml`);
1307
+ const response = await fetch(url);
1308
+ if (!response.ok) {
1309
+ console.warn(`Failed to download eCFR Title ${titleNum}: ${response.status}`);
1310
+ continue;
1311
+ }
1312
+ const body = response.body;
1313
+ if (!body) continue;
1314
+ const dest = createWriteStream(filePath);
1315
+ await pipeline(Readable.fromWeb(body), dest);
1316
+ const fileStat = await stat(filePath);
1317
+ const size = fileStat.size;
1318
+ totalBytes += size;
1319
+ files.push({ path: filePath, titleNumber: titleNum, size });
1320
+ }
1321
+ return { titlesDownloaded: files.length, files, totalBytes };
1322
+ }
1323
+ export {
1324
+ ECFR_BLOCK_ELEMENTS,
1325
+ ECFR_CONTENT_ELEMENTS,
1326
+ ECFR_DIV_ELEMENTS,
1327
+ ECFR_EMPHASIS_MAP,
1328
+ ECFR_HEADING_ELEMENTS,
1329
+ ECFR_IGNORE_ELEMENTS,
1330
+ ECFR_INLINE_ELEMENTS,
1331
+ ECFR_NOTE_ELEMENTS,
1332
+ ECFR_PASSTHROUGH_ELEMENTS,
1333
+ ECFR_REF_ELEMENTS,
1334
+ ECFR_SKIP_ELEMENTS,
1335
+ ECFR_TABLE_ELEMENTS,
1336
+ ECFR_TITLE_COUNT,
1337
+ ECFR_TITLE_NUMBERS,
1338
+ ECFR_TYPE_TO_LEVEL,
1339
+ EcfrASTBuilder,
1340
+ buildEcfrDownloadUrl,
1341
+ convertEcfrTitle,
1342
+ downloadEcfrTitles
1343
+ };
1344
+ //# sourceMappingURL=index.js.map