@lexbuild/core 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,525 @@
1
+ import { Readable } from 'node:stream';
2
+
3
+ /**
4
+ * Streaming XML parser wrapping saxes with typed events and namespace normalization.
5
+ */
6
+
7
+ /** Normalized attributes: a flat record of name → value */
8
+ type Attributes = Record<string, string>;
9
+ /** Events emitted by the XML parser */
10
+ interface ParserEvents {
11
+ /** An element was opened */
12
+ openElement: (name: string, attrs: Attributes, ns: string) => void;
13
+ /** An element was closed */
14
+ closeElement: (name: string, ns: string) => void;
15
+ /** Text content was encountered */
16
+ text: (content: string) => void;
17
+ /** An error occurred during parsing */
18
+ error: (err: Error) => void;
19
+ /** Parsing is complete */
20
+ end: () => void;
21
+ }
22
+ /** Configuration for the XML parser */
23
+ interface XMLParserOptions {
24
+ /** Namespace URI to treat as default (elements in this NS emit bare names) */
25
+ defaultNamespace?: string | undefined;
26
+ /** Additional namespace prefix mappings beyond the built-in ones */
27
+ namespacePrefixes?: Readonly<Record<string, string>> | undefined;
28
+ }
29
+ type EventName = keyof ParserEvents;
30
+ type EventHandler<K extends EventName> = ParserEvents[K];
31
+ /**
32
+ * Streaming XML parser that normalizes namespace-prefixed element names.
33
+ *
34
+ * Elements in the default namespace emit bare names (e.g., "section").
35
+ * Elements in other recognized namespaces emit prefixed names (e.g., "xhtml:table").
36
+ * Elements in unrecognized namespaces emit the full URI-prefixed name.
37
+ */
38
+ declare class XMLParser {
39
+ private readonly saxParser;
40
+ private readonly defaultNs;
41
+ private readonly prefixMap;
42
+ private readonly listeners;
43
+ constructor(options?: XMLParserOptions);
44
+ /**
45
+ * Register an event listener.
46
+ */
47
+ on<K extends EventName>(event: K, handler: EventHandler<K>): this;
48
+ /**
49
+ * Parse a complete XML string.
50
+ */
51
+ parseString(xml: string): void;
52
+ /**
53
+ * Parse from a readable stream (e.g., fs.createReadStream).
54
+ * Returns a promise that resolves when parsing is complete.
55
+ */
56
+ parseStream(stream: Readable): Promise<void>;
57
+ /**
58
+ * Normalize an element name based on its namespace.
59
+ * Default namespace elements get bare names; others get prefixed.
60
+ */
61
+ private normalizeName;
62
+ /**
63
+ * Normalize saxes namespace-aware attributes to a flat record.
64
+ * Strips namespace prefixes from attribute names for simplicity,
65
+ * except for xmlns declarations which are dropped entirely.
66
+ */
67
+ private normalizeAttributes;
68
+ /**
69
+ * Emit an event to all registered listeners.
70
+ */
71
+ private emit;
72
+ }
73
+
74
+ /**
75
+ * USLM XML namespace constants and element classification utilities.
76
+ */
77
+ /** USLM 1.0 default namespace */
78
+ declare const USLM_NS = "http://xml.house.gov/schemas/uslm/1.0";
79
+ /** XHTML namespace (used for tables) */
80
+ declare const XHTML_NS = "http://www.w3.org/1999/xhtml";
81
+ /** Dublin Core elements namespace */
82
+ declare const DC_NS = "http://purl.org/dc/elements/1.1/";
83
+ /** Dublin Core terms namespace */
84
+ declare const DCTERMS_NS = "http://purl.org/dc/terms/";
85
+ /** XML Schema Instance namespace */
86
+ declare const XSI_NS = "http://www.w3.org/2001/XMLSchema-instance";
87
+ /**
88
+ * Prefix map for recognized non-default namespaces.
89
+ * Elements in these namespaces will be emitted with the prefix (e.g., "dc:title").
90
+ */
91
+ declare const NAMESPACE_PREFIXES: Readonly<Record<string, string>>;
92
+ /** USLM elements that represent hierarchical levels */
93
+ declare const LEVEL_ELEMENTS: Set<string>;
94
+ /** USLM elements that represent content blocks */
95
+ declare const CONTENT_ELEMENTS: Set<string>;
96
+ /** USLM elements that represent inline formatting */
97
+ declare const INLINE_ELEMENTS: Set<string>;
98
+ /** USLM note-related elements */
99
+ declare const NOTE_ELEMENTS: Set<string>;
100
+ /** USLM elements that act as levels in appendix contexts */
101
+ declare const APPENDIX_LEVEL_ELEMENTS: Set<string>;
102
+ /** USLM metadata elements inside <meta> */
103
+ declare const META_ELEMENTS: Set<string>;
104
+ /** Structural container elements (no direct content) */
105
+ declare const CONTAINER_ELEMENTS: Set<string>;
106
+
107
+ /**
108
+ * @lexbuild/core AST node types
109
+ *
110
+ * The intermediate AST is a semantic representation of parsed USLM XML.
111
+ * It is NOT a 1:1 mapping — it has been partially interpreted to simplify rendering.
112
+ */
113
+ /** All hierarchical levels in the USLM schema, ordered big → small */
114
+ declare const LEVEL_TYPES: readonly ["title", "appendix", "subtitle", "chapter", "subchapter", "compiledAct", "reorganizationPlans", "reorganizationPlan", "courtRules", "courtRule", "article", "subarticle", "part", "subpart", "division", "subdivision", "preliminary", "section", "subsection", "paragraph", "subparagraph", "clause", "subclause", "item", "subitem", "subsubitem"];
115
+ /** A USLM hierarchical level type */
116
+ type LevelType = (typeof LEVEL_TYPES)[number];
117
+ /** Big levels: above section in the hierarchy */
118
+ declare const BIG_LEVELS: Set<"title" | "subtitle" | "chapter" | "subchapter" | "article" | "subarticle" | "part" | "subpart" | "division" | "subdivision" | "preliminary" | "section" | "subsection" | "paragraph" | "subparagraph" | "clause" | "subclause" | "item" | "subitem" | "subsubitem" | "appendix" | "compiledAct" | "reorganizationPlans" | "reorganizationPlan" | "courtRules" | "courtRule">;
119
+ /** Small levels: below section in the hierarchy */
120
+ declare const SMALL_LEVELS: Set<"title" | "subtitle" | "chapter" | "subchapter" | "article" | "subarticle" | "part" | "subpart" | "division" | "subdivision" | "preliminary" | "section" | "subsection" | "paragraph" | "subparagraph" | "clause" | "subclause" | "item" | "subitem" | "subsubitem" | "appendix" | "compiledAct" | "reorganizationPlans" | "reorganizationPlan" | "courtRules" | "courtRule">;
121
+ /** Base node all AST nodes extend */
122
+ interface BaseNode {
123
+ /** Discriminator for the node type */
124
+ readonly type: string;
125
+ /** USLM identifier if present (e.g., "/us/usc/t1/s1") */
126
+ identifier?: string | undefined;
127
+ /** Source XML element name for debugging */
128
+ sourceElement?: string | undefined;
129
+ }
130
+ /** A hierarchical level (title, chapter, section, subsection, etc.) */
131
+ interface LevelNode extends BaseNode {
132
+ readonly type: "level";
133
+ /** Which level in the USLM hierarchy */
134
+ levelType: LevelType;
135
+ /** Display text of the number (e.g., "§ 1.", "(a)", "CHAPTER 1—") */
136
+ num?: string | undefined;
137
+ /** Normalized value of the number (e.g., "1", "a") */
138
+ numValue?: string | undefined;
139
+ /** Heading text (e.g., "Words denoting number, gender, and so forth") */
140
+ heading?: string | undefined;
141
+ /** Legal status of this element (e.g., "repealed", "transferred") */
142
+ status?: string | undefined;
143
+ /** Child nodes */
144
+ children: ASTNode[];
145
+ }
146
+ /** Variant of a content block */
147
+ type ContentVariant = "content" | "chapeau" | "continuation" | "proviso";
148
+ /** A block of text content */
149
+ interface ContentNode extends BaseNode {
150
+ readonly type: "content";
151
+ /** What kind of content block this is */
152
+ variant: ContentVariant;
153
+ /** Inline children (text, formatting, refs) */
154
+ children: InlineNode[];
155
+ }
156
+ /** Discriminator for inline node types */
157
+ type InlineType = "text" | "bold" | "italic" | "ref" | "date" | "term" | "quoted" | "sup" | "sub" | "footnoteRef";
158
+ /** Inline text or formatting */
159
+ interface InlineNode extends BaseNode {
160
+ readonly type: "inline";
161
+ /** What kind of inline this is */
162
+ inlineType: InlineType;
163
+ /** Text content (for leaf text nodes) */
164
+ text?: string | undefined;
165
+ /** Link target (for ref nodes) */
166
+ href?: string | undefined;
167
+ /** Footnote target ID (for footnoteRef nodes) */
168
+ idref?: string | undefined;
169
+ /** Nested inline children */
170
+ children?: InlineNode[] | undefined;
171
+ }
172
+ /** A note (editorial, statutory, amendment, etc.) */
173
+ interface NoteNode extends BaseNode {
174
+ readonly type: "note";
175
+ /** Semantic category (e.g., "amendments", "codification") */
176
+ topic?: string | undefined;
177
+ /** Role refinement (e.g., "crossHeading") */
178
+ role?: string | undefined;
179
+ /** Note placement type (e.g., "uscNote", "footnote") */
180
+ noteType?: string | undefined;
181
+ /** Heading text of the note */
182
+ heading?: string | undefined;
183
+ /** Child nodes */
184
+ children: ASTNode[];
185
+ }
186
+ /** Source credit annotation */
187
+ interface SourceCreditNode extends BaseNode {
188
+ readonly type: "sourceCredit";
189
+ /** The full source credit text, including inline formatting */
190
+ children: InlineNode[];
191
+ }
192
+ /** Table (either XHTML or USLM layout-based) */
193
+ interface TableNode extends BaseNode {
194
+ readonly type: "table";
195
+ /** Which table model */
196
+ variant: "xhtml" | "layout";
197
+ /** Header rows (each row is an array of cell strings) */
198
+ headers: string[][];
199
+ /** Body rows */
200
+ rows: string[][];
201
+ /** Raw HTML for complex tables that can't be simplified to rows/columns */
202
+ rawHtml?: string | undefined;
203
+ }
204
+ /** A single TOC entry */
205
+ interface TOCItemNode extends BaseNode {
206
+ readonly type: "tocItem";
207
+ /** Section/chapter number */
208
+ number?: string | undefined;
209
+ /** Title or heading text */
210
+ title?: string | undefined;
211
+ /** Link target identifier */
212
+ href?: string | undefined;
213
+ }
214
+ /** Table of contents */
215
+ interface TOCNode extends BaseNode {
216
+ readonly type: "toc";
217
+ /** TOC entries */
218
+ items: TOCItemNode[];
219
+ }
220
+ /** Container for notes (wraps <notes type="uscNote">) */
221
+ interface NotesContainerNode extends BaseNode {
222
+ readonly type: "notesContainer";
223
+ /** The notes type attribute (e.g., "uscNote") */
224
+ notesType?: string | undefined;
225
+ /** Child note nodes */
226
+ children: (NoteNode | ASTNode)[];
227
+ }
228
+ /** Quoted content (blockquote) */
229
+ interface QuotedContentNode extends BaseNode {
230
+ readonly type: "quotedContent";
231
+ /** Where the quote originates from */
232
+ origin?: string | undefined;
233
+ /** Content of the quotation */
234
+ children: ASTNode[];
235
+ }
236
+ /** Union of all AST node types */
237
+ type ASTNode = LevelNode | ContentNode | InlineNode | NoteNode | SourceCreditNode | TableNode | TOCNode | TOCItemNode | NotesContainerNode | QuotedContentNode;
238
+ /** Info about an ancestor level in the hierarchy */
239
+ interface AncestorInfo {
240
+ /** The level type (e.g., "title", "chapter") */
241
+ levelType: LevelType;
242
+ /** Normalized number value */
243
+ numValue?: string | undefined;
244
+ /** Heading text */
245
+ heading?: string | undefined;
246
+ /** USLM identifier */
247
+ identifier?: string | undefined;
248
+ }
249
+ /** Document-level metadata extracted from the <meta> block */
250
+ interface DocumentMeta {
251
+ /** dc:title — display title (e.g., "Title 1") */
252
+ dcTitle?: string | undefined;
253
+ /** dc:type — document type (e.g., "USCTitle") */
254
+ dcType?: string | undefined;
255
+ /** docNumber — numeric designation (e.g., "1") */
256
+ docNumber?: string | undefined;
257
+ /** docPublicationName — publication name */
258
+ docPublicationName?: string | undefined;
259
+ /** Release point identifier (e.g., "119-73") */
260
+ releasePoint?: string | undefined;
261
+ /** Whether this is positive law */
262
+ positivelaw?: boolean | undefined;
263
+ /** dc:publisher */
264
+ publisher?: string | undefined;
265
+ /** dcterms:created — ISO timestamp */
266
+ created?: string | undefined;
267
+ /** dc:creator — generator tool name */
268
+ creator?: string | undefined;
269
+ /** The root document identifier (e.g., "/us/usc/t1") */
270
+ identifier?: string | undefined;
271
+ }
272
+ /** Context provided when a completed section/chapter is emitted */
273
+ interface EmitContext {
274
+ /** Ancestor chain from document root to the emitted node's parent */
275
+ ancestors: AncestorInfo[];
276
+ /** Document-level metadata from the <meta> block */
277
+ documentMeta: DocumentMeta;
278
+ }
279
+ /** Data used to generate YAML frontmatter for a section file */
280
+ interface FrontmatterData {
281
+ /** USLM canonical identifier (e.g., "/us/usc/t1/s1") */
282
+ identifier: string;
283
+ /** Human-readable display title (e.g., "1 USC § 1 - Words denoting...") */
284
+ title: string;
285
+ /** Title number (integer) */
286
+ title_number: number;
287
+ /** Title name (e.g., "General Provisions") */
288
+ title_name: string;
289
+ /** Chapter number (integer, omitted if not applicable) */
290
+ chapter_number?: number | undefined;
291
+ /** Chapter name */
292
+ chapter_name?: string | undefined;
293
+ /** Subchapter identifier (often Roman numerals) */
294
+ subchapter_number?: string | undefined;
295
+ /** Subchapter name */
296
+ subchapter_name?: string | undefined;
297
+ /** Part identifier */
298
+ part_number?: string | undefined;
299
+ /** Part name */
300
+ part_name?: string | undefined;
301
+ /** Section number (string — can be alphanumeric like "7801") */
302
+ section_number: string;
303
+ /** Section name */
304
+ section_name: string;
305
+ /** Whether this title is positive law */
306
+ positive_law: boolean;
307
+ /** Full source credit text */
308
+ source_credit?: string | undefined;
309
+ /** Release point identifier (e.g., "119-73") */
310
+ currency: string;
311
+ /** ISO date from XML generation timestamp */
312
+ last_updated: string;
313
+ /** Section status (e.g., "repealed", "transferred") */
314
+ status?: string | undefined;
315
+ }
316
+
317
+ /**
318
+ * AST Builder — converts XML parser events into an AST tree.
319
+ *
320
+ * Implements the section-emit pattern: when a section (or other configured level)
321
+ * close tag is encountered, the completed LevelNode is emitted via callback
322
+ * and its subtree is released from memory.
323
+ */
324
+
325
+ /** Options for configuring the AST builder */
326
+ interface ASTBuilderOptions {
327
+ /** Emit completed nodes at this level instead of accumulating */
328
+ emitAt: LevelType;
329
+ /** Callback when a completed node is ready */
330
+ onEmit: (node: LevelNode, context: EmitContext) => void | Promise<void>;
331
+ }
332
+ /**
333
+ * Builds an AST from XML parser events, emitting completed subtrees at the configured level.
334
+ */
335
+ declare class ASTBuilder {
336
+ private readonly options;
337
+ private readonly stack;
338
+ private readonly ancestors;
339
+ private readonly documentMeta;
340
+ /** Whether we are currently inside the <meta> block */
341
+ private inMeta;
342
+ /** Nesting depth inside <quotedContent> — levels inside quotes are not emitted */
343
+ private quotedContentDepth;
344
+ /** Active XHTML table collector (null when not inside a table) */
345
+ private tableCollector;
346
+ /** Active USLM layout collector (null when not inside a layout) */
347
+ private layoutCollector;
348
+ /** Nesting depth inside <toc> — elements inside toc are handled by layout collector only */
349
+ private tocDepth;
350
+ /** Current meta field being collected (e.g., "dc:title", "docNumber") */
351
+ private metaField;
352
+ /** Attributes of the current meta property element */
353
+ private metaPropertyAttrs;
354
+ constructor(options: ASTBuilderOptions);
355
+ /** Returns the document metadata collected so far */
356
+ getDocumentMeta(): DocumentMeta;
357
+ /**
358
+ * Handle an openElement event from the parser.
359
+ */
360
+ onOpenElement(name: string, attrs: Attributes): void;
361
+ /**
362
+ * Handle a closeElement event from the parser.
363
+ */
364
+ onCloseElement(name: string): void;
365
+ /**
366
+ * Handle a text event from the parser.
367
+ */
368
+ onText(text: string): void;
369
+ private handleMetaOpen;
370
+ private handleMetaClose;
371
+ private openLevel;
372
+ private closeLevel;
373
+ private openContent;
374
+ private closeContent;
375
+ private openInline;
376
+ private closeInline;
377
+ private openNotesContainer;
378
+ private closeNotesContainer;
379
+ private openNote;
380
+ private closeNote;
381
+ private openSourceCredit;
382
+ private closeSourceCredit;
383
+ private openQuotedContent;
384
+ private closeQuotedContent;
385
+ private handleNumOrHeadingClose;
386
+ private handlePClose;
387
+ /**
388
+ * Bubble text content up to the nearest heading/num ignore frame on the stack.
389
+ * This handles patterns like <heading><b>Editorial Notes</b></heading>
390
+ * where the text is inside an inline child but needs to be collected by the heading frame.
391
+ */
392
+ private handleTableOpen;
393
+ private handleTableClose;
394
+ private finishTable;
395
+ private handleLayoutOpen;
396
+ private handleLayoutClose;
397
+ private finishLayout;
398
+ private bubbleTextToCollector;
399
+ private peekFrame;
400
+ private peekFrameAbove;
401
+ private popFrame;
402
+ private findParentFrame;
403
+ /**
404
+ * Add a block-level AST node to the nearest parent that accepts children.
405
+ */
406
+ private addToParent;
407
+ /**
408
+ * Add an inline AST node to the nearest parent that accepts inline children.
409
+ */
410
+ private addInlineToParent;
411
+ /**
412
+ * Extract plain text from a node tree (for flattening quotedContent).
413
+ */
414
+ private extractText;
415
+ private extractInlineText;
416
+ }
417
+
418
+ /**
419
+ * Markdown renderer — converts AST nodes to Markdown strings.
420
+ *
421
+ * Stateless and pure: no side effects, no file I/O.
422
+ */
423
+
424
+ /** Notes filtering configuration */
425
+ interface NotesFilter {
426
+ /** Include editorial notes (codification, dispositionOfSections, etc.) */
427
+ editorial: boolean;
428
+ /** Include statutory notes (changeOfName, regulations, miscellaneous, repeals, etc.) */
429
+ statutory: boolean;
430
+ /** Include amendment history (amendments, effectiveDateOfAmendment) */
431
+ amendments: boolean;
432
+ }
433
+ /** Options for controlling Markdown rendering */
434
+ interface RenderOptions {
435
+ /** Heading level offset (0 = section is H1, 1 = section is H2) */
436
+ headingOffset: number;
437
+ /** How to render cross-references */
438
+ linkStyle: "relative" | "canonical" | "plaintext";
439
+ /** Function to resolve a USLM identifier to a relative file path (for linkStyle "relative") */
440
+ resolveLink?: ((identifier: string) => string | null) | undefined;
441
+ /** Notes filtering. Undefined = include all notes. */
442
+ notesFilter?: NotesFilter | undefined;
443
+ }
444
+ /**
445
+ * Render a complete section document: frontmatter + Markdown content.
446
+ */
447
+ declare function renderDocument(sectionNode: LevelNode, frontmatter: FrontmatterData, options?: RenderOptions): string;
448
+ /**
449
+ * Render a section-level node to Markdown.
450
+ */
451
+ declare function renderSection(node: LevelNode, options?: RenderOptions): string;
452
+ /**
453
+ * Render any AST node to Markdown.
454
+ */
455
+ declare function renderNode(node: ASTNode, options?: RenderOptions): string;
456
+
457
+ /**
458
+ * YAML frontmatter generator for section Markdown files.
459
+ */
460
+
461
+ /** Output format version */
462
+ declare const FORMAT_VERSION = "1.0.0";
463
+ /** Generator identifier (reads version from package.json) */
464
+ declare const GENERATOR: string;
465
+ /**
466
+ * Generate a YAML frontmatter string from section metadata.
467
+ *
468
+ * Returns a complete frontmatter block including the `---` delimiters.
469
+ */
470
+ declare function generateFrontmatter(data: FrontmatterData): string;
471
+
472
+ /**
473
+ * Cross-reference link resolver.
474
+ *
475
+ * Resolves USLM identifier URIs to relative Markdown file paths within
476
+ * the output tree, or falls back to OLRC website URLs.
477
+ */
478
+ /** Parsed components of a USLM identifier */
479
+ interface ParsedIdentifier {
480
+ /** Jurisdiction (e.g., "us") */
481
+ jurisdiction: string;
482
+ /** Code (e.g., "usc") */
483
+ code: string;
484
+ /** Title number (e.g., "1", "26") */
485
+ titleNum?: string | undefined;
486
+ /** Section number (e.g., "1", "7801", "106a") */
487
+ sectionNum?: string | undefined;
488
+ /** Subsection path (e.g., "a/2") */
489
+ subPath?: string | undefined;
490
+ }
491
+ /**
492
+ * Parse a USLM identifier into its components.
493
+ *
494
+ * Handles: /us/usc/t{N}, /us/usc/t{N}/s{N}, /us/usc/t{N}/s{N}/{sub}
495
+ * Returns null for non-USC identifiers (stat, pl, act).
496
+ */
497
+ declare function parseIdentifier(identifier: string): ParsedIdentifier | null;
498
+ /**
499
+ * Resolve a USLM identifier to an expected output file path.
500
+ *
501
+ * Given a section identifier like "/us/usc/t2/s285b", returns a path like
502
+ * "usc/title-02/section-285b.md". The chapter is unknown without a registry,
503
+ * so this returns null unless the identifier is registered.
504
+ */
505
+ interface LinkResolver {
506
+ /**
507
+ * Given a USLM identifier and the current file's path in the output tree,
508
+ * return a relative Markdown link path or null if unresolvable.
509
+ */
510
+ resolve(identifier: string, fromFile: string): string | null;
511
+ /**
512
+ * Register a converted file so future cross-references can resolve to it.
513
+ */
514
+ register(identifier: string, filePath: string): void;
515
+ /**
516
+ * Build the fallback OLRC website URL for identifiers not in the output corpus.
517
+ */
518
+ fallbackUrl(identifier: string): string | null;
519
+ }
520
+ /**
521
+ * Create a new LinkResolver instance.
522
+ */
523
+ declare function createLinkResolver(): LinkResolver;
524
+
525
+ export { APPENDIX_LEVEL_ELEMENTS, ASTBuilder, type ASTBuilderOptions, type ASTNode, type AncestorInfo, type Attributes, BIG_LEVELS, CONTAINER_ELEMENTS, CONTENT_ELEMENTS, type ContentNode, type ContentVariant, DCTERMS_NS, DC_NS, type DocumentMeta, type EmitContext, FORMAT_VERSION, type FrontmatterData, GENERATOR, INLINE_ELEMENTS, type InlineNode, type InlineType, LEVEL_ELEMENTS, LEVEL_TYPES, type LevelNode, type LevelType, type LinkResolver, META_ELEMENTS, NAMESPACE_PREFIXES, NOTE_ELEMENTS, type NoteNode, type NotesContainerNode, type NotesFilter, type ParsedIdentifier, type ParserEvents, type QuotedContentNode, type RenderOptions, SMALL_LEVELS, type SourceCreditNode, type TOCItemNode, type TOCNode, type TableNode, USLM_NS, XHTML_NS, XMLParser, type XMLParserOptions, XSI_NS, createLinkResolver, generateFrontmatter, parseIdentifier, renderDocument, renderNode, renderSection };