@mdream/js 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,390 @@
1
+ //#region src/const.d.ts
2
+ declare const TAG_H1 = 7;
3
+ declare const TAG_H2 = 8;
4
+ declare const TAG_H3 = 9;
5
+ declare const TAG_H4 = 10;
6
+ declare const TAG_H5 = 11;
7
+ declare const TAG_H6 = 12;
8
+ declare const ELEMENT_NODE = 1;
9
+ declare const TEXT_NODE = 2;
10
+ declare const NodeEventEnter$1 = 0;
11
+ declare const NodeEventExit$1 = 1;
12
+ //#endregion
13
+ //#region src/types.d.ts
14
+ /**
15
+ * Imperative hook-based transform plugins. **JavaScript engine only.**
16
+ * When transforms are provided, the JS engine is used regardless of engine selection.
17
+ * For declarative config that works with both engines, use `BuiltinPlugins`.
18
+ */
19
+ interface TransformPlugin {
20
+ /**
21
+ * Process a node before it's handled by the parser
22
+ */
23
+ beforeNodeProcess?: (event: NodeEvent, state: MdreamRuntimeState) => undefined | void | {
24
+ skip: boolean;
25
+ };
26
+ /**
27
+ * Hook that runs when entering a node
28
+ * @returns String to add to the output, or PluginHookResult with content
29
+ */
30
+ onNodeEnter?: (node: ElementNode, state: MdreamRuntimeState) => string | undefined | void;
31
+ /**
32
+ * Hook that runs when exiting a node
33
+ */
34
+ onNodeExit?: (node: ElementNode, state: MdreamRuntimeState) => string | undefined | void;
35
+ /**
36
+ * Process attributes for a node
37
+ */
38
+ processAttributes?: (node: ElementNode, state: MdreamRuntimeState) => void;
39
+ /**
40
+ * Process a text node before it's added to the output
41
+ * @returns Result with content and skip flag, or undefined for no transformation
42
+ */
43
+ processTextNode?: (node: TextNode, state: MdreamRuntimeState) => {
44
+ content: string;
45
+ skip: boolean;
46
+ } | undefined;
47
+ }
48
+ /**
49
+ * Declarative tag override configuration.
50
+ * When a string value is provided, it acts as an alias (e.g. `{ "x-heading": "h2" }`).
51
+ */
52
+ interface TagOverride {
53
+ enter?: string;
54
+ exit?: string;
55
+ spacing?: [number, number];
56
+ isInline?: boolean;
57
+ isSelfClosing?: boolean;
58
+ collapsesInnerWhiteSpace?: boolean;
59
+ }
60
+ /**
61
+ * Frontmatter configuration options.
62
+ */
63
+ interface FrontmatterConfig {
64
+ additionalFields?: Record<string, string>;
65
+ metaFields?: string[];
66
+ /**
67
+ * Callback to receive structured frontmatter data.
68
+ * Called after conversion with the extracted key-value pairs.
69
+ */
70
+ onExtract?: (frontmatter: Record<string, string>) => void;
71
+ }
72
+ /**
73
+ * Declarative configuration for built-in plugins.
74
+ * Works with both the JavaScript and Rust engines.
75
+ */
76
+ interface BuiltinPlugins {
77
+ /** Filter elements by CSS selectors, tag names, or TAG_* constants */
78
+ filter?: {
79
+ include?: (string | number)[];
80
+ exclude?: (string | number)[];
81
+ processChildren?: boolean;
82
+ };
83
+ /**
84
+ * Extract frontmatter from HTML head.
85
+ * - `true`: enable with defaults
86
+ * - `(fm) => void`: enable and receive structured data via callback
87
+ * - `FrontmatterConfig`: enable with config options and optional callback
88
+ */
89
+ frontmatter?: boolean | ((frontmatter: Record<string, string>) => void) | FrontmatterConfig;
90
+ /** Isolate main content area */
91
+ isolateMain?: boolean;
92
+ /** Convert Tailwind utility classes to markdown formatting */
93
+ tailwind?: boolean;
94
+ /**
95
+ * Extract elements matching CSS selectors during conversion.
96
+ * Each key is a CSS selector; the handler is called for every match.
97
+ */
98
+ extraction?: Record<string, (element: ExtractedElement) => void>;
99
+ /**
100
+ * Declarative tag overrides for customizing tag behavior.
101
+ * String values act as aliases (e.g. `{ "x-heading": "h2" }` makes `<x-heading>` behave like `<h2>`).
102
+ * Object values override specific handler properties.
103
+ */
104
+ tagOverrides?: Record<string, TagOverride | string>;
105
+ }
106
+ /**
107
+ * Shared engine options that work with both JS and Rust engines.
108
+ * This is the contract that `MarkdownEngine` methods accept.
109
+ */
110
+ interface CleanOptions {
111
+ /** Strip tracking query parameters (utm_*, fbclid, gclid, etc.) from URLs */
112
+ urls?: boolean;
113
+ /** Strip fragment-only links that don't match any heading in the output */
114
+ fragments?: boolean;
115
+ /** Strip links with meaningless hrefs (#, javascript:void(0)) → plain text */
116
+ emptyLinks?: boolean;
117
+ /** Collapse 3+ consecutive blank lines to 2 */
118
+ blankLines?: boolean;
119
+ /** Strip links where text equals URL: [https://x.com](https://x.com) → https://x.com */
120
+ redundantLinks?: boolean;
121
+ /** Strip self-referencing heading anchors: ## [Title](#title) → ## Title */
122
+ selfLinkHeadings?: boolean;
123
+ /** Strip images with no alt text (decorative/tracking pixels) */
124
+ emptyImages?: boolean;
125
+ /** Drop links that produce no visible text: [](url) → nothing */
126
+ emptyLinkText?: boolean;
127
+ }
128
+ interface EngineOptions {
129
+ /**
130
+ * Origin URL for resolving relative image paths and internal links.
131
+ */
132
+ origin?: string;
133
+ /**
134
+ * Declarative built-in plugin config. Works with both JS and Rust engines.
135
+ */
136
+ plugins?: BuiltinPlugins;
137
+ /**
138
+ * Clean up the markdown output. Pass `true` for all cleanup or an object
139
+ * to enable specific features. Operates as a post-processing step on the
140
+ * final markdown (sync API only for `fragments`).
141
+ */
142
+ clean?: boolean | CleanOptions;
143
+ }
144
+ interface ElementNode extends Node {
145
+ /** Element tag name (for ELEMENT_NODE) */
146
+ name: string;
147
+ /** HTML attributes (for ELEMENT_NODE) */
148
+ attributes: Record<string, string>;
149
+ /** Custom data added by plugins */
150
+ context?: PluginContext;
151
+ /** ID of the tag for fast handler lookup */
152
+ tagId?: number;
153
+ /** Map of tag names to their nesting count (using Uint8Array for performance) */
154
+ depthMap: Uint8Array;
155
+ /** Plugin outputs collected during processing */
156
+ pluginOutput?: string[];
157
+ }
158
+ interface TextNode extends Node {
159
+ /** Text content (for TEXT_NODE) */
160
+ value: string;
161
+ /** Custom data added by plugins */
162
+ context?: PluginContext;
163
+ /** Whether this text node should be excluded from markdown output (for script/style elements) */
164
+ excludedFromMarkdown?: boolean;
165
+ }
166
+ /**
167
+ * Base DOM node interface
168
+ * Optimized for streaming HTML parsing with minimal memory footprint
169
+ */
170
+ interface Node {
171
+ /** Node type (ELEMENT_NODE or TEXT_NODE) */
172
+ type: number;
173
+ /** Current nesting depth in the DOM tree */
174
+ depth: number;
175
+ /** Node exclusion and filtering now handled by plugins */
176
+ /** Index of this node within its parent's children */
177
+ index: number;
178
+ /** Current walk index for child traversal during streaming */
179
+ currentWalkIndex?: number;
180
+ /** Count of text child nodes - used for whitespace handling */
181
+ childTextNodeIndex?: number;
182
+ /** Whether node contains whitespace - used for whitespace optimization */
183
+ containsWhitespace?: boolean;
184
+ /** Cached reference to tag handler for performance */
185
+ tagHandler?: TagHandler;
186
+ /** Parent node */
187
+ parent?: ElementNode | null;
188
+ /** Custom data added by plugins */
189
+ context?: PluginContext;
190
+ }
191
+ /**
192
+ * State interface for HTML parsing and processing
193
+ * Contains parsing state that's maintained during HTML traversal
194
+ */
195
+ interface MdreamProcessingState {
196
+ /** Map of tag names to their current nesting depth - uses TypedArray for performance */
197
+ depthMap: Uint8Array;
198
+ /** Current overall nesting depth */
199
+ depth: number;
200
+ /** Currently processing element node */
201
+ currentNode?: ElementNode | null;
202
+ /** Node filtering and exclusion is now handled by plugins */
203
+ /** Whether current content contains HTML entities that need decoding */
204
+ hasEncodedHtmlEntity?: boolean;
205
+ /** Whether the last processed character was whitespace - for collapsing whitespace */
206
+ lastCharWasWhitespace?: boolean;
207
+ /** Whether the last processed buffer has whitespace - optimization flag */
208
+ textBufferContainsWhitespace?: boolean;
209
+ /** Whether the last processed buffer contains non-whitespace characters */
210
+ textBufferContainsNonWhitespace?: boolean;
211
+ /** Whether a tag was just closed - affects whitespace handling */
212
+ justClosedTag?: boolean;
213
+ /** Whether the next text node is the first in its element - for whitespace trimming */
214
+ isFirstTextInElement?: boolean;
215
+ /** Reference to the last processed text node - for context tracking */
216
+ lastTextNode?: Node;
217
+ /** Quote state tracking for non-nesting tags - avoids backward scanning */
218
+ inSingleQuote?: boolean;
219
+ inDoubleQuote?: boolean;
220
+ inBacktick?: boolean;
221
+ /** Backslash escaping state tracking - avoids checking previous character */
222
+ lastCharWasBackslash?: boolean;
223
+ /** Resolved plugin instances for efficient iteration */
224
+ resolvedPlugins?: TransformPlugin[];
225
+ /** Configuration options for conversion */
226
+ options?: EngineOptions;
227
+ }
228
+ /**
229
+ * Runtime state for markdown generation
230
+ * Extended state that includes output tracking and options
231
+ */
232
+ interface MdreamRuntimeState extends Partial<MdreamProcessingState> {
233
+ /** Number of newlines at end of most recent output */
234
+ lastNewLines?: number;
235
+ /** Configuration options for conversion */
236
+ options?: EngineOptions;
237
+ /** Table processing state - specialized for Markdown tables */
238
+ tableRenderedTable?: boolean;
239
+ tableCurrentRowCells?: number;
240
+ tableColumnAlignments?: string[];
241
+ /** Resolved plugin instances for efficient iteration */
242
+ resolvedPlugins?: TransformPlugin[];
243
+ /** Content buffer for markdown output */
244
+ buffer: string[];
245
+ /** Performance cache for last content to avoid iteration */
246
+ lastContentCache?: string;
247
+ /** Reference to the last processed node */
248
+ lastNode?: Node;
249
+ context?: PluginContext;
250
+ }
251
+ type NodeEventEnter = 0;
252
+ type NodeEventExit = 1;
253
+ /**
254
+ * Node event for DOM traversal
255
+ * Used in the event-based traversal system for streaming processing
256
+ */
257
+ interface NodeEvent {
258
+ /** Event type - enter (start tag) or exit (end tag) */
259
+ type: NodeEventEnter | NodeEventExit;
260
+ /** The node being processed */
261
+ node: Node;
262
+ }
263
+ /**
264
+ * Handler context for markdown conversion
265
+ * Passed to tag handler functions for converting specific elements
266
+ */
267
+ interface HandlerContext {
268
+ /** Current node being processed */
269
+ node: ElementNode;
270
+ /** Parent node (if any) */
271
+ parent?: ElementNode;
272
+ /** Runtime state */
273
+ state: MdreamRuntimeState;
274
+ }
275
+ /**
276
+ * Tag handler interface for HTML elements
277
+ * Used by plugins to extend or customize tag handling
278
+ */
279
+ interface TagHandler {
280
+ enter?: (context: HandlerContext) => string | undefined | void;
281
+ exit?: (context: HandlerContext) => string | undefined | void;
282
+ isSelfClosing?: boolean;
283
+ isNonNesting?: boolean;
284
+ collapsesInnerWhiteSpace?: boolean;
285
+ isInline?: boolean;
286
+ spacing?: readonly [number, number];
287
+ excludesTextNodes?: boolean;
288
+ }
289
+ interface TailwindContext {
290
+ hidden?: boolean;
291
+ prefix?: string;
292
+ suffix?: string;
293
+ }
294
+ interface PluginContext {
295
+ score?: number;
296
+ tagCount?: number;
297
+ linkTextLength?: number;
298
+ textLength?: number;
299
+ isHighLinkDensity?: boolean;
300
+ tailwind?: TailwindContext;
301
+ [key: string]: unknown;
302
+ }
303
+ /**
304
+ * Element extracted during conversion by the extraction plugin.
305
+ */
306
+ interface ExtractedElement {
307
+ /** The CSS selector that matched this element */
308
+ selector: string;
309
+ /** The HTML tag name */
310
+ tagName: string;
311
+ /** Accumulated text content of the element */
312
+ textContent: string;
313
+ /** HTML attributes of the element */
314
+ attributes: Record<string, string>;
315
+ }
316
+ /**
317
+ * Top-level options for the mdream JS engine.
318
+ * Extends the shared `EngineOptions` with JS-specific concerns.
319
+ */
320
+ interface MdreamOptions extends EngineOptions {
321
+ /**
322
+ * Imperative hook-based transform plugins.
323
+ * When provided, a new JS engine is created with these hooks.
324
+ */
325
+ hooks?: TransformPlugin[];
326
+ }
327
+ /**
328
+ * Markdown chunk with content and metadata
329
+ * Compatible with LangChain Document structure
330
+ */
331
+ interface MarkdownChunk {
332
+ /** The markdown content of the chunk */
333
+ content: string;
334
+ /** Metadata extracted during chunking */
335
+ metadata: {
336
+ /** Header hierarchy at this chunk position */headers?: Record<string, string>; /** Code block language if chunk is/contains code */
337
+ code?: string; /** Line number range in original document */
338
+ loc?: {
339
+ lines: {
340
+ from: number;
341
+ to: number;
342
+ };
343
+ };
344
+ };
345
+ }
346
+ /**
347
+ * Options for HTML to Markdown chunking
348
+ * Extends EngineOptions with chunking-specific settings
349
+ */
350
+ interface SplitterOptions extends EngineOptions {
351
+ /**
352
+ * Header tag IDs to split on (TAG_H1, TAG_H2, etc.)
353
+ * @example [TAG_H1, TAG_H2]
354
+ * @default [TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6]
355
+ */
356
+ headersToSplitOn?: number[];
357
+ /**
358
+ * Return each line as individual chunk
359
+ * @default false
360
+ */
361
+ returnEachLine?: boolean;
362
+ /**
363
+ * Strip headers from chunk content
364
+ * @default true
365
+ */
366
+ stripHeaders?: boolean;
367
+ /**
368
+ * Maximum chunk size
369
+ * @default 1000
370
+ */
371
+ chunkSize?: number;
372
+ /**
373
+ * Overlap between chunks for context preservation
374
+ * @default 200
375
+ */
376
+ chunkOverlap?: number;
377
+ /**
378
+ * Function to measure chunk length (default: character count)
379
+ * Can be replaced with token counter for LLM applications
380
+ * @default (text) => text.length
381
+ */
382
+ lengthFunction?: (text: string) => number;
383
+ /**
384
+ * Keep separators in the split chunks
385
+ * @default false
386
+ */
387
+ keepSeparator?: boolean;
388
+ }
389
+ //#endregion
390
+ export { TAG_H3 as C, TEXT_NODE as D, TAG_H6 as E, TAG_H2 as S, TAG_H5 as T, TransformPlugin as _, ExtractedElement as a, NodeEventExit$1 as b, MdreamOptions as c, NodeEvent as d, PluginContext as f, TextNode as g, TagOverride as h, EngineOptions as i, MdreamRuntimeState as l, TagHandler as m, CleanOptions as n, FrontmatterConfig as o, SplitterOptions as p, ElementNode as r, MarkdownChunk as s, BuiltinPlugins as t, Node as u, ELEMENT_NODE as v, TAG_H4 as w, TAG_H1 as x, NodeEventEnter$1 as y };
package/dist/cli.d.mts ADDED
@@ -0,0 +1 @@
1
+ export { };
package/dist/cli.mjs ADDED
@@ -0,0 +1,27 @@
1
+ import { n as streamHtmlToMarkdown } from "./_chunks/src.mjs";
2
+ import "./_chunks/const.mjs";
3
+ import "./_chunks/parse.mjs";
4
+ import "./_chunks/resolve-plugins.mjs";
5
+ import "./_chunks/plugins.mjs";
6
+ import { withMinimalPreset } from "./preset/minimal.mjs";
7
+ import { readFileSync } from "node:fs";
8
+ import { Readable } from "node:stream";
9
+ import { fileURLToPath } from "node:url";
10
+ import { cac } from "cac";
11
+ import { dirname, join } from "pathe";
12
+ //#region src/cli.ts
13
+ async function streamingConvert(options = {}) {
14
+ let conversionOptions = { origin: options.origin };
15
+ if (options.preset === "minimal") conversionOptions = withMinimalPreset(conversionOptions);
16
+ const markdownGenerator = streamHtmlToMarkdown(Readable.toWeb(process.stdin), conversionOptions);
17
+ for await (const markdownChunk of markdownGenerator) if (markdownChunk && markdownChunk.length > 0) process.stdout.write(markdownChunk);
18
+ }
19
+ const packageJsonPath = join(dirname(fileURLToPath(import.meta.url)), "..", "package.json");
20
+ const packageJson = JSON.parse(readFileSync(packageJsonPath, "utf-8"));
21
+ const cli = cac();
22
+ cli.command("[options]", "Convert HTML from stdin to Markdown on stdout (JS engine)").option("--origin <url>", "Origin URL for resolving relative image paths").option("--preset <preset>", "Conversion presets: minimal").action(async (_, opts) => {
23
+ await streamingConvert(opts);
24
+ });
25
+ cli.help().version(packageJson.version).parse();
26
+ //#endregion
27
+ export {};
@@ -0,0 +1,4 @@
1
+ import { C as TAG_H3, D as TEXT_NODE, E as TAG_H6, S as TAG_H2, T as TAG_H5, _ as TransformPlugin, a as ExtractedElement, b as NodeEventExit, c as MdreamOptions, d as NodeEvent, f as PluginContext, g as TextNode, h as TagOverride, i as EngineOptions, n as CleanOptions, o as FrontmatterConfig, p as SplitterOptions, r as ElementNode, s as MarkdownChunk, t as BuiltinPlugins, u as Node, v as ELEMENT_NODE, w as TAG_H4, x as TAG_H1, y as NodeEventEnter } from "./_chunks/types.mjs";
2
+ import { n as streamHtmlToMarkdown, r as createPlugin, t as htmlToMarkdown } from "./_chunks/index.mjs";
3
+ import { t as withMinimalPreset } from "./_chunks/minimal.mjs";
4
+ export { BuiltinPlugins, CleanOptions, ELEMENT_NODE, ElementNode, EngineOptions, ExtractedElement, FrontmatterConfig, MarkdownChunk, MdreamOptions, Node, NodeEvent, NodeEventEnter, NodeEventExit, PluginContext, SplitterOptions, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TEXT_NODE, TagOverride, TextNode, TransformPlugin, createPlugin, htmlToMarkdown, streamHtmlToMarkdown, withMinimalPreset };
package/dist/index.mjs ADDED
@@ -0,0 +1,7 @@
1
+ import { n as streamHtmlToMarkdown, t as htmlToMarkdown } from "./_chunks/src.mjs";
2
+ import { c as NodeEventExit, d as TAG_H2, f as TAG_H3, g as TEXT_NODE, h as TAG_H6, m as TAG_H5, p as TAG_H4, r as ELEMENT_NODE, s as NodeEventEnter, u as TAG_H1 } from "./_chunks/const.mjs";
3
+ import "./_chunks/parse.mjs";
4
+ import "./_chunks/resolve-plugins.mjs";
5
+ import { s as createPlugin } from "./_chunks/plugins.mjs";
6
+ import { withMinimalPreset } from "./preset/minimal.mjs";
7
+ export { ELEMENT_NODE, NodeEventEnter, NodeEventExit, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TEXT_NODE, createPlugin, htmlToMarkdown, streamHtmlToMarkdown, withMinimalPreset };
@@ -0,0 +1,89 @@
1
+ //#region src/llms-txt.d.ts
2
+ /**
3
+ * Link in llms.txt section
4
+ */
5
+ interface LlmsTxtLink {
6
+ /** The title of the link */
7
+ title: string;
8
+ /** The description of the link */
9
+ description?: string;
10
+ /** The href of the link */
11
+ href: string;
12
+ }
13
+ /**
14
+ * Section in llms.txt
15
+ */
16
+ interface LlmsTxtSection {
17
+ /** The title of the section */
18
+ title: string;
19
+ /** The description of the section (can be array for multiple paragraphs) */
20
+ description?: string | string[];
21
+ /** The links of the section */
22
+ links?: LlmsTxtLink[];
23
+ }
24
+ interface LlmsTxtArtifactsOptions {
25
+ files: ProcessedFile[];
26
+ siteName?: string;
27
+ description?: string;
28
+ origin?: string;
29
+ generateFull?: boolean;
30
+ generateMarkdown?: boolean;
31
+ outputDir?: string;
32
+ /** The sections to write before pages */
33
+ sections?: LlmsTxtSection[];
34
+ /** Notes to write at the end */
35
+ notes?: string | string[];
36
+ }
37
+ interface ProcessedFile {
38
+ filePath?: string;
39
+ title: string;
40
+ content: string;
41
+ url: string;
42
+ metadata?: {
43
+ title?: string;
44
+ description?: string;
45
+ keywords?: string;
46
+ author?: string;
47
+ };
48
+ }
49
+ interface LlmsTxtArtifactsResult {
50
+ llmsTxt: string;
51
+ llmsFullTxt?: string;
52
+ markdownFiles?: {
53
+ path: string;
54
+ content: string;
55
+ }[];
56
+ processedFiles: ProcessedFile[];
57
+ }
58
+ /**
59
+ * Main function to generate llms.txt artifacts from pre-processed files
60
+ */
61
+ declare function generateLlmsTxtArtifacts(options: LlmsTxtArtifactsOptions): Promise<LlmsTxtArtifactsResult>;
62
+ /**
63
+ * Options for creating an llms.txt stream
64
+ */
65
+ interface CreateLlmsTxtStreamOptions extends Omit<LlmsTxtArtifactsOptions, 'files' | 'generateMarkdown'> {
66
+ /** Directory to write files to (defaults to process.cwd()) */
67
+ outputDir?: string;
68
+ /** Site name for the header (defaults to 'Site') */
69
+ siteName?: string;
70
+ /** Site description for the header */
71
+ description?: string;
72
+ /** Origin URL to prepend to relative URLs */
73
+ origin?: string;
74
+ /** Generate llms-full.txt with complete page content (defaults to false) */
75
+ generateFull?: boolean;
76
+ /** The sections to write before pages */
77
+ sections?: LlmsTxtSection[];
78
+ /** Notes to write at the end */
79
+ notes?: string | string[];
80
+ }
81
+ /**
82
+ * Create a WritableStream that generates llms.txt artifacts by streaming pages to disk.
83
+ *
84
+ * Writes llms.txt (and optionally llms-full.txt) incrementally as pages are written,
85
+ * never keeping full content in memory. Creates outputDir recursively if needed.
86
+ */
87
+ declare function createLlmsTxtStream(options: CreateLlmsTxtStreamOptions): WritableStream<ProcessedFile>;
88
+ //#endregion
89
+ export { CreateLlmsTxtStreamOptions, LlmsTxtArtifactsOptions, LlmsTxtArtifactsResult, LlmsTxtLink, LlmsTxtSection, ProcessedFile, createLlmsTxtStream, generateLlmsTxtArtifacts };