mdream 0.2.8 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,234 @@
1
+ //#region src/plugins/extraction.d.ts
2
+ interface ExtractedElement extends ElementNode {
3
+ textContent: string;
4
+ }
5
+ declare function extractionPlugin(selectors: Record<string, (element: ExtractedElement, state: MdreamRuntimeState) => void>): Plugin;
6
+ //#endregion
7
+ //#region src/types.d.ts
8
+ /**
9
+ * Plugin interface for extending HTML to Markdown conversion
10
+ */
11
+ interface Plugin {
12
+ /**
13
+ * Process a node before it's handled by the parser
14
+ */
15
+ beforeNodeProcess?: (event: NodeEvent, state: MdreamRuntimeState) => undefined | void | {
16
+ skip: boolean;
17
+ };
18
+ /**
19
+ * Hook that runs when entering a node
20
+ * @returns String to add to the output, or PluginHookResult with content
21
+ */
22
+ onNodeEnter?: (node: ElementNode, state: MdreamRuntimeState) => string | undefined | void;
23
+ /**
24
+ * Hook that runs when exiting a node
25
+ * @param event - The node event
26
+ * @param state - The current runtime state
27
+ * @returns String to add to the output, or PluginHookResult with content
28
+ */
29
+ onNodeExit?: (node: ElementNode, state: MdreamRuntimeState) => string | undefined | void;
30
+ /**
31
+ * Process attributes for a node
32
+ * @param node - The node to process attributes for
33
+ * @param state - The current runtime state
34
+ */
35
+ processAttributes?: (node: ElementNode, state: MdreamRuntimeState) => void;
36
+ /**
37
+ * Process a text node before it's added to the output
38
+ * @param node - The text node to process
39
+ * @param state - The current runtime state
40
+ * @returns Legacy format or PluginHookResult with textContent and skipNode
41
+ */
42
+ processTextNode?: (node: TextNode, state: MdreamRuntimeState) => undefined | void | {
43
+ content: string;
44
+ skip: boolean;
45
+ };
46
+ }
47
+ /**
48
+ * Plugin creation options for controlling plugin behavior
49
+ */
50
+ interface PluginCreationOptions {
51
+ /**
52
+ * Order in which plugins are executed
53
+ * Lower numbers run first
54
+ */
55
+ order?: number;
56
+ /**
57
+ * Priority for region conflict resolution
58
+ * Higher numbers take precedence over lower
59
+ */
60
+ priority?: number;
61
+ }
62
+ interface HTMLToMarkdownOptions {
63
+ /**
64
+ * Origin URL for resolving relative image paths and internal links.
65
+ * Important when converting HTML with relative paths from a specific website.
66
+ */
67
+ origin?: string;
68
+ /**
69
+ * Plugins to extend HTML to Markdown conversion
70
+ */
71
+ plugins?: Plugin[];
72
+ }
73
+ declare const ELEMENT_NODE = 1;
74
+ declare const TEXT_NODE = 3;
75
+ interface ElementNode extends Node {
76
+ /** Element tag name (for ELEMENT_NODE) */
77
+ name: string;
78
+ /** HTML attributes (for ELEMENT_NODE) */
79
+ attributes: Record<string, string>;
80
+ /** Custom data added by plugins */
81
+ context?: Record<string, any>;
82
+ /** ID of the tag for fast handler lookup */
83
+ tagId?: number;
84
+ /** Map of tag names to their nesting count (using Uint8Array for performance) */
85
+ depthMap: Uint8Array;
86
+ /** Plugin outputs collected during processing */
87
+ pluginOutput?: string[];
88
+ }
89
+ interface TextNode extends Node {
90
+ /** Text content (for TEXT_NODE) */
91
+ value: string;
92
+ /** Custom data added by plugins */
93
+ context?: Record<string, any>;
94
+ /** Whether this text node should be excluded from markdown output (for script/style elements) */
95
+ excludedFromMarkdown?: boolean;
96
+ }
97
+ /**
98
+ * Base DOM node interface
99
+ * Optimized for streaming HTML parsing with minimal memory footprint
100
+ */
101
+ interface Node {
102
+ /** Node type (ELEMENT_NODE or TEXT_NODE) */
103
+ type: number;
104
+ /** Current nesting depth in the DOM tree */
105
+ depth: number;
106
+ /** Node exclusion and filtering now handled by plugins */
107
+ /** Index of this node within its parent's children */
108
+ index: number;
109
+ /** Current walk index for child traversal during streaming */
110
+ currentWalkIndex?: number;
111
+ /** Count of text child nodes - used for whitespace handling */
112
+ childTextNodeIndex?: number;
113
+ /** Whether node contains whitespace - used for whitespace optimization */
114
+ containsWhitespace?: boolean;
115
+ /** Cached reference to tag handler for performance */
116
+ tagHandler?: TagHandler;
117
+ /** Parent node */
118
+ parent?: ElementNode | null;
119
+ /** Custom data added by plugins */
120
+ context?: Record<string, any>;
121
+ /** Region ID for buffer region tracking */
122
+ regionId?: number;
123
+ }
124
+ /**
125
+ * Buffer region for tracking content inclusion/exclusion
126
+ */
127
+ interface BufferRegion {
128
+ /** Unique identifier */
129
+ id: number;
130
+ /** Inclusion state */
131
+ include: boolean;
132
+ }
133
+ /**
134
+ * State interface for HTML parsing and processing
135
+ * Contains parsing state that's maintained during HTML traversal
136
+ */
137
+ interface MdreamProcessingState {
138
+ /** Map of tag names to their current nesting depth - uses TypedArray for performance */
139
+ depthMap: Uint8Array;
140
+ /** Current overall nesting depth */
141
+ depth: number;
142
+ /** Currently processing element node */
143
+ currentNode?: ElementNode | null;
144
+ /** Node filtering and exclusion is now handled by plugins */
145
+ /** Whether current content contains HTML entities that need decoding */
146
+ hasEncodedHtmlEntity?: boolean;
147
+ /** Whether the last processed character was whitespace - for collapsing whitespace */
148
+ lastCharWasWhitespace?: boolean;
149
+ /** Whether the last processed buffer has whitespace - optimization flag */
150
+ textBufferContainsWhitespace?: boolean;
151
+ /** Whether the last processed buffer contains non-whitespace characters */
152
+ textBufferContainsNonWhitespace?: boolean;
153
+ /** Whether a tag was just closed - affects whitespace handling */
154
+ justClosedTag?: boolean;
155
+ /** Whether the next text node is the first in its element - for whitespace trimming */
156
+ isFirstTextInElement?: boolean;
157
+ /** Reference to the last processed text node - for context tracking */
158
+ lastTextNode?: Node;
159
+ /** Quote state tracking for non-nesting tags - avoids backward scanning */
160
+ inSingleQuote?: boolean;
161
+ inDoubleQuote?: boolean;
162
+ inBacktick?: boolean;
163
+ /** Backslash escaping state tracking - avoids checking previous character */
164
+ lastCharWasBackslash?: boolean;
165
+ /** Plugin instances array for efficient iteration */
166
+ plugins?: Plugin[];
167
+ /** Configuration options for conversion */
168
+ options?: HTMLToMarkdownOptions;
169
+ }
170
+ /**
171
+ * Runtime state for markdown generation
172
+ * Extended state that includes output tracking and options
173
+ */
174
+ interface MdreamRuntimeState extends Partial<MdreamProcessingState> {
175
+ /** Number of newlines at end of most recent output */
176
+ lastNewLines?: number;
177
+ /** Configuration options for conversion */
178
+ options?: HTMLToMarkdownOptions;
179
+ /** Table processing state - specialized for Markdown tables */
180
+ tableRenderedTable?: boolean;
181
+ tableCurrentRowCells?: number;
182
+ tableColumnAlignments?: string[];
183
+ /** Plugin instances array for efficient iteration */
184
+ plugins?: Plugin[];
185
+ /** Map of region IDs to buffer regions for O(1) lookups */
186
+ regionToggles: Map<number, boolean>;
187
+ /** Content buffers for regions */
188
+ regionContentBuffers: Map<number, string[]>;
189
+ /** Performance cache for last content to avoid iteration */
190
+ lastContentCache?: string;
191
+ /** Reference to the last processed node */
192
+ lastNode?: Node;
193
+ context?: Record<string, any>;
194
+ }
195
+ type NodeEventEnter = 0;
196
+ type NodeEventExit = 1;
197
+ /**
198
+ * Node event for DOM traversal
199
+ * Used in the event-based traversal system for streaming processing
200
+ */
201
+ interface NodeEvent {
202
+ /** Event type - enter (start tag) or exit (end tag) */
203
+ type: NodeEventEnter | NodeEventExit;
204
+ /** The node being processed */
205
+ node: Node;
206
+ }
207
+ /**
208
+ * Handler context for markdown conversion
209
+ * Passed to tag handler functions for converting specific elements
210
+ */
211
+ interface HandlerContext {
212
+ /** Current node being processed */
213
+ node: ElementNode;
214
+ /** Parent node (if any) */
215
+ parent?: ElementNode;
216
+ /** Runtime state */
217
+ state: MdreamRuntimeState;
218
+ }
219
+ /**
220
+ * Tag handler interface for HTML elements
221
+ * Used by plugins to extend or customize tag handling
222
+ */
223
+ interface TagHandler {
224
+ enter?: (context: HandlerContext) => string | undefined | void;
225
+ exit?: (context: HandlerContext) => string | undefined | void;
226
+ isSelfClosing?: boolean;
227
+ isNonNesting?: boolean;
228
+ collapsesInnerWhiteSpace?: boolean;
229
+ isInline?: boolean;
230
+ spacing?: readonly [number, number];
231
+ excludesTextNodes?: boolean;
232
+ }
233
+ //#endregion
234
+ export { BufferRegion, ELEMENT_NODE as ELEMENT_NODE$1, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginCreationOptions, TEXT_NODE as TEXT_NODE$1, TagHandler, TextNode, extractionPlugin as extractionPlugin$1 };
package/dist/cli.d.mts CHANGED
@@ -1,2 +1 @@
1
-
2
- export { };
1
+ export { };
package/dist/cli.mjs CHANGED
@@ -1,25 +1,23 @@
1
- import { Readable } from 'node:stream';
2
- import { cac } from 'cac';
3
- import { f as frontmatterPlugin } from './shared/mdream.CNrwlePY.mjs';
4
- import { r as readabilityPlugin } from './shared/mdream.VU-fHLcf.mjs';
5
- import { s as streamHtmlToMarkdown } from './shared/mdream.DZEl9tTZ.mjs';
6
- import './shared/mdream.C8Xgmr_a.mjs';
1
+ import "./_chunks/plugin-DCJFRZej.mjs";
2
+ import { streamHtmlToMarkdown$1 as streamHtmlToMarkdown } from "./_chunks/stream-BeojJNLt.mjs";
3
+ import { frontmatterPlugin, readabilityPlugin } from "./_chunks/readability-BfCjcbbx.mjs";
4
+ import { Readable } from "node:stream";
5
+ import { cac } from "cac";
7
6
 
7
+ //#region src/cli.ts
8
8
  async function streamingConvert(options = {}) {
9
- const outputStream = process.stdout;
10
- const conversionOptions = { origin: options.origin };
11
- conversionOptions.plugins = conversionOptions.plugins || [];
12
- conversionOptions.plugins.push(readabilityPlugin());
13
- conversionOptions.plugins.push(frontmatterPlugin());
14
- const markdownGenerator = streamHtmlToMarkdown(Readable.toWeb(process.stdin), conversionOptions);
15
- for await (const markdownChunk of markdownGenerator) {
16
- if (markdownChunk && markdownChunk.length > 0) {
17
- outputStream.write(markdownChunk);
18
- }
19
- }
9
+ const outputStream = process.stdout;
10
+ const conversionOptions = { origin: options.origin };
11
+ conversionOptions.plugins = conversionOptions.plugins || [];
12
+ conversionOptions.plugins.push(readabilityPlugin());
13
+ conversionOptions.plugins.push(frontmatterPlugin());
14
+ const markdownGenerator = streamHtmlToMarkdown(Readable.toWeb(process.stdin), conversionOptions);
15
+ for await (const markdownChunk of markdownGenerator) if (markdownChunk && markdownChunk.length > 0) outputStream.write(markdownChunk);
20
16
  }
21
17
  const cli = cac();
22
18
  cli.command("[options]", "Convert HTML from stdin to Markdown on stdout").option("--origin <url>", "Origin URL for resolving relative image paths").option("--preset <preset>", "Conversion presets: minimal").action(async (_, opts) => {
23
- await streamingConvert(opts);
19
+ await streamingConvert(opts);
24
20
  });
25
21
  cli.help().version("1.0.0").parse();
22
+
23
+ //#endregion
package/dist/index.d.mts CHANGED
@@ -1,18 +1,3 @@
1
- import { H as HTMLToMarkdownOptions } from './shared/mdream.C0Qx0F7t.mjs';
2
- export { B as BufferRegion, b as ELEMENT_NODE, E as ElementNode, f as HandlerContext, d as MdreamProcessingState, M as MdreamRuntimeState, N as Node, e as NodeEvent, P as Plugin, a as PluginCreationOptions, T as TEXT_NODE, g as TagHandler, c as TextNode } from './shared/mdream.C0Qx0F7t.mjs';
3
- import { ReadableStream } from 'node:stream/web';
4
- export { E as ExtractedElement } from './shared/mdream.DMe7T-0M.mjs';
5
-
6
- declare const TagIdMap: Record<string, number>;
7
-
8
- /**
9
- * Creates a markdown stream from an HTML stream
10
- * @param htmlStream - ReadableStream of HTML content (as Uint8Array or string)
11
- * @param options - Configuration options for conversion
12
- * @returns An async generator yielding markdown chunks
13
- */
14
- declare function streamHtmlToMarkdown(htmlStream: ReadableStream | null, options?: HTMLToMarkdownOptions): AsyncIterable<string>;
15
-
16
- declare function htmlToMarkdown(html: string, options?: HTMLToMarkdownOptions): string;
17
-
18
- export { HTMLToMarkdownOptions, TagIdMap, htmlToMarkdown, streamHtmlToMarkdown };
1
+ import { BufferRegion, ELEMENT_NODE$1 as ELEMENT_NODE, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginCreationOptions, TEXT_NODE$1 as TEXT_NODE, TagHandler, TextNode } from "./_chunks/types-BHoibuoP.mjs";
2
+ import { MarkdownProcessor, TagIdMap, createPlugin, htmlToMarkdown, parseHtml, streamHtmlToMarkdown } from "./_chunks/index-VTwTBxk0.mjs";
3
+ export { BufferRegion, ELEMENT_NODE, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MarkdownProcessor, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginCreationOptions, TEXT_NODE, TagHandler, TagIdMap, TextNode, createPlugin, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
package/dist/index.mjs CHANGED
@@ -1,13 +1,12 @@
1
- import { p as processPartialHTMLToMarkdown } from './shared/mdream.DZEl9tTZ.mjs';
2
- export { s as streamHtmlToMarkdown } from './shared/mdream.DZEl9tTZ.mjs';
3
- export { T as TagIdMap } from './shared/mdream.C8Xgmr_a.mjs';
1
+ import { TagIdMap$1 as TagIdMap, createPlugin$1 as createPlugin } from "./_chunks/plugin-DCJFRZej.mjs";
2
+ import { MarkdownProcessor$1 as MarkdownProcessor, createMarkdownProcessor, parseHtml$1 as parseHtml, streamHtmlToMarkdown$1 as streamHtmlToMarkdown } from "./_chunks/stream-BeojJNLt.mjs";
4
3
 
4
+ //#region src/index.ts
5
5
  function htmlToMarkdown(html, options = {}) {
6
- const state = {
7
- options
8
- };
9
- const result = processPartialHTMLToMarkdown(html, state).chunk;
10
- return result.trimEnd();
6
+ const processor = createMarkdownProcessor(options);
7
+ processor.processHtml(html);
8
+ return processor.getMarkdown();
11
9
  }
12
10
 
13
- export { htmlToMarkdown };
11
+ //#endregion
12
+ export { MarkdownProcessor, TagIdMap, createPlugin, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
@@ -1,12 +1,7 @@
1
- import { P as Plugin } from './shared/mdream.C0Qx0F7t.mjs';
2
- export { e as extractionPlugin } from './shared/mdream.DMe7T-0M.mjs';
1
+ import { Plugin, extractionPlugin$1 as extractionPlugin } from "./_chunks/types-BHoibuoP.mjs";
2
+ import { createPlugin } from "./_chunks/index-VTwTBxk0.mjs";
3
3
 
4
- /**
5
- * Create a plugin that implements the Plugin interface with improved type inference
6
- *
7
- * @returns A complete plugin implementation
8
- */
9
- declare function createPlugin<T extends Partial<Plugin>>(plugin: T): Plugin;
4
+ //#region src/plugins/filter.d.ts
10
5
 
11
6
  /**
12
7
  * Plugin that filters nodes based on CSS selectors.
@@ -21,29 +16,31 @@ declare function createPlugin<T extends Partial<Plugin>>(plugin: T): Plugin;
21
16
  * withQuerySelectorPlugin({ exclude: ['nav', '#sidebar', '.footer'] })
22
17
  */
23
18
  declare function filterPlugin(options?: {
24
- /** CSS selectors (or Tag Ids) for elements to include (all others will be excluded) */
25
- include?: (string | number)[];
26
- /** CSS selectors (or Tag Ids) for elements to exclude */
27
- exclude?: (string | number)[];
28
- /** Whether to also process the children of matching elements */
29
- processChildren?: boolean;
30
- keepAbsolute?: boolean;
19
+ /** CSS selectors (or Tag Ids) for elements to include (all others will be excluded) */
20
+ include?: (string | number)[];
21
+ /** CSS selectors (or Tag Ids) for elements to exclude */
22
+ exclude?: (string | number)[];
23
+ /** Whether to also process the children of matching elements */
24
+ processChildren?: boolean;
25
+ keepAbsolute?: boolean;
31
26
  }): Plugin;
32
-
27
+ //#endregion
28
+ //#region src/plugins/frontmatter.d.ts
33
29
  interface FrontmatterPluginOptions {
34
- /** Additional frontmatter fields to include */
35
- additionalFields?: Record<string, string>;
36
- /** Meta tag names to extract (beyond the standard ones) */
37
- metaFields?: string[];
38
- /** Custom formatter for frontmatter values */
39
- formatValue?: (name: string, value: string) => string;
30
+ /** Additional frontmatter fields to include */
31
+ additionalFields?: Record<string, string>;
32
+ /** Meta tag names to extract (beyond the standard ones) */
33
+ metaFields?: string[];
34
+ /** Custom formatter for frontmatter values */
35
+ formatValue?: (name: string, value: string) => string;
40
36
  }
41
37
  /**
42
38
  * A plugin that manages frontmatter generation from HTML head elements
43
39
  * Extracts metadata from meta tags and title and generates YAML frontmatter
44
40
  */
45
41
  declare function frontmatterPlugin(options?: FrontmatterPluginOptions): Plugin;
46
-
42
+ //#endregion
43
+ //#region src/plugins/isolate-main.d.ts
47
44
  /**
48
45
  * Plugin that isolates main content using the following priority order:
49
46
  * 1. If an explicit <main> element exists (within 5 depth levels), use its content exclusively
@@ -74,16 +71,18 @@ declare function frontmatterPlugin(options?: FrontmatterPluginOptions): Plugin;
74
71
  * ```
75
72
  */
76
73
  declare function isolateMainPlugin(): Plugin;
77
-
74
+ //#endregion
75
+ //#region src/plugins/readability.d.ts
78
76
  /**
79
77
  * Creates a plugin that implements readability.js style heuristics for content quality assessment
80
78
  * Controls content inclusion/exclusion using buffer regions
81
79
  */
82
80
  declare function readabilityPlugin(): Plugin;
83
-
81
+ //#endregion
82
+ //#region src/plugins/tailwind.d.ts
84
83
  /**
85
84
  * Creates a plugin that adds Tailwind class processing
86
85
  */
87
86
  declare function tailwindPlugin(): Plugin;
88
-
89
- export { createPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin };
87
+ //#endregion
88
+ export { createPlugin, extractionPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin };
package/dist/plugins.mjs CHANGED
@@ -1,46 +1,5 @@
1
- import { c as createPlugin } from './shared/mdream.CNrwlePY.mjs';
2
- export { f as frontmatterPlugin } from './shared/mdream.CNrwlePY.mjs';
3
- import { p as parseSelector } from './shared/mdream.Crxe0Sar.mjs';
4
- export { f as filterPlugin, i as isolateMainPlugin, t as tailwindPlugin } from './shared/mdream.Crxe0Sar.mjs';
5
- export { r as readabilityPlugin } from './shared/mdream.VU-fHLcf.mjs';
6
- import './shared/mdream.C8Xgmr_a.mjs';
1
+ import { createPlugin$1 as createPlugin } from "./_chunks/plugin-DCJFRZej.mjs";
2
+ import { frontmatterPlugin, readabilityPlugin } from "./_chunks/readability-BfCjcbbx.mjs";
3
+ import { extractionPlugin, filterPlugin, isolateMainPlugin, tailwindPlugin } from "./_chunks/plugins-DGakgpSl.mjs";
7
4
 
8
- function extractionPlugin(selectors) {
9
- const matcherCallbacks = Object.entries(selectors).map(([selector, callback]) => ({
10
- matcher: parseSelector(selector),
11
- callback
12
- }));
13
- const trackedElements = /* @__PURE__ */ new Map();
14
- return createPlugin({
15
- onNodeEnter(element) {
16
- matcherCallbacks.forEach(({ matcher, callback }) => {
17
- if (matcher.matches(element)) {
18
- trackedElements.set(element, { textContent: "", callback });
19
- }
20
- });
21
- },
22
- processTextNode(textNode) {
23
- let currentParent = textNode.parent;
24
- while (currentParent) {
25
- const tracked = trackedElements.get(currentParent);
26
- if (tracked) {
27
- tracked.textContent += textNode.value;
28
- }
29
- currentParent = currentParent.parent;
30
- }
31
- },
32
- onNodeExit(element, state) {
33
- const tracked = trackedElements.get(element);
34
- if (tracked) {
35
- const extractedElement = {
36
- ...element,
37
- textContent: tracked.textContent.trim()
38
- };
39
- tracked.callback(extractedElement, state);
40
- trackedElements.delete(element);
41
- }
42
- }
43
- });
44
- }
45
-
46
- export { createPlugin, extractionPlugin };
5
+ export { createPlugin, extractionPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin };
@@ -1,4 +1,6 @@
1
- import { H as HTMLToMarkdownOptions } from '../shared/mdream.C0Qx0F7t.mjs';
1
+ import { HTMLToMarkdownOptions } from "../_chunks/types-BHoibuoP.mjs";
2
+
3
+ //#region src/preset/minimal.d.ts
2
4
 
3
5
  /**
4
6
  * Creates a configurable minimal preset with advanced options
@@ -7,5 +9,5 @@ import { H as HTMLToMarkdownOptions } from '../shared/mdream.C0Qx0F7t.mjs';
7
9
  * @returns HTML to Markdown options with configured plugins
8
10
  */
9
11
  declare function withMinimalPreset(options?: HTMLToMarkdownOptions): HTMLToMarkdownOptions;
10
-
11
- export { withMinimalPreset };
12
+ //#endregion
13
+ export { withMinimalPreset };
@@ -1,39 +1,41 @@
1
- import { aa as TAG_FORM, W as TAG_FIELDSET, b2 as TAG_OBJECT, a5 as TAG_EMBED, b6 as TAG_FIGURE, ab as TAG_FOOTER, v as TAG_ASIDE, Q as TAG_IFRAME, a4 as TAG_INPUT, Y as TAG_TEXTAREA, Z as TAG_SELECT, ai as TAG_BUTTON, ak as TAG_NAV } from '../shared/mdream.C8Xgmr_a.mjs';
2
- import { i as isolateMainPlugin, t as tailwindPlugin, f as filterPlugin } from '../shared/mdream.Crxe0Sar.mjs';
3
- import { f as frontmatterPlugin } from '../shared/mdream.CNrwlePY.mjs';
1
+ import { TAG_ASIDE, TAG_BUTTON, TAG_EMBED, TAG_FIELDSET, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_IFRAME, TAG_INPUT, TAG_NAV, TAG_OBJECT, TAG_SELECT, TAG_TEXTAREA } from "../_chunks/plugin-DCJFRZej.mjs";
2
+ import { frontmatterPlugin } from "../_chunks/readability-BfCjcbbx.mjs";
3
+ import { filterPlugin, isolateMainPlugin, tailwindPlugin } from "../_chunks/plugins-DGakgpSl.mjs";
4
4
 
5
+ //#region src/preset/minimal.ts
6
+ /**
7
+ * Creates a configurable minimal preset with advanced options
8
+ *
9
+ * @param options HTML to Markdown options
10
+ * @returns HTML to Markdown options with configured plugins
11
+ */
5
12
  function withMinimalPreset(options = {}) {
6
- const plugins = [
7
- isolateMainPlugin(),
8
- frontmatterPlugin(),
9
- tailwindPlugin(),
10
- // First apply readability plugin to extract main content
11
- // Then filter out unwanted tags
12
- filterPlugin({
13
- exclude: [
14
- TAG_FORM,
15
- TAG_FIELDSET,
16
- TAG_OBJECT,
17
- TAG_EMBED,
18
- TAG_FIGURE,
19
- TAG_FOOTER,
20
- TAG_ASIDE,
21
- TAG_IFRAME,
22
- TAG_INPUT,
23
- TAG_TEXTAREA,
24
- TAG_SELECT,
25
- TAG_BUTTON,
26
- TAG_NAV
27
- ]
28
- })
29
- ];
30
- if (options.plugins) {
31
- plugins.push(...options.plugins);
32
- }
33
- return {
34
- ...options,
35
- plugins
36
- };
13
+ const plugins = [
14
+ isolateMainPlugin(),
15
+ frontmatterPlugin(),
16
+ tailwindPlugin(),
17
+ filterPlugin({ exclude: [
18
+ TAG_FORM,
19
+ TAG_FIELDSET,
20
+ TAG_OBJECT,
21
+ TAG_EMBED,
22
+ TAG_FIGURE,
23
+ TAG_FOOTER,
24
+ TAG_ASIDE,
25
+ TAG_IFRAME,
26
+ TAG_INPUT,
27
+ TAG_TEXTAREA,
28
+ TAG_SELECT,
29
+ TAG_BUTTON,
30
+ TAG_NAV
31
+ ] })
32
+ ];
33
+ if (options.plugins) plugins.push(...options.plugins);
34
+ return {
35
+ ...options,
36
+ plugins
37
+ };
37
38
  }
38
39
 
39
- export { withMinimalPreset };
40
+ //#endregion
41
+ export { withMinimalPreset };
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "mdream",
3
3
  "type": "module",
4
- "version": "0.2.8",
4
+ "version": "0.3.0",
5
5
  "description": "Ultra-performant JavaScript HTML to Markdown converter optimized for LLMs.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -29,13 +29,14 @@
29
29
  "cac": "^6.7.14"
30
30
  },
31
31
  "devDependencies": {
32
- "@antfu/eslint-config": "^4.16.1",
33
- "@types/node": "^22.15.34",
32
+ "@antfu/eslint-config": "^4.16.2",
33
+ "@types/node": "^24.0.10",
34
34
  "bumpp": "^10.2.0",
35
- "crawlee": "^3.13.8",
36
- "eslint": "^9.30.0",
35
+ "crawlee": "^3.13.9",
36
+ "eslint": "^9.30.1",
37
37
  "llm-cost": "^1.0.5",
38
- "playwright": "^1.53.1",
38
+ "obuild": "^0.2.1",
39
+ "playwright": "^1.53.2",
39
40
  "typescript": "5.8.3",
40
41
  "unbuild": "^3.5.0",
41
42
  "vitest": "^3.2.4"
@@ -53,9 +54,9 @@
53
54
  "test:github:file": "cat test/fixtures/github-markdown-complete.html | node ./bin/mdream.mjs --origin https://docs.github.com | tee test/github-markdown.md",
54
55
  "test:wiki:file": "pnpm build && cat test/fixtures/wikipedia-largest.html | node ./bin/mdream.mjs --origin https://en.wikipedia.org | tee test/wiki-markdown.md",
55
56
  "test:wiki-small:file": "cat test/fixtures/wikipedia-small.html | node ./bin/mdream.mjs --origin https://en.wikipedia.org | tee test/wiki-markdown.md",
56
- "build": "unbuild",
57
+ "build": "obuild",
57
58
  "typecheck": "tsc --noEmit src/index.ts",
58
- "dev:prepare": "unbuild --stub",
59
+ "dev:prepare": "obuild --stub",
59
60
  "test": "vitest test",
60
61
  "release": "pnpm build && bumpp && pnpm -r publish"
61
62
  }
package/dist/cli.d.ts DELETED
@@ -1,2 +0,0 @@
1
-
2
- export { };
package/dist/index.d.ts DELETED
@@ -1,18 +0,0 @@
1
- import { H as HTMLToMarkdownOptions } from './shared/mdream.C0Qx0F7t.js';
2
- export { B as BufferRegion, b as ELEMENT_NODE, E as ElementNode, f as HandlerContext, d as MdreamProcessingState, M as MdreamRuntimeState, N as Node, e as NodeEvent, P as Plugin, a as PluginCreationOptions, T as TEXT_NODE, g as TagHandler, c as TextNode } from './shared/mdream.C0Qx0F7t.js';
3
- import { ReadableStream } from 'node:stream/web';
4
- export { E as ExtractedElement } from './shared/mdream.BFdDSM96.js';
5
-
6
- declare const TagIdMap: Record<string, number>;
7
-
8
- /**
9
- * Creates a markdown stream from an HTML stream
10
- * @param htmlStream - ReadableStream of HTML content (as Uint8Array or string)
11
- * @param options - Configuration options for conversion
12
- * @returns An async generator yielding markdown chunks
13
- */
14
- declare function streamHtmlToMarkdown(htmlStream: ReadableStream | null, options?: HTMLToMarkdownOptions): AsyncIterable<string>;
15
-
16
- declare function htmlToMarkdown(html: string, options?: HTMLToMarkdownOptions): string;
17
-
18
- export { HTMLToMarkdownOptions, TagIdMap, htmlToMarkdown, streamHtmlToMarkdown };