mdream 0.2.8 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/_chunks/index-VTwTBxk0.d.mts +58 -0
- package/dist/_chunks/plugin-DCJFRZej.mjs +299 -0
- package/dist/_chunks/plugins-DGakgpSl.mjs +582 -0
- package/dist/_chunks/readability-BfCjcbbx.mjs +271 -0
- package/dist/_chunks/stream-BeojJNLt.mjs +1409 -0
- package/dist/_chunks/types-BHoibuoP.d.mts +234 -0
- package/dist/cli.d.mts +1 -2
- package/dist/cli.mjs +16 -18
- package/dist/index.d.mts +3 -18
- package/dist/index.mjs +8 -9
- package/dist/plugins.d.mts +26 -27
- package/dist/plugins.mjs +4 -45
- package/dist/preset/minimal.d.mts +5 -3
- package/dist/preset/minimal.mjs +37 -35
- package/package.json +9 -8
- package/dist/cli.d.ts +0 -2
- package/dist/index.d.ts +0 -18
- package/dist/plugins.d.ts +0 -89
- package/dist/preset/minimal.d.ts +0 -11
- package/dist/shared/mdream.BFdDSM96.d.ts +0 -9
- package/dist/shared/mdream.C0Qx0F7t.d.mts +0 -226
- package/dist/shared/mdream.C0Qx0F7t.d.ts +0 -226
- package/dist/shared/mdream.C8Xgmr_a.mjs +0 -280
- package/dist/shared/mdream.CNrwlePY.mjs +0 -105
- package/dist/shared/mdream.Crxe0Sar.mjs +0 -501
- package/dist/shared/mdream.DMe7T-0M.d.mts +0 -9
- package/dist/shared/mdream.DZEl9tTZ.mjs +0 -1475
- package/dist/shared/mdream.VU-fHLcf.mjs +0 -291
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
//#region src/plugins/extraction.d.ts
|
|
2
|
+
interface ExtractedElement extends ElementNode {
|
|
3
|
+
textContent: string;
|
|
4
|
+
}
|
|
5
|
+
declare function extractionPlugin(selectors: Record<string, (element: ExtractedElement, state: MdreamRuntimeState) => void>): Plugin;
|
|
6
|
+
//#endregion
|
|
7
|
+
//#region src/types.d.ts
|
|
8
|
+
/**
|
|
9
|
+
* Plugin interface for extending HTML to Markdown conversion
|
|
10
|
+
*/
|
|
11
|
+
interface Plugin {
|
|
12
|
+
/**
|
|
13
|
+
* Process a node before it's handled by the parser
|
|
14
|
+
*/
|
|
15
|
+
beforeNodeProcess?: (event: NodeEvent, state: MdreamRuntimeState) => undefined | void | {
|
|
16
|
+
skip: boolean;
|
|
17
|
+
};
|
|
18
|
+
/**
|
|
19
|
+
* Hook that runs when entering a node
|
|
20
|
+
* @returns String to add to the output, or PluginHookResult with content
|
|
21
|
+
*/
|
|
22
|
+
onNodeEnter?: (node: ElementNode, state: MdreamRuntimeState) => string | undefined | void;
|
|
23
|
+
/**
|
|
24
|
+
* Hook that runs when exiting a node
|
|
25
|
+
* @param event - The node event
|
|
26
|
+
* @param state - The current runtime state
|
|
27
|
+
* @returns String to add to the output, or PluginHookResult with content
|
|
28
|
+
*/
|
|
29
|
+
onNodeExit?: (node: ElementNode, state: MdreamRuntimeState) => string | undefined | void;
|
|
30
|
+
/**
|
|
31
|
+
* Process attributes for a node
|
|
32
|
+
* @param node - The node to process attributes for
|
|
33
|
+
* @param state - The current runtime state
|
|
34
|
+
*/
|
|
35
|
+
processAttributes?: (node: ElementNode, state: MdreamRuntimeState) => void;
|
|
36
|
+
/**
|
|
37
|
+
* Process a text node before it's added to the output
|
|
38
|
+
* @param node - The text node to process
|
|
39
|
+
* @param state - The current runtime state
|
|
40
|
+
* @returns Legacy format or PluginHookResult with textContent and skipNode
|
|
41
|
+
*/
|
|
42
|
+
processTextNode?: (node: TextNode, state: MdreamRuntimeState) => undefined | void | {
|
|
43
|
+
content: string;
|
|
44
|
+
skip: boolean;
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Plugin creation options for controlling plugin behavior
|
|
49
|
+
*/
|
|
50
|
+
interface PluginCreationOptions {
|
|
51
|
+
/**
|
|
52
|
+
* Order in which plugins are executed
|
|
53
|
+
* Lower numbers run first
|
|
54
|
+
*/
|
|
55
|
+
order?: number;
|
|
56
|
+
/**
|
|
57
|
+
* Priority for region conflict resolution
|
|
58
|
+
* Higher numbers take precedence over lower
|
|
59
|
+
*/
|
|
60
|
+
priority?: number;
|
|
61
|
+
}
|
|
62
|
+
interface HTMLToMarkdownOptions {
|
|
63
|
+
/**
|
|
64
|
+
* Origin URL for resolving relative image paths and internal links.
|
|
65
|
+
* Important when converting HTML with relative paths from a specific website.
|
|
66
|
+
*/
|
|
67
|
+
origin?: string;
|
|
68
|
+
/**
|
|
69
|
+
* Plugins to extend HTML to Markdown conversion
|
|
70
|
+
*/
|
|
71
|
+
plugins?: Plugin[];
|
|
72
|
+
}
|
|
73
|
+
declare const ELEMENT_NODE = 1;
|
|
74
|
+
declare const TEXT_NODE = 3;
|
|
75
|
+
interface ElementNode extends Node {
|
|
76
|
+
/** Element tag name (for ELEMENT_NODE) */
|
|
77
|
+
name: string;
|
|
78
|
+
/** HTML attributes (for ELEMENT_NODE) */
|
|
79
|
+
attributes: Record<string, string>;
|
|
80
|
+
/** Custom data added by plugins */
|
|
81
|
+
context?: Record<string, any>;
|
|
82
|
+
/** ID of the tag for fast handler lookup */
|
|
83
|
+
tagId?: number;
|
|
84
|
+
/** Map of tag names to their nesting count (using Uint8Array for performance) */
|
|
85
|
+
depthMap: Uint8Array;
|
|
86
|
+
/** Plugin outputs collected during processing */
|
|
87
|
+
pluginOutput?: string[];
|
|
88
|
+
}
|
|
89
|
+
interface TextNode extends Node {
|
|
90
|
+
/** Text content (for TEXT_NODE) */
|
|
91
|
+
value: string;
|
|
92
|
+
/** Custom data added by plugins */
|
|
93
|
+
context?: Record<string, any>;
|
|
94
|
+
/** Whether this text node should be excluded from markdown output (for script/style elements) */
|
|
95
|
+
excludedFromMarkdown?: boolean;
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Base DOM node interface
|
|
99
|
+
* Optimized for streaming HTML parsing with minimal memory footprint
|
|
100
|
+
*/
|
|
101
|
+
interface Node {
|
|
102
|
+
/** Node type (ELEMENT_NODE or TEXT_NODE) */
|
|
103
|
+
type: number;
|
|
104
|
+
/** Current nesting depth in the DOM tree */
|
|
105
|
+
depth: number;
|
|
106
|
+
/** Node exclusion and filtering now handled by plugins */
|
|
107
|
+
/** Index of this node within its parent's children */
|
|
108
|
+
index: number;
|
|
109
|
+
/** Current walk index for child traversal during streaming */
|
|
110
|
+
currentWalkIndex?: number;
|
|
111
|
+
/** Count of text child nodes - used for whitespace handling */
|
|
112
|
+
childTextNodeIndex?: number;
|
|
113
|
+
/** Whether node contains whitespace - used for whitespace optimization */
|
|
114
|
+
containsWhitespace?: boolean;
|
|
115
|
+
/** Cached reference to tag handler for performance */
|
|
116
|
+
tagHandler?: TagHandler;
|
|
117
|
+
/** Parent node */
|
|
118
|
+
parent?: ElementNode | null;
|
|
119
|
+
/** Custom data added by plugins */
|
|
120
|
+
context?: Record<string, any>;
|
|
121
|
+
/** Region ID for buffer region tracking */
|
|
122
|
+
regionId?: number;
|
|
123
|
+
}
|
|
124
|
+
/**
|
|
125
|
+
* Buffer region for tracking content inclusion/exclusion
|
|
126
|
+
*/
|
|
127
|
+
interface BufferRegion {
|
|
128
|
+
/** Unique identifier */
|
|
129
|
+
id: number;
|
|
130
|
+
/** Inclusion state */
|
|
131
|
+
include: boolean;
|
|
132
|
+
}
|
|
133
|
+
/**
|
|
134
|
+
* State interface for HTML parsing and processing
|
|
135
|
+
* Contains parsing state that's maintained during HTML traversal
|
|
136
|
+
*/
|
|
137
|
+
interface MdreamProcessingState {
|
|
138
|
+
/** Map of tag names to their current nesting depth - uses TypedArray for performance */
|
|
139
|
+
depthMap: Uint8Array;
|
|
140
|
+
/** Current overall nesting depth */
|
|
141
|
+
depth: number;
|
|
142
|
+
/** Currently processing element node */
|
|
143
|
+
currentNode?: ElementNode | null;
|
|
144
|
+
/** Node filtering and exclusion is now handled by plugins */
|
|
145
|
+
/** Whether current content contains HTML entities that need decoding */
|
|
146
|
+
hasEncodedHtmlEntity?: boolean;
|
|
147
|
+
/** Whether the last processed character was whitespace - for collapsing whitespace */
|
|
148
|
+
lastCharWasWhitespace?: boolean;
|
|
149
|
+
/** Whether the last processed buffer has whitespace - optimization flag */
|
|
150
|
+
textBufferContainsWhitespace?: boolean;
|
|
151
|
+
/** Whether the last processed buffer contains non-whitespace characters */
|
|
152
|
+
textBufferContainsNonWhitespace?: boolean;
|
|
153
|
+
/** Whether a tag was just closed - affects whitespace handling */
|
|
154
|
+
justClosedTag?: boolean;
|
|
155
|
+
/** Whether the next text node is the first in its element - for whitespace trimming */
|
|
156
|
+
isFirstTextInElement?: boolean;
|
|
157
|
+
/** Reference to the last processed text node - for context tracking */
|
|
158
|
+
lastTextNode?: Node;
|
|
159
|
+
/** Quote state tracking for non-nesting tags - avoids backward scanning */
|
|
160
|
+
inSingleQuote?: boolean;
|
|
161
|
+
inDoubleQuote?: boolean;
|
|
162
|
+
inBacktick?: boolean;
|
|
163
|
+
/** Backslash escaping state tracking - avoids checking previous character */
|
|
164
|
+
lastCharWasBackslash?: boolean;
|
|
165
|
+
/** Plugin instances array for efficient iteration */
|
|
166
|
+
plugins?: Plugin[];
|
|
167
|
+
/** Configuration options for conversion */
|
|
168
|
+
options?: HTMLToMarkdownOptions;
|
|
169
|
+
}
|
|
170
|
+
/**
|
|
171
|
+
* Runtime state for markdown generation
|
|
172
|
+
* Extended state that includes output tracking and options
|
|
173
|
+
*/
|
|
174
|
+
interface MdreamRuntimeState extends Partial<MdreamProcessingState> {
|
|
175
|
+
/** Number of newlines at end of most recent output */
|
|
176
|
+
lastNewLines?: number;
|
|
177
|
+
/** Configuration options for conversion */
|
|
178
|
+
options?: HTMLToMarkdownOptions;
|
|
179
|
+
/** Table processing state - specialized for Markdown tables */
|
|
180
|
+
tableRenderedTable?: boolean;
|
|
181
|
+
tableCurrentRowCells?: number;
|
|
182
|
+
tableColumnAlignments?: string[];
|
|
183
|
+
/** Plugin instances array for efficient iteration */
|
|
184
|
+
plugins?: Plugin[];
|
|
185
|
+
/** Map of region IDs to buffer regions for O(1) lookups */
|
|
186
|
+
regionToggles: Map<number, boolean>;
|
|
187
|
+
/** Content buffers for regions */
|
|
188
|
+
regionContentBuffers: Map<number, string[]>;
|
|
189
|
+
/** Performance cache for last content to avoid iteration */
|
|
190
|
+
lastContentCache?: string;
|
|
191
|
+
/** Reference to the last processed node */
|
|
192
|
+
lastNode?: Node;
|
|
193
|
+
context?: Record<string, any>;
|
|
194
|
+
}
|
|
195
|
+
type NodeEventEnter = 0;
|
|
196
|
+
type NodeEventExit = 1;
|
|
197
|
+
/**
|
|
198
|
+
* Node event for DOM traversal
|
|
199
|
+
* Used in the event-based traversal system for streaming processing
|
|
200
|
+
*/
|
|
201
|
+
interface NodeEvent {
|
|
202
|
+
/** Event type - enter (start tag) or exit (end tag) */
|
|
203
|
+
type: NodeEventEnter | NodeEventExit;
|
|
204
|
+
/** The node being processed */
|
|
205
|
+
node: Node;
|
|
206
|
+
}
|
|
207
|
+
/**
|
|
208
|
+
* Handler context for markdown conversion
|
|
209
|
+
* Passed to tag handler functions for converting specific elements
|
|
210
|
+
*/
|
|
211
|
+
interface HandlerContext {
|
|
212
|
+
/** Current node being processed */
|
|
213
|
+
node: ElementNode;
|
|
214
|
+
/** Parent node (if any) */
|
|
215
|
+
parent?: ElementNode;
|
|
216
|
+
/** Runtime state */
|
|
217
|
+
state: MdreamRuntimeState;
|
|
218
|
+
}
|
|
219
|
+
/**
|
|
220
|
+
* Tag handler interface for HTML elements
|
|
221
|
+
* Used by plugins to extend or customize tag handling
|
|
222
|
+
*/
|
|
223
|
+
interface TagHandler {
|
|
224
|
+
enter?: (context: HandlerContext) => string | undefined | void;
|
|
225
|
+
exit?: (context: HandlerContext) => string | undefined | void;
|
|
226
|
+
isSelfClosing?: boolean;
|
|
227
|
+
isNonNesting?: boolean;
|
|
228
|
+
collapsesInnerWhiteSpace?: boolean;
|
|
229
|
+
isInline?: boolean;
|
|
230
|
+
spacing?: readonly [number, number];
|
|
231
|
+
excludesTextNodes?: boolean;
|
|
232
|
+
}
|
|
233
|
+
//#endregion
|
|
234
|
+
export { BufferRegion, ELEMENT_NODE as ELEMENT_NODE$1, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginCreationOptions, TEXT_NODE as TEXT_NODE$1, TagHandler, TextNode, extractionPlugin as extractionPlugin$1 };
|
package/dist/cli.d.mts
CHANGED
|
@@ -1,2 +1 @@
|
|
|
1
|
-
|
|
2
|
-
export { };
|
|
1
|
+
export { };
|
package/dist/cli.mjs
CHANGED
|
@@ -1,25 +1,23 @@
|
|
|
1
|
-
import
|
|
2
|
-
import {
|
|
3
|
-
import {
|
|
4
|
-
import {
|
|
5
|
-
import {
|
|
6
|
-
import './shared/mdream.C8Xgmr_a.mjs';
|
|
1
|
+
import "./_chunks/plugin-DCJFRZej.mjs";
|
|
2
|
+
import { streamHtmlToMarkdown$1 as streamHtmlToMarkdown } from "./_chunks/stream-BeojJNLt.mjs";
|
|
3
|
+
import { frontmatterPlugin, readabilityPlugin } from "./_chunks/readability-BfCjcbbx.mjs";
|
|
4
|
+
import { Readable } from "node:stream";
|
|
5
|
+
import { cac } from "cac";
|
|
7
6
|
|
|
7
|
+
//#region src/cli.ts
|
|
8
8
|
async function streamingConvert(options = {}) {
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
if (markdownChunk && markdownChunk.length > 0) {
|
|
17
|
-
outputStream.write(markdownChunk);
|
|
18
|
-
}
|
|
19
|
-
}
|
|
9
|
+
const outputStream = process.stdout;
|
|
10
|
+
const conversionOptions = { origin: options.origin };
|
|
11
|
+
conversionOptions.plugins = conversionOptions.plugins || [];
|
|
12
|
+
conversionOptions.plugins.push(readabilityPlugin());
|
|
13
|
+
conversionOptions.plugins.push(frontmatterPlugin());
|
|
14
|
+
const markdownGenerator = streamHtmlToMarkdown(Readable.toWeb(process.stdin), conversionOptions);
|
|
15
|
+
for await (const markdownChunk of markdownGenerator) if (markdownChunk && markdownChunk.length > 0) outputStream.write(markdownChunk);
|
|
20
16
|
}
|
|
21
17
|
const cli = cac();
|
|
22
18
|
cli.command("[options]", "Convert HTML from stdin to Markdown on stdout").option("--origin <url>", "Origin URL for resolving relative image paths").option("--preset <preset>", "Conversion presets: minimal").action(async (_, opts) => {
|
|
23
|
-
|
|
19
|
+
await streamingConvert(opts);
|
|
24
20
|
});
|
|
25
21
|
cli.help().version("1.0.0").parse();
|
|
22
|
+
|
|
23
|
+
//#endregion
|
package/dist/index.d.mts
CHANGED
|
@@ -1,18 +1,3 @@
|
|
|
1
|
-
import {
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
export { E as ExtractedElement } from './shared/mdream.DMe7T-0M.mjs';
|
|
5
|
-
|
|
6
|
-
declare const TagIdMap: Record<string, number>;
|
|
7
|
-
|
|
8
|
-
/**
|
|
9
|
-
* Creates a markdown stream from an HTML stream
|
|
10
|
-
* @param htmlStream - ReadableStream of HTML content (as Uint8Array or string)
|
|
11
|
-
* @param options - Configuration options for conversion
|
|
12
|
-
* @returns An async generator yielding markdown chunks
|
|
13
|
-
*/
|
|
14
|
-
declare function streamHtmlToMarkdown(htmlStream: ReadableStream | null, options?: HTMLToMarkdownOptions): AsyncIterable<string>;
|
|
15
|
-
|
|
16
|
-
declare function htmlToMarkdown(html: string, options?: HTMLToMarkdownOptions): string;
|
|
17
|
-
|
|
18
|
-
export { HTMLToMarkdownOptions, TagIdMap, htmlToMarkdown, streamHtmlToMarkdown };
|
|
1
|
+
import { BufferRegion, ELEMENT_NODE$1 as ELEMENT_NODE, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginCreationOptions, TEXT_NODE$1 as TEXT_NODE, TagHandler, TextNode } from "./_chunks/types-BHoibuoP.mjs";
|
|
2
|
+
import { MarkdownProcessor, TagIdMap, createPlugin, htmlToMarkdown, parseHtml, streamHtmlToMarkdown } from "./_chunks/index-VTwTBxk0.mjs";
|
|
3
|
+
export { BufferRegion, ELEMENT_NODE, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MarkdownProcessor, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginCreationOptions, TEXT_NODE, TagHandler, TagIdMap, TextNode, createPlugin, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
|
package/dist/index.mjs
CHANGED
|
@@ -1,13 +1,12 @@
|
|
|
1
|
-
import {
|
|
2
|
-
|
|
3
|
-
export { T as TagIdMap } from './shared/mdream.C8Xgmr_a.mjs';
|
|
1
|
+
import { TagIdMap$1 as TagIdMap, createPlugin$1 as createPlugin } from "./_chunks/plugin-DCJFRZej.mjs";
|
|
2
|
+
import { MarkdownProcessor$1 as MarkdownProcessor, createMarkdownProcessor, parseHtml$1 as parseHtml, streamHtmlToMarkdown$1 as streamHtmlToMarkdown } from "./_chunks/stream-BeojJNLt.mjs";
|
|
4
3
|
|
|
4
|
+
//#region src/index.ts
|
|
5
5
|
function htmlToMarkdown(html, options = {}) {
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
const result = processPartialHTMLToMarkdown(html, state).chunk;
|
|
10
|
-
return result.trimEnd();
|
|
6
|
+
const processor = createMarkdownProcessor(options);
|
|
7
|
+
processor.processHtml(html);
|
|
8
|
+
return processor.getMarkdown();
|
|
11
9
|
}
|
|
12
10
|
|
|
13
|
-
|
|
11
|
+
//#endregion
|
|
12
|
+
export { MarkdownProcessor, TagIdMap, createPlugin, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
|
package/dist/plugins.d.mts
CHANGED
|
@@ -1,12 +1,7 @@
|
|
|
1
|
-
import {
|
|
2
|
-
|
|
1
|
+
import { Plugin, extractionPlugin$1 as extractionPlugin } from "./_chunks/types-BHoibuoP.mjs";
|
|
2
|
+
import { createPlugin } from "./_chunks/index-VTwTBxk0.mjs";
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
* Create a plugin that implements the Plugin interface with improved type inference
|
|
6
|
-
*
|
|
7
|
-
* @returns A complete plugin implementation
|
|
8
|
-
*/
|
|
9
|
-
declare function createPlugin<T extends Partial<Plugin>>(plugin: T): Plugin;
|
|
4
|
+
//#region src/plugins/filter.d.ts
|
|
10
5
|
|
|
11
6
|
/**
|
|
12
7
|
* Plugin that filters nodes based on CSS selectors.
|
|
@@ -21,29 +16,31 @@ declare function createPlugin<T extends Partial<Plugin>>(plugin: T): Plugin;
|
|
|
21
16
|
* withQuerySelectorPlugin({ exclude: ['nav', '#sidebar', '.footer'] })
|
|
22
17
|
*/
|
|
23
18
|
declare function filterPlugin(options?: {
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
19
|
+
/** CSS selectors (or Tag Ids) for elements to include (all others will be excluded) */
|
|
20
|
+
include?: (string | number)[];
|
|
21
|
+
/** CSS selectors (or Tag Ids) for elements to exclude */
|
|
22
|
+
exclude?: (string | number)[];
|
|
23
|
+
/** Whether to also process the children of matching elements */
|
|
24
|
+
processChildren?: boolean;
|
|
25
|
+
keepAbsolute?: boolean;
|
|
31
26
|
}): Plugin;
|
|
32
|
-
|
|
27
|
+
//#endregion
|
|
28
|
+
//#region src/plugins/frontmatter.d.ts
|
|
33
29
|
interface FrontmatterPluginOptions {
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
30
|
+
/** Additional frontmatter fields to include */
|
|
31
|
+
additionalFields?: Record<string, string>;
|
|
32
|
+
/** Meta tag names to extract (beyond the standard ones) */
|
|
33
|
+
metaFields?: string[];
|
|
34
|
+
/** Custom formatter for frontmatter values */
|
|
35
|
+
formatValue?: (name: string, value: string) => string;
|
|
40
36
|
}
|
|
41
37
|
/**
|
|
42
38
|
* A plugin that manages frontmatter generation from HTML head elements
|
|
43
39
|
* Extracts metadata from meta tags and title and generates YAML frontmatter
|
|
44
40
|
*/
|
|
45
41
|
declare function frontmatterPlugin(options?: FrontmatterPluginOptions): Plugin;
|
|
46
|
-
|
|
42
|
+
//#endregion
|
|
43
|
+
//#region src/plugins/isolate-main.d.ts
|
|
47
44
|
/**
|
|
48
45
|
* Plugin that isolates main content using the following priority order:
|
|
49
46
|
* 1. If an explicit <main> element exists (within 5 depth levels), use its content exclusively
|
|
@@ -74,16 +71,18 @@ declare function frontmatterPlugin(options?: FrontmatterPluginOptions): Plugin;
|
|
|
74
71
|
* ```
|
|
75
72
|
*/
|
|
76
73
|
declare function isolateMainPlugin(): Plugin;
|
|
77
|
-
|
|
74
|
+
//#endregion
|
|
75
|
+
//#region src/plugins/readability.d.ts
|
|
78
76
|
/**
|
|
79
77
|
* Creates a plugin that implements readability.js style heuristics for content quality assessment
|
|
80
78
|
* Controls content inclusion/exclusion using buffer regions
|
|
81
79
|
*/
|
|
82
80
|
declare function readabilityPlugin(): Plugin;
|
|
83
|
-
|
|
81
|
+
//#endregion
|
|
82
|
+
//#region src/plugins/tailwind.d.ts
|
|
84
83
|
/**
|
|
85
84
|
* Creates a plugin that adds Tailwind class processing
|
|
86
85
|
*/
|
|
87
86
|
declare function tailwindPlugin(): Plugin;
|
|
88
|
-
|
|
89
|
-
export { createPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin };
|
|
87
|
+
//#endregion
|
|
88
|
+
export { createPlugin, extractionPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin };
|
package/dist/plugins.mjs
CHANGED
|
@@ -1,46 +1,5 @@
|
|
|
1
|
-
import {
|
|
2
|
-
|
|
3
|
-
import {
|
|
4
|
-
export { f as filterPlugin, i as isolateMainPlugin, t as tailwindPlugin } from './shared/mdream.Crxe0Sar.mjs';
|
|
5
|
-
export { r as readabilityPlugin } from './shared/mdream.VU-fHLcf.mjs';
|
|
6
|
-
import './shared/mdream.C8Xgmr_a.mjs';
|
|
1
|
+
import { createPlugin$1 as createPlugin } from "./_chunks/plugin-DCJFRZej.mjs";
|
|
2
|
+
import { frontmatterPlugin, readabilityPlugin } from "./_chunks/readability-BfCjcbbx.mjs";
|
|
3
|
+
import { extractionPlugin, filterPlugin, isolateMainPlugin, tailwindPlugin } from "./_chunks/plugins-DGakgpSl.mjs";
|
|
7
4
|
|
|
8
|
-
|
|
9
|
-
const matcherCallbacks = Object.entries(selectors).map(([selector, callback]) => ({
|
|
10
|
-
matcher: parseSelector(selector),
|
|
11
|
-
callback
|
|
12
|
-
}));
|
|
13
|
-
const trackedElements = /* @__PURE__ */ new Map();
|
|
14
|
-
return createPlugin({
|
|
15
|
-
onNodeEnter(element) {
|
|
16
|
-
matcherCallbacks.forEach(({ matcher, callback }) => {
|
|
17
|
-
if (matcher.matches(element)) {
|
|
18
|
-
trackedElements.set(element, { textContent: "", callback });
|
|
19
|
-
}
|
|
20
|
-
});
|
|
21
|
-
},
|
|
22
|
-
processTextNode(textNode) {
|
|
23
|
-
let currentParent = textNode.parent;
|
|
24
|
-
while (currentParent) {
|
|
25
|
-
const tracked = trackedElements.get(currentParent);
|
|
26
|
-
if (tracked) {
|
|
27
|
-
tracked.textContent += textNode.value;
|
|
28
|
-
}
|
|
29
|
-
currentParent = currentParent.parent;
|
|
30
|
-
}
|
|
31
|
-
},
|
|
32
|
-
onNodeExit(element, state) {
|
|
33
|
-
const tracked = trackedElements.get(element);
|
|
34
|
-
if (tracked) {
|
|
35
|
-
const extractedElement = {
|
|
36
|
-
...element,
|
|
37
|
-
textContent: tracked.textContent.trim()
|
|
38
|
-
};
|
|
39
|
-
tracked.callback(extractedElement, state);
|
|
40
|
-
trackedElements.delete(element);
|
|
41
|
-
}
|
|
42
|
-
}
|
|
43
|
-
});
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
export { createPlugin, extractionPlugin };
|
|
5
|
+
export { createPlugin, extractionPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin };
|
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { HTMLToMarkdownOptions } from "../_chunks/types-BHoibuoP.mjs";
|
|
2
|
+
|
|
3
|
+
//#region src/preset/minimal.d.ts
|
|
2
4
|
|
|
3
5
|
/**
|
|
4
6
|
* Creates a configurable minimal preset with advanced options
|
|
@@ -7,5 +9,5 @@ import { H as HTMLToMarkdownOptions } from '../shared/mdream.C0Qx0F7t.mjs';
|
|
|
7
9
|
* @returns HTML to Markdown options with configured plugins
|
|
8
10
|
*/
|
|
9
11
|
declare function withMinimalPreset(options?: HTMLToMarkdownOptions): HTMLToMarkdownOptions;
|
|
10
|
-
|
|
11
|
-
export { withMinimalPreset };
|
|
12
|
+
//#endregion
|
|
13
|
+
export { withMinimalPreset };
|
package/dist/preset/minimal.mjs
CHANGED
|
@@ -1,39 +1,41 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import {
|
|
3
|
-
import {
|
|
1
|
+
import { TAG_ASIDE, TAG_BUTTON, TAG_EMBED, TAG_FIELDSET, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_IFRAME, TAG_INPUT, TAG_NAV, TAG_OBJECT, TAG_SELECT, TAG_TEXTAREA } from "../_chunks/plugin-DCJFRZej.mjs";
|
|
2
|
+
import { frontmatterPlugin } from "../_chunks/readability-BfCjcbbx.mjs";
|
|
3
|
+
import { filterPlugin, isolateMainPlugin, tailwindPlugin } from "../_chunks/plugins-DGakgpSl.mjs";
|
|
4
4
|
|
|
5
|
+
//#region src/preset/minimal.ts
|
|
6
|
+
/**
|
|
7
|
+
* Creates a configurable minimal preset with advanced options
|
|
8
|
+
*
|
|
9
|
+
* @param options HTML to Markdown options
|
|
10
|
+
* @returns HTML to Markdown options with configured plugins
|
|
11
|
+
*/
|
|
5
12
|
function withMinimalPreset(options = {}) {
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
plugins.push(...options.plugins);
|
|
32
|
-
}
|
|
33
|
-
return {
|
|
34
|
-
...options,
|
|
35
|
-
plugins
|
|
36
|
-
};
|
|
13
|
+
const plugins = [
|
|
14
|
+
isolateMainPlugin(),
|
|
15
|
+
frontmatterPlugin(),
|
|
16
|
+
tailwindPlugin(),
|
|
17
|
+
filterPlugin({ exclude: [
|
|
18
|
+
TAG_FORM,
|
|
19
|
+
TAG_FIELDSET,
|
|
20
|
+
TAG_OBJECT,
|
|
21
|
+
TAG_EMBED,
|
|
22
|
+
TAG_FIGURE,
|
|
23
|
+
TAG_FOOTER,
|
|
24
|
+
TAG_ASIDE,
|
|
25
|
+
TAG_IFRAME,
|
|
26
|
+
TAG_INPUT,
|
|
27
|
+
TAG_TEXTAREA,
|
|
28
|
+
TAG_SELECT,
|
|
29
|
+
TAG_BUTTON,
|
|
30
|
+
TAG_NAV
|
|
31
|
+
] })
|
|
32
|
+
];
|
|
33
|
+
if (options.plugins) plugins.push(...options.plugins);
|
|
34
|
+
return {
|
|
35
|
+
...options,
|
|
36
|
+
plugins
|
|
37
|
+
};
|
|
37
38
|
}
|
|
38
39
|
|
|
39
|
-
|
|
40
|
+
//#endregion
|
|
41
|
+
export { withMinimalPreset };
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "mdream",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "0.3.0",
|
|
5
5
|
"description": "Ultra-performant JavaScript HTML to Markdown converter optimized for LLMs.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -29,13 +29,14 @@
|
|
|
29
29
|
"cac": "^6.7.14"
|
|
30
30
|
},
|
|
31
31
|
"devDependencies": {
|
|
32
|
-
"@antfu/eslint-config": "^4.16.
|
|
33
|
-
"@types/node": "^
|
|
32
|
+
"@antfu/eslint-config": "^4.16.2",
|
|
33
|
+
"@types/node": "^24.0.10",
|
|
34
34
|
"bumpp": "^10.2.0",
|
|
35
|
-
"crawlee": "^3.13.
|
|
36
|
-
"eslint": "^9.30.
|
|
35
|
+
"crawlee": "^3.13.9",
|
|
36
|
+
"eslint": "^9.30.1",
|
|
37
37
|
"llm-cost": "^1.0.5",
|
|
38
|
-
"
|
|
38
|
+
"obuild": "^0.2.1",
|
|
39
|
+
"playwright": "^1.53.2",
|
|
39
40
|
"typescript": "5.8.3",
|
|
40
41
|
"unbuild": "^3.5.0",
|
|
41
42
|
"vitest": "^3.2.4"
|
|
@@ -53,9 +54,9 @@
|
|
|
53
54
|
"test:github:file": "cat test/fixtures/github-markdown-complete.html | node ./bin/mdream.mjs --origin https://docs.github.com | tee test/github-markdown.md",
|
|
54
55
|
"test:wiki:file": "pnpm build && cat test/fixtures/wikipedia-largest.html | node ./bin/mdream.mjs --origin https://en.wikipedia.org | tee test/wiki-markdown.md",
|
|
55
56
|
"test:wiki-small:file": "cat test/fixtures/wikipedia-small.html | node ./bin/mdream.mjs --origin https://en.wikipedia.org | tee test/wiki-markdown.md",
|
|
56
|
-
"build": "
|
|
57
|
+
"build": "obuild",
|
|
57
58
|
"typecheck": "tsc --noEmit src/index.ts",
|
|
58
|
-
"dev:prepare": "
|
|
59
|
+
"dev:prepare": "obuild --stub",
|
|
59
60
|
"test": "vitest test",
|
|
60
61
|
"release": "pnpm build && bumpp && pnpm -r publish"
|
|
61
62
|
}
|
package/dist/cli.d.ts
DELETED
package/dist/index.d.ts
DELETED
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
import { H as HTMLToMarkdownOptions } from './shared/mdream.C0Qx0F7t.js';
|
|
2
|
-
export { B as BufferRegion, b as ELEMENT_NODE, E as ElementNode, f as HandlerContext, d as MdreamProcessingState, M as MdreamRuntimeState, N as Node, e as NodeEvent, P as Plugin, a as PluginCreationOptions, T as TEXT_NODE, g as TagHandler, c as TextNode } from './shared/mdream.C0Qx0F7t.js';
|
|
3
|
-
import { ReadableStream } from 'node:stream/web';
|
|
4
|
-
export { E as ExtractedElement } from './shared/mdream.BFdDSM96.js';
|
|
5
|
-
|
|
6
|
-
declare const TagIdMap: Record<string, number>;
|
|
7
|
-
|
|
8
|
-
/**
|
|
9
|
-
* Creates a markdown stream from an HTML stream
|
|
10
|
-
* @param htmlStream - ReadableStream of HTML content (as Uint8Array or string)
|
|
11
|
-
* @param options - Configuration options for conversion
|
|
12
|
-
* @returns An async generator yielding markdown chunks
|
|
13
|
-
*/
|
|
14
|
-
declare function streamHtmlToMarkdown(htmlStream: ReadableStream | null, options?: HTMLToMarkdownOptions): AsyncIterable<string>;
|
|
15
|
-
|
|
16
|
-
declare function htmlToMarkdown(html: string, options?: HTMLToMarkdownOptions): string;
|
|
17
|
-
|
|
18
|
-
export { HTMLToMarkdownOptions, TagIdMap, htmlToMarkdown, streamHtmlToMarkdown };
|