mdream 0.2.8 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_chunks/minimal-Ru8PBNVI.mjs +40 -0
- package/dist/_chunks/plugin-Bqz9GKOA.mjs +299 -0
- package/dist/_chunks/plugin-D45YAMmt.d.mts +12 -0
- package/dist/_chunks/plugins-D305pIpW.mjs +844 -0
- package/dist/_chunks/stream-IeCVDuTy.mjs +1427 -0
- package/dist/_chunks/types-D9VKEbix.d.mts +255 -0
- package/dist/cli.d.mts +1 -2
- package/dist/cli.mjs +23 -19
- package/dist/index.d.mts +65 -8
- package/dist/index.mjs +8 -9
- package/dist/plugins.d.mts +26 -27
- package/dist/plugins.mjs +3 -45
- package/dist/preset/minimal.d.mts +5 -3
- package/dist/preset/minimal.mjs +4 -38
- package/package.json +5 -18
- package/README.md +0 -252
- package/dist/cli.d.ts +0 -2
- package/dist/index.d.ts +0 -18
- package/dist/plugins.d.ts +0 -89
- package/dist/preset/minimal.d.ts +0 -11
- package/dist/shared/mdream.BFdDSM96.d.ts +0 -9
- package/dist/shared/mdream.C0Qx0F7t.d.mts +0 -226
- package/dist/shared/mdream.C0Qx0F7t.d.ts +0 -226
- package/dist/shared/mdream.C8Xgmr_a.mjs +0 -280
- package/dist/shared/mdream.CNrwlePY.mjs +0 -105
- package/dist/shared/mdream.Crxe0Sar.mjs +0 -501
- package/dist/shared/mdream.DMe7T-0M.d.mts +0 -9
- package/dist/shared/mdream.DZEl9tTZ.mjs +0 -1475
- package/dist/shared/mdream.VU-fHLcf.mjs +0 -291
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
//#region src/plugins/extraction.d.ts
|
|
2
|
+
interface ExtractedElement extends ElementNode {
|
|
3
|
+
textContent: string;
|
|
4
|
+
}
|
|
5
|
+
declare function extractionPlugin(selectors: Record<string, (element: ExtractedElement, state: MdreamRuntimeState) => void>): Plugin;
|
|
6
|
+
//#endregion
|
|
7
|
+
//#region src/types.d.ts
|
|
8
|
+
/**
|
|
9
|
+
* Plugin interface for extending HTML to Markdown conversion
|
|
10
|
+
*/
|
|
11
|
+
interface Plugin {
|
|
12
|
+
/**
|
|
13
|
+
* Process a node before it's handled by the parser
|
|
14
|
+
*/
|
|
15
|
+
beforeNodeProcess?: (event: NodeEvent, state: MdreamRuntimeState) => undefined | void | {
|
|
16
|
+
skip: boolean;
|
|
17
|
+
};
|
|
18
|
+
/**
|
|
19
|
+
* Hook that runs when entering a node
|
|
20
|
+
* @returns String to add to the output, or PluginHookResult with content
|
|
21
|
+
*/
|
|
22
|
+
onNodeEnter?: (node: ElementNode, state: MdreamRuntimeState) => string | undefined | void;
|
|
23
|
+
/**
|
|
24
|
+
* Hook that runs when exiting a node
|
|
25
|
+
* @param event - The node event
|
|
26
|
+
* @param state - The current runtime state
|
|
27
|
+
* @returns String to add to the output, or PluginHookResult with content
|
|
28
|
+
*/
|
|
29
|
+
onNodeExit?: (node: ElementNode, state: MdreamRuntimeState) => string | undefined | void;
|
|
30
|
+
/**
|
|
31
|
+
* Process attributes for a node
|
|
32
|
+
* @param node - The node to process attributes for
|
|
33
|
+
* @param state - The current runtime state
|
|
34
|
+
*/
|
|
35
|
+
processAttributes?: (node: ElementNode, state: MdreamRuntimeState) => void;
|
|
36
|
+
/**
|
|
37
|
+
* Process a text node before it's added to the output
|
|
38
|
+
* @param node - The text node to process
|
|
39
|
+
* @param state - The current runtime state
|
|
40
|
+
* @returns Result with content and skip flag, or undefined for no transformation
|
|
41
|
+
*/
|
|
42
|
+
processTextNode?: (node: TextNode, state: MdreamRuntimeState) => {
|
|
43
|
+
content: string;
|
|
44
|
+
skip: boolean;
|
|
45
|
+
} | undefined;
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Plugin creation options for controlling plugin behavior
|
|
49
|
+
*/
|
|
50
|
+
interface PluginCreationOptions {
|
|
51
|
+
/**
|
|
52
|
+
* Order in which plugins are executed
|
|
53
|
+
* Lower numbers run first
|
|
54
|
+
*/
|
|
55
|
+
order?: number;
|
|
56
|
+
/**
|
|
57
|
+
* Priority for region conflict resolution
|
|
58
|
+
* Higher numbers take precedence over lower
|
|
59
|
+
*/
|
|
60
|
+
priority?: number;
|
|
61
|
+
}
|
|
62
|
+
interface HTMLToMarkdownOptions {
|
|
63
|
+
/**
|
|
64
|
+
* Origin URL for resolving relative image paths and internal links.
|
|
65
|
+
* Important when converting HTML with relative paths from a specific website.
|
|
66
|
+
*/
|
|
67
|
+
origin?: string;
|
|
68
|
+
/**
|
|
69
|
+
* Plugins to extend HTML to Markdown conversion
|
|
70
|
+
*/
|
|
71
|
+
plugins?: Plugin[];
|
|
72
|
+
}
|
|
73
|
+
declare const ELEMENT_NODE = 1;
|
|
74
|
+
declare const TEXT_NODE = 3;
|
|
75
|
+
interface ElementNode extends Node {
|
|
76
|
+
/** Element tag name (for ELEMENT_NODE) */
|
|
77
|
+
name: string;
|
|
78
|
+
/** HTML attributes (for ELEMENT_NODE) */
|
|
79
|
+
attributes: Record<string, string>;
|
|
80
|
+
/** Custom data added by plugins */
|
|
81
|
+
context?: PluginContext;
|
|
82
|
+
/** ID of the tag for fast handler lookup */
|
|
83
|
+
tagId?: number;
|
|
84
|
+
/** Map of tag names to their nesting count (using Uint8Array for performance) */
|
|
85
|
+
depthMap: Uint8Array;
|
|
86
|
+
/** Plugin outputs collected during processing */
|
|
87
|
+
pluginOutput?: string[];
|
|
88
|
+
}
|
|
89
|
+
interface TextNode extends Node {
|
|
90
|
+
/** Text content (for TEXT_NODE) */
|
|
91
|
+
value: string;
|
|
92
|
+
/** Custom data added by plugins */
|
|
93
|
+
context?: PluginContext;
|
|
94
|
+
/** Whether this text node should be excluded from markdown output (for script/style elements) */
|
|
95
|
+
excludedFromMarkdown?: boolean;
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Base DOM node interface
|
|
99
|
+
* Optimized for streaming HTML parsing with minimal memory footprint
|
|
100
|
+
*/
|
|
101
|
+
interface Node {
|
|
102
|
+
/** Node type (ELEMENT_NODE or TEXT_NODE) */
|
|
103
|
+
type: number;
|
|
104
|
+
/** Current nesting depth in the DOM tree */
|
|
105
|
+
depth: number;
|
|
106
|
+
/** Node exclusion and filtering now handled by plugins */
|
|
107
|
+
/** Index of this node within its parent's children */
|
|
108
|
+
index: number;
|
|
109
|
+
/** Current walk index for child traversal during streaming */
|
|
110
|
+
currentWalkIndex?: number;
|
|
111
|
+
/** Count of text child nodes - used for whitespace handling */
|
|
112
|
+
childTextNodeIndex?: number;
|
|
113
|
+
/** Whether node contains whitespace - used for whitespace optimization */
|
|
114
|
+
containsWhitespace?: boolean;
|
|
115
|
+
/** Cached reference to tag handler for performance */
|
|
116
|
+
tagHandler?: TagHandler;
|
|
117
|
+
/** Parent node */
|
|
118
|
+
parent?: ElementNode | null;
|
|
119
|
+
/** Custom data added by plugins */
|
|
120
|
+
context?: PluginContext;
|
|
121
|
+
/** Region ID for buffer region tracking */
|
|
122
|
+
regionId?: number;
|
|
123
|
+
}
|
|
124
|
+
/**
|
|
125
|
+
* Buffer region for tracking content inclusion/exclusion
|
|
126
|
+
*/
|
|
127
|
+
interface BufferRegion {
|
|
128
|
+
/** Unique identifier */
|
|
129
|
+
id: number;
|
|
130
|
+
/** Inclusion state */
|
|
131
|
+
include: boolean;
|
|
132
|
+
}
|
|
133
|
+
/**
|
|
134
|
+
* State interface for HTML parsing and processing
|
|
135
|
+
* Contains parsing state that's maintained during HTML traversal
|
|
136
|
+
*/
|
|
137
|
+
interface MdreamProcessingState {
|
|
138
|
+
/** Map of tag names to their current nesting depth - uses TypedArray for performance */
|
|
139
|
+
depthMap: Uint8Array;
|
|
140
|
+
/** Current overall nesting depth */
|
|
141
|
+
depth: number;
|
|
142
|
+
/** Currently processing element node */
|
|
143
|
+
currentNode?: ElementNode | null;
|
|
144
|
+
/** Node filtering and exclusion is now handled by plugins */
|
|
145
|
+
/** Whether current content contains HTML entities that need decoding */
|
|
146
|
+
hasEncodedHtmlEntity?: boolean;
|
|
147
|
+
/** Whether the last processed character was whitespace - for collapsing whitespace */
|
|
148
|
+
lastCharWasWhitespace?: boolean;
|
|
149
|
+
/** Whether the last processed buffer has whitespace - optimization flag */
|
|
150
|
+
textBufferContainsWhitespace?: boolean;
|
|
151
|
+
/** Whether the last processed buffer contains non-whitespace characters */
|
|
152
|
+
textBufferContainsNonWhitespace?: boolean;
|
|
153
|
+
/** Whether a tag was just closed - affects whitespace handling */
|
|
154
|
+
justClosedTag?: boolean;
|
|
155
|
+
/** Whether the next text node is the first in its element - for whitespace trimming */
|
|
156
|
+
isFirstTextInElement?: boolean;
|
|
157
|
+
/** Reference to the last processed text node - for context tracking */
|
|
158
|
+
lastTextNode?: Node;
|
|
159
|
+
/** Quote state tracking for non-nesting tags - avoids backward scanning */
|
|
160
|
+
inSingleQuote?: boolean;
|
|
161
|
+
inDoubleQuote?: boolean;
|
|
162
|
+
inBacktick?: boolean;
|
|
163
|
+
/** Backslash escaping state tracking - avoids checking previous character */
|
|
164
|
+
lastCharWasBackslash?: boolean;
|
|
165
|
+
/** Plugin instances array for efficient iteration */
|
|
166
|
+
plugins?: Plugin[];
|
|
167
|
+
/** Configuration options for conversion */
|
|
168
|
+
options?: HTMLToMarkdownOptions;
|
|
169
|
+
}
|
|
170
|
+
/**
|
|
171
|
+
* Runtime state for markdown generation
|
|
172
|
+
* Extended state that includes output tracking and options
|
|
173
|
+
*/
|
|
174
|
+
interface MdreamRuntimeState extends Partial<MdreamProcessingState> {
|
|
175
|
+
/** Number of newlines at end of most recent output */
|
|
176
|
+
lastNewLines?: number;
|
|
177
|
+
/** Configuration options for conversion */
|
|
178
|
+
options?: HTMLToMarkdownOptions;
|
|
179
|
+
/** Table processing state - specialized for Markdown tables */
|
|
180
|
+
tableRenderedTable?: boolean;
|
|
181
|
+
tableCurrentRowCells?: number;
|
|
182
|
+
tableColumnAlignments?: string[];
|
|
183
|
+
/** Plugin instances array for efficient iteration */
|
|
184
|
+
plugins?: Plugin[];
|
|
185
|
+
/** Map of region IDs to buffer regions for O(1) lookups */
|
|
186
|
+
regionToggles: Map<number, boolean>;
|
|
187
|
+
/** Content buffers for regions */
|
|
188
|
+
regionContentBuffers: Map<number, string[]>;
|
|
189
|
+
/** Performance cache for last content to avoid iteration */
|
|
190
|
+
lastContentCache?: string;
|
|
191
|
+
/** Reference to the last processed node */
|
|
192
|
+
lastNode?: Node;
|
|
193
|
+
context?: PluginContext;
|
|
194
|
+
}
|
|
195
|
+
type NodeEventEnter = 0;
|
|
196
|
+
type NodeEventExit = 1;
|
|
197
|
+
/**
|
|
198
|
+
* Node event for DOM traversal
|
|
199
|
+
* Used in the event-based traversal system for streaming processing
|
|
200
|
+
*/
|
|
201
|
+
interface NodeEvent {
|
|
202
|
+
/** Event type - enter (start tag) or exit (end tag) */
|
|
203
|
+
type: NodeEventEnter | NodeEventExit;
|
|
204
|
+
/** The node being processed */
|
|
205
|
+
node: Node;
|
|
206
|
+
}
|
|
207
|
+
/**
|
|
208
|
+
* Handler context for markdown conversion
|
|
209
|
+
* Passed to tag handler functions for converting specific elements
|
|
210
|
+
*/
|
|
211
|
+
interface HandlerContext {
|
|
212
|
+
/** Current node being processed */
|
|
213
|
+
node: ElementNode;
|
|
214
|
+
/** Parent node (if any) */
|
|
215
|
+
parent?: ElementNode;
|
|
216
|
+
/** Runtime state */
|
|
217
|
+
state: MdreamRuntimeState;
|
|
218
|
+
}
|
|
219
|
+
/**
|
|
220
|
+
* Tag handler interface for HTML elements
|
|
221
|
+
* Used by plugins to extend or customize tag handling
|
|
222
|
+
*/
|
|
223
|
+
interface TagHandler {
|
|
224
|
+
enter?: (context: HandlerContext) => string | undefined | void;
|
|
225
|
+
exit?: (context: HandlerContext) => string | undefined | void;
|
|
226
|
+
isSelfClosing?: boolean;
|
|
227
|
+
isNonNesting?: boolean;
|
|
228
|
+
collapsesInnerWhiteSpace?: boolean;
|
|
229
|
+
isInline?: boolean;
|
|
230
|
+
spacing?: readonly [number, number];
|
|
231
|
+
excludesTextNodes?: boolean;
|
|
232
|
+
}
|
|
233
|
+
interface ReadabilityContext {
|
|
234
|
+
score?: number;
|
|
235
|
+
tagCount?: number;
|
|
236
|
+
linkTextLength?: number;
|
|
237
|
+
textLength?: number;
|
|
238
|
+
isHighLinkDensity?: boolean;
|
|
239
|
+
}
|
|
240
|
+
interface TailwindContext {
|
|
241
|
+
hidden?: boolean;
|
|
242
|
+
prefix?: string;
|
|
243
|
+
suffix?: string;
|
|
244
|
+
}
|
|
245
|
+
interface PluginContext {
|
|
246
|
+
score?: number;
|
|
247
|
+
tagCount?: number;
|
|
248
|
+
linkTextLength?: number;
|
|
249
|
+
textLength?: number;
|
|
250
|
+
isHighLinkDensity?: boolean;
|
|
251
|
+
tailwind?: TailwindContext;
|
|
252
|
+
[key: string]: unknown;
|
|
253
|
+
}
|
|
254
|
+
//#endregion
|
|
255
|
+
export { BufferRegion, ELEMENT_NODE as ELEMENT_NODE$1, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginContext, PluginCreationOptions, ReadabilityContext, TEXT_NODE as TEXT_NODE$1, TagHandler, TailwindContext, TextNode, extractionPlugin as extractionPlugin$1 };
|
package/dist/cli.d.mts
CHANGED
|
@@ -1,2 +1 @@
|
|
|
1
|
-
|
|
2
|
-
export { };
|
|
1
|
+
export { };
|
package/dist/cli.mjs
CHANGED
|
@@ -1,25 +1,29 @@
|
|
|
1
|
-
import
|
|
2
|
-
import {
|
|
3
|
-
import
|
|
4
|
-
import {
|
|
5
|
-
import {
|
|
6
|
-
import
|
|
1
|
+
import "./_chunks/plugin-Bqz9GKOA.mjs";
|
|
2
|
+
import { streamHtmlToMarkdown } from "./_chunks/stream-IeCVDuTy.mjs";
|
|
3
|
+
import "./_chunks/plugins-D305pIpW.mjs";
|
|
4
|
+
import { withMinimalPreset } from "./_chunks/minimal-Ru8PBNVI.mjs";
|
|
5
|
+
import { readFileSync } from "node:fs";
|
|
6
|
+
import { dirname, join } from "node:path";
|
|
7
|
+
import { Readable } from "node:stream";
|
|
8
|
+
import { fileURLToPath } from "node:url";
|
|
9
|
+
import { cac } from "cac";
|
|
7
10
|
|
|
11
|
+
//#region src/cli.ts
|
|
8
12
|
async function streamingConvert(options = {}) {
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
const markdownGenerator = streamHtmlToMarkdown(Readable.toWeb(process.stdin), conversionOptions);
|
|
15
|
-
for await (const markdownChunk of markdownGenerator) {
|
|
16
|
-
if (markdownChunk && markdownChunk.length > 0) {
|
|
17
|
-
outputStream.write(markdownChunk);
|
|
18
|
-
}
|
|
19
|
-
}
|
|
13
|
+
const outputStream = process.stdout;
|
|
14
|
+
let conversionOptions = { origin: options.origin };
|
|
15
|
+
if (options.preset === "minimal") conversionOptions = withMinimalPreset(conversionOptions);
|
|
16
|
+
const markdownGenerator = streamHtmlToMarkdown(Readable.toWeb(process.stdin), conversionOptions);
|
|
17
|
+
for await (const markdownChunk of markdownGenerator) if (markdownChunk && markdownChunk.length > 0) outputStream.write(markdownChunk);
|
|
20
18
|
}
|
|
19
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
20
|
+
const packageJsonPath = join(__dirname, "..", "package.json");
|
|
21
|
+
const packageJson = JSON.parse(readFileSync(packageJsonPath, "utf-8"));
|
|
22
|
+
const version = packageJson.version;
|
|
21
23
|
const cli = cac();
|
|
22
24
|
cli.command("[options]", "Convert HTML from stdin to Markdown on stdout").option("--origin <url>", "Origin URL for resolving relative image paths").option("--preset <preset>", "Conversion presets: minimal").action(async (_, opts) => {
|
|
23
|
-
|
|
25
|
+
await streamingConvert(opts);
|
|
24
26
|
});
|
|
25
|
-
cli.help().version(
|
|
27
|
+
cli.help().version(version).parse();
|
|
28
|
+
|
|
29
|
+
//#endregion
|
package/dist/index.d.mts
CHANGED
|
@@ -1,10 +1,66 @@
|
|
|
1
|
-
import {
|
|
2
|
-
|
|
3
|
-
import { ReadableStream } from
|
|
4
|
-
export { E as ExtractedElement } from './shared/mdream.DMe7T-0M.mjs';
|
|
1
|
+
import { BufferRegion, ELEMENT_NODE$1 as ELEMENT_NODE, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginContext, PluginCreationOptions, ReadabilityContext, TEXT_NODE$1 as TEXT_NODE, TagHandler, TailwindContext, TextNode } from "./_chunks/types-D9VKEbix.mjs";
|
|
2
|
+
import { createPlugin$1 as createPlugin } from "./_chunks/plugin-D45YAMmt.mjs";
|
|
3
|
+
import { ReadableStream } from "node:stream/web";
|
|
5
4
|
|
|
6
|
-
|
|
5
|
+
//#region src/const.d.ts
|
|
7
6
|
|
|
7
|
+
declare const TagIdMap: Record<string, number>;
|
|
8
|
+
//#endregion
|
|
9
|
+
//#region src/markdown-processor.d.ts
|
|
10
|
+
interface MarkdownState {
|
|
11
|
+
/** Configuration options for conversion */
|
|
12
|
+
options?: HTMLToMarkdownOptions;
|
|
13
|
+
/** Map of region IDs to buffer regions for O(1) lookups */
|
|
14
|
+
regionToggles: Map<number, boolean>;
|
|
15
|
+
/** Content buffers for regions */
|
|
16
|
+
regionContentBuffers: Map<number, string[]>;
|
|
17
|
+
/** Performance cache for last content to avoid iteration */
|
|
18
|
+
lastContentCache?: string;
|
|
19
|
+
/** Reference to the last processed node */
|
|
20
|
+
lastNode?: ElementNode | TextNode;
|
|
21
|
+
/** Reference to the last processed text node - for context tracking */
|
|
22
|
+
lastTextNode?: TextNode;
|
|
23
|
+
/** Table processing state - specialized for Markdown tables */
|
|
24
|
+
tableRenderedTable?: boolean;
|
|
25
|
+
tableCurrentRowCells?: number;
|
|
26
|
+
tableColumnAlignments?: string[];
|
|
27
|
+
/** Map of tag names to their current nesting depth */
|
|
28
|
+
depthMap: Uint8Array;
|
|
29
|
+
/** Current depth for plugin access */
|
|
30
|
+
depth?: number;
|
|
31
|
+
/** Context for additional data */
|
|
32
|
+
context?: PluginContext;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Creates a markdown processor that consumes DOM events and generates markdown
|
|
36
|
+
*/
|
|
37
|
+
declare function createMarkdownProcessor(options?: HTMLToMarkdownOptions): {
|
|
38
|
+
processEvent: (event: NodeEvent) => void;
|
|
39
|
+
processHtml: (html: string) => void;
|
|
40
|
+
getMarkdown: () => string;
|
|
41
|
+
getMarkdownChunk: () => string;
|
|
42
|
+
state: MarkdownState;
|
|
43
|
+
};
|
|
44
|
+
declare const MarkdownProcessor: typeof createMarkdownProcessor;
|
|
45
|
+
//#endregion
|
|
46
|
+
//#region src/parse.d.ts
|
|
47
|
+
interface ParseOptions {
|
|
48
|
+
plugins?: Plugin[];
|
|
49
|
+
}
|
|
50
|
+
interface ParseResult {
|
|
51
|
+
events: NodeEvent[];
|
|
52
|
+
remainingHtml: string;
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Pure HTML parser that emits DOM events
|
|
56
|
+
* Completely decoupled from markdown generation
|
|
57
|
+
*/
|
|
58
|
+
declare function parseHtml(html: string, options?: ParseOptions): ParseResult;
|
|
59
|
+
/**
|
|
60
|
+
* Streaming HTML parser - calls onEvent for each DOM event
|
|
61
|
+
*/
|
|
62
|
+
//#endregion
|
|
63
|
+
//#region src/stream.d.ts
|
|
8
64
|
/**
|
|
9
65
|
* Creates a markdown stream from an HTML stream
|
|
10
66
|
* @param htmlStream - ReadableStream of HTML content (as Uint8Array or string)
|
|
@@ -12,7 +68,8 @@ declare const TagIdMap: Record<string, number>;
|
|
|
12
68
|
* @returns An async generator yielding markdown chunks
|
|
13
69
|
*/
|
|
14
70
|
declare function streamHtmlToMarkdown(htmlStream: ReadableStream | null, options?: HTMLToMarkdownOptions): AsyncIterable<string>;
|
|
15
|
-
|
|
71
|
+
//#endregion
|
|
72
|
+
//#region src/index.d.ts
|
|
16
73
|
declare function htmlToMarkdown(html: string, options?: HTMLToMarkdownOptions): string;
|
|
17
|
-
|
|
18
|
-
export { HTMLToMarkdownOptions, TagIdMap, htmlToMarkdown, streamHtmlToMarkdown };
|
|
74
|
+
//#endregion
|
|
75
|
+
export { BufferRegion, ELEMENT_NODE, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MarkdownProcessor, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginContext, PluginCreationOptions, ReadabilityContext, TEXT_NODE, TagHandler, TagIdMap, TailwindContext, TextNode, createPlugin, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
|
package/dist/index.mjs
CHANGED
|
@@ -1,13 +1,12 @@
|
|
|
1
|
-
import {
|
|
2
|
-
|
|
3
|
-
export { T as TagIdMap } from './shared/mdream.C8Xgmr_a.mjs';
|
|
1
|
+
import { TagIdMap, createPlugin } from "./_chunks/plugin-Bqz9GKOA.mjs";
|
|
2
|
+
import { MarkdownProcessor, createMarkdownProcessor, parseHtml, streamHtmlToMarkdown } from "./_chunks/stream-IeCVDuTy.mjs";
|
|
4
3
|
|
|
4
|
+
//#region src/index.ts
|
|
5
5
|
function htmlToMarkdown(html, options = {}) {
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
const result = processPartialHTMLToMarkdown(html, state).chunk;
|
|
10
|
-
return result.trimEnd();
|
|
6
|
+
const processor = createMarkdownProcessor(options);
|
|
7
|
+
processor.processHtml(html);
|
|
8
|
+
return processor.getMarkdown();
|
|
11
9
|
}
|
|
12
10
|
|
|
13
|
-
|
|
11
|
+
//#endregion
|
|
12
|
+
export { MarkdownProcessor, TagIdMap, createPlugin, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
|
package/dist/plugins.d.mts
CHANGED
|
@@ -1,12 +1,7 @@
|
|
|
1
|
-
import {
|
|
2
|
-
|
|
1
|
+
import { Plugin, extractionPlugin$1 as extractionPlugin } from "./_chunks/types-D9VKEbix.mjs";
|
|
2
|
+
import { createPlugin$1 as createPlugin } from "./_chunks/plugin-D45YAMmt.mjs";
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
* Create a plugin that implements the Plugin interface with improved type inference
|
|
6
|
-
*
|
|
7
|
-
* @returns A complete plugin implementation
|
|
8
|
-
*/
|
|
9
|
-
declare function createPlugin<T extends Partial<Plugin>>(plugin: T): Plugin;
|
|
4
|
+
//#region src/plugins/filter.d.ts
|
|
10
5
|
|
|
11
6
|
/**
|
|
12
7
|
* Plugin that filters nodes based on CSS selectors.
|
|
@@ -21,29 +16,31 @@ declare function createPlugin<T extends Partial<Plugin>>(plugin: T): Plugin;
|
|
|
21
16
|
* withQuerySelectorPlugin({ exclude: ['nav', '#sidebar', '.footer'] })
|
|
22
17
|
*/
|
|
23
18
|
declare function filterPlugin(options?: {
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
19
|
+
/** CSS selectors (or Tag Ids) for elements to include (all others will be excluded) */
|
|
20
|
+
include?: (string | number)[];
|
|
21
|
+
/** CSS selectors (or Tag Ids) for elements to exclude */
|
|
22
|
+
exclude?: (string | number)[];
|
|
23
|
+
/** Whether to also process the children of matching elements */
|
|
24
|
+
processChildren?: boolean;
|
|
25
|
+
keepAbsolute?: boolean;
|
|
31
26
|
}): Plugin;
|
|
32
|
-
|
|
27
|
+
//#endregion
|
|
28
|
+
//#region src/plugins/frontmatter.d.ts
|
|
33
29
|
interface FrontmatterPluginOptions {
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
30
|
+
/** Additional frontmatter fields to include */
|
|
31
|
+
additionalFields?: Record<string, string>;
|
|
32
|
+
/** Meta tag names to extract (beyond the standard ones) */
|
|
33
|
+
metaFields?: string[];
|
|
34
|
+
/** Custom formatter for frontmatter values */
|
|
35
|
+
formatValue?: (name: string, value: string) => string;
|
|
40
36
|
}
|
|
41
37
|
/**
|
|
42
38
|
* A plugin that manages frontmatter generation from HTML head elements
|
|
43
39
|
* Extracts metadata from meta tags and title and generates YAML frontmatter
|
|
44
40
|
*/
|
|
45
41
|
declare function frontmatterPlugin(options?: FrontmatterPluginOptions): Plugin;
|
|
46
|
-
|
|
42
|
+
//#endregion
|
|
43
|
+
//#region src/plugins/isolate-main.d.ts
|
|
47
44
|
/**
|
|
48
45
|
* Plugin that isolates main content using the following priority order:
|
|
49
46
|
* 1. If an explicit <main> element exists (within 5 depth levels), use its content exclusively
|
|
@@ -74,16 +71,18 @@ declare function frontmatterPlugin(options?: FrontmatterPluginOptions): Plugin;
|
|
|
74
71
|
* ```
|
|
75
72
|
*/
|
|
76
73
|
declare function isolateMainPlugin(): Plugin;
|
|
77
|
-
|
|
74
|
+
//#endregion
|
|
75
|
+
//#region src/plugins/readability.d.ts
|
|
78
76
|
/**
|
|
79
77
|
* Creates a plugin that implements readability.js style heuristics for content quality assessment
|
|
80
78
|
* Controls content inclusion/exclusion using buffer regions
|
|
81
79
|
*/
|
|
82
80
|
declare function readabilityPlugin(): Plugin;
|
|
83
|
-
|
|
81
|
+
//#endregion
|
|
82
|
+
//#region src/plugins/tailwind.d.ts
|
|
84
83
|
/**
|
|
85
84
|
* Creates a plugin that adds Tailwind class processing
|
|
86
85
|
*/
|
|
87
86
|
declare function tailwindPlugin(): Plugin;
|
|
88
|
-
|
|
89
|
-
export { createPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin };
|
|
87
|
+
//#endregion
|
|
88
|
+
export { createPlugin, extractionPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin };
|
package/dist/plugins.mjs
CHANGED
|
@@ -1,46 +1,4 @@
|
|
|
1
|
-
import {
|
|
2
|
-
|
|
3
|
-
import { p as parseSelector } from './shared/mdream.Crxe0Sar.mjs';
|
|
4
|
-
export { f as filterPlugin, i as isolateMainPlugin, t as tailwindPlugin } from './shared/mdream.Crxe0Sar.mjs';
|
|
5
|
-
export { r as readabilityPlugin } from './shared/mdream.VU-fHLcf.mjs';
|
|
6
|
-
import './shared/mdream.C8Xgmr_a.mjs';
|
|
1
|
+
import { createPlugin } from "./_chunks/plugin-Bqz9GKOA.mjs";
|
|
2
|
+
import { extractionPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin } from "./_chunks/plugins-D305pIpW.mjs";
|
|
7
3
|
|
|
8
|
-
|
|
9
|
-
const matcherCallbacks = Object.entries(selectors).map(([selector, callback]) => ({
|
|
10
|
-
matcher: parseSelector(selector),
|
|
11
|
-
callback
|
|
12
|
-
}));
|
|
13
|
-
const trackedElements = /* @__PURE__ */ new Map();
|
|
14
|
-
return createPlugin({
|
|
15
|
-
onNodeEnter(element) {
|
|
16
|
-
matcherCallbacks.forEach(({ matcher, callback }) => {
|
|
17
|
-
if (matcher.matches(element)) {
|
|
18
|
-
trackedElements.set(element, { textContent: "", callback });
|
|
19
|
-
}
|
|
20
|
-
});
|
|
21
|
-
},
|
|
22
|
-
processTextNode(textNode) {
|
|
23
|
-
let currentParent = textNode.parent;
|
|
24
|
-
while (currentParent) {
|
|
25
|
-
const tracked = trackedElements.get(currentParent);
|
|
26
|
-
if (tracked) {
|
|
27
|
-
tracked.textContent += textNode.value;
|
|
28
|
-
}
|
|
29
|
-
currentParent = currentParent.parent;
|
|
30
|
-
}
|
|
31
|
-
},
|
|
32
|
-
onNodeExit(element, state) {
|
|
33
|
-
const tracked = trackedElements.get(element);
|
|
34
|
-
if (tracked) {
|
|
35
|
-
const extractedElement = {
|
|
36
|
-
...element,
|
|
37
|
-
textContent: tracked.textContent.trim()
|
|
38
|
-
};
|
|
39
|
-
tracked.callback(extractedElement, state);
|
|
40
|
-
trackedElements.delete(element);
|
|
41
|
-
}
|
|
42
|
-
}
|
|
43
|
-
});
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
export { createPlugin, extractionPlugin };
|
|
4
|
+
export { createPlugin, extractionPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin };
|
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { HTMLToMarkdownOptions } from "../_chunks/types-D9VKEbix.mjs";
|
|
2
|
+
|
|
3
|
+
//#region src/preset/minimal.d.ts
|
|
2
4
|
|
|
3
5
|
/**
|
|
4
6
|
* Creates a configurable minimal preset with advanced options
|
|
@@ -7,5 +9,5 @@ import { H as HTMLToMarkdownOptions } from '../shared/mdream.C0Qx0F7t.mjs';
|
|
|
7
9
|
* @returns HTML to Markdown options with configured plugins
|
|
8
10
|
*/
|
|
9
11
|
declare function withMinimalPreset(options?: HTMLToMarkdownOptions): HTMLToMarkdownOptions;
|
|
10
|
-
|
|
11
|
-
export { withMinimalPreset };
|
|
12
|
+
//#endregion
|
|
13
|
+
export { withMinimalPreset };
|
package/dist/preset/minimal.mjs
CHANGED
|
@@ -1,39 +1,5 @@
|
|
|
1
|
-
import
|
|
2
|
-
import
|
|
3
|
-
import {
|
|
1
|
+
import "../_chunks/plugin-Bqz9GKOA.mjs";
|
|
2
|
+
import "../_chunks/plugins-D305pIpW.mjs";
|
|
3
|
+
import { withMinimalPreset } from "../_chunks/minimal-Ru8PBNVI.mjs";
|
|
4
4
|
|
|
5
|
-
|
|
6
|
-
const plugins = [
|
|
7
|
-
isolateMainPlugin(),
|
|
8
|
-
frontmatterPlugin(),
|
|
9
|
-
tailwindPlugin(),
|
|
10
|
-
// First apply readability plugin to extract main content
|
|
11
|
-
// Then filter out unwanted tags
|
|
12
|
-
filterPlugin({
|
|
13
|
-
exclude: [
|
|
14
|
-
TAG_FORM,
|
|
15
|
-
TAG_FIELDSET,
|
|
16
|
-
TAG_OBJECT,
|
|
17
|
-
TAG_EMBED,
|
|
18
|
-
TAG_FIGURE,
|
|
19
|
-
TAG_FOOTER,
|
|
20
|
-
TAG_ASIDE,
|
|
21
|
-
TAG_IFRAME,
|
|
22
|
-
TAG_INPUT,
|
|
23
|
-
TAG_TEXTAREA,
|
|
24
|
-
TAG_SELECT,
|
|
25
|
-
TAG_BUTTON,
|
|
26
|
-
TAG_NAV
|
|
27
|
-
]
|
|
28
|
-
})
|
|
29
|
-
];
|
|
30
|
-
if (options.plugins) {
|
|
31
|
-
plugins.push(...options.plugins);
|
|
32
|
-
}
|
|
33
|
-
return {
|
|
34
|
-
...options,
|
|
35
|
-
plugins
|
|
36
|
-
};
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
export { withMinimalPreset };
|
|
5
|
+
export { withMinimalPreset };
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "mdream",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "0.4.0",
|
|
5
5
|
"description": "Ultra-performant JavaScript HTML to Markdown converter optimized for LLMs.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -28,18 +28,6 @@
|
|
|
28
28
|
"dependencies": {
|
|
29
29
|
"cac": "^6.7.14"
|
|
30
30
|
},
|
|
31
|
-
"devDependencies": {
|
|
32
|
-
"@antfu/eslint-config": "^4.16.1",
|
|
33
|
-
"@types/node": "^22.15.34",
|
|
34
|
-
"bumpp": "^10.2.0",
|
|
35
|
-
"crawlee": "^3.13.8",
|
|
36
|
-
"eslint": "^9.30.0",
|
|
37
|
-
"llm-cost": "^1.0.5",
|
|
38
|
-
"playwright": "^1.53.1",
|
|
39
|
-
"typescript": "5.8.3",
|
|
40
|
-
"unbuild": "^3.5.0",
|
|
41
|
-
"vitest": "^3.2.4"
|
|
42
|
-
},
|
|
43
31
|
"scripts": {
|
|
44
32
|
"flame": "pnpm build && unbuild bench/bundle && clinic flame -- node bench/bundle/dist/string.mjs 10",
|
|
45
33
|
"bench:build": "pnpm build && unbuild bench/bundle",
|
|
@@ -53,10 +41,9 @@
|
|
|
53
41
|
"test:github:file": "cat test/fixtures/github-markdown-complete.html | node ./bin/mdream.mjs --origin https://docs.github.com | tee test/github-markdown.md",
|
|
54
42
|
"test:wiki:file": "pnpm build && cat test/fixtures/wikipedia-largest.html | node ./bin/mdream.mjs --origin https://en.wikipedia.org | tee test/wiki-markdown.md",
|
|
55
43
|
"test:wiki-small:file": "cat test/fixtures/wikipedia-small.html | node ./bin/mdream.mjs --origin https://en.wikipedia.org | tee test/wiki-markdown.md",
|
|
56
|
-
"build": "
|
|
57
|
-
"typecheck": "tsc --noEmit
|
|
58
|
-
"dev:prepare": "
|
|
59
|
-
"test": "vitest test"
|
|
60
|
-
"release": "pnpm build && bumpp && pnpm -r publish"
|
|
44
|
+
"build": "obuild",
|
|
45
|
+
"typecheck": "tsc --noEmit",
|
|
46
|
+
"dev:prepare": "obuild --stub",
|
|
47
|
+
"test": "vitest test"
|
|
61
48
|
}
|
|
62
49
|
}
|