mdream 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.md +9 -0
- package/README.md +185 -0
- package/bin/mdream.mjs +2 -0
- package/dist/cli.d.mts +2 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.mjs +25 -0
- package/dist/index.d.mts +15 -0
- package/dist/index.d.ts +15 -0
- package/dist/index.mjs +13 -0
- package/dist/plugins.d.mts +88 -0
- package/dist/plugins.d.ts +88 -0
- package/dist/plugins.mjs +4 -0
- package/dist/preset/minimal.d.mts +11 -0
- package/dist/preset/minimal.d.ts +11 -0
- package/dist/preset/minimal.mjs +39 -0
- package/dist/shared/mdream.-hdaPj9a.mjs +280 -0
- package/dist/shared/mdream.5zaIXVJz.mjs +508 -0
- package/dist/shared/mdream.C8ruysN5.mjs +291 -0
- package/dist/shared/mdream.DUeWbUFG.mjs +1432 -0
- package/dist/shared/mdream.a2AvjJLp.d.mts +218 -0
- package/dist/shared/mdream.a2AvjJLp.d.ts +218 -0
- package/dist/shared/mdream.cpEmpxyh.mjs +105 -0
- package/package.json +62 -0
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Plugin interface for extending HTML to Markdown conversion
|
|
3
|
+
*/
|
|
4
|
+
interface Plugin {
|
|
5
|
+
/**
|
|
6
|
+
* Process a node before it's handled by the parser
|
|
7
|
+
*/
|
|
8
|
+
beforeNodeProcess?: (event: NodeEvent, state: MdreamRuntimeState) => undefined | void | {
|
|
9
|
+
skip: boolean;
|
|
10
|
+
};
|
|
11
|
+
/**
|
|
12
|
+
* Hook that runs when entering a node
|
|
13
|
+
* @returns String to add to the output, or PluginHookResult with content
|
|
14
|
+
*/
|
|
15
|
+
onNodeEnter?: (node: ElementNode, state: MdreamRuntimeState) => string | undefined | void;
|
|
16
|
+
/**
|
|
17
|
+
* Hook that runs when exiting a node
|
|
18
|
+
* @param event - The node event
|
|
19
|
+
* @param state - The current runtime state
|
|
20
|
+
* @returns String to add to the output, or PluginHookResult with content
|
|
21
|
+
*/
|
|
22
|
+
onNodeExit?: (node: ElementNode, state: MdreamRuntimeState) => string | undefined | void;
|
|
23
|
+
/**
|
|
24
|
+
* Process attributes for a node
|
|
25
|
+
* @param node - The node to process attributes for
|
|
26
|
+
* @param state - The current runtime state
|
|
27
|
+
*/
|
|
28
|
+
processAttributes?: (node: ElementNode, state: MdreamRuntimeState) => void;
|
|
29
|
+
/**
|
|
30
|
+
* Process a text node before it's added to the output
|
|
31
|
+
* @param node - The text node to process
|
|
32
|
+
* @param state - The current runtime state
|
|
33
|
+
* @returns Legacy format or PluginHookResult with textContent and skipNode
|
|
34
|
+
*/
|
|
35
|
+
processTextNode?: (node: TextNode, state: MdreamRuntimeState) => undefined | void | {
|
|
36
|
+
content: string;
|
|
37
|
+
skip: boolean;
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Plugin creation options for controlling plugin behavior
|
|
42
|
+
*/
|
|
43
|
+
interface PluginCreationOptions {
|
|
44
|
+
/**
|
|
45
|
+
* Order in which plugins are executed
|
|
46
|
+
* Lower numbers run first
|
|
47
|
+
*/
|
|
48
|
+
order?: number;
|
|
49
|
+
/**
|
|
50
|
+
* Priority for region conflict resolution
|
|
51
|
+
* Higher numbers take precedence over lower
|
|
52
|
+
*/
|
|
53
|
+
priority?: number;
|
|
54
|
+
}
|
|
55
|
+
interface HTMLToMarkdownOptions {
|
|
56
|
+
/**
|
|
57
|
+
* Origin URL for resolving relative image paths and internal links.
|
|
58
|
+
* Important when converting HTML with relative paths from a specific website.
|
|
59
|
+
*/
|
|
60
|
+
origin?: string;
|
|
61
|
+
/**
|
|
62
|
+
* Plugins to extend HTML to Markdown conversion
|
|
63
|
+
*/
|
|
64
|
+
plugins?: Plugin[];
|
|
65
|
+
}
|
|
66
|
+
declare const ELEMENT_NODE = 1;
|
|
67
|
+
declare const TEXT_NODE = 3;
|
|
68
|
+
interface ElementNode extends Node {
|
|
69
|
+
/** Element tag name (for ELEMENT_NODE) */
|
|
70
|
+
name: string;
|
|
71
|
+
/** HTML attributes (for ELEMENT_NODE) */
|
|
72
|
+
attributes: Record<string, string>;
|
|
73
|
+
/** Custom data added by plugins */
|
|
74
|
+
context?: Record<string, any>;
|
|
75
|
+
/** ID of the tag for fast handler lookup */
|
|
76
|
+
tagId?: number;
|
|
77
|
+
/** Map of tag names to their nesting count (using Uint8Array for performance) */
|
|
78
|
+
depthMap: Uint8Array;
|
|
79
|
+
}
|
|
80
|
+
interface TextNode extends Node {
|
|
81
|
+
/** Text content (for TEXT_NODE) */
|
|
82
|
+
value: string;
|
|
83
|
+
/** Custom data added by plugins */
|
|
84
|
+
context?: Record<string, any>;
|
|
85
|
+
}
|
|
86
|
+
/**
|
|
87
|
+
* Base DOM node interface
|
|
88
|
+
* Optimized for streaming HTML parsing with minimal memory footprint
|
|
89
|
+
*/
|
|
90
|
+
interface Node {
|
|
91
|
+
/** Node type (ELEMENT_NODE or TEXT_NODE) */
|
|
92
|
+
type: number;
|
|
93
|
+
/** Current nesting depth in the DOM tree */
|
|
94
|
+
depth: number;
|
|
95
|
+
/** Node exclusion and filtering now handled by plugins */
|
|
96
|
+
/** Index of this node within its parent's children */
|
|
97
|
+
index: number;
|
|
98
|
+
/** Current walk index for child traversal during streaming */
|
|
99
|
+
currentWalkIndex?: number;
|
|
100
|
+
/** Count of text child nodes - used for whitespace handling */
|
|
101
|
+
childTextNodeIndex?: number;
|
|
102
|
+
/** Whether node contains whitespace - used for whitespace optimization */
|
|
103
|
+
containsWhitespace?: boolean;
|
|
104
|
+
/** Cached reference to tag handler for performance */
|
|
105
|
+
tagHandler?: TagHandler;
|
|
106
|
+
/** Parent node */
|
|
107
|
+
parent?: ElementNode | null;
|
|
108
|
+
/** Custom data added by plugins */
|
|
109
|
+
context?: Record<string, any>;
|
|
110
|
+
/** Region ID for buffer region tracking */
|
|
111
|
+
regionId?: number;
|
|
112
|
+
}
|
|
113
|
+
/**
|
|
114
|
+
* Buffer region for tracking content inclusion/exclusion
|
|
115
|
+
*/
|
|
116
|
+
interface BufferRegion {
|
|
117
|
+
/** Unique identifier */
|
|
118
|
+
id: number;
|
|
119
|
+
/** Inclusion state */
|
|
120
|
+
include: boolean;
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* State interface for HTML parsing and processing
|
|
124
|
+
* Contains parsing state that's maintained during HTML traversal
|
|
125
|
+
*/
|
|
126
|
+
interface MdreamProcessingState {
|
|
127
|
+
/** Map of tag names to their current nesting depth - uses TypedArray for performance */
|
|
128
|
+
depthMap: Uint8Array;
|
|
129
|
+
/** Current overall nesting depth */
|
|
130
|
+
depth: number;
|
|
131
|
+
/** Currently processing element node */
|
|
132
|
+
currentNode?: ElementNode | null;
|
|
133
|
+
/** Node filtering and exclusion is now handled by plugins */
|
|
134
|
+
/** Whether current content contains HTML entities that need decoding */
|
|
135
|
+
hasEncodedHtmlEntity?: boolean;
|
|
136
|
+
/** Whether the last processed character was whitespace - for collapsing whitespace */
|
|
137
|
+
lastCharWasWhitespace?: boolean;
|
|
138
|
+
/** Whether the last processed buffer has whitespace - optimization flag */
|
|
139
|
+
textBufferContainsWhitespace?: boolean;
|
|
140
|
+
/** Whether the last processed buffer contains non-whitespace characters */
|
|
141
|
+
textBufferContainsNonWhitespace?: boolean;
|
|
142
|
+
/** Whether a tag was just closed - affects whitespace handling */
|
|
143
|
+
justClosedTag?: boolean;
|
|
144
|
+
/** Whether the next text node is the first in its element - for whitespace trimming */
|
|
145
|
+
isFirstTextInElement?: boolean;
|
|
146
|
+
/** Reference to the last processed text node - for context tracking */
|
|
147
|
+
lastTextNode?: Node;
|
|
148
|
+
/** Plugin instances array for efficient iteration */
|
|
149
|
+
plugins?: Plugin[];
|
|
150
|
+
/** Configuration options for conversion */
|
|
151
|
+
options?: HTMLToMarkdownOptions;
|
|
152
|
+
}
|
|
153
|
+
/**
|
|
154
|
+
* Runtime state for markdown generation
|
|
155
|
+
* Extended state that includes output tracking and options
|
|
156
|
+
*/
|
|
157
|
+
interface MdreamRuntimeState extends Partial<MdreamProcessingState> {
|
|
158
|
+
/** Number of newlines at end of most recent output */
|
|
159
|
+
lastNewLines?: number;
|
|
160
|
+
/** Configuration options for conversion */
|
|
161
|
+
options?: HTMLToMarkdownOptions;
|
|
162
|
+
/** Table processing state - specialized for Markdown tables */
|
|
163
|
+
tableRenderedTable?: boolean;
|
|
164
|
+
tableCurrentRowCells?: number;
|
|
165
|
+
tableColumnAlignments?: string[];
|
|
166
|
+
/** Plugin instances array for efficient iteration */
|
|
167
|
+
plugins?: Plugin[];
|
|
168
|
+
/** Map of region IDs to buffer regions for O(1) lookups */
|
|
169
|
+
regionToggles: Map<number, boolean>;
|
|
170
|
+
/** Content buffers for regions */
|
|
171
|
+
regionContentBuffers: Map<number, string[]>;
|
|
172
|
+
/** Performance cache for last content to avoid iteration */
|
|
173
|
+
lastContentCache?: string;
|
|
174
|
+
/** Reference to the last processed node */
|
|
175
|
+
lastNode?: Node;
|
|
176
|
+
context?: Record<string, any>;
|
|
177
|
+
}
|
|
178
|
+
type NodeEventEnter = 0;
|
|
179
|
+
type NodeEventExit = 1;
|
|
180
|
+
/**
|
|
181
|
+
* Node event for DOM traversal
|
|
182
|
+
* Used in the event-based traversal system for streaming processing
|
|
183
|
+
*/
|
|
184
|
+
interface NodeEvent {
|
|
185
|
+
/** Event type - enter (start tag) or exit (end tag) */
|
|
186
|
+
type: NodeEventEnter | NodeEventExit;
|
|
187
|
+
/** The node being processed */
|
|
188
|
+
node: Node;
|
|
189
|
+
}
|
|
190
|
+
/**
|
|
191
|
+
* Handler context for markdown conversion
|
|
192
|
+
* Passed to tag handler functions for converting specific elements
|
|
193
|
+
*/
|
|
194
|
+
interface HandlerContext {
|
|
195
|
+
/** Current node being processed */
|
|
196
|
+
node: ElementNode;
|
|
197
|
+
/** Parent node (if any) */
|
|
198
|
+
parent?: ElementNode;
|
|
199
|
+
/** Runtime state */
|
|
200
|
+
state: MdreamRuntimeState;
|
|
201
|
+
}
|
|
202
|
+
/**
|
|
203
|
+
* Tag handler interface for HTML elements
|
|
204
|
+
* Used by plugins to extend or customize tag handling
|
|
205
|
+
*/
|
|
206
|
+
interface TagHandler {
|
|
207
|
+
enter?: (context: HandlerContext) => string | undefined | void;
|
|
208
|
+
exit?: (context: HandlerContext) => string | undefined | void;
|
|
209
|
+
isSelfClosing?: boolean;
|
|
210
|
+
isNonNesting?: boolean;
|
|
211
|
+
collapsesInnerWhiteSpace?: boolean;
|
|
212
|
+
isInline?: boolean;
|
|
213
|
+
spacing?: readonly [number, number];
|
|
214
|
+
excludesTextNodes?: boolean;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
export { ELEMENT_NODE as E, TEXT_NODE as T };
|
|
218
|
+
export type { BufferRegion as B, HTMLToMarkdownOptions as H, MdreamProcessingState as M, Node as N, Plugin as P, PluginCreationOptions as a, ElementNode as b, TextNode as c, MdreamRuntimeState as d, NodeEvent as e, HandlerContext as f, TagHandler as g };
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Plugin interface for extending HTML to Markdown conversion
|
|
3
|
+
*/
|
|
4
|
+
interface Plugin {
|
|
5
|
+
/**
|
|
6
|
+
* Process a node before it's handled by the parser
|
|
7
|
+
*/
|
|
8
|
+
beforeNodeProcess?: (event: NodeEvent, state: MdreamRuntimeState) => undefined | void | {
|
|
9
|
+
skip: boolean;
|
|
10
|
+
};
|
|
11
|
+
/**
|
|
12
|
+
* Hook that runs when entering a node
|
|
13
|
+
* @returns String to add to the output, or PluginHookResult with content
|
|
14
|
+
*/
|
|
15
|
+
onNodeEnter?: (node: ElementNode, state: MdreamRuntimeState) => string | undefined | void;
|
|
16
|
+
/**
|
|
17
|
+
* Hook that runs when exiting a node
|
|
18
|
+
* @param event - The node event
|
|
19
|
+
* @param state - The current runtime state
|
|
20
|
+
* @returns String to add to the output, or PluginHookResult with content
|
|
21
|
+
*/
|
|
22
|
+
onNodeExit?: (node: ElementNode, state: MdreamRuntimeState) => string | undefined | void;
|
|
23
|
+
/**
|
|
24
|
+
* Process attributes for a node
|
|
25
|
+
* @param node - The node to process attributes for
|
|
26
|
+
* @param state - The current runtime state
|
|
27
|
+
*/
|
|
28
|
+
processAttributes?: (node: ElementNode, state: MdreamRuntimeState) => void;
|
|
29
|
+
/**
|
|
30
|
+
* Process a text node before it's added to the output
|
|
31
|
+
* @param node - The text node to process
|
|
32
|
+
* @param state - The current runtime state
|
|
33
|
+
* @returns Legacy format or PluginHookResult with textContent and skipNode
|
|
34
|
+
*/
|
|
35
|
+
processTextNode?: (node: TextNode, state: MdreamRuntimeState) => undefined | void | {
|
|
36
|
+
content: string;
|
|
37
|
+
skip: boolean;
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Plugin creation options for controlling plugin behavior
|
|
42
|
+
*/
|
|
43
|
+
interface PluginCreationOptions {
|
|
44
|
+
/**
|
|
45
|
+
* Order in which plugins are executed
|
|
46
|
+
* Lower numbers run first
|
|
47
|
+
*/
|
|
48
|
+
order?: number;
|
|
49
|
+
/**
|
|
50
|
+
* Priority for region conflict resolution
|
|
51
|
+
* Higher numbers take precedence over lower
|
|
52
|
+
*/
|
|
53
|
+
priority?: number;
|
|
54
|
+
}
|
|
55
|
+
interface HTMLToMarkdownOptions {
|
|
56
|
+
/**
|
|
57
|
+
* Origin URL for resolving relative image paths and internal links.
|
|
58
|
+
* Important when converting HTML with relative paths from a specific website.
|
|
59
|
+
*/
|
|
60
|
+
origin?: string;
|
|
61
|
+
/**
|
|
62
|
+
* Plugins to extend HTML to Markdown conversion
|
|
63
|
+
*/
|
|
64
|
+
plugins?: Plugin[];
|
|
65
|
+
}
|
|
66
|
+
declare const ELEMENT_NODE = 1;
|
|
67
|
+
declare const TEXT_NODE = 3;
|
|
68
|
+
interface ElementNode extends Node {
|
|
69
|
+
/** Element tag name (for ELEMENT_NODE) */
|
|
70
|
+
name: string;
|
|
71
|
+
/** HTML attributes (for ELEMENT_NODE) */
|
|
72
|
+
attributes: Record<string, string>;
|
|
73
|
+
/** Custom data added by plugins */
|
|
74
|
+
context?: Record<string, any>;
|
|
75
|
+
/** ID of the tag for fast handler lookup */
|
|
76
|
+
tagId?: number;
|
|
77
|
+
/** Map of tag names to their nesting count (using Uint8Array for performance) */
|
|
78
|
+
depthMap: Uint8Array;
|
|
79
|
+
}
|
|
80
|
+
interface TextNode extends Node {
|
|
81
|
+
/** Text content (for TEXT_NODE) */
|
|
82
|
+
value: string;
|
|
83
|
+
/** Custom data added by plugins */
|
|
84
|
+
context?: Record<string, any>;
|
|
85
|
+
}
|
|
86
|
+
/**
|
|
87
|
+
* Base DOM node interface
|
|
88
|
+
* Optimized for streaming HTML parsing with minimal memory footprint
|
|
89
|
+
*/
|
|
90
|
+
interface Node {
|
|
91
|
+
/** Node type (ELEMENT_NODE or TEXT_NODE) */
|
|
92
|
+
type: number;
|
|
93
|
+
/** Current nesting depth in the DOM tree */
|
|
94
|
+
depth: number;
|
|
95
|
+
/** Node exclusion and filtering now handled by plugins */
|
|
96
|
+
/** Index of this node within its parent's children */
|
|
97
|
+
index: number;
|
|
98
|
+
/** Current walk index for child traversal during streaming */
|
|
99
|
+
currentWalkIndex?: number;
|
|
100
|
+
/** Count of text child nodes - used for whitespace handling */
|
|
101
|
+
childTextNodeIndex?: number;
|
|
102
|
+
/** Whether node contains whitespace - used for whitespace optimization */
|
|
103
|
+
containsWhitespace?: boolean;
|
|
104
|
+
/** Cached reference to tag handler for performance */
|
|
105
|
+
tagHandler?: TagHandler;
|
|
106
|
+
/** Parent node */
|
|
107
|
+
parent?: ElementNode | null;
|
|
108
|
+
/** Custom data added by plugins */
|
|
109
|
+
context?: Record<string, any>;
|
|
110
|
+
/** Region ID for buffer region tracking */
|
|
111
|
+
regionId?: number;
|
|
112
|
+
}
|
|
113
|
+
/**
|
|
114
|
+
* Buffer region for tracking content inclusion/exclusion
|
|
115
|
+
*/
|
|
116
|
+
interface BufferRegion {
|
|
117
|
+
/** Unique identifier */
|
|
118
|
+
id: number;
|
|
119
|
+
/** Inclusion state */
|
|
120
|
+
include: boolean;
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* State interface for HTML parsing and processing
|
|
124
|
+
* Contains parsing state that's maintained during HTML traversal
|
|
125
|
+
*/
|
|
126
|
+
interface MdreamProcessingState {
|
|
127
|
+
/** Map of tag names to their current nesting depth - uses TypedArray for performance */
|
|
128
|
+
depthMap: Uint8Array;
|
|
129
|
+
/** Current overall nesting depth */
|
|
130
|
+
depth: number;
|
|
131
|
+
/** Currently processing element node */
|
|
132
|
+
currentNode?: ElementNode | null;
|
|
133
|
+
/** Node filtering and exclusion is now handled by plugins */
|
|
134
|
+
/** Whether current content contains HTML entities that need decoding */
|
|
135
|
+
hasEncodedHtmlEntity?: boolean;
|
|
136
|
+
/** Whether the last processed character was whitespace - for collapsing whitespace */
|
|
137
|
+
lastCharWasWhitespace?: boolean;
|
|
138
|
+
/** Whether the last processed buffer has whitespace - optimization flag */
|
|
139
|
+
textBufferContainsWhitespace?: boolean;
|
|
140
|
+
/** Whether the last processed buffer contains non-whitespace characters */
|
|
141
|
+
textBufferContainsNonWhitespace?: boolean;
|
|
142
|
+
/** Whether a tag was just closed - affects whitespace handling */
|
|
143
|
+
justClosedTag?: boolean;
|
|
144
|
+
/** Whether the next text node is the first in its element - for whitespace trimming */
|
|
145
|
+
isFirstTextInElement?: boolean;
|
|
146
|
+
/** Reference to the last processed text node - for context tracking */
|
|
147
|
+
lastTextNode?: Node;
|
|
148
|
+
/** Plugin instances array for efficient iteration */
|
|
149
|
+
plugins?: Plugin[];
|
|
150
|
+
/** Configuration options for conversion */
|
|
151
|
+
options?: HTMLToMarkdownOptions;
|
|
152
|
+
}
|
|
153
|
+
/**
|
|
154
|
+
* Runtime state for markdown generation
|
|
155
|
+
* Extended state that includes output tracking and options
|
|
156
|
+
*/
|
|
157
|
+
interface MdreamRuntimeState extends Partial<MdreamProcessingState> {
|
|
158
|
+
/** Number of newlines at end of most recent output */
|
|
159
|
+
lastNewLines?: number;
|
|
160
|
+
/** Configuration options for conversion */
|
|
161
|
+
options?: HTMLToMarkdownOptions;
|
|
162
|
+
/** Table processing state - specialized for Markdown tables */
|
|
163
|
+
tableRenderedTable?: boolean;
|
|
164
|
+
tableCurrentRowCells?: number;
|
|
165
|
+
tableColumnAlignments?: string[];
|
|
166
|
+
/** Plugin instances array for efficient iteration */
|
|
167
|
+
plugins?: Plugin[];
|
|
168
|
+
/** Map of region IDs to buffer regions for O(1) lookups */
|
|
169
|
+
regionToggles: Map<number, boolean>;
|
|
170
|
+
/** Content buffers for regions */
|
|
171
|
+
regionContentBuffers: Map<number, string[]>;
|
|
172
|
+
/** Performance cache for last content to avoid iteration */
|
|
173
|
+
lastContentCache?: string;
|
|
174
|
+
/** Reference to the last processed node */
|
|
175
|
+
lastNode?: Node;
|
|
176
|
+
context?: Record<string, any>;
|
|
177
|
+
}
|
|
178
|
+
type NodeEventEnter = 0;
|
|
179
|
+
type NodeEventExit = 1;
|
|
180
|
+
/**
|
|
181
|
+
* Node event for DOM traversal
|
|
182
|
+
* Used in the event-based traversal system for streaming processing
|
|
183
|
+
*/
|
|
184
|
+
interface NodeEvent {
|
|
185
|
+
/** Event type - enter (start tag) or exit (end tag) */
|
|
186
|
+
type: NodeEventEnter | NodeEventExit;
|
|
187
|
+
/** The node being processed */
|
|
188
|
+
node: Node;
|
|
189
|
+
}
|
|
190
|
+
/**
|
|
191
|
+
* Handler context for markdown conversion
|
|
192
|
+
* Passed to tag handler functions for converting specific elements
|
|
193
|
+
*/
|
|
194
|
+
interface HandlerContext {
|
|
195
|
+
/** Current node being processed */
|
|
196
|
+
node: ElementNode;
|
|
197
|
+
/** Parent node (if any) */
|
|
198
|
+
parent?: ElementNode;
|
|
199
|
+
/** Runtime state */
|
|
200
|
+
state: MdreamRuntimeState;
|
|
201
|
+
}
|
|
202
|
+
/**
|
|
203
|
+
* Tag handler interface for HTML elements
|
|
204
|
+
* Used by plugins to extend or customize tag handling
|
|
205
|
+
*/
|
|
206
|
+
interface TagHandler {
|
|
207
|
+
enter?: (context: HandlerContext) => string | undefined | void;
|
|
208
|
+
exit?: (context: HandlerContext) => string | undefined | void;
|
|
209
|
+
isSelfClosing?: boolean;
|
|
210
|
+
isNonNesting?: boolean;
|
|
211
|
+
collapsesInnerWhiteSpace?: boolean;
|
|
212
|
+
isInline?: boolean;
|
|
213
|
+
spacing?: readonly [number, number];
|
|
214
|
+
excludesTextNodes?: boolean;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
export { ELEMENT_NODE as E, TEXT_NODE as T };
|
|
218
|
+
export type { BufferRegion as B, HTMLToMarkdownOptions as H, MdreamProcessingState as M, Node as N, Plugin as P, PluginCreationOptions as a, ElementNode as b, TextNode as c, MdreamRuntimeState as d, NodeEvent as e, HandlerContext as f, TagHandler as g };
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import { aa as ELEMENT_NODE, b as TAG_HEAD, ab as collectNodeContent, ac as TAG_TITLE, ad as TAG_META } from './mdream.-hdaPj9a.mjs';
|
|
2
|
+
|
|
3
|
+
function createPlugin(plugin) {
|
|
4
|
+
return plugin;
|
|
5
|
+
}
|
|
6
|
+
|
|
7
|
+
function frontmatterPlugin(options = {}) {
|
|
8
|
+
const additionalFields = options.additionalFields || {};
|
|
9
|
+
const metaFields = /* @__PURE__ */ new Set([
|
|
10
|
+
"description",
|
|
11
|
+
"keywords",
|
|
12
|
+
"author",
|
|
13
|
+
"date",
|
|
14
|
+
"og:title",
|
|
15
|
+
"og:description",
|
|
16
|
+
"twitter:title",
|
|
17
|
+
"twitter:description",
|
|
18
|
+
...options.metaFields || []
|
|
19
|
+
]);
|
|
20
|
+
const frontmatter = { ...additionalFields, meta: {} };
|
|
21
|
+
let inHead = false;
|
|
22
|
+
const formatValue = options.formatValue || ((name, value) => {
|
|
23
|
+
value = value.replace(/"/g, '\\"');
|
|
24
|
+
if (value.includes("\n") || value.includes(":") || value.includes("#") || value.includes(" ")) {
|
|
25
|
+
return `"${value}"`;
|
|
26
|
+
}
|
|
27
|
+
return value;
|
|
28
|
+
});
|
|
29
|
+
return createPlugin({
|
|
30
|
+
onNodeEnter(node) {
|
|
31
|
+
if (node.tagId === TAG_HEAD) {
|
|
32
|
+
inHead = true;
|
|
33
|
+
return;
|
|
34
|
+
}
|
|
35
|
+
if (inHead && node.type === ELEMENT_NODE && node.tagId === TAG_TITLE) {
|
|
36
|
+
return;
|
|
37
|
+
}
|
|
38
|
+
if (inHead && node.type === ELEMENT_NODE && node.tagId === TAG_META) {
|
|
39
|
+
const elementNode = node;
|
|
40
|
+
const { name, property, content } = elementNode.attributes || {};
|
|
41
|
+
const metaName = property || name;
|
|
42
|
+
if (metaName && content && metaFields.has(metaName)) {
|
|
43
|
+
frontmatter.meta[metaName.includes(":") ? `"${metaName}"` : metaName] = formatValue(metaName, content);
|
|
44
|
+
}
|
|
45
|
+
return void 0;
|
|
46
|
+
}
|
|
47
|
+
},
|
|
48
|
+
onNodeExit(node, state) {
|
|
49
|
+
if (node.type === ELEMENT_NODE && node.tagId === TAG_HEAD) {
|
|
50
|
+
inHead = false;
|
|
51
|
+
if (Object.keys(frontmatter).length > 0) {
|
|
52
|
+
const frontmatterContent = generateFrontmatter();
|
|
53
|
+
collectNodeContent({ regionId: 0 }, frontmatterContent, state);
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
return void 0;
|
|
57
|
+
},
|
|
58
|
+
processTextNode(node) {
|
|
59
|
+
if (!inHead) {
|
|
60
|
+
return;
|
|
61
|
+
}
|
|
62
|
+
const parent = node.parent;
|
|
63
|
+
if (parent && parent.tagId === TAG_TITLE && node.value) {
|
|
64
|
+
frontmatter.title = formatValue("title", node.value.trim());
|
|
65
|
+
return { content: "", skip: true };
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
});
|
|
69
|
+
function generateFrontmatter() {
|
|
70
|
+
if (Object.keys(frontmatter).length === 0) {
|
|
71
|
+
return "";
|
|
72
|
+
}
|
|
73
|
+
let yamlLines = [];
|
|
74
|
+
const entries = Object.entries(frontmatter).sort(([a], [b]) => {
|
|
75
|
+
if (a === "title")
|
|
76
|
+
return -1;
|
|
77
|
+
if (b === "title")
|
|
78
|
+
return 1;
|
|
79
|
+
if (a === "description")
|
|
80
|
+
return -1;
|
|
81
|
+
if (b === "description")
|
|
82
|
+
return 1;
|
|
83
|
+
return a.localeCompare(b);
|
|
84
|
+
});
|
|
85
|
+
for (const [key, value] of entries) {
|
|
86
|
+
if (key === "meta" && Object.keys(value).length > 0) {
|
|
87
|
+
yamlLines.push("meta:");
|
|
88
|
+
const metaEntries = Object.entries(value).sort(([a], [b]) => a.localeCompare(b)).map(([metaKey, metaValue]) => ` ${metaKey}: ${metaValue}`);
|
|
89
|
+
yamlLines.push(...metaEntries);
|
|
90
|
+
} else if (key !== "meta" || Object.keys(value).length > 0) {
|
|
91
|
+
yamlLines.push(`${key}: ${value}`);
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
if (Object.keys(frontmatter.meta).length === 0) {
|
|
95
|
+
yamlLines = yamlLines.filter((line) => !line.startsWith("meta:"));
|
|
96
|
+
}
|
|
97
|
+
return `---
|
|
98
|
+
${yamlLines.join("\n")}
|
|
99
|
+
---
|
|
100
|
+
|
|
101
|
+
`;
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
export { createPlugin as c, frontmatterPlugin as f };
|
package/package.json
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "mdream",
|
|
3
|
+
"type": "module",
|
|
4
|
+
"version": "0.1.1",
|
|
5
|
+
"description": "Ultra-performant JavaScript HTML to Markdown converter optimized for LLMs.",
|
|
6
|
+
"author": {
|
|
7
|
+
"name": "Harlan Wilton",
|
|
8
|
+
"email": "harlan@harlanzw.com",
|
|
9
|
+
"url": "https://harlanzw.com/"
|
|
10
|
+
},
|
|
11
|
+
"license": "MIT",
|
|
12
|
+
"exports": {
|
|
13
|
+
".": {
|
|
14
|
+
"types": "./dist/index.d.ts",
|
|
15
|
+
"import": "./dist/index.mjs"
|
|
16
|
+
},
|
|
17
|
+
"./cli": "./dist/cli.mjs",
|
|
18
|
+
"./plugins": "./dist/plugins.mjs",
|
|
19
|
+
"./preset/minimal": "./dist/preset/minimal.mjs"
|
|
20
|
+
},
|
|
21
|
+
"main": "./dist/index.mjs",
|
|
22
|
+
"types": "./dist/index.d.ts",
|
|
23
|
+
"bin": "./bin/mdream.mjs",
|
|
24
|
+
"files": [
|
|
25
|
+
"bin",
|
|
26
|
+
"dist"
|
|
27
|
+
],
|
|
28
|
+
"dependencies": {
|
|
29
|
+
"cac": "^6.7.14"
|
|
30
|
+
},
|
|
31
|
+
"devDependencies": {
|
|
32
|
+
"@antfu/eslint-config": "^4.13.2",
|
|
33
|
+
"@types/node": "^22.15.29",
|
|
34
|
+
"bumpp": "^10.1.1",
|
|
35
|
+
"crawlee": "^3.13.5",
|
|
36
|
+
"eslint": "^9.28.0",
|
|
37
|
+
"llm-cost": "^1.0.5",
|
|
38
|
+
"playwright": "^1.52.0",
|
|
39
|
+
"typescript": "5.8.3",
|
|
40
|
+
"unbuild": "^3.5.0",
|
|
41
|
+
"vitest": "^3.1.4"
|
|
42
|
+
},
|
|
43
|
+
"scripts": {
|
|
44
|
+
"flame": "pnpm build && unbuild bench/bundle && clinic flame -- node bench/bundle/dist/string.mjs 10",
|
|
45
|
+
"bench:build": "pnpm build && unbuild bench/bundle",
|
|
46
|
+
"bench:stream": "pnpm build && unbuild bench/bundle && hyperfine --runs 100 'node bench/bundle/dist/stream.mjs' --warmup 3",
|
|
47
|
+
"bench:await": "pnpm build && unbuild bench/bundle && hyperfine --runs 100 'node bench/bundle/dist/await.mjs' --warmup 3",
|
|
48
|
+
"bench:string": "pnpm build && unbuild bench/bundle && hyperfine --runs 100 'node bench/bundle/dist/string.mjs' --warmup 3",
|
|
49
|
+
"ts": "node --experimental-strip-types",
|
|
50
|
+
"test:live": "curl -s https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax | node ./bin/mdream.mjs --origin https://docs.github.com | tee test/github-markdown.md",
|
|
51
|
+
"test:images": "cat test/fixtures/test-origin.html | node ./bin/mdream.mjs --origin https://docs.github.com",
|
|
52
|
+
"test:github:live": "curl -s https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax | node ./bin/mdream.mjs --origin https://docs.github.com | tee test/github-markdown.md",
|
|
53
|
+
"test:github:file": "cat test/fixtures/github-markdown-complete.html | node ./bin/mdream.mjs --origin https://docs.github.com | tee test/github-markdown.md",
|
|
54
|
+
"test:wiki:file": "pnpm build && cat test/fixtures/wikipedia-largest.html | node ./bin/mdream.mjs --origin https://en.wikipedia.org | tee test/wiki-markdown.md",
|
|
55
|
+
"test:wiki-small:file": "cat test/fixtures/wikipedia-small.html | node ./bin/mdream.mjs --origin https://en.wikipedia.org | tee test/wiki-markdown.md",
|
|
56
|
+
"build": "unbuild",
|
|
57
|
+
"typecheck": "tsc --noEmit src/index.ts",
|
|
58
|
+
"dev:prepare": "unbuild --stub",
|
|
59
|
+
"test": "vitest test",
|
|
60
|
+
"release": "pnpm build && bumpp && pnpm -r publish"
|
|
61
|
+
}
|
|
62
|
+
}
|