mdream 0.15.2 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -1
- package/dist/_chunks/{const-Bf_XN9U9.mjs → const.mjs} +1 -42
- package/dist/_chunks/{extraction-BA9MDtq3.mjs → extraction.mjs} +2 -27
- package/dist/_chunks/{markdown-processor-D26Uo5td.mjs → markdown-processor.mjs} +22 -104
- package/dist/_chunks/{plugin-D5soyEXm.d.mts → plugin.d.mts} +1 -2
- package/dist/_chunks/plugin.mjs +4 -0
- package/dist/_chunks/{src-BJpipdul.mjs → stream.mjs} +2 -20
- package/dist/_chunks/{plugins-DJnqR2fA.mjs → tailwind.mjs} +15 -275
- package/dist/_chunks/{types-CT4ZxeOH.d.mts → types.d.mts} +6 -22
- package/dist/cli.mjs +5 -9
- package/dist/iife.js +8 -8
- package/dist/index.d.mts +5 -8
- package/dist/index.mjs +10 -6
- package/dist/llms-txt.mjs +383 -5
- package/dist/plugins.d.mts +5 -16
- package/dist/plugins.mjs +4 -5
- package/dist/preset/minimal.d.mts +1 -2
- package/dist/preset/minimal.mjs +30 -4
- package/dist/splitter.d.mts +1 -2
- package/dist/splitter.mjs +7 -24
- package/package.json +1 -1
- package/dist/_chunks/llms-txt-Czb_M48B.mjs +0 -440
- package/dist/_chunks/minimal-BiDhcwif.mjs +0 -40
- package/dist/_chunks/plugin-CjWWQTuL.mjs +0 -12
package/README.md
CHANGED
|
@@ -201,7 +201,6 @@ Mdream includes several built-in plugins that can be used individually or combin
|
|
|
201
201
|
- **[`frontmatterPlugin`](./src/plugins/frontmatter.ts)**: Generate YAML frontmatter from HTML head elements (title, meta tags)
|
|
202
202
|
- **[`isolateMainPlugin`](./src/plugins/isolate-main.ts)**: Isolate main content using `<main>` elements or header-to-footer boundaries
|
|
203
203
|
- **[`tailwindPlugin`](./src/plugins/tailwind.ts)**: Convert Tailwind CSS classes to Markdown formatting (bold, italic, etc.)
|
|
204
|
-
- **[`readabilityPlugin`](./src/plugins/readability.ts)**: Content scoring and extraction (experimental)
|
|
205
204
|
|
|
206
205
|
```ts
|
|
207
206
|
import { filterPlugin, frontmatterPlugin, isolateMainPlugin } from 'mdream/plugins'
|
|
@@ -215,6 +214,26 @@ const markdown = htmlToMarkdown(html, {
|
|
|
215
214
|
})
|
|
216
215
|
```
|
|
217
216
|
|
|
217
|
+
### Content Extraction with Readability
|
|
218
|
+
|
|
219
|
+
For advanced content extraction (article detection, boilerplate removal), use [@mozilla/readability](https://github.com/mozilla/readability) before mdream:
|
|
220
|
+
|
|
221
|
+
```ts
|
|
222
|
+
import { Readability } from '@mozilla/readability'
|
|
223
|
+
import { JSDOM } from 'jsdom'
|
|
224
|
+
import { htmlToMarkdown } from 'mdream'
|
|
225
|
+
|
|
226
|
+
const dom = new JSDOM(html, { url: 'https://example.com' })
|
|
227
|
+
const article = new Readability(dom.window.document).parse()
|
|
228
|
+
|
|
229
|
+
if (article) {
|
|
230
|
+
const markdown = htmlToMarkdown(article.content)
|
|
231
|
+
// article.title, article.excerpt, article.byline also available
|
|
232
|
+
}
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
This pipeline gives you battle-tested content extraction + fast markdown conversion.
|
|
236
|
+
|
|
218
237
|
### Plugin Hooks
|
|
219
238
|
|
|
220
239
|
- `beforeNodeProcess`: Called before any node processing, can skip nodes
|
|
@@ -1,42 +1,3 @@
|
|
|
1
|
-
//#region src/buffer-region.ts
|
|
2
|
-
/**
|
|
3
|
-
* Creates a new buffer region
|
|
4
|
-
* Returns null if node already has a region assigned
|
|
5
|
-
*/
|
|
6
|
-
function createBufferRegion(node, state, include) {
|
|
7
|
-
if (node.regionId) return null;
|
|
8
|
-
const id = state.regionToggles.size + 1;
|
|
9
|
-
node.regionId = id;
|
|
10
|
-
state.regionToggles.set(id, include);
|
|
11
|
-
state.regionContentBuffers.set(id, []);
|
|
12
|
-
return id;
|
|
13
|
-
}
|
|
14
|
-
/**
|
|
15
|
-
* Collects content for a node into appropriate buffer (optimized)
|
|
16
|
-
*/
|
|
17
|
-
function collectNodeContent(node, content, state) {
|
|
18
|
-
if (!content) return;
|
|
19
|
-
const regionId = node.regionId || 0;
|
|
20
|
-
const targetBuffer = state.regionContentBuffers.get(regionId);
|
|
21
|
-
if (targetBuffer) {
|
|
22
|
-
targetBuffer.push(content);
|
|
23
|
-
state.lastContentCache = content;
|
|
24
|
-
}
|
|
25
|
-
}
|
|
26
|
-
/**
|
|
27
|
-
* Assembles final content from buffer regions and clears them after use
|
|
28
|
-
* Ensures frontmatter (regionId -1) appears first, followed by other included regions
|
|
29
|
-
*/
|
|
30
|
-
function assembleBufferedContent(state) {
|
|
31
|
-
const fragments = [];
|
|
32
|
-
for (const [regionId, content] of Array.from(state.regionContentBuffers.entries())) if (state.regionToggles.get(regionId)) fragments.push(...content);
|
|
33
|
-
state.regionToggles.clear();
|
|
34
|
-
state.regionContentBuffers.clear();
|
|
35
|
-
return fragments.join("").trimStart();
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
//#endregion
|
|
39
|
-
//#region src/const.ts
|
|
40
1
|
const TAG_HTML = 0;
|
|
41
2
|
const TAG_HEAD = 1;
|
|
42
3
|
const TAG_DETAILS = 2;
|
|
@@ -280,6 +241,4 @@ const DEFAULT_BLOCK_SPACING = [2, 2];
|
|
|
280
241
|
const BLOCKQUOTE_SPACING = [1, 1];
|
|
281
242
|
const LIST_ITEM_SPACING = [1, 0];
|
|
282
243
|
const TABLE_ROW_SPACING = [0, 1];
|
|
283
|
-
|
|
284
|
-
//#endregion
|
|
285
|
-
export { TAG_H2 as $, TAG_TBODY as $t, TAG_BUTTON as A, TAG_P as At, TAG_DFN as B, TAG_SCRIPT as Bt, TAG_AUDIO as C, TAG_METER as Ct, TAG_BLOCKQUOTE as D, TAG_OBJECT as Dt, TAG_BDO as E, TAG_NOSCRIPT as Et, TAG_CODE as F, TAG_Q as Ft, TAG_EM as G, TAG_SPAN as Gt, TAG_DIV as H, TAG_SELECT as Ht, TAG_COL as I, TAG_RP as It, TAG_FIGCAPTION as J, TAG_SUB as Jt, TAG_EMBED as K, TAG_STRONG as Kt, TAG_DD as L, TAG_RT as Lt, TAG_CAPTION as M, TAG_PLAINTEXT as Mt, TAG_CENTER as N, TAG_PRE as Nt, TAG_BODY as O, TAG_OL as Ot, TAG_CITE as P, TAG_PROGRESS as Pt, TAG_H1 as Q, TAG_TABLE as Qt, TAG_DEL as R, TAG_RUBY as Rt, TAG_ASIDE as S, TAG_META as St, TAG_BASE as T, TAG_NOFRAMES as Tt, TAG_DL as U, TAG_SMALL as Ut, TAG_DIALOG as V, TAG_SECTION as Vt, TAG_DT as W, TAG_SOURCE as Wt, TAG_FOOTER as X, TAG_SUP as Xt, TAG_FIGURE as Y, TAG_SUMMARY as Yt, TAG_FORM as Z, TAG_SVG as Zt, TAG_A as _, TagIdMap as _n, TAG_LI as _t, LIST_ITEM_SPACING as a, TAG_THEAD as an, TAG_HEADER as at, TAG_AREA as b, createBufferRegion as bn, TAG_MAP as bt, MARKDOWN_HORIZONTAL_RULE as c, TAG_TR as cn, TAG_I as ct, MARKDOWN_STRONG as d, TAG_UL as dn, TAG_INPUT as dt, TAG_TD as en, TAG_H3 as et, MAX_TAG_ID as f, TAG_VAR as fn, TAG_INS as ft, TABLE_ROW_SPACING as g, TEXT_NODE as gn, TAG_LEGEND as gt, NodeEventExit as h, TAG_XMP as hn, TAG_LABEL as ht, HTML_ENTITIES as i, TAG_TH as in, TAG_HEAD as it, TAG_CANVAS as j, TAG_PARAM as jt, TAG_BR as k, TAG_OPTION as kt, MARKDOWN_INLINE_CODE as l, TAG_TRACK as ln, TAG_IFRAME as lt, NodeEventEnter as m, TAG_WBR as mn, TAG_KEYGEN as mt, DEFAULT_BLOCK_SPACING as n, TAG_TEXTAREA as nn, TAG_H5 as nt, MARKDOWN_CODE_BLOCK as o, TAG_TIME as on, TAG_HR as ot, NO_SPACING as p, TAG_VIDEO as pn, TAG_KBD as pt, TAG_FIELDSET as q, TAG_STYLE as qt, ELEMENT_NODE as r, TAG_TFOOT as rn, TAG_H6 as rt, MARKDOWN_EMPHASIS as s, TAG_TITLE as sn, TAG_HTML as st, BLOCKQUOTE_SPACING as t, TAG_TEMPLATE as tn, TAG_H4 as tt, MARKDOWN_STRIKETHROUGH as u, TAG_U as un, TAG_IMG as ut, TAG_ABBR as v, assembleBufferedContent as vn, TAG_LINK as vt, TAG_B as w, TAG_NAV as wt, TAG_ARTICLE as x, TAG_MARK as xt, TAG_ADDRESS as y, collectNodeContent as yn, TAG_MAIN as yt, TAG_DETAILS as z, TAG_SAMP as zt };
|
|
244
|
+
export { TAG_H5 as $, TAG_TH as $t, TAG_CANVAS as A, TAG_PROGRESS as At, TAG_DIV as B, TAG_SPAN as Bt, TAG_B as C, TAG_OBJECT as Ct, TAG_BODY as D, TAG_PARAM as Dt, TAG_BLOCKQUOTE as E, TAG_P as Et, TAG_DD as F, TAG_SAMP as Ft, TAG_FIELDSET as G, TAG_SUP as Gt, TAG_DT as H, TAG_STYLE as Ht, TAG_DEL as I, TAG_SCRIPT as It, TAG_FORM as J, TAG_TBODY as Jt, TAG_FIGURE as K, TAG_SVG as Kt, TAG_DETAILS as L, TAG_SELECT as Lt, TAG_CITE as M, TAG_RP as Mt, TAG_CODE as N, TAG_RT as Nt, TAG_BR as O, TAG_PLAINTEXT as Ot, TAG_COL as P, TAG_RUBY as Pt, TAG_H4 as Q, TAG_TFOOT as Qt, TAG_DFN as R, TAG_SMALL as Rt, TAG_AUDIO as S, TAG_NOSCRIPT as St, TAG_BDO as T, TAG_OPTION as Tt, TAG_EM as U, TAG_SUB as Ut, TAG_DL as V, TAG_STRONG as Vt, TAG_EMBED as W, TAG_SUMMARY as Wt, TAG_H2 as X, TAG_TEMPLATE as Xt, TAG_H1 as Y, TAG_TD as Yt, TAG_H3 as Z, TAG_TEXTAREA as Zt, TAG_A as _, TAG_MARK as _t, LIST_ITEM_SPACING as a, TAG_U as an, TAG_IFRAME as at, TAG_AREA as b, TAG_NAV as bt, MARKDOWN_HORIZONTAL_RULE as c, TAG_VIDEO as cn, TAG_INS as ct, MARKDOWN_STRONG as d, TEXT_NODE as dn, TAG_LABEL as dt, TAG_THEAD as en, TAG_H6 as et, MAX_TAG_ID as f, TagIdMap as fn, TAG_LEGEND as ft, TABLE_ROW_SPACING as g, TAG_MAP as gt, NodeEventExit as h, TAG_MAIN as ht, HTML_ENTITIES as i, TAG_TRACK as in, TAG_I as it, TAG_CENTER as j, TAG_Q as jt, TAG_BUTTON as k, TAG_PRE as kt, MARKDOWN_INLINE_CODE as l, TAG_WBR as ln, TAG_KBD as lt, NodeEventEnter as m, TAG_LINK as mt, DEFAULT_BLOCK_SPACING as n, TAG_TITLE as nn, TAG_HEADER as nt, MARKDOWN_CODE_BLOCK as o, TAG_UL as on, TAG_IMG as ot, NO_SPACING as p, TAG_LI as pt, TAG_FOOTER as q, TAG_TABLE as qt, ELEMENT_NODE as r, TAG_TR as rn, TAG_HR as rt, MARKDOWN_EMPHASIS as s, TAG_VAR as sn, TAG_INPUT as st, BLOCKQUOTE_SPACING as t, TAG_TIME as tn, TAG_HEAD as tt, MARKDOWN_STRIKETHROUGH as u, TAG_XMP as un, TAG_KEYGEN as ut, TAG_ABBR as v, TAG_META as vt, TAG_BASE as w, TAG_OL as wt, TAG_ASIDE as x, TAG_NOFRAMES as xt, TAG_ADDRESS as y, TAG_METER as yt, TAG_DIALOG as z, TAG_SOURCE as zt };
|
|
@@ -1,18 +1,10 @@
|
|
|
1
|
-
import { t as createPlugin } from "./plugin
|
|
2
|
-
|
|
3
|
-
//#region src/libs/query-selector.ts
|
|
4
|
-
/**
|
|
5
|
-
* Creates a tag selector matcher (e.g., 'div', 'p', 'h1')
|
|
6
|
-
*/
|
|
1
|
+
import { t as createPlugin } from "./plugin.mjs";
|
|
7
2
|
function createTagSelector(tagName) {
|
|
8
3
|
return {
|
|
9
4
|
matches: (element) => element.name === tagName,
|
|
10
5
|
toString: () => tagName
|
|
11
6
|
};
|
|
12
7
|
}
|
|
13
|
-
/**
|
|
14
|
-
* Creates an ID selector matcher (e.g., '#main', '#content')
|
|
15
|
-
*/
|
|
16
8
|
function createIdSelector(selector) {
|
|
17
9
|
const id = selector.slice(1);
|
|
18
10
|
return {
|
|
@@ -20,9 +12,6 @@ function createIdSelector(selector) {
|
|
|
20
12
|
toString: () => `#${id}`
|
|
21
13
|
};
|
|
22
14
|
}
|
|
23
|
-
/**
|
|
24
|
-
* Creates a class selector matcher (e.g., '.container', '.header')
|
|
25
|
-
*/
|
|
26
15
|
function createClassSelector(selector) {
|
|
27
16
|
const className = selector.slice(1);
|
|
28
17
|
return {
|
|
@@ -33,9 +22,6 @@ function createClassSelector(selector) {
|
|
|
33
22
|
toString: () => `.${className}`
|
|
34
23
|
};
|
|
35
24
|
}
|
|
36
|
-
/**
|
|
37
|
-
* Creates an attribute selector matcher (e.g., '[data-id]', '[href="https://example.com"]')
|
|
38
|
-
*/
|
|
39
25
|
function createAttributeSelector(selector) {
|
|
40
26
|
const match = selector.match(/\[([^\]=~|^$*]+)(?:([=~|^$*]+)["']?([^"'\]]+)["']?)?\]/);
|
|
41
27
|
const attrName = match ? match[1] : selector.slice(1, -1);
|
|
@@ -62,18 +48,12 @@ function createAttributeSelector(selector) {
|
|
|
62
48
|
}
|
|
63
49
|
};
|
|
64
50
|
}
|
|
65
|
-
/**
|
|
66
|
-
* Creates a compound selector that combines multiple selectors (e.g., 'div.container', 'h1#title')
|
|
67
|
-
*/
|
|
68
51
|
function createCompoundSelector(selectors) {
|
|
69
52
|
return {
|
|
70
53
|
matches: (element) => selectors.every((selector) => selector.matches(element)),
|
|
71
54
|
toString: () => selectors.map((s) => s.toString()).join("")
|
|
72
55
|
};
|
|
73
56
|
}
|
|
74
|
-
/**
|
|
75
|
-
* Parses a CSS selector into a matcher
|
|
76
|
-
*/
|
|
77
57
|
function parseSelector(selector) {
|
|
78
58
|
selector = selector.trim();
|
|
79
59
|
if (!selector) throw new Error("Empty selector");
|
|
@@ -100,9 +80,6 @@ function parseSelector(selector) {
|
|
|
100
80
|
if (selectorParts.length === 1) return selectorParts[0];
|
|
101
81
|
return createCompoundSelector(selectorParts);
|
|
102
82
|
}
|
|
103
|
-
|
|
104
|
-
//#endregion
|
|
105
|
-
//#region src/plugins/extraction.ts
|
|
106
83
|
function extractionPlugin(selectors) {
|
|
107
84
|
const matcherCallbacks = Object.entries(selectors).map(([selector, callback]) => ({
|
|
108
85
|
matcher: parseSelector(selector),
|
|
@@ -139,6 +116,4 @@ function extractionPlugin(selectors) {
|
|
|
139
116
|
}
|
|
140
117
|
});
|
|
141
118
|
}
|
|
142
|
-
|
|
143
|
-
//#endregion
|
|
144
|
-
export { parseSelector as n, extractionPlugin as t };
|
|
119
|
+
export { parseSelector as n, extractionPlugin as t };
|
|
@@ -1,6 +1,4 @@
|
|
|
1
|
-
import { $ as
|
|
2
|
-
|
|
3
|
-
//#region src/tags.ts
|
|
1
|
+
import { $ as TAG_H5, $t as TAG_TH, A as TAG_CANVAS, At as TAG_PROGRESS, B as TAG_DIV, Bt as TAG_SPAN, C as TAG_B, D as TAG_BODY, Dt as TAG_PARAM, E as TAG_BLOCKQUOTE, Et as TAG_P, F as TAG_DD, Ft as TAG_SAMP, G as TAG_FIELDSET, Gt as TAG_SUP, H as TAG_DT, Ht as TAG_STYLE, I as TAG_DEL, It as TAG_SCRIPT, J as TAG_FORM, Jt as TAG_TBODY, Kt as TAG_SVG, L as TAG_DETAILS, Lt as TAG_SELECT, M as TAG_CITE, Mt as TAG_RP, N as TAG_CODE, Nt as TAG_RT, O as TAG_BR, Ot as TAG_PLAINTEXT, P as TAG_COL, Pt as TAG_RUBY, Q as TAG_H4, Qt as TAG_TFOOT, R as TAG_DFN, Rt as TAG_SMALL, S as TAG_AUDIO, St as TAG_NOSCRIPT, T as TAG_BDO, Tt as TAG_OPTION, U as TAG_EM, Ut as TAG_SUB, V as TAG_DL, Vt as TAG_STRONG, W as TAG_EMBED, Wt as TAG_SUMMARY, X as TAG_H2, Xt as TAG_TEMPLATE, Y as TAG_H1, Yt as TAG_TD, Z as TAG_H3, Zt as TAG_TEXTAREA, _ as TAG_A, _t as TAG_MARK, a as LIST_ITEM_SPACING, an as TAG_U, at as TAG_IFRAME, b as TAG_AREA, bt as TAG_NAV, c as MARKDOWN_HORIZONTAL_RULE, cn as TAG_VIDEO, ct as TAG_INS, d as MARKDOWN_STRONG, dn as TEXT_NODE, dt as TAG_LABEL, en as TAG_THEAD, et as TAG_H6, f as MAX_TAG_ID, fn as TagIdMap, ft as TAG_LEGEND, g as TABLE_ROW_SPACING, gt as TAG_MAP, h as NodeEventExit, i as HTML_ENTITIES, in as TAG_TRACK, it as TAG_I, j as TAG_CENTER, jt as TAG_Q, k as TAG_BUTTON, kt as TAG_PRE, l as MARKDOWN_INLINE_CODE, ln as TAG_WBR, lt as TAG_KBD, m as NodeEventEnter, mt as TAG_LINK, n as DEFAULT_BLOCK_SPACING, nn as TAG_TITLE, o as MARKDOWN_CODE_BLOCK, on as TAG_UL, ot as TAG_IMG, p as NO_SPACING, pt as TAG_LI, q as TAG_FOOTER, qt as TAG_TABLE, r as ELEMENT_NODE, rn as TAG_TR, rt as TAG_HR, s as MARKDOWN_EMPHASIS, sn as TAG_VAR, st as TAG_INPUT, t as BLOCKQUOTE_SPACING, tn as TAG_TIME, tt as TAG_HEAD, u as MARKDOWN_STRIKETHROUGH, un as TAG_XMP, ut as TAG_KEYGEN, v as TAG_ABBR, vt as TAG_META, w as TAG_BASE, wt as TAG_OL, x as TAG_ASIDE, xt as TAG_NOFRAMES, y as TAG_ADDRESS, yt as TAG_METER, z as TAG_DIALOG, zt as TAG_SOURCE } from "./const.mjs";
|
|
4
2
|
function resolveUrl(url, origin) {
|
|
5
3
|
if (!url) return url;
|
|
6
4
|
if (url.startsWith("//")) return `https:${url}`;
|
|
@@ -529,12 +527,6 @@ const tagHandlers = {
|
|
|
529
527
|
spacing: [0, 1]
|
|
530
528
|
}
|
|
531
529
|
};
|
|
532
|
-
|
|
533
|
-
//#endregion
|
|
534
|
-
//#region src/utils.ts
|
|
535
|
-
/**
|
|
536
|
-
* Decode HTML entities - optimized version with single pass
|
|
537
|
-
*/
|
|
538
530
|
function decodeHTMLEntities(text) {
|
|
539
531
|
let result = "";
|
|
540
532
|
let i = 0;
|
|
@@ -585,9 +577,6 @@ function traverseUpToFirstBlockNode(node) {
|
|
|
585
577
|
}
|
|
586
578
|
return parentsToIncrement;
|
|
587
579
|
}
|
|
588
|
-
|
|
589
|
-
//#endregion
|
|
590
|
-
//#region src/parse.ts
|
|
591
580
|
const LT_CHAR = 60;
|
|
592
581
|
const GT_CHAR = 62;
|
|
593
582
|
const SLASH_CHAR = 47;
|
|
@@ -610,16 +599,9 @@ const EMPTY_ATTRIBUTES = Object.freeze({});
|
|
|
610
599
|
function copyDepthMap(depthMap) {
|
|
611
600
|
return new Uint8Array(depthMap);
|
|
612
601
|
}
|
|
613
|
-
/**
|
|
614
|
-
* Fast whitespace check using direct character code comparison
|
|
615
|
-
*/
|
|
616
602
|
function isWhitespace(charCode) {
|
|
617
603
|
return charCode === SPACE_CHAR || charCode === TAB_CHAR || charCode === NEWLINE_CHAR || charCode === CARRIAGE_RETURN_CHAR;
|
|
618
604
|
}
|
|
619
|
-
/**
|
|
620
|
-
* Pure HTML parser that emits DOM events
|
|
621
|
-
* Completely decoupled from markdown generation
|
|
622
|
-
*/
|
|
623
605
|
function parseHtml(html, options = {}) {
|
|
624
606
|
const events = [];
|
|
625
607
|
return {
|
|
@@ -633,15 +615,9 @@ function parseHtml(html, options = {}) {
|
|
|
633
615
|
})
|
|
634
616
|
};
|
|
635
617
|
}
|
|
636
|
-
/**
|
|
637
|
-
* Streaming HTML parser - calls onEvent for each DOM event
|
|
638
|
-
*/
|
|
639
618
|
function parseHtmlStream(html, state, onEvent) {
|
|
640
619
|
return parseHtmlInternal(html, state, onEvent);
|
|
641
620
|
}
|
|
642
|
-
/**
|
|
643
|
-
* Internal parsing function - extracted from original parseHTML
|
|
644
|
-
*/
|
|
645
621
|
function parseHtmlInternal(htmlChunk, state, handleEvent) {
|
|
646
622
|
let textBuffer = "";
|
|
647
623
|
state.depthMap ??= new Uint8Array(MAX_TAG_ID);
|
|
@@ -772,9 +748,6 @@ function parseHtmlInternal(htmlChunk, state, handleEvent) {
|
|
|
772
748
|
}
|
|
773
749
|
return textBuffer;
|
|
774
750
|
}
|
|
775
|
-
/**
|
|
776
|
-
* Process accumulated text buffer and create text node event
|
|
777
|
-
*/
|
|
778
751
|
function processTextBuffer(textBuffer, state, handleEvent) {
|
|
779
752
|
const containsNonWhitespace = state.textBufferContainsNonWhitespace;
|
|
780
753
|
const containsWhitespace = state.textBufferContainsWhitespace;
|
|
@@ -801,7 +774,6 @@ function processTextBuffer(textBuffer, state, handleEvent) {
|
|
|
801
774
|
type: TEXT_NODE,
|
|
802
775
|
value: text,
|
|
803
776
|
parent: state.currentNode,
|
|
804
|
-
regionId: state.currentNode?.regionId,
|
|
805
777
|
index: state.currentNode.currentWalkIndex++,
|
|
806
778
|
depth: state.depth,
|
|
807
779
|
containsWhitespace,
|
|
@@ -814,9 +786,6 @@ function processTextBuffer(textBuffer, state, handleEvent) {
|
|
|
814
786
|
});
|
|
815
787
|
state.lastTextNode = textNode;
|
|
816
788
|
}
|
|
817
|
-
/**
|
|
818
|
-
* Process HTML closing tag
|
|
819
|
-
*/
|
|
820
789
|
function processClosingTag(htmlChunk, position, state, handleEvent) {
|
|
821
790
|
let i = position + 2;
|
|
822
791
|
const tagNameStart = i;
|
|
@@ -857,9 +826,6 @@ function processClosingTag(htmlChunk, position, state, handleEvent) {
|
|
|
857
826
|
remainingText: ""
|
|
858
827
|
};
|
|
859
828
|
}
|
|
860
|
-
/**
|
|
861
|
-
* Close a node and emit exit event
|
|
862
|
-
*/
|
|
863
829
|
function closeNode(node, state, handleEvent) {
|
|
864
830
|
if (!node) return;
|
|
865
831
|
if (node.tagId === TAG_A && !node.childTextNodeIndex) {
|
|
@@ -895,9 +861,6 @@ function closeNode(node, state, handleEvent) {
|
|
|
895
861
|
state.hasEncodedHtmlEntity = false;
|
|
896
862
|
state.justClosedTag = true;
|
|
897
863
|
}
|
|
898
|
-
/**
|
|
899
|
-
* Process HTML comment or doctype
|
|
900
|
-
*/
|
|
901
864
|
function processCommentOrDoctype(htmlChunk, position) {
|
|
902
865
|
let i = position;
|
|
903
866
|
const chunkLength = htmlChunk.length;
|
|
@@ -939,9 +902,6 @@ function processCommentOrDoctype(htmlChunk, position) {
|
|
|
939
902
|
};
|
|
940
903
|
}
|
|
941
904
|
}
|
|
942
|
-
/**
|
|
943
|
-
* Process HTML opening tag
|
|
944
|
-
*/
|
|
945
905
|
function processOpeningTag(tagName, tagId, htmlChunk, i, state, handleEvent) {
|
|
946
906
|
if (state.currentNode?.tagHandler?.isNonNesting) closeNode(state.currentNode, state, handleEvent);
|
|
947
907
|
const tagHandler = tagHandlers[tagId];
|
|
@@ -966,7 +926,6 @@ function processOpeningTag(tagName, tagId, htmlChunk, i, state, handleEvent) {
|
|
|
966
926
|
depthMap: copyDepthMap(state.depthMap),
|
|
967
927
|
depth: state.depth,
|
|
968
928
|
index: currentWalkIndex,
|
|
969
|
-
regionId: state.currentNode?.regionId,
|
|
970
929
|
tagId,
|
|
971
930
|
tagHandler
|
|
972
931
|
};
|
|
@@ -996,9 +955,6 @@ function processOpeningTag(tagName, tagId, htmlChunk, i, state, handleEvent) {
|
|
|
996
955
|
selfClosing: result.selfClosing
|
|
997
956
|
};
|
|
998
957
|
}
|
|
999
|
-
/**
|
|
1000
|
-
* Extract and process HTML tag attributes
|
|
1001
|
-
*/
|
|
1002
958
|
function processTagAttributes(htmlChunk, position, tagHandler) {
|
|
1003
959
|
let i = position;
|
|
1004
960
|
const chunkLength = htmlChunk.length;
|
|
@@ -1046,9 +1002,6 @@ function processTagAttributes(htmlChunk, position, tagHandler) {
|
|
|
1046
1002
|
attrBuffer: htmlChunk.substring(attrStartPos, i)
|
|
1047
1003
|
};
|
|
1048
1004
|
}
|
|
1049
|
-
/**
|
|
1050
|
-
* Parse HTML attributes string into key-value object
|
|
1051
|
-
*/
|
|
1052
1005
|
function parseAttributes(attrStr) {
|
|
1053
1006
|
if (!attrStr) return EMPTY_ATTRIBUTES;
|
|
1054
1007
|
const result = {};
|
|
@@ -1128,19 +1081,6 @@ function parseAttributes(attrStr) {
|
|
|
1128
1081
|
}
|
|
1129
1082
|
return result;
|
|
1130
1083
|
}
|
|
1131
|
-
|
|
1132
|
-
//#endregion
|
|
1133
|
-
//#region src/plugin-processor.ts
|
|
1134
|
-
/**
|
|
1135
|
-
* Processes plugins for a given node event
|
|
1136
|
-
* Shared logic between markdown-processor.ts and stream.ts
|
|
1137
|
-
*
|
|
1138
|
-
* @param event - The node event to process
|
|
1139
|
-
* @param plugins - Array of plugins to apply
|
|
1140
|
-
* @param state - The current runtime state
|
|
1141
|
-
* @param processEvent - Callback to process the event after plugin processing
|
|
1142
|
-
* @returns true if the event should be skipped, false to continue processing
|
|
1143
|
-
*/
|
|
1144
1084
|
function processPluginsForEvent(event, plugins, state, processEvent) {
|
|
1145
1085
|
if (plugins?.length) {
|
|
1146
1086
|
for (const plugin of plugins) {
|
|
@@ -1173,12 +1113,6 @@ function processPluginsForEvent(event, plugins, state, processEvent) {
|
|
|
1173
1113
|
processEvent(event);
|
|
1174
1114
|
return false;
|
|
1175
1115
|
}
|
|
1176
|
-
|
|
1177
|
-
//#endregion
|
|
1178
|
-
//#region src/markdown-processor.ts
|
|
1179
|
-
/**
|
|
1180
|
-
* Determines if spacing is needed between two characters
|
|
1181
|
-
*/
|
|
1182
1116
|
function needsSpacing(lastChar, firstChar, state) {
|
|
1183
1117
|
if (lastChar === " " || lastChar === "\n" || lastChar === " ") return false;
|
|
1184
1118
|
if (firstChar === " " || firstChar === "\n" || firstChar === " ") return false;
|
|
@@ -1208,15 +1142,9 @@ function needsSpacing(lastChar, firstChar, state) {
|
|
|
1208
1142
|
if (noSpaceAfter.has(lastChar) || noSpaceBefore.has(firstChar)) return false;
|
|
1209
1143
|
return true;
|
|
1210
1144
|
}
|
|
1211
|
-
/**
|
|
1212
|
-
* Determines if spacing should be added before text content
|
|
1213
|
-
*/
|
|
1214
1145
|
function shouldAddSpacingBeforeText(lastChar, lastNode, textNode) {
|
|
1215
1146
|
return !!lastChar && lastChar !== "\n" && lastChar !== " " && lastChar !== "[" && lastChar !== ">" && !lastNode?.tagHandler?.isInline && textNode.value[0] !== " ";
|
|
1216
1147
|
}
|
|
1217
|
-
/**
|
|
1218
|
-
* Calculate newline configuration based on tag handler spacing config
|
|
1219
|
-
*/
|
|
1220
1148
|
function calculateNewLineConfig(node) {
|
|
1221
1149
|
const tagId = node.tagId;
|
|
1222
1150
|
const depthMap = node.depthMap;
|
|
@@ -1236,28 +1164,19 @@ function calculateNewLineConfig(node) {
|
|
|
1236
1164
|
if (node.tagHandler?.spacing) return node.tagHandler?.spacing;
|
|
1237
1165
|
return DEFAULT_BLOCK_SPACING;
|
|
1238
1166
|
}
|
|
1239
|
-
/**
|
|
1240
|
-
* Creates a markdown processor that consumes DOM events and generates markdown
|
|
1241
|
-
*/
|
|
1242
1167
|
function createMarkdownProcessor(options = {}) {
|
|
1243
1168
|
const state = {
|
|
1244
1169
|
options,
|
|
1245
|
-
|
|
1246
|
-
regionContentBuffers: /* @__PURE__ */ new Map(),
|
|
1170
|
+
buffer: [],
|
|
1247
1171
|
depthMap: new Uint8Array(MAX_TAG_ID)
|
|
1248
1172
|
};
|
|
1249
|
-
state.regionToggles.set(0, true);
|
|
1250
|
-
state.regionContentBuffers.set(0, []);
|
|
1251
1173
|
let lastYieldedLength = 0;
|
|
1252
|
-
/**
|
|
1253
|
-
* Process a DOM event and generate markdown
|
|
1254
|
-
*/
|
|
1255
1174
|
function processEvent(event) {
|
|
1256
1175
|
const { type: eventType, node } = event;
|
|
1257
1176
|
const lastNode = state.lastNode;
|
|
1258
1177
|
state.lastNode = event.node;
|
|
1259
1178
|
state.depth = node.depth;
|
|
1260
|
-
const buff = state.
|
|
1179
|
+
const buff = state.buffer;
|
|
1261
1180
|
const lastBuffEntry = buff[buff.length - 1];
|
|
1262
1181
|
const lastChar = lastBuffEntry?.charAt(lastBuffEntry.length - 1) || "";
|
|
1263
1182
|
let secondLastChar;
|
|
@@ -1269,7 +1188,8 @@ function createMarkdownProcessor(options = {}) {
|
|
|
1269
1188
|
if (textNode.excludedFromMarkdown) return;
|
|
1270
1189
|
if (textNode.value === " " && lastChar === "\n") return;
|
|
1271
1190
|
if (shouldAddSpacingBeforeText(lastChar, lastNode, textNode)) textNode.value = ` ${textNode.value}`;
|
|
1272
|
-
|
|
1191
|
+
state.buffer.push(textNode.value);
|
|
1192
|
+
state.lastContentCache = textNode.value;
|
|
1273
1193
|
}
|
|
1274
1194
|
state.lastTextNode = textNode;
|
|
1275
1195
|
return;
|
|
@@ -1299,7 +1219,10 @@ function createMarkdownProcessor(options = {}) {
|
|
|
1299
1219
|
const newLines = Math.max(0, configuredNewLines - lastNewLines);
|
|
1300
1220
|
if (newLines > 0) {
|
|
1301
1221
|
if (!buff.length) {
|
|
1302
|
-
for (const fragment of output)
|
|
1222
|
+
for (const fragment of output) if (fragment) {
|
|
1223
|
+
state.buffer.push(fragment);
|
|
1224
|
+
state.lastContentCache = fragment;
|
|
1225
|
+
}
|
|
1303
1226
|
return;
|
|
1304
1227
|
}
|
|
1305
1228
|
const newlinesStr = "\n".repeat(newLines);
|
|
@@ -1321,12 +1244,15 @@ function createMarkdownProcessor(options = {}) {
|
|
|
1321
1244
|
state.lastTextNode = void 0;
|
|
1322
1245
|
}
|
|
1323
1246
|
}
|
|
1324
|
-
if (output[0]?.[0] && eventType === NodeEventEnter && lastChar && needsSpacing(lastChar, output[0][0], state))
|
|
1325
|
-
|
|
1247
|
+
if (output[0]?.[0] && eventType === NodeEventEnter && lastChar && needsSpacing(lastChar, output[0][0], state)) {
|
|
1248
|
+
state.buffer.push(" ");
|
|
1249
|
+
state.lastContentCache = " ";
|
|
1250
|
+
}
|
|
1251
|
+
for (const fragment of output) if (fragment) {
|
|
1252
|
+
state.buffer.push(fragment);
|
|
1253
|
+
state.lastContentCache = fragment;
|
|
1254
|
+
}
|
|
1326
1255
|
}
|
|
1327
|
-
/**
|
|
1328
|
-
* Process HTML string and generate events
|
|
1329
|
-
*/
|
|
1330
1256
|
function processHtml(html) {
|
|
1331
1257
|
parseHtmlStream(html, {
|
|
1332
1258
|
depthMap: state.depthMap,
|
|
@@ -1336,19 +1262,13 @@ function createMarkdownProcessor(options = {}) {
|
|
|
1336
1262
|
processPluginsForEvent(event, state.options?.plugins, state, processEvent);
|
|
1337
1263
|
});
|
|
1338
1264
|
}
|
|
1339
|
-
/**
|
|
1340
|
-
* Get the final markdown output
|
|
1341
|
-
*/
|
|
1342
1265
|
function getMarkdown() {
|
|
1343
|
-
|
|
1266
|
+
const result = state.buffer.join("").trimStart();
|
|
1267
|
+
state.buffer.length = 0;
|
|
1268
|
+
return result.trimEnd();
|
|
1344
1269
|
}
|
|
1345
|
-
/**
|
|
1346
|
-
* Get new markdown content since the last call (for streaming)
|
|
1347
|
-
*/
|
|
1348
1270
|
function getMarkdownChunk() {
|
|
1349
|
-
const
|
|
1350
|
-
for (const [regionId, content] of Array.from(state.regionContentBuffers.entries())) if (state.regionToggles.get(regionId)) fragments.push(...content);
|
|
1351
|
-
const currentContent = fragments.join("").trimStart();
|
|
1271
|
+
const currentContent = state.buffer.join("").trimStart();
|
|
1352
1272
|
const newContent = currentContent.slice(lastYieldedLength);
|
|
1353
1273
|
lastYieldedLength = currentContent.length;
|
|
1354
1274
|
return newContent;
|
|
@@ -1362,6 +1282,4 @@ function createMarkdownProcessor(options = {}) {
|
|
|
1362
1282
|
};
|
|
1363
1283
|
}
|
|
1364
1284
|
const MarkdownProcessor = createMarkdownProcessor;
|
|
1365
|
-
|
|
1366
|
-
//#endregion
|
|
1367
|
-
export { parseHtmlStream as a, parseHtml as i, createMarkdownProcessor as n, processPluginsForEvent as r, MarkdownProcessor as t };
|
|
1285
|
+
export { parseHtmlStream as a, parseHtml as i, createMarkdownProcessor as n, processPluginsForEvent as r, MarkdownProcessor as t };
|
|
@@ -1,12 +1,4 @@
|
|
|
1
|
-
import { a as parseHtmlStream, n as createMarkdownProcessor, r as processPluginsForEvent } from "./markdown-processor
|
|
2
|
-
|
|
3
|
-
//#region src/stream.ts
|
|
4
|
-
/**
|
|
5
|
-
* Creates a markdown stream from an HTML stream
|
|
6
|
-
* @param htmlStream - ReadableStream of HTML content (as Uint8Array or string)
|
|
7
|
-
* @param options - Configuration options for conversion
|
|
8
|
-
* @returns An async generator yielding markdown chunks
|
|
9
|
-
*/
|
|
1
|
+
import { a as parseHtmlStream, n as createMarkdownProcessor, r as processPluginsForEvent } from "./markdown-processor.mjs";
|
|
10
2
|
async function* streamHtmlToMarkdown(htmlStream, options = {}) {
|
|
11
3
|
if (!htmlStream) throw new Error("Invalid HTML stream provided");
|
|
12
4
|
const decoder = new TextDecoder();
|
|
@@ -38,14 +30,4 @@ async function* streamHtmlToMarkdown(htmlStream, options = {}) {
|
|
|
38
30
|
reader.releaseLock();
|
|
39
31
|
}
|
|
40
32
|
}
|
|
41
|
-
|
|
42
|
-
//#endregion
|
|
43
|
-
//#region src/index.ts
|
|
44
|
-
function htmlToMarkdown(html, options = {}) {
|
|
45
|
-
const processor = createMarkdownProcessor(options);
|
|
46
|
-
processor.processHtml(html);
|
|
47
|
-
return processor.getMarkdown();
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
//#endregion
|
|
51
|
-
export { streamHtmlToMarkdown as n, htmlToMarkdown as t };
|
|
33
|
+
export { streamHtmlToMarkdown as t };
|