mdream 0.15.3 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -201,7 +201,6 @@ Mdream includes several built-in plugins that can be used individually or combin
201
201
  - **[`frontmatterPlugin`](./src/plugins/frontmatter.ts)**: Generate YAML frontmatter from HTML head elements (title, meta tags)
202
202
  - **[`isolateMainPlugin`](./src/plugins/isolate-main.ts)**: Isolate main content using `<main>` elements or header-to-footer boundaries
203
203
  - **[`tailwindPlugin`](./src/plugins/tailwind.ts)**: Convert Tailwind CSS classes to Markdown formatting (bold, italic, etc.)
204
- - **[`readabilityPlugin`](./src/plugins/readability.ts)**: Content scoring and extraction (experimental)
205
204
 
206
205
  ```ts
207
206
  import { filterPlugin, frontmatterPlugin, isolateMainPlugin } from 'mdream/plugins'
@@ -215,6 +214,26 @@ const markdown = htmlToMarkdown(html, {
215
214
  })
216
215
  ```
217
216
 
217
+ ### Content Extraction with Readability
218
+
219
+ For advanced content extraction (article detection, boilerplate removal), use [@mozilla/readability](https://github.com/mozilla/readability) before mdream:
220
+
221
+ ```ts
222
+ import { Readability } from '@mozilla/readability'
223
+ import { JSDOM } from 'jsdom'
224
+ import { htmlToMarkdown } from 'mdream'
225
+
226
+ const dom = new JSDOM(html, { url: 'https://example.com' })
227
+ const article = new Readability(dom.window.document).parse()
228
+
229
+ if (article) {
230
+ const markdown = htmlToMarkdown(article.content)
231
+ // article.title, article.excerpt, article.byline also available
232
+ }
233
+ ```
234
+
235
+ This pipeline gives you battle-tested content extraction + fast markdown conversion.
236
+
218
237
  ### Plugin Hooks
219
238
 
220
239
  - `beforeNodeProcess`: Called before any node processing, can skip nodes
@@ -526,12 +545,3 @@ Custom notes
526
545
 
527
546
  Licensed under the [MIT license](https://github.com/harlan-zw/mdream/blob/main/LICENSE.md).
528
547
 
529
- <!-- Badges -->
530
- [npm-version-src]: https://img.shields.io/npm/v/mdream/latest.svg?style=flat&colorA=18181B&colorB=4C9BE0
531
- [npm-version-href]: https://npmjs.com/package/mdream
532
-
533
- [npm-downloads-src]: https://img.shields.io/npm/dm/mdream.svg?style=flat&colorA=18181B&colorB=4C9BE0
534
- [npm-downloads-href]: https://npmjs.com/package/mdream
535
-
536
- [license-src]: https://img.shields.io/github/license/harlan-zw/mdream.svg?style=flat&colorA=18181B&colorB=4C9BE0
537
- [license-href]: https://github.com/harlan-zw/mdream/blob/main/LICENSE.md
@@ -1,151 +1,3 @@
1
- //#region src/buffer-region.ts
2
- /**
3
- * Creates a new buffer region
4
- * Returns null if node already has a region assigned
5
- */
6
- function createBufferRegion(node, state, include) {
7
- if (node.regionId) return null;
8
- const id = state.regionToggles.size + 1;
9
- node.regionId = id;
10
- state.regionToggles.set(id, include);
11
- state.regionContentBuffers.set(id, []);
12
- return id;
13
- }
14
- /**
15
- * Collects content for a node into appropriate buffer (optimized)
16
- */
17
- function collectNodeContent(node, content, state) {
18
- if (!content) return;
19
- const regionId = node.regionId || 0;
20
- const targetBuffer = state.regionContentBuffers.get(regionId);
21
- if (targetBuffer) {
22
- targetBuffer.push(content);
23
- state.lastContentCache = content;
24
- }
25
- }
26
- /**
27
- * Assembles final content from buffer regions and clears them after use
28
- * Ensures frontmatter (regionId -1) appears first, followed by other included regions
29
- */
30
- function assembleBufferedContent(state) {
31
- const fragments = [];
32
- for (const [regionId, content] of Array.from(state.regionContentBuffers.entries())) if (state.regionToggles.get(regionId)) fragments.push(...content);
33
- state.regionToggles.clear();
34
- state.regionContentBuffers.clear();
35
- return fragments.join("").trimStart();
36
- }
37
-
38
- //#endregion
39
- //#region src/const.ts
40
- const TAG_HTML = 0;
41
- const TAG_HEAD = 1;
42
- const TAG_DETAILS = 2;
43
- const TAG_SUMMARY = 3;
44
- const TAG_TITLE = 4;
45
- const TAG_META = 5;
46
- const TAG_BR = 6;
47
- const TAG_H1 = 7;
48
- const TAG_H2 = 8;
49
- const TAG_H3 = 9;
50
- const TAG_H4 = 10;
51
- const TAG_H5 = 11;
52
- const TAG_H6 = 12;
53
- const TAG_HR = 13;
54
- const TAG_STRONG = 14;
55
- const TAG_B = 15;
56
- const TAG_EM = 16;
57
- const TAG_I = 17;
58
- const TAG_DEL = 18;
59
- const TAG_SUB = 19;
60
- const TAG_SUP = 20;
61
- const TAG_INS = 21;
62
- const TAG_BLOCKQUOTE = 22;
63
- const TAG_CODE = 23;
64
- const TAG_UL = 24;
65
- const TAG_LI = 25;
66
- const TAG_A = 26;
67
- const TAG_IMG = 27;
68
- const TAG_TABLE = 28;
69
- const TAG_THEAD = 29;
70
- const TAG_TR = 30;
71
- const TAG_TH = 31;
72
- const TAG_TD = 32;
73
- const TAG_OL = 33;
74
- const TAG_PRE = 34;
75
- const TAG_P = 35;
76
- const TAG_DIV = 36;
77
- const TAG_SPAN = 37;
78
- const TAG_TBODY = 38;
79
- const TAG_TFOOT = 39;
80
- const TAG_FORM = 40;
81
- const TAG_NAV = 41;
82
- const TAG_LABEL = 42;
83
- const TAG_BUTTON = 43;
84
- const TAG_BODY = 44;
85
- const TAG_CENTER = 45;
86
- const TAG_KBD = 46;
87
- const TAG_FOOTER = 47;
88
- const TAG_PATH = 48;
89
- const TAG_SVG = 49;
90
- const TAG_ARTICLE = 50;
91
- const TAG_SECTION = 51;
92
- const TAG_SCRIPT = 52;
93
- const TAG_STYLE = 53;
94
- const TAG_LINK = 54;
95
- const TAG_AREA = 55;
96
- const TAG_BASE = 56;
97
- const TAG_COL = 57;
98
- const TAG_EMBED = 58;
99
- const TAG_INPUT = 59;
100
- const TAG_KEYGEN = 60;
101
- const TAG_PARAM = 61;
102
- const TAG_SOURCE = 62;
103
- const TAG_TRACK = 63;
104
- const TAG_WBR = 64;
105
- const TAG_SELECT = 65;
106
- const TAG_TEXTAREA = 66;
107
- const TAG_OPTION = 67;
108
- const TAG_FIELDSET = 68;
109
- const TAG_LEGEND = 69;
110
- const TAG_AUDIO = 70;
111
- const TAG_VIDEO = 71;
112
- const TAG_CANVAS = 72;
113
- const TAG_IFRAME = 73;
114
- const TAG_MAP = 74;
115
- const TAG_DIALOG = 75;
116
- const TAG_METER = 76;
117
- const TAG_PROGRESS = 77;
118
- const TAG_TEMPLATE = 78;
119
- const TAG_ABBR = 79;
120
- const TAG_MARK = 80;
121
- const TAG_Q = 81;
122
- const TAG_SAMP = 82;
123
- const TAG_SMALL = 83;
124
- const TAG_NOSCRIPT = 84;
125
- const TAG_NOFRAMES = 85;
126
- const TAG_XMP = 86;
127
- const TAG_PLAINTEXT = 87;
128
- const TAG_ASIDE = 88;
129
- const TAG_U = 89;
130
- const TAG_CITE = 90;
131
- const TAG_DFN = 91;
132
- const TAG_VAR = 92;
133
- const TAG_TIME = 93;
134
- const TAG_BDO = 94;
135
- const TAG_RUBY = 95;
136
- const TAG_RT = 96;
137
- const TAG_RP = 97;
138
- const TAG_DD = 98;
139
- const TAG_DT = 99;
140
- const TAG_ADDRESS = 100;
141
- const TAG_DL = 101;
142
- const TAG_FIGURE = 102;
143
- const TAG_OBJECT = 103;
144
- const TAG_MAIN = 104;
145
- const TAG_HEADER = 105;
146
- const TAG_FIGCAPTION = 106;
147
- const TAG_CAPTION = 107;
148
- const MAX_TAG_ID = 108;
149
1
  const HTML_ENTITIES = {
150
2
  "&amp;": "&",
151
3
  "&lt;": "<",
@@ -155,131 +7,120 @@ const HTML_ENTITIES = {
155
7
  "&apos;": "'",
156
8
  "&nbsp;": " "
157
9
  };
158
- const ELEMENT_NODE = 1;
159
- const TEXT_NODE = 2;
160
- const NodeEventEnter = 0;
161
- const NodeEventExit = 1;
162
10
  const TagIdMap = {
163
- html: TAG_HTML,
164
- head: TAG_HEAD,
165
- details: TAG_DETAILS,
166
- summary: TAG_SUMMARY,
167
- title: TAG_TITLE,
168
- meta: TAG_META,
169
- br: TAG_BR,
170
- h1: TAG_H1,
171
- h2: TAG_H2,
172
- h3: TAG_H3,
173
- h4: TAG_H4,
174
- h5: TAG_H5,
175
- h6: TAG_H6,
176
- hr: TAG_HR,
177
- strong: TAG_STRONG,
178
- b: TAG_B,
179
- em: TAG_EM,
180
- i: TAG_I,
181
- del: TAG_DEL,
182
- sub: TAG_SUB,
183
- sup: TAG_SUP,
184
- ins: TAG_INS,
185
- blockquote: TAG_BLOCKQUOTE,
186
- code: TAG_CODE,
187
- ul: TAG_UL,
188
- li: TAG_LI,
189
- a: TAG_A,
190
- img: TAG_IMG,
191
- table: TAG_TABLE,
192
- thead: TAG_THEAD,
193
- tr: TAG_TR,
194
- th: TAG_TH,
195
- td: TAG_TD,
196
- ol: TAG_OL,
197
- pre: TAG_PRE,
198
- p: TAG_P,
199
- div: TAG_DIV,
200
- span: TAG_SPAN,
201
- tbody: TAG_TBODY,
202
- tfoot: TAG_TFOOT,
203
- form: TAG_FORM,
204
- nav: TAG_NAV,
205
- label: TAG_LABEL,
206
- button: TAG_BUTTON,
207
- body: TAG_BODY,
208
- center: TAG_CENTER,
209
- kbd: TAG_KBD,
210
- footer: TAG_FOOTER,
211
- path: TAG_PATH,
212
- svg: TAG_SVG,
213
- article: TAG_ARTICLE,
214
- section: TAG_SECTION,
215
- script: TAG_SCRIPT,
216
- style: TAG_STYLE,
217
- link: TAG_LINK,
218
- area: TAG_AREA,
219
- base: TAG_BASE,
220
- col: TAG_COL,
221
- embed: TAG_EMBED,
222
- input: TAG_INPUT,
223
- keygen: TAG_KEYGEN,
224
- param: TAG_PARAM,
225
- source: TAG_SOURCE,
226
- track: TAG_TRACK,
227
- wbr: TAG_WBR,
228
- select: TAG_SELECT,
229
- textarea: TAG_TEXTAREA,
230
- option: TAG_OPTION,
231
- fieldset: TAG_FIELDSET,
232
- legend: TAG_LEGEND,
233
- audio: TAG_AUDIO,
234
- video: TAG_VIDEO,
235
- canvas: TAG_CANVAS,
236
- iframe: TAG_IFRAME,
237
- map: TAG_MAP,
238
- dialog: TAG_DIALOG,
239
- meter: TAG_METER,
240
- progress: TAG_PROGRESS,
241
- template: TAG_TEMPLATE,
242
- abbr: TAG_ABBR,
243
- mark: TAG_MARK,
244
- q: TAG_Q,
245
- samp: TAG_SAMP,
246
- small: TAG_SMALL,
247
- noscript: TAG_NOSCRIPT,
248
- noframes: TAG_NOFRAMES,
249
- xmp: TAG_XMP,
250
- plaintext: TAG_PLAINTEXT,
251
- aside: TAG_ASIDE,
252
- u: TAG_U,
253
- cite: TAG_CITE,
254
- dfn: TAG_DFN,
255
- var: TAG_VAR,
256
- time: TAG_TIME,
257
- bdo: TAG_BDO,
258
- ruby: TAG_RUBY,
259
- rt: TAG_RT,
260
- rp: TAG_RP,
261
- dd: TAG_DD,
262
- dt: TAG_DT,
263
- dl: TAG_DL,
264
- address: TAG_ADDRESS,
265
- figure: TAG_FIGURE,
266
- object: TAG_OBJECT,
267
- main: TAG_MAIN,
268
- header: TAG_HEADER,
269
- figcaption: TAG_FIGCAPTION,
270
- caption: TAG_CAPTION
11
+ html: 0,
12
+ head: 1,
13
+ details: 2,
14
+ summary: 3,
15
+ title: 4,
16
+ meta: 5,
17
+ br: 6,
18
+ h1: 7,
19
+ h2: 8,
20
+ h3: 9,
21
+ h4: 10,
22
+ h5: 11,
23
+ h6: 12,
24
+ hr: 13,
25
+ strong: 14,
26
+ b: 15,
27
+ em: 16,
28
+ i: 17,
29
+ del: 18,
30
+ sub: 19,
31
+ sup: 20,
32
+ ins: 21,
33
+ blockquote: 22,
34
+ code: 23,
35
+ ul: 24,
36
+ li: 25,
37
+ a: 26,
38
+ img: 27,
39
+ table: 28,
40
+ thead: 29,
41
+ tr: 30,
42
+ th: 31,
43
+ td: 32,
44
+ ol: 33,
45
+ pre: 34,
46
+ p: 35,
47
+ div: 36,
48
+ span: 37,
49
+ tbody: 38,
50
+ tfoot: 39,
51
+ form: 40,
52
+ nav: 41,
53
+ label: 42,
54
+ button: 43,
55
+ body: 44,
56
+ center: 45,
57
+ kbd: 46,
58
+ footer: 47,
59
+ path: 48,
60
+ svg: 49,
61
+ article: 50,
62
+ section: 51,
63
+ script: 52,
64
+ style: 53,
65
+ link: 54,
66
+ area: 55,
67
+ base: 56,
68
+ col: 57,
69
+ embed: 58,
70
+ input: 59,
71
+ keygen: 60,
72
+ param: 61,
73
+ source: 62,
74
+ track: 63,
75
+ wbr: 64,
76
+ select: 65,
77
+ textarea: 66,
78
+ option: 67,
79
+ fieldset: 68,
80
+ legend: 69,
81
+ audio: 70,
82
+ video: 71,
83
+ canvas: 72,
84
+ iframe: 73,
85
+ map: 74,
86
+ dialog: 75,
87
+ meter: 76,
88
+ progress: 77,
89
+ template: 78,
90
+ abbr: 79,
91
+ mark: 80,
92
+ q: 81,
93
+ samp: 82,
94
+ small: 83,
95
+ noscript: 84,
96
+ noframes: 85,
97
+ xmp: 86,
98
+ plaintext: 87,
99
+ aside: 88,
100
+ u: 89,
101
+ cite: 90,
102
+ dfn: 91,
103
+ var: 92,
104
+ time: 93,
105
+ bdo: 94,
106
+ ruby: 95,
107
+ rt: 96,
108
+ rp: 97,
109
+ dd: 98,
110
+ dt: 99,
111
+ dl: 101,
112
+ address: 100,
113
+ figure: 102,
114
+ object: 103,
115
+ main: 104,
116
+ header: 105,
117
+ figcaption: 106,
118
+ caption: 107
271
119
  };
272
- const MARKDOWN_STRONG = "**";
273
- const MARKDOWN_EMPHASIS = "_";
274
- const MARKDOWN_STRIKETHROUGH = "~~";
275
- const MARKDOWN_CODE_BLOCK = "```";
276
- const MARKDOWN_INLINE_CODE = "`";
277
- const MARKDOWN_HORIZONTAL_RULE = "---";
278
120
  const NO_SPACING = [0, 0];
279
121
  const DEFAULT_BLOCK_SPACING = [2, 2];
280
122
  const BLOCKQUOTE_SPACING = [1, 1];
281
123
  const LIST_ITEM_SPACING = [1, 0];
282
124
  const TABLE_ROW_SPACING = [0, 1];
283
-
284
125
  //#endregion
285
- export { TAG_H2 as $, TAG_TBODY as $t, TAG_BUTTON as A, TAG_P as At, TAG_DFN as B, TAG_SCRIPT as Bt, TAG_AUDIO as C, TAG_METER as Ct, TAG_BLOCKQUOTE as D, TAG_OBJECT as Dt, TAG_BDO as E, TAG_NOSCRIPT as Et, TAG_CODE as F, TAG_Q as Ft, TAG_EM as G, TAG_SPAN as Gt, TAG_DIV as H, TAG_SELECT as Ht, TAG_COL as I, TAG_RP as It, TAG_FIGCAPTION as J, TAG_SUB as Jt, TAG_EMBED as K, TAG_STRONG as Kt, TAG_DD as L, TAG_RT as Lt, TAG_CAPTION as M, TAG_PLAINTEXT as Mt, TAG_CENTER as N, TAG_PRE as Nt, TAG_BODY as O, TAG_OL as Ot, TAG_CITE as P, TAG_PROGRESS as Pt, TAG_H1 as Q, TAG_TABLE as Qt, TAG_DEL as R, TAG_RUBY as Rt, TAG_ASIDE as S, TAG_META as St, TAG_BASE as T, TAG_NOFRAMES as Tt, TAG_DL as U, TAG_SMALL as Ut, TAG_DIALOG as V, TAG_SECTION as Vt, TAG_DT as W, TAG_SOURCE as Wt, TAG_FOOTER as X, TAG_SUP as Xt, TAG_FIGURE as Y, TAG_SUMMARY as Yt, TAG_FORM as Z, TAG_SVG as Zt, TAG_A as _, TagIdMap as _n, TAG_LI as _t, LIST_ITEM_SPACING as a, TAG_THEAD as an, TAG_HEADER as at, TAG_AREA as b, createBufferRegion as bn, TAG_MAP as bt, MARKDOWN_HORIZONTAL_RULE as c, TAG_TR as cn, TAG_I as ct, MARKDOWN_STRONG as d, TAG_UL as dn, TAG_INPUT as dt, TAG_TD as en, TAG_H3 as et, MAX_TAG_ID as f, TAG_VAR as fn, TAG_INS as ft, TABLE_ROW_SPACING as g, TEXT_NODE as gn, TAG_LEGEND as gt, NodeEventExit as h, TAG_XMP as hn, TAG_LABEL as ht, HTML_ENTITIES as i, TAG_TH as in, TAG_HEAD as it, TAG_CANVAS as j, TAG_PARAM as jt, TAG_BR as k, TAG_OPTION as kt, MARKDOWN_INLINE_CODE as l, TAG_TRACK as ln, TAG_IFRAME as lt, NodeEventEnter as m, TAG_WBR as mn, TAG_KEYGEN as mt, DEFAULT_BLOCK_SPACING as n, TAG_TEXTAREA as nn, TAG_H5 as nt, MARKDOWN_CODE_BLOCK as o, TAG_TIME as on, TAG_HR as ot, NO_SPACING as p, TAG_VIDEO as pn, TAG_KBD as pt, TAG_FIELDSET as q, TAG_STYLE as qt, ELEMENT_NODE as r, TAG_TFOOT as rn, TAG_H6 as rt, MARKDOWN_EMPHASIS as s, TAG_TITLE as sn, TAG_HTML as st, BLOCKQUOTE_SPACING as t, TAG_TEMPLATE as tn, TAG_H4 as tt, MARKDOWN_STRIKETHROUGH as u, TAG_U as un, TAG_IMG as ut, TAG_ABBR as v, assembleBufferedContent as vn, TAG_LINK as vt, TAG_B as w, TAG_NAV as wt, TAG_ARTICLE as x, TAG_MARK as xt, TAG_ADDRESS as y, collectNodeContent as yn, TAG_MAIN as yt, TAG_DETAILS as z, TAG_SAMP as zt };
126
+ export { NO_SPACING as a, LIST_ITEM_SPACING as i, DEFAULT_BLOCK_SPACING as n, TABLE_ROW_SPACING as o, HTML_ENTITIES as r, TagIdMap as s, BLOCKQUOTE_SPACING as t };
@@ -1,5 +1,4 @@
1
1
  import { t as createPlugin } from "./plugin.mjs";
2
-
3
2
  //#region src/libs/query-selector.ts
4
3
  /**
5
4
  * Creates a tag selector matcher (e.g., 'div', 'p', 'h1')
@@ -33,11 +32,12 @@ function createClassSelector(selector) {
33
32
  toString: () => `.${className}`
34
33
  };
35
34
  }
35
+ const ATTR_SELECTOR_RE = /\[([^\]=~|^$*]+)(?:([=~|^$*]+)["']?([^"'\]]+)["']?)?\]/;
36
36
  /**
37
37
  * Creates an attribute selector matcher (e.g., '[data-id]', '[href="https://example.com"]')
38
38
  */
39
39
  function createAttributeSelector(selector) {
40
- const match = selector.match(/\[([^\]=~|^$*]+)(?:([=~|^$*]+)["']?([^"'\]]+)["']?)?\]/);
40
+ const match = selector.match(ATTR_SELECTOR_RE);
41
41
  const attrName = match ? match[1] : selector.slice(1, -1);
42
42
  const operator = match?.[2];
43
43
  const attrValue = match?.[3];
@@ -100,7 +100,6 @@ function parseSelector(selector) {
100
100
  if (selectorParts.length === 1) return selectorParts[0];
101
101
  return createCompoundSelector(selectorParts);
102
102
  }
103
-
104
103
  //#endregion
105
104
  //#region src/plugins/extraction.ts
106
105
  function extractionPlugin(selectors) {
@@ -139,6 +138,5 @@ function extractionPlugin(selectors) {
139
138
  }
140
139
  });
141
140
  }
142
-
143
141
  //#endregion
144
- export { parseSelector as n, extractionPlugin as t };
142
+ export { parseSelector as n, extractionPlugin as t };