mdream 0.5.2 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -284,6 +284,109 @@ const BLOCKQUOTE_SPACING = [1, 1];
284
284
  const LIST_ITEM_SPACING = [1, 0];
285
285
  const TABLE_ROW_SPACING = [0, 1];
286
286
 
287
+ //#endregion
288
+ //#region src/libs/query-selector.ts
289
+ /**
290
+ * Creates a tag selector matcher (e.g., 'div', 'p', 'h1')
291
+ */
292
+ function createTagSelector(tagName) {
293
+ return {
294
+ matches: (element) => element.name === tagName,
295
+ toString: () => tagName
296
+ };
297
+ }
298
+ /**
299
+ * Creates an ID selector matcher (e.g., '#main', '#content')
300
+ */
301
+ function createIdSelector(selector) {
302
+ const id = selector.slice(1);
303
+ return {
304
+ matches: (element) => element.attributes?.id === id,
305
+ toString: () => `#${id}`
306
+ };
307
+ }
308
+ /**
309
+ * Creates a class selector matcher (e.g., '.container', '.header')
310
+ */
311
+ function createClassSelector(selector) {
312
+ const className = selector.slice(1);
313
+ return {
314
+ matches: (element) => {
315
+ if (!element.attributes?.class) return false;
316
+ const classes = element.attributes.class.trim().split(" ").filter(Boolean);
317
+ return classes.includes(className);
318
+ },
319
+ toString: () => `.${className}`
320
+ };
321
+ }
322
+ /**
323
+ * Creates an attribute selector matcher (e.g., '[data-id]', '[href="https://example.com"]')
324
+ */
325
+ function createAttributeSelector(selector) {
326
+ const match = selector.match(/\[([^\]=~|^$*]+)(?:([=~|^$*]+)["']?([^"'\]]+)["']?)?\]/);
327
+ const attrName = match ? match[1] : selector.slice(1, -1);
328
+ const operator = match?.[2];
329
+ const attrValue = match?.[3];
330
+ return {
331
+ matches: (element) => {
332
+ if (!(attrName in (element.attributes || {}))) return false;
333
+ if (!operator || !attrValue) return true;
334
+ const value = element.attributes[attrName];
335
+ switch (operator) {
336
+ case "=": return value === attrValue;
337
+ case "^=": return value.startsWith(attrValue);
338
+ case "$=": return value.endsWith(attrValue);
339
+ case "*=": return value.includes(attrValue);
340
+ case "~=": return value.trim().split(" ").filter(Boolean).includes(attrValue);
341
+ case "|=": return value === attrValue || value.startsWith(`${attrValue}-`);
342
+ default: return false;
343
+ }
344
+ },
345
+ toString: () => {
346
+ if (!operator || !attrValue) return `[${attrName}]`;
347
+ return `[${attrName}${operator}${attrValue}]`;
348
+ }
349
+ };
350
+ }
351
+ /**
352
+ * Creates a compound selector that combines multiple selectors (e.g., 'div.container', 'h1#title')
353
+ */
354
+ function createCompoundSelector(selectors) {
355
+ return {
356
+ matches: (element) => selectors.every((selector) => selector.matches(element)),
357
+ toString: () => selectors.map((s) => s.toString()).join("")
358
+ };
359
+ }
360
+ /**
361
+ * Parses a CSS selector into a matcher
362
+ */
363
+ function parseSelector(selector) {
364
+ selector = selector.trim();
365
+ if (!selector) throw new Error("Empty selector");
366
+ const selectorParts = [];
367
+ let current = "";
368
+ let inAttribute = false;
369
+ for (let i = 0; i < selector.length; i++) {
370
+ const char = selector[i];
371
+ if ((char === "." || char === "#" || char === "[") && current) {
372
+ if (current[0] === ".") selectorParts.push(createClassSelector(current));
373
+ else if (current[0] === "#") selectorParts.push(createIdSelector(current));
374
+ else if (current[0] === "[") selectorParts.push(createAttributeSelector(current));
375
+ else selectorParts.push(createTagSelector(current));
376
+ current = char;
377
+ } else current += char;
378
+ if (char === "[") inAttribute = true;
379
+ if (char === "]") inAttribute = false;
380
+ if (inAttribute && char !== "[") {}
381
+ }
382
+ if (current) if (current[0] === ".") selectorParts.push(createClassSelector(current));
383
+ else if (current[0] === "#") selectorParts.push(createIdSelector(current));
384
+ else if (current[0] === "[") selectorParts.push(createAttributeSelector(current));
385
+ else selectorParts.push(createTagSelector(current));
386
+ if (selectorParts.length === 1) return selectorParts[0];
387
+ return createCompoundSelector(selectorParts);
388
+ }
389
+
287
390
  //#endregion
288
391
  //#region src/pluggable/plugin.ts
289
392
  /**
@@ -296,4 +399,44 @@ function createPlugin(plugin) {
296
399
  }
297
400
 
298
401
  //#endregion
299
- export { BLOCKQUOTE_SPACING, DEFAULT_BLOCK_SPACING, ELEMENT_NODE, HTML_ENTITIES, LIST_ITEM_SPACING, MARKDOWN_CODE_BLOCK, MARKDOWN_EMPHASIS, MARKDOWN_HORIZONTAL_RULE, MARKDOWN_INLINE_CODE, MARKDOWN_STRIKETHROUGH, MARKDOWN_STRONG, MAX_TAG_ID, NO_SPACING, NodeEventEnter, NodeEventExit, TABLE_ROW_SPACING, TAG_A, TAG_ABBR, TAG_ADDRESS, TAG_AREA, TAG_ARTICLE, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BASE, TAG_BDO, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CANVAS, TAG_CAPTION, TAG_CENTER, TAG_CITE, TAG_CODE, TAG_COL, TAG_DD, TAG_DEL, TAG_DETAILS, TAG_DFN, TAG_DIALOG, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FIGCAPTION, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HEADER, TAG_HR, TAG_HTML, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_INS, TAG_KBD, TAG_KEYGEN, TAG_LABEL, TAG_LEGEND, TAG_LI, TAG_LINK, TAG_MAIN, TAG_MAP, TAG_MARK, TAG_META, TAG_METER, TAG_NAV, TAG_NOFRAMES, TAG_NOSCRIPT, TAG_OBJECT, TAG_OL, TAG_OPTION, TAG_P, TAG_PARAM, TAG_PLAINTEXT, TAG_PRE, TAG_PROGRESS, TAG_Q, TAG_RP, TAG_RT, TAG_RUBY, TAG_SAMP, TAG_SCRIPT, TAG_SECTION, TAG_SELECT, TAG_SMALL, TAG_SOURCE, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUB, TAG_SUMMARY, TAG_SUP, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEMPLATE, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TIME, TAG_TITLE, TAG_TR, TAG_TRACK, TAG_U, TAG_UL, TAG_VAR, TAG_VIDEO, TAG_WBR, TAG_XMP, TEXT_NODE, TagIdMap, assembleBufferedContent, collectNodeContent, createBufferRegion, createPlugin };
402
+ //#region src/plugins/extraction.ts
403
+ function extractionPlugin(selectors) {
404
+ const matcherCallbacks = Object.entries(selectors).map(([selector, callback]) => ({
405
+ matcher: parseSelector(selector),
406
+ callback
407
+ }));
408
+ const trackedElements = new Map();
409
+ return createPlugin({
410
+ onNodeEnter(element) {
411
+ matcherCallbacks.forEach(({ matcher, callback }) => {
412
+ if (matcher.matches(element)) trackedElements.set(element, {
413
+ textContent: "",
414
+ callback
415
+ });
416
+ });
417
+ },
418
+ processTextNode(textNode) {
419
+ let currentParent = textNode.parent;
420
+ while (currentParent) {
421
+ const tracked = trackedElements.get(currentParent);
422
+ if (tracked) tracked.textContent += textNode.value;
423
+ currentParent = currentParent.parent;
424
+ }
425
+ return void 0;
426
+ },
427
+ onNodeExit(element, state) {
428
+ const tracked = trackedElements.get(element);
429
+ if (tracked) {
430
+ const extractedElement = {
431
+ ...element,
432
+ textContent: tracked.textContent.trim()
433
+ };
434
+ tracked.callback(extractedElement, state);
435
+ trackedElements.delete(element);
436
+ }
437
+ }
438
+ });
439
+ }
440
+
441
+ //#endregion
442
+ export { BLOCKQUOTE_SPACING, DEFAULT_BLOCK_SPACING, ELEMENT_NODE, HTML_ENTITIES, LIST_ITEM_SPACING, MARKDOWN_CODE_BLOCK, MARKDOWN_EMPHASIS, MARKDOWN_HORIZONTAL_RULE, MARKDOWN_INLINE_CODE, MARKDOWN_STRIKETHROUGH, MARKDOWN_STRONG, MAX_TAG_ID, NO_SPACING, NodeEventEnter, NodeEventExit, TABLE_ROW_SPACING, TAG_A, TAG_ABBR, TAG_ADDRESS, TAG_AREA, TAG_ARTICLE, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BASE, TAG_BDO, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CANVAS, TAG_CAPTION, TAG_CENTER, TAG_CITE, TAG_CODE, TAG_COL, TAG_DD, TAG_DEL, TAG_DETAILS, TAG_DFN, TAG_DIALOG, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FIGCAPTION, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HEADER, TAG_HR, TAG_HTML, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_INS, TAG_KBD, TAG_KEYGEN, TAG_LABEL, TAG_LEGEND, TAG_LI, TAG_LINK, TAG_MAIN, TAG_MAP, TAG_MARK, TAG_META, TAG_METER, TAG_NAV, TAG_NOFRAMES, TAG_NOSCRIPT, TAG_OBJECT, TAG_OL, TAG_OPTION, TAG_P, TAG_PARAM, TAG_PLAINTEXT, TAG_PRE, TAG_PROGRESS, TAG_Q, TAG_RP, TAG_RT, TAG_RUBY, TAG_SAMP, TAG_SCRIPT, TAG_SECTION, TAG_SELECT, TAG_SMALL, TAG_SOURCE, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUB, TAG_SUMMARY, TAG_SUP, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEMPLATE, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TIME, TAG_TITLE, TAG_TR, TAG_TRACK, TAG_U, TAG_UL, TAG_VAR, TAG_VIDEO, TAG_WBR, TAG_XMP, TEXT_NODE, TagIdMap, assembleBufferedContent, collectNodeContent, createBufferRegion, createPlugin, extractionPlugin, parseSelector };
@@ -1,5 +1,5 @@
1
- import { TAG_ASIDE, TAG_BUTTON, TAG_EMBED, TAG_FIELDSET, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_IFRAME, TAG_INPUT, TAG_NAV, TAG_OBJECT, TAG_SELECT, TAG_TEXTAREA } from "./plugin-Bqz9GKOA.mjs";
2
- import { filterPlugin, frontmatterPlugin, isolateMainPlugin, tailwindPlugin } from "./plugins-D305pIpW.mjs";
1
+ import { TAG_ASIDE, TAG_BUTTON, TAG_EMBED, TAG_FIELDSET, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_IFRAME, TAG_INPUT, TAG_NAV, TAG_OBJECT, TAG_SELECT, TAG_TEXTAREA } from "./extraction-D28Kr1J3.mjs";
2
+ import { filterPlugin, frontmatterPlugin, isolateMainPlugin, tailwindPlugin } from "./plugins-DXY-fo9h.mjs";
3
3
 
4
4
  //#region src/preset/minimal.ts
5
5
  /**
@@ -1,148 +1,5 @@
1
- import { ELEMENT_NODE, TAG_A, TAG_ADDRESS, TAG_ARTICLE, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CAPTION, TAG_CODE, TAG_DD, TAG_DETAILS, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FIGCAPTION, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HEADER, TAG_HR, TAG_HTML, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_LI, TAG_MAIN, TAG_META, TAG_NAV, TAG_OBJECT, TAG_OL, TAG_P, TAG_PRE, TAG_SCRIPT, TAG_SECTION, TAG_SELECT, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUMMARY, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TITLE, TAG_TR, TAG_UL, TAG_VIDEO, TEXT_NODE, collectNodeContent, createBufferRegion, createPlugin } from "./plugin-Bqz9GKOA.mjs";
1
+ import { ELEMENT_NODE, TAG_A, TAG_ADDRESS, TAG_ARTICLE, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CAPTION, TAG_CODE, TAG_DD, TAG_DETAILS, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FIGCAPTION, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HEADER, TAG_HR, TAG_HTML, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_LI, TAG_MAIN, TAG_META, TAG_NAV, TAG_OBJECT, TAG_OL, TAG_P, TAG_PRE, TAG_SCRIPT, TAG_SECTION, TAG_SELECT, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUMMARY, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TITLE, TAG_TR, TAG_UL, TAG_VIDEO, TEXT_NODE, collectNodeContent, createBufferRegion, createPlugin, parseSelector } from "./extraction-D28Kr1J3.mjs";
2
2
 
3
- //#region src/libs/query-selector.ts
4
- /**
5
- * Creates a tag selector matcher (e.g., 'div', 'p', 'h1')
6
- */
7
- function createTagSelector(tagName) {
8
- return {
9
- matches: (element) => element.name === tagName,
10
- toString: () => tagName
11
- };
12
- }
13
- /**
14
- * Creates an ID selector matcher (e.g., '#main', '#content')
15
- */
16
- function createIdSelector(selector) {
17
- const id = selector.slice(1);
18
- return {
19
- matches: (element) => element.attributes?.id === id,
20
- toString: () => `#${id}`
21
- };
22
- }
23
- /**
24
- * Creates a class selector matcher (e.g., '.container', '.header')
25
- */
26
- function createClassSelector(selector) {
27
- const className = selector.slice(1);
28
- return {
29
- matches: (element) => {
30
- if (!element.attributes?.class) return false;
31
- const classes = element.attributes.class.trim().split(" ").filter(Boolean);
32
- return classes.includes(className);
33
- },
34
- toString: () => `.${className}`
35
- };
36
- }
37
- /**
38
- * Creates an attribute selector matcher (e.g., '[data-id]', '[href="https://example.com"]')
39
- */
40
- function createAttributeSelector(selector) {
41
- const match = selector.match(/\[([^\]=~|^$*]+)(?:([=~|^$*]+)["']?([^"'\]]+)["']?)?\]/);
42
- const attrName = match ? match[1] : selector.slice(1, -1);
43
- const operator = match?.[2];
44
- const attrValue = match?.[3];
45
- return {
46
- matches: (element) => {
47
- if (!(attrName in (element.attributes || {}))) return false;
48
- if (!operator || !attrValue) return true;
49
- const value = element.attributes[attrName];
50
- switch (operator) {
51
- case "=": return value === attrValue;
52
- case "^=": return value.startsWith(attrValue);
53
- case "$=": return value.endsWith(attrValue);
54
- case "*=": return value.includes(attrValue);
55
- case "~=": return value.trim().split(" ").filter(Boolean).includes(attrValue);
56
- case "|=": return value === attrValue || value.startsWith(`${attrValue}-`);
57
- default: return false;
58
- }
59
- },
60
- toString: () => {
61
- if (!operator || !attrValue) return `[${attrName}]`;
62
- return `[${attrName}${operator}${attrValue}]`;
63
- }
64
- };
65
- }
66
- /**
67
- * Creates a compound selector that combines multiple selectors (e.g., 'div.container', 'h1#title')
68
- */
69
- function createCompoundSelector(selectors) {
70
- return {
71
- matches: (element) => selectors.every((selector) => selector.matches(element)),
72
- toString: () => selectors.map((s) => s.toString()).join("")
73
- };
74
- }
75
- /**
76
- * Parses a CSS selector into a matcher
77
- */
78
- function parseSelector(selector) {
79
- selector = selector.trim();
80
- if (!selector) throw new Error("Empty selector");
81
- const selectorParts = [];
82
- let current = "";
83
- let inAttribute = false;
84
- for (let i = 0; i < selector.length; i++) {
85
- const char = selector[i];
86
- if ((char === "." || char === "#" || char === "[") && current) {
87
- if (current[0] === ".") selectorParts.push(createClassSelector(current));
88
- else if (current[0] === "#") selectorParts.push(createIdSelector(current));
89
- else if (current[0] === "[") selectorParts.push(createAttributeSelector(current));
90
- else selectorParts.push(createTagSelector(current));
91
- current = char;
92
- } else current += char;
93
- if (char === "[") inAttribute = true;
94
- if (char === "]") inAttribute = false;
95
- if (inAttribute && char !== "[") {}
96
- }
97
- if (current) if (current[0] === ".") selectorParts.push(createClassSelector(current));
98
- else if (current[0] === "#") selectorParts.push(createIdSelector(current));
99
- else if (current[0] === "[") selectorParts.push(createAttributeSelector(current));
100
- else selectorParts.push(createTagSelector(current));
101
- if (selectorParts.length === 1) return selectorParts[0];
102
- return createCompoundSelector(selectorParts);
103
- }
104
-
105
- //#endregion
106
- //#region src/plugins/extraction.ts
107
- function extractionPlugin(selectors) {
108
- const matcherCallbacks = Object.entries(selectors).map(([selector, callback]) => ({
109
- matcher: parseSelector(selector),
110
- callback
111
- }));
112
- const trackedElements = new Map();
113
- return createPlugin({
114
- onNodeEnter(element) {
115
- matcherCallbacks.forEach(({ matcher, callback }) => {
116
- if (matcher.matches(element)) trackedElements.set(element, {
117
- textContent: "",
118
- callback
119
- });
120
- });
121
- },
122
- processTextNode(textNode) {
123
- let currentParent = textNode.parent;
124
- while (currentParent) {
125
- const tracked = trackedElements.get(currentParent);
126
- if (tracked) tracked.textContent += textNode.value;
127
- currentParent = currentParent.parent;
128
- }
129
- return void 0;
130
- },
131
- onNodeExit(element, state) {
132
- const tracked = trackedElements.get(element);
133
- if (tracked) {
134
- const extractedElement = {
135
- ...element,
136
- textContent: tracked.textContent.trim()
137
- };
138
- tracked.callback(extractedElement, state);
139
- trackedElements.delete(element);
140
- }
141
- }
142
- });
143
- }
144
-
145
- //#endregion
146
3
  //#region src/plugins/filter.ts
147
4
  /**
148
5
  * Plugin that filters nodes based on CSS selectors.
@@ -841,4 +698,4 @@ function tailwindPlugin() {
841
698
  }
842
699
 
843
700
  //#endregion
844
- export { extractionPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin };
701
+ export { filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin };
@@ -1,4 +1,7 @@
1
- import { BLOCKQUOTE_SPACING, DEFAULT_BLOCK_SPACING, ELEMENT_NODE, HTML_ENTITIES, LIST_ITEM_SPACING, MARKDOWN_CODE_BLOCK, MARKDOWN_EMPHASIS, MARKDOWN_HORIZONTAL_RULE, MARKDOWN_INLINE_CODE, MARKDOWN_STRIKETHROUGH, MARKDOWN_STRONG, MAX_TAG_ID, NO_SPACING, NodeEventEnter, NodeEventExit, TABLE_ROW_SPACING, TAG_A, TAG_ABBR, TAG_ADDRESS, TAG_AREA, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BASE, TAG_BDO, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CANVAS, TAG_CENTER, TAG_CITE, TAG_CODE, TAG_COL, TAG_DD, TAG_DEL, TAG_DETAILS, TAG_DFN, TAG_DIALOG, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HR, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_INS, TAG_KBD, TAG_KEYGEN, TAG_LABEL, TAG_LEGEND, TAG_LI, TAG_LINK, TAG_MAP, TAG_MARK, TAG_META, TAG_METER, TAG_NAV, TAG_NOFRAMES, TAG_NOSCRIPT, TAG_OL, TAG_OPTION, TAG_P, TAG_PARAM, TAG_PLAINTEXT, TAG_PRE, TAG_PROGRESS, TAG_Q, TAG_RP, TAG_RT, TAG_RUBY, TAG_SAMP, TAG_SCRIPT, TAG_SELECT, TAG_SMALL, TAG_SOURCE, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUB, TAG_SUMMARY, TAG_SUP, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEMPLATE, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TIME, TAG_TITLE, TAG_TR, TAG_TRACK, TAG_U, TAG_UL, TAG_VAR, TAG_VIDEO, TAG_WBR, TAG_XMP, TEXT_NODE, TagIdMap, assembleBufferedContent, collectNodeContent } from "./plugin-Bqz9GKOA.mjs";
1
+ import { BLOCKQUOTE_SPACING, DEFAULT_BLOCK_SPACING, ELEMENT_NODE, HTML_ENTITIES, LIST_ITEM_SPACING, MARKDOWN_CODE_BLOCK, MARKDOWN_EMPHASIS, MARKDOWN_HORIZONTAL_RULE, MARKDOWN_INLINE_CODE, MARKDOWN_STRIKETHROUGH, MARKDOWN_STRONG, MAX_TAG_ID, NO_SPACING, NodeEventEnter, NodeEventExit, TABLE_ROW_SPACING, TAG_A, TAG_ABBR, TAG_ADDRESS, TAG_AREA, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BASE, TAG_BDO, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CANVAS, TAG_CENTER, TAG_CITE, TAG_CODE, TAG_COL, TAG_DD, TAG_DEL, TAG_DETAILS, TAG_DFN, TAG_DIALOG, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HR, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_INS, TAG_KBD, TAG_KEYGEN, TAG_LABEL, TAG_LEGEND, TAG_LI, TAG_LINK, TAG_MAP, TAG_MARK, TAG_META, TAG_METER, TAG_NAV, TAG_NOFRAMES, TAG_NOSCRIPT, TAG_OL, TAG_OPTION, TAG_P, TAG_PARAM, TAG_PLAINTEXT, TAG_PRE, TAG_PROGRESS, TAG_Q, TAG_RP, TAG_RT, TAG_RUBY, TAG_SAMP, TAG_SCRIPT, TAG_SELECT, TAG_SMALL, TAG_SOURCE, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUB, TAG_SUMMARY, TAG_SUP, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEMPLATE, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TIME, TAG_TITLE, TAG_TR, TAG_TRACK, TAG_U, TAG_UL, TAG_VAR, TAG_VIDEO, TAG_WBR, TAG_XMP, TEXT_NODE, TagIdMap, assembleBufferedContent, collectNodeContent, extractionPlugin } from "./extraction-D28Kr1J3.mjs";
2
+ import { readFile } from "node:fs/promises";
3
+ import { basename, dirname, relative } from "node:path";
4
+ import { glob } from "tinyglobby";
2
5
 
3
6
  //#region src/tags.ts
4
7
  function resolveUrl(url, origin) {
@@ -1382,6 +1385,164 @@ function createMarkdownProcessor(options = {}) {
1382
1385
  }
1383
1386
  const MarkdownProcessor = createMarkdownProcessor;
1384
1387
 
1388
+ //#endregion
1389
+ //#region src/llms-txt.ts
1390
+ /**
1391
+ * Extract metadata from HTML content using mdream's extraction plugin
1392
+ */
1393
+ function extractMetadata(html, url) {
1394
+ let title = "";
1395
+ let description = "";
1396
+ let keywords = "";
1397
+ let author = "";
1398
+ const extractionPluginInstance = extractionPlugin({
1399
+ "title": (element) => {
1400
+ if (!title && element.textContent) title = element.textContent.trim();
1401
+ },
1402
+ "meta[name=\"description\"]": (element) => {
1403
+ if (!description && element.attributes?.content) description = element.attributes.content.trim();
1404
+ },
1405
+ "meta[property=\"og:description\"]": (element) => {
1406
+ if (!description && element.attributes?.content) description = element.attributes.content.trim();
1407
+ },
1408
+ "meta[name=\"keywords\"]": (element) => {
1409
+ if (!keywords && element.attributes?.content) keywords = element.attributes.content.trim();
1410
+ },
1411
+ "meta[name=\"author\"]": (element) => {
1412
+ if (!author && element.attributes?.content) author = element.attributes.content.trim();
1413
+ },
1414
+ "meta[property=\"og:title\"]": (element) => {
1415
+ if (!title && element.attributes?.content) title = element.attributes.content.trim();
1416
+ }
1417
+ });
1418
+ htmlToMarkdown(html, {
1419
+ plugins: [extractionPluginInstance],
1420
+ origin: url
1421
+ });
1422
+ return {
1423
+ title: title || void 0,
1424
+ description: description || void 0,
1425
+ keywords: keywords || void 0,
1426
+ author: author || void 0
1427
+ };
1428
+ }
1429
+ /**
1430
+ * Convert file path to URL path
1431
+ */
1432
+ function pathToUrl(filePath, baseDir) {
1433
+ let url = relative(baseDir, filePath);
1434
+ if (url.endsWith(".html")) url = url.slice(0, -5);
1435
+ if (url.endsWith("/index")) url = url.slice(0, -6);
1436
+ if (url === "index") return "/";
1437
+ if (!url.startsWith("/")) url = `/${url}`;
1438
+ return url;
1439
+ }
1440
+ /**
1441
+ * Process HTML files from glob patterns
1442
+ */
1443
+ async function processHtmlFiles(patterns, origin) {
1444
+ const allPatterns = Array.isArray(patterns) ? patterns : [patterns];
1445
+ const allFiles = [];
1446
+ for (const pattern of allPatterns) {
1447
+ const files = await glob(pattern);
1448
+ allFiles.push(...files);
1449
+ }
1450
+ const uniqueFiles = [...new Set(allFiles)];
1451
+ const results = [];
1452
+ const baseDir = uniqueFiles.length > 0 ? dirname(uniqueFiles[0]) : ".";
1453
+ for (const filePath of uniqueFiles) try {
1454
+ const html = await readFile(filePath, "utf-8");
1455
+ const metadata = extractMetadata(html, origin || filePath);
1456
+ const content = htmlToMarkdown(html, { origin });
1457
+ const url = pathToUrl(filePath, baseDir);
1458
+ results.push({
1459
+ filePath,
1460
+ title: metadata?.title || basename(filePath, ".html"),
1461
+ content,
1462
+ url,
1463
+ metadata
1464
+ });
1465
+ } catch (error) {
1466
+ console.error(`Error processing ${filePath}:`, error);
1467
+ }
1468
+ return results;
1469
+ }
1470
+ /**
1471
+ * Generate llms.txt content
1472
+ */
1473
+ function generateLlmsTxtContent(files, options) {
1474
+ const { siteName = "Site", description, origin = "" } = options;
1475
+ let content = `# ${siteName}\n\n`;
1476
+ if (description) content += `> ${description}\n\n`;
1477
+ if (files.length > 0) {
1478
+ content += `## Pages\n\n`;
1479
+ for (const file of files) {
1480
+ const url = origin + file.url;
1481
+ const desc = file.metadata?.description;
1482
+ const descText = desc ? `: ${desc.substring(0, 100)}${desc.length > 100 ? "..." : ""}` : "";
1483
+ content += `- [${file.title}](${url})${descText}\n`;
1484
+ }
1485
+ }
1486
+ return content;
1487
+ }
1488
+ /**
1489
+ * Generate llms-full.txt content with complete page content
1490
+ */
1491
+ function generateLlmsFullTxtContent(files, options) {
1492
+ const { siteName = "Site", description, origin = "" } = options;
1493
+ let content = `# ${siteName}\n\n`;
1494
+ if (description) content += `> ${description}\n\n`;
1495
+ if (files.length > 0) {
1496
+ content += `## Table of Contents\n\n`;
1497
+ for (const file of files) {
1498
+ const anchor = file.title.toLowerCase().replace(/[^a-z0-9]/g, "-");
1499
+ content += `- [${file.title}](#${anchor})\n`;
1500
+ }
1501
+ content += `\n---\n\n`;
1502
+ for (const file of files) {
1503
+ const url = origin ? origin + file.url : file.url;
1504
+ content += `## ${file.title}\n\n`;
1505
+ content += `**URL:** ${url}\n\n`;
1506
+ content += `${file.content}\n\n---\n\n`;
1507
+ }
1508
+ }
1509
+ return content;
1510
+ }
1511
+ /**
1512
+ * Generate individual markdown files structure
1513
+ */
1514
+ function generateMarkdownFilesContent(files) {
1515
+ const markdownFiles = [];
1516
+ for (const file of files) {
1517
+ const mdPath = file.url === "/" ? "md/index.md" : `md${file.url}.md`;
1518
+ markdownFiles.push({
1519
+ path: mdPath,
1520
+ content: file.content
1521
+ });
1522
+ }
1523
+ return markdownFiles;
1524
+ }
1525
+ /**
1526
+ * Main function to process files and generate llms.txt artifacts
1527
+ */
1528
+ async function generateLlmsTxtArtifacts(options) {
1529
+ let files;
1530
+ if (options.files) files = options.files;
1531
+ else if (options.patterns) files = await processHtmlFiles(options.patterns, options.origin);
1532
+ else throw new Error("Either patterns or files must be provided");
1533
+ const llmsTxt = generateLlmsTxtContent(files, options);
1534
+ let llmsFullTxt;
1535
+ if (options.generateFull) llmsFullTxt = generateLlmsFullTxtContent(files, options);
1536
+ let markdownFiles;
1537
+ if (options.generateMarkdown) markdownFiles = generateMarkdownFilesContent(files);
1538
+ return {
1539
+ llmsTxt,
1540
+ llmsFullTxt,
1541
+ markdownFiles,
1542
+ processedFiles: files
1543
+ };
1544
+ }
1545
+
1385
1546
  //#endregion
1386
1547
  //#region src/stream.ts
1387
1548
  /**
@@ -1424,4 +1585,12 @@ async function* streamHtmlToMarkdown(htmlStream, options = {}) {
1424
1585
  }
1425
1586
 
1426
1587
  //#endregion
1427
- export { MarkdownProcessor, createMarkdownProcessor, parseHtml, streamHtmlToMarkdown };
1588
+ //#region src/index.ts
1589
+ function htmlToMarkdown(html, options = {}) {
1590
+ const processor = createMarkdownProcessor(options);
1591
+ processor.processHtml(html);
1592
+ return processor.getMarkdown();
1593
+ }
1594
+
1595
+ //#endregion
1596
+ export { MarkdownProcessor, generateLlmsTxtArtifacts, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
package/dist/cli.mjs CHANGED
@@ -1,9 +1,10 @@
1
- import "./_chunks/plugin-Bqz9GKOA.mjs";
2
- import { streamHtmlToMarkdown } from "./_chunks/stream-IeCVDuTy.mjs";
3
- import "./_chunks/plugins-D305pIpW.mjs";
4
- import { withMinimalPreset } from "./_chunks/minimal-Ru8PBNVI.mjs";
5
- import { readFileSync } from "node:fs";
1
+ import "./_chunks/extraction-D28Kr1J3.mjs";
2
+ import { generateLlmsTxtArtifacts, streamHtmlToMarkdown } from "./_chunks/src-Eo8j0-9L.mjs";
3
+ import "./_chunks/plugins-DXY-fo9h.mjs";
4
+ import { withMinimalPreset } from "./_chunks/minimal-CCnrG7a1.mjs";
5
+ import { mkdir, writeFile } from "node:fs/promises";
6
6
  import { dirname, join } from "node:path";
7
+ import { readFileSync } from "node:fs";
7
8
  import { Readable } from "node:stream";
8
9
  import { fileURLToPath } from "node:url";
9
10
  import { cac } from "cac";
@@ -16,6 +17,38 @@ async function streamingConvert(options = {}) {
16
17
  const markdownGenerator = streamHtmlToMarkdown(Readable.toWeb(process.stdin), conversionOptions);
17
18
  for await (const markdownChunk of markdownGenerator) if (markdownChunk && markdownChunk.length > 0) outputStream.write(markdownChunk);
18
19
  }
20
+ async function generateLlms(patterns, options) {
21
+ try {
22
+ const artifacts = options.artifacts ? options.artifacts.split(",").map((a) => a.trim()) : [
23
+ "llms.txt",
24
+ "llms-full.txt",
25
+ "markdown"
26
+ ];
27
+ const result = await generateLlmsTxtArtifacts({
28
+ patterns,
29
+ siteName: options.siteName,
30
+ description: options.description,
31
+ origin: options.origin,
32
+ generateFull: artifacts.includes("llms-full.txt"),
33
+ generateMarkdown: artifacts.includes("markdown")
34
+ });
35
+ await mkdir(options.output, { recursive: true });
36
+ const llmsPath = join(options.output, "llms.txt");
37
+ await writeFile(llmsPath, result.llmsTxt, "utf-8");
38
+ if (artifacts.includes("llms-full.txt") && result.llmsFullTxt) {
39
+ const fullPath = join(options.output, "llms-full.txt");
40
+ await writeFile(fullPath, result.llmsFullTxt, "utf-8");
41
+ }
42
+ if (artifacts.includes("markdown") && result.markdownFiles) for (const mdFile of result.markdownFiles) {
43
+ const fullPath = join(options.output, mdFile.path);
44
+ await mkdir(dirname(fullPath), { recursive: true });
45
+ await writeFile(fullPath, mdFile.content, "utf-8");
46
+ }
47
+ } catch (error) {
48
+ console.error("❌ Error generating llms.txt:", error);
49
+ process.exit(1);
50
+ }
51
+ }
19
52
  const __dirname = dirname(fileURLToPath(import.meta.url));
20
53
  const packageJsonPath = join(__dirname, "..", "package.json");
21
54
  const packageJson = JSON.parse(readFileSync(packageJsonPath, "utf-8"));
@@ -24,6 +57,12 @@ const cli = cac();
24
57
  cli.command("[options]", "Convert HTML from stdin to Markdown on stdout").option("--origin <url>", "Origin URL for resolving relative image paths").option("--preset <preset>", "Conversion presets: minimal").action(async (_, opts) => {
25
58
  await streamingConvert(opts);
26
59
  });
60
+ cli.command("llms <patterns...>", "Generate llms.txt artifacts from HTML files").option("--site-name <name>", "Name of the site for llms.txt header").option("--description <desc>", "Description of the site for llms.txt").option("--origin <url>", "Origin URL for resolving relative paths and generating absolute URLs").option("-o, --output <dir>", "Output directory for generated files", { default: "." }).option("--artifacts <list>", "Comma-separated list of artifacts to generate: llms.txt,llms-full.txt,markdown", { default: "llms.txt,llms-full.txt,markdown" }).action(async (patterns, opts) => {
61
+ await generateLlms(patterns, {
62
+ patterns,
63
+ ...opts
64
+ });
65
+ });
27
66
  cli.help().version(version).parse();
28
67
 
29
68
  //#endregion
package/dist/index.d.mts CHANGED
@@ -6,6 +6,42 @@ import { ReadableStream } from "node:stream/web";
6
6
 
7
7
  declare const TagIdMap: Record<string, number>;
8
8
  //#endregion
9
+ //#region src/llms-txt.d.ts
10
+ interface LlmsTxtArtifactsOptions {
11
+ patterns?: string | string[];
12
+ files?: ProcessedFile[];
13
+ siteName?: string;
14
+ description?: string;
15
+ origin?: string;
16
+ generateFull?: boolean;
17
+ generateMarkdown?: boolean;
18
+ }
19
+ interface ProcessedFile {
20
+ filePath?: string;
21
+ title: string;
22
+ content: string;
23
+ url: string;
24
+ metadata?: {
25
+ title?: string;
26
+ description?: string;
27
+ keywords?: string;
28
+ author?: string;
29
+ };
30
+ }
31
+ interface LlmsTxtArtifactsResult {
32
+ llmsTxt: string;
33
+ llmsFullTxt?: string;
34
+ markdownFiles?: {
35
+ path: string;
36
+ content: string;
37
+ }[];
38
+ processedFiles: ProcessedFile[];
39
+ }
40
+ /**
41
+ * Main function to process files and generate llms.txt artifacts
42
+ */
43
+ declare function generateLlmsTxtArtifacts(options: LlmsTxtArtifactsOptions): Promise<LlmsTxtArtifactsResult>;
44
+ //#endregion
9
45
  //#region src/markdown-processor.d.ts
10
46
  interface MarkdownState {
11
47
  /** Configuration options for conversion */
@@ -72,4 +108,4 @@ declare function streamHtmlToMarkdown(htmlStream: ReadableStream | null, options
72
108
  //#region src/index.d.ts
73
109
  declare function htmlToMarkdown(html: string, options?: HTMLToMarkdownOptions): string;
74
110
  //#endregion
75
- export { BufferRegion, ELEMENT_NODE, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MarkdownProcessor, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginContext, PluginCreationOptions, ReadabilityContext, TEXT_NODE, TagHandler, TagIdMap, TailwindContext, TextNode, createPlugin, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
111
+ export { BufferRegion, ELEMENT_NODE, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, LlmsTxtArtifactsOptions, LlmsTxtArtifactsResult, MarkdownProcessor, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginContext, PluginCreationOptions, ProcessedFile, ReadabilityContext, TEXT_NODE, TagHandler, TagIdMap, TailwindContext, TextNode, createPlugin, generateLlmsTxtArtifacts, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
package/dist/index.mjs CHANGED
@@ -1,12 +1,4 @@
1
- import { TagIdMap, createPlugin } from "./_chunks/plugin-Bqz9GKOA.mjs";
2
- import { MarkdownProcessor, createMarkdownProcessor, parseHtml, streamHtmlToMarkdown } from "./_chunks/stream-IeCVDuTy.mjs";
1
+ import { TagIdMap, createPlugin } from "./_chunks/extraction-D28Kr1J3.mjs";
2
+ import { MarkdownProcessor, generateLlmsTxtArtifacts, htmlToMarkdown, parseHtml, streamHtmlToMarkdown } from "./_chunks/src-Eo8j0-9L.mjs";
3
3
 
4
- //#region src/index.ts
5
- function htmlToMarkdown(html, options = {}) {
6
- const processor = createMarkdownProcessor(options);
7
- processor.processHtml(html);
8
- return processor.getMarkdown();
9
- }
10
-
11
- //#endregion
12
- export { MarkdownProcessor, TagIdMap, createPlugin, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
4
+ export { MarkdownProcessor, TagIdMap, createPlugin, generateLlmsTxtArtifacts, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
package/dist/plugins.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { createPlugin } from "./_chunks/plugin-Bqz9GKOA.mjs";
2
- import { extractionPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin } from "./_chunks/plugins-D305pIpW.mjs";
1
+ import { createPlugin, extractionPlugin } from "./_chunks/extraction-D28Kr1J3.mjs";
2
+ import { filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin } from "./_chunks/plugins-DXY-fo9h.mjs";
3
3
 
4
4
  export { createPlugin, extractionPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin };
@@ -1,5 +1,5 @@
1
- import "../_chunks/plugin-Bqz9GKOA.mjs";
2
- import "../_chunks/plugins-D305pIpW.mjs";
3
- import { withMinimalPreset } from "../_chunks/minimal-Ru8PBNVI.mjs";
1
+ import "../_chunks/extraction-D28Kr1J3.mjs";
2
+ import "../_chunks/plugins-DXY-fo9h.mjs";
3
+ import { withMinimalPreset } from "../_chunks/minimal-CCnrG7a1.mjs";
4
4
 
5
5
  export { withMinimalPreset };
package/package.json CHANGED
@@ -1,8 +1,8 @@
1
1
  {
2
2
  "name": "mdream",
3
3
  "type": "module",
4
- "version": "0.5.2",
5
- "description": "Ultra-performant JavaScript HTML to Markdown converter optimized for LLMs.",
4
+ "version": "0.6.0",
5
+ "description": "Ultra-performant HTML to Markdown Convertor Optimized for LLMs and llm.txt artifacts.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
8
8
  "email": "harlan@harlanzw.com",
@@ -53,7 +53,8 @@
53
53
  "dist"
54
54
  ],
55
55
  "dependencies": {
56
- "cac": "^6.7.14"
56
+ "cac": "^6.7.14",
57
+ "tinyglobby": "^0.2.10"
57
58
  },
58
59
  "scripts": {
59
60
  "flame": "pnpm build && unbuild bench/bundle && clinic flame -- node bench/bundle/dist/string.mjs 10",