mdream 0.5.2 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_chunks/{plugin-Bqz9GKOA.mjs → extraction-D28Kr1J3.mjs} +144 -1
- package/dist/_chunks/{minimal-Ru8PBNVI.mjs → minimal-CCnrG7a1.mjs} +2 -2
- package/dist/_chunks/{plugins-D305pIpW.mjs → plugins-DXY-fo9h.mjs} +2 -145
- package/dist/_chunks/{stream-IeCVDuTy.mjs → src-Eo8j0-9L.mjs} +171 -2
- package/dist/cli.mjs +44 -5
- package/dist/index.d.mts +37 -1
- package/dist/index.mjs +3 -11
- package/dist/plugins.mjs +2 -2
- package/dist/preset/minimal.mjs +3 -3
- package/package.json +4 -3
|
@@ -284,6 +284,109 @@ const BLOCKQUOTE_SPACING = [1, 1];
|
|
|
284
284
|
const LIST_ITEM_SPACING = [1, 0];
|
|
285
285
|
const TABLE_ROW_SPACING = [0, 1];
|
|
286
286
|
|
|
287
|
+
//#endregion
|
|
288
|
+
//#region src/libs/query-selector.ts
|
|
289
|
+
/**
|
|
290
|
+
* Creates a tag selector matcher (e.g., 'div', 'p', 'h1')
|
|
291
|
+
*/
|
|
292
|
+
function createTagSelector(tagName) {
|
|
293
|
+
return {
|
|
294
|
+
matches: (element) => element.name === tagName,
|
|
295
|
+
toString: () => tagName
|
|
296
|
+
};
|
|
297
|
+
}
|
|
298
|
+
/**
|
|
299
|
+
* Creates an ID selector matcher (e.g., '#main', '#content')
|
|
300
|
+
*/
|
|
301
|
+
function createIdSelector(selector) {
|
|
302
|
+
const id = selector.slice(1);
|
|
303
|
+
return {
|
|
304
|
+
matches: (element) => element.attributes?.id === id,
|
|
305
|
+
toString: () => `#${id}`
|
|
306
|
+
};
|
|
307
|
+
}
|
|
308
|
+
/**
|
|
309
|
+
* Creates a class selector matcher (e.g., '.container', '.header')
|
|
310
|
+
*/
|
|
311
|
+
function createClassSelector(selector) {
|
|
312
|
+
const className = selector.slice(1);
|
|
313
|
+
return {
|
|
314
|
+
matches: (element) => {
|
|
315
|
+
if (!element.attributes?.class) return false;
|
|
316
|
+
const classes = element.attributes.class.trim().split(" ").filter(Boolean);
|
|
317
|
+
return classes.includes(className);
|
|
318
|
+
},
|
|
319
|
+
toString: () => `.${className}`
|
|
320
|
+
};
|
|
321
|
+
}
|
|
322
|
+
/**
|
|
323
|
+
* Creates an attribute selector matcher (e.g., '[data-id]', '[href="https://example.com"]')
|
|
324
|
+
*/
|
|
325
|
+
function createAttributeSelector(selector) {
|
|
326
|
+
const match = selector.match(/\[([^\]=~|^$*]+)(?:([=~|^$*]+)["']?([^"'\]]+)["']?)?\]/);
|
|
327
|
+
const attrName = match ? match[1] : selector.slice(1, -1);
|
|
328
|
+
const operator = match?.[2];
|
|
329
|
+
const attrValue = match?.[3];
|
|
330
|
+
return {
|
|
331
|
+
matches: (element) => {
|
|
332
|
+
if (!(attrName in (element.attributes || {}))) return false;
|
|
333
|
+
if (!operator || !attrValue) return true;
|
|
334
|
+
const value = element.attributes[attrName];
|
|
335
|
+
switch (operator) {
|
|
336
|
+
case "=": return value === attrValue;
|
|
337
|
+
case "^=": return value.startsWith(attrValue);
|
|
338
|
+
case "$=": return value.endsWith(attrValue);
|
|
339
|
+
case "*=": return value.includes(attrValue);
|
|
340
|
+
case "~=": return value.trim().split(" ").filter(Boolean).includes(attrValue);
|
|
341
|
+
case "|=": return value === attrValue || value.startsWith(`${attrValue}-`);
|
|
342
|
+
default: return false;
|
|
343
|
+
}
|
|
344
|
+
},
|
|
345
|
+
toString: () => {
|
|
346
|
+
if (!operator || !attrValue) return `[${attrName}]`;
|
|
347
|
+
return `[${attrName}${operator}${attrValue}]`;
|
|
348
|
+
}
|
|
349
|
+
};
|
|
350
|
+
}
|
|
351
|
+
/**
|
|
352
|
+
* Creates a compound selector that combines multiple selectors (e.g., 'div.container', 'h1#title')
|
|
353
|
+
*/
|
|
354
|
+
function createCompoundSelector(selectors) {
|
|
355
|
+
return {
|
|
356
|
+
matches: (element) => selectors.every((selector) => selector.matches(element)),
|
|
357
|
+
toString: () => selectors.map((s) => s.toString()).join("")
|
|
358
|
+
};
|
|
359
|
+
}
|
|
360
|
+
/**
|
|
361
|
+
* Parses a CSS selector into a matcher
|
|
362
|
+
*/
|
|
363
|
+
function parseSelector(selector) {
|
|
364
|
+
selector = selector.trim();
|
|
365
|
+
if (!selector) throw new Error("Empty selector");
|
|
366
|
+
const selectorParts = [];
|
|
367
|
+
let current = "";
|
|
368
|
+
let inAttribute = false;
|
|
369
|
+
for (let i = 0; i < selector.length; i++) {
|
|
370
|
+
const char = selector[i];
|
|
371
|
+
if ((char === "." || char === "#" || char === "[") && current) {
|
|
372
|
+
if (current[0] === ".") selectorParts.push(createClassSelector(current));
|
|
373
|
+
else if (current[0] === "#") selectorParts.push(createIdSelector(current));
|
|
374
|
+
else if (current[0] === "[") selectorParts.push(createAttributeSelector(current));
|
|
375
|
+
else selectorParts.push(createTagSelector(current));
|
|
376
|
+
current = char;
|
|
377
|
+
} else current += char;
|
|
378
|
+
if (char === "[") inAttribute = true;
|
|
379
|
+
if (char === "]") inAttribute = false;
|
|
380
|
+
if (inAttribute && char !== "[") {}
|
|
381
|
+
}
|
|
382
|
+
if (current) if (current[0] === ".") selectorParts.push(createClassSelector(current));
|
|
383
|
+
else if (current[0] === "#") selectorParts.push(createIdSelector(current));
|
|
384
|
+
else if (current[0] === "[") selectorParts.push(createAttributeSelector(current));
|
|
385
|
+
else selectorParts.push(createTagSelector(current));
|
|
386
|
+
if (selectorParts.length === 1) return selectorParts[0];
|
|
387
|
+
return createCompoundSelector(selectorParts);
|
|
388
|
+
}
|
|
389
|
+
|
|
287
390
|
//#endregion
|
|
288
391
|
//#region src/pluggable/plugin.ts
|
|
289
392
|
/**
|
|
@@ -296,4 +399,44 @@ function createPlugin(plugin) {
|
|
|
296
399
|
}
|
|
297
400
|
|
|
298
401
|
//#endregion
|
|
299
|
-
|
|
402
|
+
//#region src/plugins/extraction.ts
|
|
403
|
+
function extractionPlugin(selectors) {
|
|
404
|
+
const matcherCallbacks = Object.entries(selectors).map(([selector, callback]) => ({
|
|
405
|
+
matcher: parseSelector(selector),
|
|
406
|
+
callback
|
|
407
|
+
}));
|
|
408
|
+
const trackedElements = new Map();
|
|
409
|
+
return createPlugin({
|
|
410
|
+
onNodeEnter(element) {
|
|
411
|
+
matcherCallbacks.forEach(({ matcher, callback }) => {
|
|
412
|
+
if (matcher.matches(element)) trackedElements.set(element, {
|
|
413
|
+
textContent: "",
|
|
414
|
+
callback
|
|
415
|
+
});
|
|
416
|
+
});
|
|
417
|
+
},
|
|
418
|
+
processTextNode(textNode) {
|
|
419
|
+
let currentParent = textNode.parent;
|
|
420
|
+
while (currentParent) {
|
|
421
|
+
const tracked = trackedElements.get(currentParent);
|
|
422
|
+
if (tracked) tracked.textContent += textNode.value;
|
|
423
|
+
currentParent = currentParent.parent;
|
|
424
|
+
}
|
|
425
|
+
return void 0;
|
|
426
|
+
},
|
|
427
|
+
onNodeExit(element, state) {
|
|
428
|
+
const tracked = trackedElements.get(element);
|
|
429
|
+
if (tracked) {
|
|
430
|
+
const extractedElement = {
|
|
431
|
+
...element,
|
|
432
|
+
textContent: tracked.textContent.trim()
|
|
433
|
+
};
|
|
434
|
+
tracked.callback(extractedElement, state);
|
|
435
|
+
trackedElements.delete(element);
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
});
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
//#endregion
|
|
442
|
+
export { BLOCKQUOTE_SPACING, DEFAULT_BLOCK_SPACING, ELEMENT_NODE, HTML_ENTITIES, LIST_ITEM_SPACING, MARKDOWN_CODE_BLOCK, MARKDOWN_EMPHASIS, MARKDOWN_HORIZONTAL_RULE, MARKDOWN_INLINE_CODE, MARKDOWN_STRIKETHROUGH, MARKDOWN_STRONG, MAX_TAG_ID, NO_SPACING, NodeEventEnter, NodeEventExit, TABLE_ROW_SPACING, TAG_A, TAG_ABBR, TAG_ADDRESS, TAG_AREA, TAG_ARTICLE, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BASE, TAG_BDO, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CANVAS, TAG_CAPTION, TAG_CENTER, TAG_CITE, TAG_CODE, TAG_COL, TAG_DD, TAG_DEL, TAG_DETAILS, TAG_DFN, TAG_DIALOG, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FIGCAPTION, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HEADER, TAG_HR, TAG_HTML, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_INS, TAG_KBD, TAG_KEYGEN, TAG_LABEL, TAG_LEGEND, TAG_LI, TAG_LINK, TAG_MAIN, TAG_MAP, TAG_MARK, TAG_META, TAG_METER, TAG_NAV, TAG_NOFRAMES, TAG_NOSCRIPT, TAG_OBJECT, TAG_OL, TAG_OPTION, TAG_P, TAG_PARAM, TAG_PLAINTEXT, TAG_PRE, TAG_PROGRESS, TAG_Q, TAG_RP, TAG_RT, TAG_RUBY, TAG_SAMP, TAG_SCRIPT, TAG_SECTION, TAG_SELECT, TAG_SMALL, TAG_SOURCE, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUB, TAG_SUMMARY, TAG_SUP, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEMPLATE, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TIME, TAG_TITLE, TAG_TR, TAG_TRACK, TAG_U, TAG_UL, TAG_VAR, TAG_VIDEO, TAG_WBR, TAG_XMP, TEXT_NODE, TagIdMap, assembleBufferedContent, collectNodeContent, createBufferRegion, createPlugin, extractionPlugin, parseSelector };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { TAG_ASIDE, TAG_BUTTON, TAG_EMBED, TAG_FIELDSET, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_IFRAME, TAG_INPUT, TAG_NAV, TAG_OBJECT, TAG_SELECT, TAG_TEXTAREA } from "./
|
|
2
|
-
import { filterPlugin, frontmatterPlugin, isolateMainPlugin, tailwindPlugin } from "./plugins-
|
|
1
|
+
import { TAG_ASIDE, TAG_BUTTON, TAG_EMBED, TAG_FIELDSET, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_IFRAME, TAG_INPUT, TAG_NAV, TAG_OBJECT, TAG_SELECT, TAG_TEXTAREA } from "./extraction-D28Kr1J3.mjs";
|
|
2
|
+
import { filterPlugin, frontmatterPlugin, isolateMainPlugin, tailwindPlugin } from "./plugins-DXY-fo9h.mjs";
|
|
3
3
|
|
|
4
4
|
//#region src/preset/minimal.ts
|
|
5
5
|
/**
|
|
@@ -1,148 +1,5 @@
|
|
|
1
|
-
import { ELEMENT_NODE, TAG_A, TAG_ADDRESS, TAG_ARTICLE, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CAPTION, TAG_CODE, TAG_DD, TAG_DETAILS, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FIGCAPTION, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HEADER, TAG_HR, TAG_HTML, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_LI, TAG_MAIN, TAG_META, TAG_NAV, TAG_OBJECT, TAG_OL, TAG_P, TAG_PRE, TAG_SCRIPT, TAG_SECTION, TAG_SELECT, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUMMARY, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TITLE, TAG_TR, TAG_UL, TAG_VIDEO, TEXT_NODE, collectNodeContent, createBufferRegion, createPlugin } from "./
|
|
1
|
+
import { ELEMENT_NODE, TAG_A, TAG_ADDRESS, TAG_ARTICLE, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CAPTION, TAG_CODE, TAG_DD, TAG_DETAILS, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FIGCAPTION, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HEADER, TAG_HR, TAG_HTML, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_LI, TAG_MAIN, TAG_META, TAG_NAV, TAG_OBJECT, TAG_OL, TAG_P, TAG_PRE, TAG_SCRIPT, TAG_SECTION, TAG_SELECT, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUMMARY, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TITLE, TAG_TR, TAG_UL, TAG_VIDEO, TEXT_NODE, collectNodeContent, createBufferRegion, createPlugin, parseSelector } from "./extraction-D28Kr1J3.mjs";
|
|
2
2
|
|
|
3
|
-
//#region src/libs/query-selector.ts
|
|
4
|
-
/**
|
|
5
|
-
* Creates a tag selector matcher (e.g., 'div', 'p', 'h1')
|
|
6
|
-
*/
|
|
7
|
-
function createTagSelector(tagName) {
|
|
8
|
-
return {
|
|
9
|
-
matches: (element) => element.name === tagName,
|
|
10
|
-
toString: () => tagName
|
|
11
|
-
};
|
|
12
|
-
}
|
|
13
|
-
/**
|
|
14
|
-
* Creates an ID selector matcher (e.g., '#main', '#content')
|
|
15
|
-
*/
|
|
16
|
-
function createIdSelector(selector) {
|
|
17
|
-
const id = selector.slice(1);
|
|
18
|
-
return {
|
|
19
|
-
matches: (element) => element.attributes?.id === id,
|
|
20
|
-
toString: () => `#${id}`
|
|
21
|
-
};
|
|
22
|
-
}
|
|
23
|
-
/**
|
|
24
|
-
* Creates a class selector matcher (e.g., '.container', '.header')
|
|
25
|
-
*/
|
|
26
|
-
function createClassSelector(selector) {
|
|
27
|
-
const className = selector.slice(1);
|
|
28
|
-
return {
|
|
29
|
-
matches: (element) => {
|
|
30
|
-
if (!element.attributes?.class) return false;
|
|
31
|
-
const classes = element.attributes.class.trim().split(" ").filter(Boolean);
|
|
32
|
-
return classes.includes(className);
|
|
33
|
-
},
|
|
34
|
-
toString: () => `.${className}`
|
|
35
|
-
};
|
|
36
|
-
}
|
|
37
|
-
/**
|
|
38
|
-
* Creates an attribute selector matcher (e.g., '[data-id]', '[href="https://example.com"]')
|
|
39
|
-
*/
|
|
40
|
-
function createAttributeSelector(selector) {
|
|
41
|
-
const match = selector.match(/\[([^\]=~|^$*]+)(?:([=~|^$*]+)["']?([^"'\]]+)["']?)?\]/);
|
|
42
|
-
const attrName = match ? match[1] : selector.slice(1, -1);
|
|
43
|
-
const operator = match?.[2];
|
|
44
|
-
const attrValue = match?.[3];
|
|
45
|
-
return {
|
|
46
|
-
matches: (element) => {
|
|
47
|
-
if (!(attrName in (element.attributes || {}))) return false;
|
|
48
|
-
if (!operator || !attrValue) return true;
|
|
49
|
-
const value = element.attributes[attrName];
|
|
50
|
-
switch (operator) {
|
|
51
|
-
case "=": return value === attrValue;
|
|
52
|
-
case "^=": return value.startsWith(attrValue);
|
|
53
|
-
case "$=": return value.endsWith(attrValue);
|
|
54
|
-
case "*=": return value.includes(attrValue);
|
|
55
|
-
case "~=": return value.trim().split(" ").filter(Boolean).includes(attrValue);
|
|
56
|
-
case "|=": return value === attrValue || value.startsWith(`${attrValue}-`);
|
|
57
|
-
default: return false;
|
|
58
|
-
}
|
|
59
|
-
},
|
|
60
|
-
toString: () => {
|
|
61
|
-
if (!operator || !attrValue) return `[${attrName}]`;
|
|
62
|
-
return `[${attrName}${operator}${attrValue}]`;
|
|
63
|
-
}
|
|
64
|
-
};
|
|
65
|
-
}
|
|
66
|
-
/**
|
|
67
|
-
* Creates a compound selector that combines multiple selectors (e.g., 'div.container', 'h1#title')
|
|
68
|
-
*/
|
|
69
|
-
function createCompoundSelector(selectors) {
|
|
70
|
-
return {
|
|
71
|
-
matches: (element) => selectors.every((selector) => selector.matches(element)),
|
|
72
|
-
toString: () => selectors.map((s) => s.toString()).join("")
|
|
73
|
-
};
|
|
74
|
-
}
|
|
75
|
-
/**
|
|
76
|
-
* Parses a CSS selector into a matcher
|
|
77
|
-
*/
|
|
78
|
-
function parseSelector(selector) {
|
|
79
|
-
selector = selector.trim();
|
|
80
|
-
if (!selector) throw new Error("Empty selector");
|
|
81
|
-
const selectorParts = [];
|
|
82
|
-
let current = "";
|
|
83
|
-
let inAttribute = false;
|
|
84
|
-
for (let i = 0; i < selector.length; i++) {
|
|
85
|
-
const char = selector[i];
|
|
86
|
-
if ((char === "." || char === "#" || char === "[") && current) {
|
|
87
|
-
if (current[0] === ".") selectorParts.push(createClassSelector(current));
|
|
88
|
-
else if (current[0] === "#") selectorParts.push(createIdSelector(current));
|
|
89
|
-
else if (current[0] === "[") selectorParts.push(createAttributeSelector(current));
|
|
90
|
-
else selectorParts.push(createTagSelector(current));
|
|
91
|
-
current = char;
|
|
92
|
-
} else current += char;
|
|
93
|
-
if (char === "[") inAttribute = true;
|
|
94
|
-
if (char === "]") inAttribute = false;
|
|
95
|
-
if (inAttribute && char !== "[") {}
|
|
96
|
-
}
|
|
97
|
-
if (current) if (current[0] === ".") selectorParts.push(createClassSelector(current));
|
|
98
|
-
else if (current[0] === "#") selectorParts.push(createIdSelector(current));
|
|
99
|
-
else if (current[0] === "[") selectorParts.push(createAttributeSelector(current));
|
|
100
|
-
else selectorParts.push(createTagSelector(current));
|
|
101
|
-
if (selectorParts.length === 1) return selectorParts[0];
|
|
102
|
-
return createCompoundSelector(selectorParts);
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
//#endregion
|
|
106
|
-
//#region src/plugins/extraction.ts
|
|
107
|
-
function extractionPlugin(selectors) {
|
|
108
|
-
const matcherCallbacks = Object.entries(selectors).map(([selector, callback]) => ({
|
|
109
|
-
matcher: parseSelector(selector),
|
|
110
|
-
callback
|
|
111
|
-
}));
|
|
112
|
-
const trackedElements = new Map();
|
|
113
|
-
return createPlugin({
|
|
114
|
-
onNodeEnter(element) {
|
|
115
|
-
matcherCallbacks.forEach(({ matcher, callback }) => {
|
|
116
|
-
if (matcher.matches(element)) trackedElements.set(element, {
|
|
117
|
-
textContent: "",
|
|
118
|
-
callback
|
|
119
|
-
});
|
|
120
|
-
});
|
|
121
|
-
},
|
|
122
|
-
processTextNode(textNode) {
|
|
123
|
-
let currentParent = textNode.parent;
|
|
124
|
-
while (currentParent) {
|
|
125
|
-
const tracked = trackedElements.get(currentParent);
|
|
126
|
-
if (tracked) tracked.textContent += textNode.value;
|
|
127
|
-
currentParent = currentParent.parent;
|
|
128
|
-
}
|
|
129
|
-
return void 0;
|
|
130
|
-
},
|
|
131
|
-
onNodeExit(element, state) {
|
|
132
|
-
const tracked = trackedElements.get(element);
|
|
133
|
-
if (tracked) {
|
|
134
|
-
const extractedElement = {
|
|
135
|
-
...element,
|
|
136
|
-
textContent: tracked.textContent.trim()
|
|
137
|
-
};
|
|
138
|
-
tracked.callback(extractedElement, state);
|
|
139
|
-
trackedElements.delete(element);
|
|
140
|
-
}
|
|
141
|
-
}
|
|
142
|
-
});
|
|
143
|
-
}
|
|
144
|
-
|
|
145
|
-
//#endregion
|
|
146
3
|
//#region src/plugins/filter.ts
|
|
147
4
|
/**
|
|
148
5
|
* Plugin that filters nodes based on CSS selectors.
|
|
@@ -841,4 +698,4 @@ function tailwindPlugin() {
|
|
|
841
698
|
}
|
|
842
699
|
|
|
843
700
|
//#endregion
|
|
844
|
-
export {
|
|
701
|
+
export { filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin };
|
|
@@ -1,4 +1,7 @@
|
|
|
1
|
-
import { BLOCKQUOTE_SPACING, DEFAULT_BLOCK_SPACING, ELEMENT_NODE, HTML_ENTITIES, LIST_ITEM_SPACING, MARKDOWN_CODE_BLOCK, MARKDOWN_EMPHASIS, MARKDOWN_HORIZONTAL_RULE, MARKDOWN_INLINE_CODE, MARKDOWN_STRIKETHROUGH, MARKDOWN_STRONG, MAX_TAG_ID, NO_SPACING, NodeEventEnter, NodeEventExit, TABLE_ROW_SPACING, TAG_A, TAG_ABBR, TAG_ADDRESS, TAG_AREA, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BASE, TAG_BDO, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CANVAS, TAG_CENTER, TAG_CITE, TAG_CODE, TAG_COL, TAG_DD, TAG_DEL, TAG_DETAILS, TAG_DFN, TAG_DIALOG, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HR, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_INS, TAG_KBD, TAG_KEYGEN, TAG_LABEL, TAG_LEGEND, TAG_LI, TAG_LINK, TAG_MAP, TAG_MARK, TAG_META, TAG_METER, TAG_NAV, TAG_NOFRAMES, TAG_NOSCRIPT, TAG_OL, TAG_OPTION, TAG_P, TAG_PARAM, TAG_PLAINTEXT, TAG_PRE, TAG_PROGRESS, TAG_Q, TAG_RP, TAG_RT, TAG_RUBY, TAG_SAMP, TAG_SCRIPT, TAG_SELECT, TAG_SMALL, TAG_SOURCE, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUB, TAG_SUMMARY, TAG_SUP, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEMPLATE, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TIME, TAG_TITLE, TAG_TR, TAG_TRACK, TAG_U, TAG_UL, TAG_VAR, TAG_VIDEO, TAG_WBR, TAG_XMP, TEXT_NODE, TagIdMap, assembleBufferedContent, collectNodeContent } from "./
|
|
1
|
+
import { BLOCKQUOTE_SPACING, DEFAULT_BLOCK_SPACING, ELEMENT_NODE, HTML_ENTITIES, LIST_ITEM_SPACING, MARKDOWN_CODE_BLOCK, MARKDOWN_EMPHASIS, MARKDOWN_HORIZONTAL_RULE, MARKDOWN_INLINE_CODE, MARKDOWN_STRIKETHROUGH, MARKDOWN_STRONG, MAX_TAG_ID, NO_SPACING, NodeEventEnter, NodeEventExit, TABLE_ROW_SPACING, TAG_A, TAG_ABBR, TAG_ADDRESS, TAG_AREA, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BASE, TAG_BDO, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CANVAS, TAG_CENTER, TAG_CITE, TAG_CODE, TAG_COL, TAG_DD, TAG_DEL, TAG_DETAILS, TAG_DFN, TAG_DIALOG, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HR, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_INS, TAG_KBD, TAG_KEYGEN, TAG_LABEL, TAG_LEGEND, TAG_LI, TAG_LINK, TAG_MAP, TAG_MARK, TAG_META, TAG_METER, TAG_NAV, TAG_NOFRAMES, TAG_NOSCRIPT, TAG_OL, TAG_OPTION, TAG_P, TAG_PARAM, TAG_PLAINTEXT, TAG_PRE, TAG_PROGRESS, TAG_Q, TAG_RP, TAG_RT, TAG_RUBY, TAG_SAMP, TAG_SCRIPT, TAG_SELECT, TAG_SMALL, TAG_SOURCE, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUB, TAG_SUMMARY, TAG_SUP, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEMPLATE, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TIME, TAG_TITLE, TAG_TR, TAG_TRACK, TAG_U, TAG_UL, TAG_VAR, TAG_VIDEO, TAG_WBR, TAG_XMP, TEXT_NODE, TagIdMap, assembleBufferedContent, collectNodeContent, extractionPlugin } from "./extraction-D28Kr1J3.mjs";
|
|
2
|
+
import { readFile } from "node:fs/promises";
|
|
3
|
+
import { basename, dirname, relative } from "node:path";
|
|
4
|
+
import { glob } from "tinyglobby";
|
|
2
5
|
|
|
3
6
|
//#region src/tags.ts
|
|
4
7
|
function resolveUrl(url, origin) {
|
|
@@ -1382,6 +1385,164 @@ function createMarkdownProcessor(options = {}) {
|
|
|
1382
1385
|
}
|
|
1383
1386
|
const MarkdownProcessor = createMarkdownProcessor;
|
|
1384
1387
|
|
|
1388
|
+
//#endregion
|
|
1389
|
+
//#region src/llms-txt.ts
|
|
1390
|
+
/**
|
|
1391
|
+
* Extract metadata from HTML content using mdream's extraction plugin
|
|
1392
|
+
*/
|
|
1393
|
+
function extractMetadata(html, url) {
|
|
1394
|
+
let title = "";
|
|
1395
|
+
let description = "";
|
|
1396
|
+
let keywords = "";
|
|
1397
|
+
let author = "";
|
|
1398
|
+
const extractionPluginInstance = extractionPlugin({
|
|
1399
|
+
"title": (element) => {
|
|
1400
|
+
if (!title && element.textContent) title = element.textContent.trim();
|
|
1401
|
+
},
|
|
1402
|
+
"meta[name=\"description\"]": (element) => {
|
|
1403
|
+
if (!description && element.attributes?.content) description = element.attributes.content.trim();
|
|
1404
|
+
},
|
|
1405
|
+
"meta[property=\"og:description\"]": (element) => {
|
|
1406
|
+
if (!description && element.attributes?.content) description = element.attributes.content.trim();
|
|
1407
|
+
},
|
|
1408
|
+
"meta[name=\"keywords\"]": (element) => {
|
|
1409
|
+
if (!keywords && element.attributes?.content) keywords = element.attributes.content.trim();
|
|
1410
|
+
},
|
|
1411
|
+
"meta[name=\"author\"]": (element) => {
|
|
1412
|
+
if (!author && element.attributes?.content) author = element.attributes.content.trim();
|
|
1413
|
+
},
|
|
1414
|
+
"meta[property=\"og:title\"]": (element) => {
|
|
1415
|
+
if (!title && element.attributes?.content) title = element.attributes.content.trim();
|
|
1416
|
+
}
|
|
1417
|
+
});
|
|
1418
|
+
htmlToMarkdown(html, {
|
|
1419
|
+
plugins: [extractionPluginInstance],
|
|
1420
|
+
origin: url
|
|
1421
|
+
});
|
|
1422
|
+
return {
|
|
1423
|
+
title: title || void 0,
|
|
1424
|
+
description: description || void 0,
|
|
1425
|
+
keywords: keywords || void 0,
|
|
1426
|
+
author: author || void 0
|
|
1427
|
+
};
|
|
1428
|
+
}
|
|
1429
|
+
/**
|
|
1430
|
+
* Convert file path to URL path
|
|
1431
|
+
*/
|
|
1432
|
+
function pathToUrl(filePath, baseDir) {
|
|
1433
|
+
let url = relative(baseDir, filePath);
|
|
1434
|
+
if (url.endsWith(".html")) url = url.slice(0, -5);
|
|
1435
|
+
if (url.endsWith("/index")) url = url.slice(0, -6);
|
|
1436
|
+
if (url === "index") return "/";
|
|
1437
|
+
if (!url.startsWith("/")) url = `/${url}`;
|
|
1438
|
+
return url;
|
|
1439
|
+
}
|
|
1440
|
+
/**
|
|
1441
|
+
* Process HTML files from glob patterns
|
|
1442
|
+
*/
|
|
1443
|
+
async function processHtmlFiles(patterns, origin) {
|
|
1444
|
+
const allPatterns = Array.isArray(patterns) ? patterns : [patterns];
|
|
1445
|
+
const allFiles = [];
|
|
1446
|
+
for (const pattern of allPatterns) {
|
|
1447
|
+
const files = await glob(pattern);
|
|
1448
|
+
allFiles.push(...files);
|
|
1449
|
+
}
|
|
1450
|
+
const uniqueFiles = [...new Set(allFiles)];
|
|
1451
|
+
const results = [];
|
|
1452
|
+
const baseDir = uniqueFiles.length > 0 ? dirname(uniqueFiles[0]) : ".";
|
|
1453
|
+
for (const filePath of uniqueFiles) try {
|
|
1454
|
+
const html = await readFile(filePath, "utf-8");
|
|
1455
|
+
const metadata = extractMetadata(html, origin || filePath);
|
|
1456
|
+
const content = htmlToMarkdown(html, { origin });
|
|
1457
|
+
const url = pathToUrl(filePath, baseDir);
|
|
1458
|
+
results.push({
|
|
1459
|
+
filePath,
|
|
1460
|
+
title: metadata?.title || basename(filePath, ".html"),
|
|
1461
|
+
content,
|
|
1462
|
+
url,
|
|
1463
|
+
metadata
|
|
1464
|
+
});
|
|
1465
|
+
} catch (error) {
|
|
1466
|
+
console.error(`Error processing ${filePath}:`, error);
|
|
1467
|
+
}
|
|
1468
|
+
return results;
|
|
1469
|
+
}
|
|
1470
|
+
/**
|
|
1471
|
+
* Generate llms.txt content
|
|
1472
|
+
*/
|
|
1473
|
+
function generateLlmsTxtContent(files, options) {
|
|
1474
|
+
const { siteName = "Site", description, origin = "" } = options;
|
|
1475
|
+
let content = `# ${siteName}\n\n`;
|
|
1476
|
+
if (description) content += `> ${description}\n\n`;
|
|
1477
|
+
if (files.length > 0) {
|
|
1478
|
+
content += `## Pages\n\n`;
|
|
1479
|
+
for (const file of files) {
|
|
1480
|
+
const url = origin + file.url;
|
|
1481
|
+
const desc = file.metadata?.description;
|
|
1482
|
+
const descText = desc ? `: ${desc.substring(0, 100)}${desc.length > 100 ? "..." : ""}` : "";
|
|
1483
|
+
content += `- [${file.title}](${url})${descText}\n`;
|
|
1484
|
+
}
|
|
1485
|
+
}
|
|
1486
|
+
return content;
|
|
1487
|
+
}
|
|
1488
|
+
/**
|
|
1489
|
+
* Generate llms-full.txt content with complete page content
|
|
1490
|
+
*/
|
|
1491
|
+
function generateLlmsFullTxtContent(files, options) {
|
|
1492
|
+
const { siteName = "Site", description, origin = "" } = options;
|
|
1493
|
+
let content = `# ${siteName}\n\n`;
|
|
1494
|
+
if (description) content += `> ${description}\n\n`;
|
|
1495
|
+
if (files.length > 0) {
|
|
1496
|
+
content += `## Table of Contents\n\n`;
|
|
1497
|
+
for (const file of files) {
|
|
1498
|
+
const anchor = file.title.toLowerCase().replace(/[^a-z0-9]/g, "-");
|
|
1499
|
+
content += `- [${file.title}](#${anchor})\n`;
|
|
1500
|
+
}
|
|
1501
|
+
content += `\n---\n\n`;
|
|
1502
|
+
for (const file of files) {
|
|
1503
|
+
const url = origin ? origin + file.url : file.url;
|
|
1504
|
+
content += `## ${file.title}\n\n`;
|
|
1505
|
+
content += `**URL:** ${url}\n\n`;
|
|
1506
|
+
content += `${file.content}\n\n---\n\n`;
|
|
1507
|
+
}
|
|
1508
|
+
}
|
|
1509
|
+
return content;
|
|
1510
|
+
}
|
|
1511
|
+
/**
|
|
1512
|
+
* Generate individual markdown files structure
|
|
1513
|
+
*/
|
|
1514
|
+
function generateMarkdownFilesContent(files) {
|
|
1515
|
+
const markdownFiles = [];
|
|
1516
|
+
for (const file of files) {
|
|
1517
|
+
const mdPath = file.url === "/" ? "md/index.md" : `md${file.url}.md`;
|
|
1518
|
+
markdownFiles.push({
|
|
1519
|
+
path: mdPath,
|
|
1520
|
+
content: file.content
|
|
1521
|
+
});
|
|
1522
|
+
}
|
|
1523
|
+
return markdownFiles;
|
|
1524
|
+
}
|
|
1525
|
+
/**
|
|
1526
|
+
* Main function to process files and generate llms.txt artifacts
|
|
1527
|
+
*/
|
|
1528
|
+
async function generateLlmsTxtArtifacts(options) {
|
|
1529
|
+
let files;
|
|
1530
|
+
if (options.files) files = options.files;
|
|
1531
|
+
else if (options.patterns) files = await processHtmlFiles(options.patterns, options.origin);
|
|
1532
|
+
else throw new Error("Either patterns or files must be provided");
|
|
1533
|
+
const llmsTxt = generateLlmsTxtContent(files, options);
|
|
1534
|
+
let llmsFullTxt;
|
|
1535
|
+
if (options.generateFull) llmsFullTxt = generateLlmsFullTxtContent(files, options);
|
|
1536
|
+
let markdownFiles;
|
|
1537
|
+
if (options.generateMarkdown) markdownFiles = generateMarkdownFilesContent(files);
|
|
1538
|
+
return {
|
|
1539
|
+
llmsTxt,
|
|
1540
|
+
llmsFullTxt,
|
|
1541
|
+
markdownFiles,
|
|
1542
|
+
processedFiles: files
|
|
1543
|
+
};
|
|
1544
|
+
}
|
|
1545
|
+
|
|
1385
1546
|
//#endregion
|
|
1386
1547
|
//#region src/stream.ts
|
|
1387
1548
|
/**
|
|
@@ -1424,4 +1585,12 @@ async function* streamHtmlToMarkdown(htmlStream, options = {}) {
|
|
|
1424
1585
|
}
|
|
1425
1586
|
|
|
1426
1587
|
//#endregion
|
|
1427
|
-
|
|
1588
|
+
//#region src/index.ts
|
|
1589
|
+
function htmlToMarkdown(html, options = {}) {
|
|
1590
|
+
const processor = createMarkdownProcessor(options);
|
|
1591
|
+
processor.processHtml(html);
|
|
1592
|
+
return processor.getMarkdown();
|
|
1593
|
+
}
|
|
1594
|
+
|
|
1595
|
+
//#endregion
|
|
1596
|
+
export { MarkdownProcessor, generateLlmsTxtArtifacts, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
|
package/dist/cli.mjs
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
import "./_chunks/
|
|
2
|
-
import { streamHtmlToMarkdown } from "./_chunks/
|
|
3
|
-
import "./_chunks/plugins-
|
|
4
|
-
import { withMinimalPreset } from "./_chunks/minimal-
|
|
5
|
-
import {
|
|
1
|
+
import "./_chunks/extraction-D28Kr1J3.mjs";
|
|
2
|
+
import { generateLlmsTxtArtifacts, streamHtmlToMarkdown } from "./_chunks/src-Eo8j0-9L.mjs";
|
|
3
|
+
import "./_chunks/plugins-DXY-fo9h.mjs";
|
|
4
|
+
import { withMinimalPreset } from "./_chunks/minimal-CCnrG7a1.mjs";
|
|
5
|
+
import { mkdir, writeFile } from "node:fs/promises";
|
|
6
6
|
import { dirname, join } from "node:path";
|
|
7
|
+
import { readFileSync } from "node:fs";
|
|
7
8
|
import { Readable } from "node:stream";
|
|
8
9
|
import { fileURLToPath } from "node:url";
|
|
9
10
|
import { cac } from "cac";
|
|
@@ -16,6 +17,38 @@ async function streamingConvert(options = {}) {
|
|
|
16
17
|
const markdownGenerator = streamHtmlToMarkdown(Readable.toWeb(process.stdin), conversionOptions);
|
|
17
18
|
for await (const markdownChunk of markdownGenerator) if (markdownChunk && markdownChunk.length > 0) outputStream.write(markdownChunk);
|
|
18
19
|
}
|
|
20
|
+
async function generateLlms(patterns, options) {
|
|
21
|
+
try {
|
|
22
|
+
const artifacts = options.artifacts ? options.artifacts.split(",").map((a) => a.trim()) : [
|
|
23
|
+
"llms.txt",
|
|
24
|
+
"llms-full.txt",
|
|
25
|
+
"markdown"
|
|
26
|
+
];
|
|
27
|
+
const result = await generateLlmsTxtArtifacts({
|
|
28
|
+
patterns,
|
|
29
|
+
siteName: options.siteName,
|
|
30
|
+
description: options.description,
|
|
31
|
+
origin: options.origin,
|
|
32
|
+
generateFull: artifacts.includes("llms-full.txt"),
|
|
33
|
+
generateMarkdown: artifacts.includes("markdown")
|
|
34
|
+
});
|
|
35
|
+
await mkdir(options.output, { recursive: true });
|
|
36
|
+
const llmsPath = join(options.output, "llms.txt");
|
|
37
|
+
await writeFile(llmsPath, result.llmsTxt, "utf-8");
|
|
38
|
+
if (artifacts.includes("llms-full.txt") && result.llmsFullTxt) {
|
|
39
|
+
const fullPath = join(options.output, "llms-full.txt");
|
|
40
|
+
await writeFile(fullPath, result.llmsFullTxt, "utf-8");
|
|
41
|
+
}
|
|
42
|
+
if (artifacts.includes("markdown") && result.markdownFiles) for (const mdFile of result.markdownFiles) {
|
|
43
|
+
const fullPath = join(options.output, mdFile.path);
|
|
44
|
+
await mkdir(dirname(fullPath), { recursive: true });
|
|
45
|
+
await writeFile(fullPath, mdFile.content, "utf-8");
|
|
46
|
+
}
|
|
47
|
+
} catch (error) {
|
|
48
|
+
console.error("❌ Error generating llms.txt:", error);
|
|
49
|
+
process.exit(1);
|
|
50
|
+
}
|
|
51
|
+
}
|
|
19
52
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
20
53
|
const packageJsonPath = join(__dirname, "..", "package.json");
|
|
21
54
|
const packageJson = JSON.parse(readFileSync(packageJsonPath, "utf-8"));
|
|
@@ -24,6 +57,12 @@ const cli = cac();
|
|
|
24
57
|
cli.command("[options]", "Convert HTML from stdin to Markdown on stdout").option("--origin <url>", "Origin URL for resolving relative image paths").option("--preset <preset>", "Conversion presets: minimal").action(async (_, opts) => {
|
|
25
58
|
await streamingConvert(opts);
|
|
26
59
|
});
|
|
60
|
+
cli.command("llms <patterns...>", "Generate llms.txt artifacts from HTML files").option("--site-name <name>", "Name of the site for llms.txt header").option("--description <desc>", "Description of the site for llms.txt").option("--origin <url>", "Origin URL for resolving relative paths and generating absolute URLs").option("-o, --output <dir>", "Output directory for generated files", { default: "." }).option("--artifacts <list>", "Comma-separated list of artifacts to generate: llms.txt,llms-full.txt,markdown", { default: "llms.txt,llms-full.txt,markdown" }).action(async (patterns, opts) => {
|
|
61
|
+
await generateLlms(patterns, {
|
|
62
|
+
patterns,
|
|
63
|
+
...opts
|
|
64
|
+
});
|
|
65
|
+
});
|
|
27
66
|
cli.help().version(version).parse();
|
|
28
67
|
|
|
29
68
|
//#endregion
|
package/dist/index.d.mts
CHANGED
|
@@ -6,6 +6,42 @@ import { ReadableStream } from "node:stream/web";
|
|
|
6
6
|
|
|
7
7
|
declare const TagIdMap: Record<string, number>;
|
|
8
8
|
//#endregion
|
|
9
|
+
//#region src/llms-txt.d.ts
|
|
10
|
+
interface LlmsTxtArtifactsOptions {
|
|
11
|
+
patterns?: string | string[];
|
|
12
|
+
files?: ProcessedFile[];
|
|
13
|
+
siteName?: string;
|
|
14
|
+
description?: string;
|
|
15
|
+
origin?: string;
|
|
16
|
+
generateFull?: boolean;
|
|
17
|
+
generateMarkdown?: boolean;
|
|
18
|
+
}
|
|
19
|
+
interface ProcessedFile {
|
|
20
|
+
filePath?: string;
|
|
21
|
+
title: string;
|
|
22
|
+
content: string;
|
|
23
|
+
url: string;
|
|
24
|
+
metadata?: {
|
|
25
|
+
title?: string;
|
|
26
|
+
description?: string;
|
|
27
|
+
keywords?: string;
|
|
28
|
+
author?: string;
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
interface LlmsTxtArtifactsResult {
|
|
32
|
+
llmsTxt: string;
|
|
33
|
+
llmsFullTxt?: string;
|
|
34
|
+
markdownFiles?: {
|
|
35
|
+
path: string;
|
|
36
|
+
content: string;
|
|
37
|
+
}[];
|
|
38
|
+
processedFiles: ProcessedFile[];
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Main function to process files and generate llms.txt artifacts
|
|
42
|
+
*/
|
|
43
|
+
declare function generateLlmsTxtArtifacts(options: LlmsTxtArtifactsOptions): Promise<LlmsTxtArtifactsResult>;
|
|
44
|
+
//#endregion
|
|
9
45
|
//#region src/markdown-processor.d.ts
|
|
10
46
|
interface MarkdownState {
|
|
11
47
|
/** Configuration options for conversion */
|
|
@@ -72,4 +108,4 @@ declare function streamHtmlToMarkdown(htmlStream: ReadableStream | null, options
|
|
|
72
108
|
//#region src/index.d.ts
|
|
73
109
|
declare function htmlToMarkdown(html: string, options?: HTMLToMarkdownOptions): string;
|
|
74
110
|
//#endregion
|
|
75
|
-
export { BufferRegion, ELEMENT_NODE, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MarkdownProcessor, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginContext, PluginCreationOptions, ReadabilityContext, TEXT_NODE, TagHandler, TagIdMap, TailwindContext, TextNode, createPlugin, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
|
|
111
|
+
export { BufferRegion, ELEMENT_NODE, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, LlmsTxtArtifactsOptions, LlmsTxtArtifactsResult, MarkdownProcessor, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginContext, PluginCreationOptions, ProcessedFile, ReadabilityContext, TEXT_NODE, TagHandler, TagIdMap, TailwindContext, TextNode, createPlugin, generateLlmsTxtArtifacts, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
|
package/dist/index.mjs
CHANGED
|
@@ -1,12 +1,4 @@
|
|
|
1
|
-
import { TagIdMap, createPlugin } from "./_chunks/
|
|
2
|
-
import { MarkdownProcessor,
|
|
1
|
+
import { TagIdMap, createPlugin } from "./_chunks/extraction-D28Kr1J3.mjs";
|
|
2
|
+
import { MarkdownProcessor, generateLlmsTxtArtifacts, htmlToMarkdown, parseHtml, streamHtmlToMarkdown } from "./_chunks/src-Eo8j0-9L.mjs";
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
function htmlToMarkdown(html, options = {}) {
|
|
6
|
-
const processor = createMarkdownProcessor(options);
|
|
7
|
-
processor.processHtml(html);
|
|
8
|
-
return processor.getMarkdown();
|
|
9
|
-
}
|
|
10
|
-
|
|
11
|
-
//#endregion
|
|
12
|
-
export { MarkdownProcessor, TagIdMap, createPlugin, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
|
|
4
|
+
export { MarkdownProcessor, TagIdMap, createPlugin, generateLlmsTxtArtifacts, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
|
package/dist/plugins.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { createPlugin } from "./_chunks/
|
|
2
|
-
import {
|
|
1
|
+
import { createPlugin, extractionPlugin } from "./_chunks/extraction-D28Kr1J3.mjs";
|
|
2
|
+
import { filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin } from "./_chunks/plugins-DXY-fo9h.mjs";
|
|
3
3
|
|
|
4
4
|
export { createPlugin, extractionPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin };
|
package/dist/preset/minimal.mjs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import "../_chunks/
|
|
2
|
-
import "../_chunks/plugins-
|
|
3
|
-
import { withMinimalPreset } from "../_chunks/minimal-
|
|
1
|
+
import "../_chunks/extraction-D28Kr1J3.mjs";
|
|
2
|
+
import "../_chunks/plugins-DXY-fo9h.mjs";
|
|
3
|
+
import { withMinimalPreset } from "../_chunks/minimal-CCnrG7a1.mjs";
|
|
4
4
|
|
|
5
5
|
export { withMinimalPreset };
|
package/package.json
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "mdream",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.
|
|
5
|
-
"description": "Ultra-performant
|
|
4
|
+
"version": "0.6.1",
|
|
5
|
+
"description": "Ultra-performant HTML to Markdown Convertor Optimized for LLMs and llm.txt artifacts.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
8
8
|
"email": "harlan@harlanzw.com",
|
|
@@ -53,7 +53,8 @@
|
|
|
53
53
|
"dist"
|
|
54
54
|
],
|
|
55
55
|
"dependencies": {
|
|
56
|
-
"cac": "^6.7.14"
|
|
56
|
+
"cac": "^6.7.14",
|
|
57
|
+
"tinyglobby": "^0.2.10"
|
|
57
58
|
},
|
|
58
59
|
"scripts": {
|
|
59
60
|
"flame": "pnpm build && unbuild bench/bundle && clinic flame -- node bench/bundle/dist/string.mjs 10",
|