@mdream/js 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,215 @@
1
+ import "./_chunks/const.mjs";
2
+ import { r as parseHtmlStream } from "./_chunks/parse.mjs";
3
+ import { n as createMarkdownProcessor, r as processPluginsForEvent, t as resolvePlugins } from "./_chunks/resolve-plugins.mjs";
4
+ import "./_chunks/plugins.mjs";
5
+ //#region src/splitter.ts
6
+ const MARKDOWN_HEADER_LINE_RE = /^#{1,6}\s+/;
7
+ const NEWLINE_RE = /\n/g;
8
+ const DEFAULT_HEADERS_TO_SPLIT_ON = [
9
+ 8,
10
+ 9,
11
+ 10,
12
+ 11,
13
+ 12
14
+ ];
15
+ function createOptions(options) {
16
+ return {
17
+ headersToSplitOn: options.headersToSplitOn ?? DEFAULT_HEADERS_TO_SPLIT_ON,
18
+ returnEachLine: options.returnEachLine ?? false,
19
+ stripHeaders: options.stripHeaders ?? true,
20
+ chunkSize: options.chunkSize ?? 1e3,
21
+ chunkOverlap: options.chunkOverlap ?? 200,
22
+ lengthFunction: options.lengthFunction ?? ((text) => text.length),
23
+ keepSeparator: options.keepSeparator ?? false,
24
+ resolvedPlugins: resolvePlugins(options).plugins
25
+ };
26
+ }
27
+ function getCodeLanguage(node) {
28
+ const className = node.attributes?.class;
29
+ if (!className) return "";
30
+ const langParts = className.split(" ").map((c) => c.split("language-")[1]).filter(Boolean);
31
+ return langParts && langParts.length > 0 ? langParts[0].trim() : "";
32
+ }
33
+ function shouldSplitOnHeader(tagId, options) {
34
+ return options.headersToSplitOn.includes(tagId);
35
+ }
36
+ /**
37
+ * Get current markdown content WITHOUT clearing buffers
38
+ */
39
+ function getCurrentMarkdown(state) {
40
+ return state.buffer.join("").trimStart();
41
+ }
42
+ /**
43
+ * Convert HTML to Markdown and split into chunks in single pass.
44
+ * Yields chunks during HTML event processing for better memory efficiency.
45
+ *
46
+ * **JavaScript engine only** — uses the JS engine's internal processing pipeline.
47
+ * Not compatible with the Rust engine.
48
+ */
49
+ function* htmlToMarkdownSplitChunksStream(html, options = {}) {
50
+ const opts = createOptions(options);
51
+ if (opts.chunkOverlap >= opts.chunkSize) throw new Error("chunkOverlap must be less than chunkSize");
52
+ const processor = createMarkdownProcessor(options, opts.resolvedPlugins);
53
+ const headerHierarchy = /* @__PURE__ */ new Map();
54
+ const seenSplitHeaders = /* @__PURE__ */ new Set();
55
+ let currentChunkCodeLanguage = "";
56
+ let collectingHeaderText = false;
57
+ let currentHeaderTagId = null;
58
+ let currentHeaderText = "";
59
+ let lineNumber = 1;
60
+ let lastChunkEndPosition = 0;
61
+ let lastSplitPosition = 0;
62
+ function* flushChunk(endPosition, applyOverlap = false) {
63
+ const currentMd = getCurrentMarkdown(processor.state);
64
+ const chunkEnd = endPosition ?? currentMd.length;
65
+ const originalChunkContent = currentMd.slice(lastChunkEndPosition, chunkEnd);
66
+ if (!originalChunkContent.trim()) {
67
+ lastChunkEndPosition = chunkEnd;
68
+ return;
69
+ }
70
+ let chunkContent = originalChunkContent;
71
+ if (opts.stripHeaders) {
72
+ chunkContent = chunkContent.split("\n").filter((line) => !MARKDOWN_HEADER_LINE_RE.test(line)).join("\n").trim();
73
+ if (!chunkContent) {
74
+ lastChunkEndPosition = chunkEnd;
75
+ return;
76
+ }
77
+ }
78
+ const chunk = {
79
+ content: chunkContent.trimEnd(),
80
+ metadata: { loc: { lines: {
81
+ from: lineNumber,
82
+ to: lineNumber + (originalChunkContent.match(NEWLINE_RE) || []).length
83
+ } } }
84
+ };
85
+ if (headerHierarchy.size > 0) {
86
+ chunk.metadata.headers = {};
87
+ for (const [tagId, text] of headerHierarchy.entries()) {
88
+ const level = `h${tagId - 7 + 1}`;
89
+ chunk.metadata.headers[level] = text;
90
+ }
91
+ }
92
+ if (currentChunkCodeLanguage) chunk.metadata.code = currentChunkCodeLanguage;
93
+ yield chunk;
94
+ currentChunkCodeLanguage = "";
95
+ lastSplitPosition = chunkEnd;
96
+ if (applyOverlap && opts.chunkOverlap > 0) {
97
+ const maxOverlap = Math.max(0, originalChunkContent.length - 1);
98
+ lastChunkEndPosition = chunkEnd - Math.min(opts.chunkOverlap, maxOverlap);
99
+ } else lastChunkEndPosition = chunkEnd;
100
+ lineNumber += (originalChunkContent.match(NEWLINE_RE) || []).length;
101
+ }
102
+ const parseState = {
103
+ depthMap: processor.state.depthMap,
104
+ depth: 0,
105
+ resolvedPlugins: opts.resolvedPlugins
106
+ };
107
+ const eventBuffer = [];
108
+ parseHtmlStream(html, parseState, (event) => {
109
+ eventBuffer.push(event);
110
+ });
111
+ for (const event of eventBuffer) {
112
+ const { type: eventType, node } = event;
113
+ if (node.type === 1) {
114
+ const element = node;
115
+ const tagId = element.tagId;
116
+ if (tagId && tagId >= 7 && tagId <= 12) {
117
+ if (eventType === 0) {
118
+ collectingHeaderText = true;
119
+ currentHeaderTagId = tagId;
120
+ currentHeaderText = "";
121
+ if (shouldSplitOnHeader(tagId, opts)) {
122
+ if (seenSplitHeaders.has(tagId)) {
123
+ yield* flushChunk();
124
+ for (let i = tagId; i <= 12; i++) headerHierarchy.delete(i);
125
+ }
126
+ seenSplitHeaders.add(tagId);
127
+ }
128
+ } else if (eventType === 1 && currentHeaderTagId === tagId) {
129
+ headerHierarchy.set(tagId, currentHeaderText.trim());
130
+ collectingHeaderText = false;
131
+ currentHeaderTagId = null;
132
+ }
133
+ }
134
+ if (tagId === 23 && (element.depthMap[34] || 0) > 0) {
135
+ if (eventType === 0) {
136
+ const lang = getCodeLanguage(element);
137
+ if (lang && !currentChunkCodeLanguage) currentChunkCodeLanguage = lang;
138
+ }
139
+ }
140
+ if (tagId === 13 && eventType === 0) yield* flushChunk();
141
+ }
142
+ if (collectingHeaderText && node.type === 2) currentHeaderText += node.value;
143
+ processPluginsForEvent(event, opts.resolvedPlugins, processor.state, processor.processEvent);
144
+ if (!opts.returnEachLine) {
145
+ const currentMd = getCurrentMarkdown(processor.state);
146
+ if (opts.lengthFunction(currentMd.slice(lastChunkEndPosition)) > opts.chunkSize) {
147
+ const idealSplitPos = lastChunkEndPosition + opts.chunkSize;
148
+ const separators = [
149
+ "\n\n",
150
+ "```\n",
151
+ "\n",
152
+ " "
153
+ ];
154
+ let splitPosition = -1;
155
+ for (const sep of separators) {
156
+ const idx = currentMd.lastIndexOf(sep, idealSplitPos);
157
+ const candidateSplitPos = idx + sep.length;
158
+ if (idx >= 0) {
159
+ const beforeSplit = currentMd.slice(0, candidateSplitPos);
160
+ let backtickCount = 0;
161
+ let pos = beforeSplit.indexOf("```", 0);
162
+ while (pos !== -1) {
163
+ backtickCount++;
164
+ pos = beforeSplit.indexOf("```", pos + 3);
165
+ }
166
+ if (backtickCount % 2 === 1) continue;
167
+ }
168
+ if (idx >= 0 && candidateSplitPos > lastSplitPosition) {
169
+ splitPosition = candidateSplitPos;
170
+ break;
171
+ }
172
+ }
173
+ if (splitPosition === -1 || splitPosition <= lastChunkEndPosition) splitPosition = currentMd.length;
174
+ yield* flushChunk(splitPosition, true);
175
+ }
176
+ }
177
+ }
178
+ yield* flushChunk();
179
+ }
180
+ /**
181
+ * Convert HTML to Markdown and split into chunks in single pass.
182
+ * Chunks are created during HTML event processing.
183
+ *
184
+ * **JavaScript engine only** — uses the JS engine's internal processing pipeline.
185
+ * Not compatible with the Rust engine.
186
+ */
187
+ function htmlToMarkdownSplitChunks(html, options = {}) {
188
+ const opts = createOptions(options);
189
+ const chunks = [];
190
+ for (const chunk of htmlToMarkdownSplitChunksStream(html, options)) chunks.push(chunk);
191
+ if (opts.returnEachLine && chunks.length > 0) {
192
+ const lineChunks = [];
193
+ for (const chunk of chunks) {
194
+ const lines = chunk.content.split("\n");
195
+ const chunkStartLine = chunk.metadata.loc?.lines.from || 1;
196
+ for (let i = 0; i < lines.length; i++) {
197
+ const line = lines[i];
198
+ if (line && line.trim()) lineChunks.push({
199
+ content: line,
200
+ metadata: {
201
+ ...chunk.metadata,
202
+ loc: { lines: {
203
+ from: chunkStartLine + i,
204
+ to: chunkStartLine + i
205
+ } }
206
+ }
207
+ });
208
+ }
209
+ }
210
+ return lineChunks;
211
+ }
212
+ return chunks;
213
+ }
214
+ //#endregion
215
+ export { htmlToMarkdownSplitChunks, htmlToMarkdownSplitChunksStream };
package/package.json ADDED
@@ -0,0 +1,93 @@
1
+ {
2
+ "name": "@mdream/js",
3
+ "type": "module",
4
+ "version": "0.17.0",
5
+ "description": "JavaScript HTML-to-Markdown engine for mdream. Escape hatch for hooks and edge runtimes.",
6
+ "author": {
7
+ "name": "Harlan Wilton",
8
+ "email": "harlan@harlanzw.com",
9
+ "url": "https://harlanzw.com/"
10
+ },
11
+ "license": "MIT",
12
+ "repository": {
13
+ "type": "git",
14
+ "url": "https://github.com/harlan-zw/mdream",
15
+ "directory": "packages/js"
16
+ },
17
+ "exports": {
18
+ ".": {
19
+ "types": "./dist/index.d.mts",
20
+ "import": {
21
+ "types": "./dist/index.d.mts",
22
+ "default": "./dist/index.mjs"
23
+ },
24
+ "default": "./dist/index.mjs"
25
+ },
26
+ "./negotiate": {
27
+ "types": "./dist/negotiate.d.mts",
28
+ "import": {
29
+ "types": "./dist/negotiate.d.mts",
30
+ "default": "./dist/negotiate.mjs"
31
+ },
32
+ "default": "./dist/negotiate.mjs"
33
+ },
34
+ "./plugins": {
35
+ "types": "./dist/plugins.d.mts",
36
+ "import": {
37
+ "types": "./dist/plugins.d.mts",
38
+ "default": "./dist/plugins.mjs"
39
+ },
40
+ "default": "./dist/plugins.mjs"
41
+ },
42
+ "./preset/minimal": {
43
+ "types": "./dist/preset/minimal.d.mts",
44
+ "import": {
45
+ "types": "./dist/preset/minimal.d.mts",
46
+ "default": "./dist/preset/minimal.mjs"
47
+ },
48
+ "default": "./dist/preset/minimal.mjs"
49
+ },
50
+ "./splitter": {
51
+ "types": "./dist/splitter.d.mts",
52
+ "import": {
53
+ "types": "./dist/splitter.d.mts",
54
+ "default": "./dist/splitter.mjs"
55
+ },
56
+ "default": "./dist/splitter.mjs"
57
+ },
58
+ "./parse": {
59
+ "types": "./dist/parse.d.mts",
60
+ "import": {
61
+ "types": "./dist/parse.d.mts",
62
+ "default": "./dist/parse.mjs"
63
+ },
64
+ "default": "./dist/parse.mjs"
65
+ },
66
+ "./llms-txt": {
67
+ "types": "./dist/llms-txt.d.mts",
68
+ "import": {
69
+ "types": "./dist/llms-txt.d.mts",
70
+ "default": "./dist/llms-txt.mjs"
71
+ },
72
+ "default": "./dist/llms-txt.mjs"
73
+ }
74
+ },
75
+ "main": "./dist/index.mjs",
76
+ "types": "./dist/index.d.mts",
77
+ "bin": {
78
+ "mdream-js": "./bin/mdream.mjs"
79
+ },
80
+ "files": [
81
+ "bin",
82
+ "dist"
83
+ ],
84
+ "dependencies": {
85
+ "cac": "^7.0.0",
86
+ "pathe": "^2.0.3"
87
+ },
88
+ "scripts": {
89
+ "build": "obuild",
90
+ "typecheck": "tsc --noEmit",
91
+ "dev:prepare": "obuild --stub"
92
+ }
93
+ }