@dragon708/docmind-markdown 1.0.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +306 -9
- package/dist/index.js +819 -99
- package/package.json +19 -2
package/dist/index.js
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
|
|
1
|
+
import { isStructuredDocumentResult } from '@dragon708/docmind-shared';
|
|
2
|
+
|
|
3
|
+
// src/structured-markdown.ts
|
|
2
4
|
function clampHeadingLevel(level) {
|
|
3
5
|
if (level === void 0 || !Number.isFinite(level)) return 2;
|
|
4
6
|
const n = Math.floor(level);
|
|
@@ -12,6 +14,9 @@ function escapeTableCell(text) {
|
|
|
12
14
|
function safeString(s) {
|
|
13
15
|
return typeof s === "string" ? s : "";
|
|
14
16
|
}
|
|
17
|
+
function blockText(s) {
|
|
18
|
+
return safeString(s).trim();
|
|
19
|
+
}
|
|
15
20
|
function safeArrays(result) {
|
|
16
21
|
return {
|
|
17
22
|
blocks: Array.isArray(result.blocks) ? result.blocks : [],
|
|
@@ -41,6 +46,21 @@ function metadataHeaderLines(meta) {
|
|
|
41
46
|
}
|
|
42
47
|
return lines;
|
|
43
48
|
}
|
|
49
|
+
function expandTableRowForMarkdown(row) {
|
|
50
|
+
const out = [];
|
|
51
|
+
for (const cell of row) {
|
|
52
|
+
const base = escapeTableCell(cell.text);
|
|
53
|
+
const rs = cell.rowSpan !== void 0 && Number.isFinite(cell.rowSpan) ? Math.max(1, Math.floor(cell.rowSpan)) : 1;
|
|
54
|
+
const cs = cell.colSpan !== void 0 && Number.isFinite(cell.colSpan) ? Math.max(1, Math.floor(cell.colSpan)) : 1;
|
|
55
|
+
const note = rs > 1 ? `${base} *(rows: ${rs})*` : base;
|
|
56
|
+
out.push({ text: note });
|
|
57
|
+
for (let i = 1; i < cs; i++) out.push({ text: "" });
|
|
58
|
+
}
|
|
59
|
+
return out;
|
|
60
|
+
}
|
|
61
|
+
function tableRowWidth(row) {
|
|
62
|
+
return row.length;
|
|
63
|
+
}
|
|
44
64
|
function tableToMarkdown(table) {
|
|
45
65
|
const rows = table.rows;
|
|
46
66
|
if (rows.length === 0) {
|
|
@@ -49,10 +69,12 @@ function tableToMarkdown(table) {
|
|
|
49
69
|
*(empty table)*
|
|
50
70
|
` : "*(empty table)*\n";
|
|
51
71
|
}
|
|
52
|
-
const
|
|
53
|
-
const
|
|
72
|
+
const expanded = rows.map((r) => expandTableRowForMarkdown(r));
|
|
73
|
+
const width = Math.max(1, ...expanded.map(tableRowWidth));
|
|
74
|
+
const padRow = (cells) => Array.from({ length: width }, (_, i) => escapeTableCell(cells[i]?.text ?? ""));
|
|
75
|
+
const line = (cells) => `| ${padRow(cells).join(" | ")} |`;
|
|
76
|
+
const header = expanded[0] ?? [];
|
|
54
77
|
const sep = Array.from({ length: width }, () => "---");
|
|
55
|
-
const line = (cells) => `| ${Array.from({ length: width }, (_, i) => escapeTableCell(cells[i]?.text ?? "")).join(" | ")} |`;
|
|
56
78
|
const out = [];
|
|
57
79
|
if (table.caption) {
|
|
58
80
|
out.push(`**${escapeTableCell(table.caption)}**`);
|
|
@@ -60,8 +82,8 @@ function tableToMarkdown(table) {
|
|
|
60
82
|
}
|
|
61
83
|
out.push(line(header));
|
|
62
84
|
out.push(`| ${sep.join(" | ")} |`);
|
|
63
|
-
for (let r = 1; r <
|
|
64
|
-
out.push(line(
|
|
85
|
+
for (let r = 1; r < expanded.length; r++) {
|
|
86
|
+
out.push(line(expanded[r]));
|
|
65
87
|
}
|
|
66
88
|
return `${out.join("\n")}
|
|
67
89
|
`;
|
|
@@ -88,10 +110,12 @@ function referencedImageIds(blocks) {
|
|
|
88
110
|
}
|
|
89
111
|
function convertStructuredToMarkdown(result, options) {
|
|
90
112
|
const imagePlaceholder = options?.imagePlaceholder ?? "<!-- image: no src -->";
|
|
113
|
+
const imageMissingSrcMode = options?.imageMissingSrcMode ?? "placeholder";
|
|
91
114
|
const pageSep = (options?.pageSeparator ?? "---").trimEnd();
|
|
92
115
|
const pageTransitions = options?.pageTransitionMarkers !== false;
|
|
93
116
|
const appendOrphanTables = options?.appendUnreferencedTables !== false;
|
|
94
117
|
const appendOrphanImages = options?.appendUnreferencedImages === true;
|
|
118
|
+
const appendWarningsSection = options?.appendWarningsSection === true;
|
|
95
119
|
const { blocks, tables, pages, images } = safeArrays(result);
|
|
96
120
|
const hasPageModel = pages.length > 0;
|
|
97
121
|
const parts = [];
|
|
@@ -139,11 +163,12 @@ function convertStructuredToMarkdown(result, options) {
|
|
|
139
163
|
tables,
|
|
140
164
|
images,
|
|
141
165
|
imagePlaceholder,
|
|
166
|
+
imageMissingSrcMode,
|
|
142
167
|
orderedDepthCounters,
|
|
143
168
|
resetListState,
|
|
144
169
|
pageSep
|
|
145
170
|
);
|
|
146
|
-
parts.push(chunk);
|
|
171
|
+
if (chunk.length > 0) parts.push(chunk);
|
|
147
172
|
}
|
|
148
173
|
if (appendOrphanTables) {
|
|
149
174
|
const used = referencedTableIds(blocks);
|
|
@@ -176,6 +201,18 @@ function convertStructuredToMarkdown(result, options) {
|
|
|
176
201
|
}
|
|
177
202
|
}
|
|
178
203
|
}
|
|
204
|
+
if (appendWarningsSection) {
|
|
205
|
+
const warns = Array.isArray(result.warnings) ? result.warnings : [];
|
|
206
|
+
if (warns.length > 0) {
|
|
207
|
+
parts.push("");
|
|
208
|
+
parts.push("### Extraction warnings");
|
|
209
|
+
parts.push("");
|
|
210
|
+
for (const w of warns) {
|
|
211
|
+
const line = String(w).replace(/\r?\n/g, " ").trim();
|
|
212
|
+
if (line.length > 0) parts.push(`- ${line}`);
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
}
|
|
179
216
|
let body = parts.join("\n\n").replace(/\n{3,}/g, "\n\n").trimEnd();
|
|
180
217
|
if (body.length === 0) {
|
|
181
218
|
body = safeString(result.text).trim();
|
|
@@ -188,49 +225,93 @@ function escapeCommentText(s) {
|
|
|
188
225
|
function listItemLine(block, orderedDepthCounters) {
|
|
189
226
|
const depth = Math.max(0, block.depth ?? 0);
|
|
190
227
|
const indent = " ".repeat(depth);
|
|
191
|
-
const style = block.listStyle
|
|
228
|
+
const style = block.listStyle === "ordered" ? "ordered" : "unordered";
|
|
192
229
|
if (style === "ordered") {
|
|
193
230
|
while (orderedDepthCounters.length <= depth) orderedDepthCounters.push(0);
|
|
194
231
|
orderedDepthCounters.length = depth + 1;
|
|
195
232
|
orderedDepthCounters[depth] = (orderedDepthCounters[depth] ?? 0) + 1;
|
|
196
233
|
const n = orderedDepthCounters[depth];
|
|
197
|
-
return `${indent}${n}. ${block.text
|
|
234
|
+
return `${indent}${n}. ${blockText(block.text)}`;
|
|
198
235
|
}
|
|
199
236
|
orderedDepthCounters.length = depth;
|
|
200
|
-
return `${indent}- ${block.text
|
|
237
|
+
return `${indent}- ${blockText(block.text)}`;
|
|
238
|
+
}
|
|
239
|
+
function imageRefWithoutSrcMarkdown(imageId, altDisplay, imagePlaceholder, imageMissingSrcMode, kind) {
|
|
240
|
+
if (imageMissingSrcMode === "llm-label") {
|
|
241
|
+
const altPart = altDisplay.length > 0 ? ` \u2014 _${altDisplay.replace(/_/g, "\\_")}_` : "";
|
|
242
|
+
return `*[Image: \`${escapeBackticks(imageId)}\`${altPart}]*`;
|
|
243
|
+
}
|
|
244
|
+
const hint = kind === "placeholder" ? " (placeholder)" : kind === "embedded" ? " (embedded)" : kind === "external" ? " (external, no URL)" : "";
|
|
245
|
+
return `${imagePlaceholder}${hint}`;
|
|
246
|
+
}
|
|
247
|
+
function escapeBackticks(s) {
|
|
248
|
+
return s.replace(/`/g, "\\`");
|
|
201
249
|
}
|
|
202
|
-
function
|
|
250
|
+
function quoteMarkdownLines(text, prefix) {
|
|
251
|
+
return text.split(/\r?\n/).map((ln) => `${prefix}${ln}`).join("\n");
|
|
252
|
+
}
|
|
253
|
+
function unknownBlockToMarkdown(block) {
|
|
254
|
+
const hint = block.hint?.trim();
|
|
255
|
+
const raw = block.raw?.trim();
|
|
256
|
+
if (raw && hint) {
|
|
257
|
+
return `> _Unrecognized block:_ ${hint}
|
|
258
|
+
>
|
|
259
|
+
${quoteMarkdownLines(raw, "> ")}`;
|
|
260
|
+
}
|
|
261
|
+
if (raw) {
|
|
262
|
+
return raw.includes("\n") ? quoteMarkdownLines(raw, "> ") : raw;
|
|
263
|
+
}
|
|
264
|
+
if (hint) {
|
|
265
|
+
return `> _Unrecognized:_ ${hint}`;
|
|
266
|
+
}
|
|
267
|
+
return "<!-- unknown block -->";
|
|
268
|
+
}
|
|
269
|
+
function blockToMarkdown(block, tables, images, imagePlaceholder, imageMissingSrcMode, orderedDepthCounters, resetListState, pageSep) {
|
|
203
270
|
switch (block.type) {
|
|
204
271
|
case "heading": {
|
|
205
272
|
resetListState();
|
|
206
273
|
const level = clampHeadingLevel(block.level);
|
|
207
274
|
const hashes = "#".repeat(level);
|
|
208
|
-
|
|
275
|
+
const t = blockText(block.text);
|
|
276
|
+
if (t.length === 0) return "";
|
|
277
|
+
return `${hashes} ${t}`;
|
|
209
278
|
}
|
|
210
279
|
case "paragraph": {
|
|
211
280
|
resetListState();
|
|
212
|
-
return block.text
|
|
281
|
+
return blockText(block.text);
|
|
213
282
|
}
|
|
214
283
|
case "list-item":
|
|
215
|
-
return listItemLine(block, orderedDepthCounters);
|
|
284
|
+
return blockText(block.text).length === 0 ? "" : listItemLine(block, orderedDepthCounters);
|
|
216
285
|
case "table": {
|
|
217
286
|
resetListState();
|
|
218
|
-
const
|
|
287
|
+
const tid = safeString(block.tableId);
|
|
288
|
+
if (!tid) {
|
|
289
|
+
return `<!-- table block: missing tableId -->`;
|
|
290
|
+
}
|
|
291
|
+
const t = resolveTable(tables, tid);
|
|
219
292
|
if (!t) {
|
|
220
|
-
return `<!-- table not found: ${escapeCommentText(
|
|
293
|
+
return `<!-- table not found: ${escapeCommentText(tid)} -->`;
|
|
221
294
|
}
|
|
222
295
|
return tableToMarkdown(t).trimEnd();
|
|
223
296
|
}
|
|
224
297
|
case "image-ref": {
|
|
225
298
|
resetListState();
|
|
226
|
-
const
|
|
299
|
+
const iid = safeString(block.imageId);
|
|
300
|
+
if (!iid) {
|
|
301
|
+
return `<!-- image-ref: missing imageId -->`;
|
|
302
|
+
}
|
|
303
|
+
const img = resolveImage(images, iid);
|
|
227
304
|
const altRaw = block.alt ?? img?.alt ?? "";
|
|
228
305
|
const alt = altRaw.replace(/]/g, "\\]");
|
|
229
306
|
const src = img?.src;
|
|
230
307
|
if (src) return ``;
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
308
|
+
return imageRefWithoutSrcMarkdown(
|
|
309
|
+
iid,
|
|
310
|
+
altRaw.trim(),
|
|
311
|
+
imagePlaceholder,
|
|
312
|
+
imageMissingSrcMode,
|
|
313
|
+
img?.kind
|
|
314
|
+
);
|
|
234
315
|
}
|
|
235
316
|
case "page-break": {
|
|
236
317
|
resetListState();
|
|
@@ -238,10 +319,7 @@ function blockToMarkdown(block, tables, images, imagePlaceholder, orderedDepthCo
|
|
|
238
319
|
}
|
|
239
320
|
case "unknown": {
|
|
240
321
|
resetListState();
|
|
241
|
-
|
|
242
|
-
if (raw) return raw;
|
|
243
|
-
if (block.hint) return `<!-- ${escapeCommentText(block.hint)} -->`;
|
|
244
|
-
return "<!-- unknown block -->";
|
|
322
|
+
return unknownBlockToMarkdown(block);
|
|
245
323
|
}
|
|
246
324
|
default: {
|
|
247
325
|
const _exhaustive = block;
|
|
@@ -253,7 +331,7 @@ function structuredDocumentToMarkdown(structured, options) {
|
|
|
253
331
|
return convertStructuredToMarkdown(structured, options);
|
|
254
332
|
}
|
|
255
333
|
|
|
256
|
-
// src/
|
|
334
|
+
// src/llm-text.ts
|
|
257
335
|
function clampHeadingLevel2(level) {
|
|
258
336
|
if (level === void 0 || !Number.isFinite(level)) return 2;
|
|
259
337
|
const n = Math.floor(level);
|
|
@@ -261,6 +339,13 @@ function clampHeadingLevel2(level) {
|
|
|
261
339
|
if (n > 6) return 6;
|
|
262
340
|
return n;
|
|
263
341
|
}
|
|
342
|
+
function sanitizeNoiseChars(s) {
|
|
343
|
+
return s.replace(/[\u200B-\u200D\uFEFF\u2060]/g, "").replace(/\u00A0/g, " ").replace(/[ \t\f\v]+/g, " ").trim();
|
|
344
|
+
}
|
|
345
|
+
function sanitizeLineOriented(text, enabled) {
|
|
346
|
+
if (!enabled) return text;
|
|
347
|
+
return text.split("\n").map((line) => sanitizeNoiseChars(line)).join("\n");
|
|
348
|
+
}
|
|
264
349
|
function safeArrays2(result) {
|
|
265
350
|
return {
|
|
266
351
|
blocks: Array.isArray(result.blocks) ? result.blocks : [],
|
|
@@ -296,17 +381,45 @@ function metadataDocBlock(meta, extraMax) {
|
|
|
296
381
|
return `[DOC]
|
|
297
382
|
${lines.join("\n")}`;
|
|
298
383
|
}
|
|
299
|
-
function
|
|
384
|
+
function cellToLlmSegment(text, sanitize) {
|
|
385
|
+
const one = text.replace(/\r?\n/g, " ").replace(/\|/g, "\xB7").replace(/\s+/g, " ").trim();
|
|
386
|
+
return sanitize ? sanitizeNoiseChars(one) : one;
|
|
387
|
+
}
|
|
388
|
+
function expandTableRowForLlm(row) {
|
|
389
|
+
const out = [];
|
|
390
|
+
for (const cell of row) {
|
|
391
|
+
const base = cell.text;
|
|
392
|
+
const rs = cell.rowSpan !== void 0 && Number.isFinite(cell.rowSpan) ? Math.max(1, Math.floor(cell.rowSpan)) : 1;
|
|
393
|
+
const cs = cell.colSpan !== void 0 && Number.isFinite(cell.colSpan) ? Math.max(1, Math.floor(cell.colSpan)) : 1;
|
|
394
|
+
const note = rs > 1 ? `${base} (rows\xD7${rs})` : base;
|
|
395
|
+
out.push(note);
|
|
396
|
+
for (let i = 1; i < cs; i++) out.push("");
|
|
397
|
+
}
|
|
398
|
+
return out;
|
|
399
|
+
}
|
|
400
|
+
function tableHeaderRuleLine(nCols, glue) {
|
|
401
|
+
const unit = "---";
|
|
402
|
+
return Array.from({ length: Math.max(1, nCols) }, () => unit).join(glue);
|
|
403
|
+
}
|
|
404
|
+
function tableToLlmBlock(table, tag, glue, headerSep, sanitize) {
|
|
300
405
|
const lines = [];
|
|
301
406
|
lines.push(`${tag} id=${table.id}`);
|
|
302
|
-
if (table.caption)
|
|
407
|
+
if (table.caption) {
|
|
408
|
+
lines.push(`Caption: ${sanitize ? sanitizeNoiseChars(table.caption) : table.caption}`);
|
|
409
|
+
}
|
|
303
410
|
const rows = table.rows;
|
|
304
411
|
if (rows.length === 0) {
|
|
305
412
|
lines.push("(empty table)");
|
|
306
413
|
return lines.join("\n");
|
|
307
414
|
}
|
|
308
|
-
|
|
309
|
-
|
|
415
|
+
const expanded = rows.map((r) => expandTableRowForLlm(r));
|
|
416
|
+
for (let ri = 0; ri < expanded.length; ri++) {
|
|
417
|
+
const row = expanded[ri];
|
|
418
|
+
const rendered = row.map((c) => cellToLlmSegment(c, sanitize)).join(glue);
|
|
419
|
+
lines.push(rendered);
|
|
420
|
+
if (headerSep && ri === 0 && expanded.length > 1) {
|
|
421
|
+
lines.push(tableHeaderRuleLine(row.length, glue));
|
|
422
|
+
}
|
|
310
423
|
}
|
|
311
424
|
return lines.join("\n");
|
|
312
425
|
}
|
|
@@ -320,11 +433,12 @@ function referencedTableIds2(blocks) {
|
|
|
320
433
|
}
|
|
321
434
|
return ids;
|
|
322
435
|
}
|
|
323
|
-
function listItemLine2(block, orderedDepthCounters) {
|
|
436
|
+
function listItemLine2(block, orderedDepthCounters, sanitize) {
|
|
324
437
|
const depth = Math.max(0, block.depth ?? 0);
|
|
325
438
|
const indent = " ".repeat(depth);
|
|
326
|
-
const
|
|
327
|
-
const text =
|
|
439
|
+
const raw = block.text.replace(/\r?\n/g, " ").trim();
|
|
440
|
+
const text = sanitize ? sanitizeNoiseChars(raw) : raw.replace(/\s+/g, " ").trim();
|
|
441
|
+
const style = block.listStyle === "ordered" ? "ordered" : "unordered";
|
|
328
442
|
if (style === "ordered") {
|
|
329
443
|
while (orderedDepthCounters.length <= depth) orderedDepthCounters.push(0);
|
|
330
444
|
orderedDepthCounters.length = depth + 1;
|
|
@@ -335,31 +449,56 @@ function listItemLine2(block, orderedDepthCounters) {
|
|
|
335
449
|
orderedDepthCounters.length = depth;
|
|
336
450
|
return `${indent}\u2022 ${text}`;
|
|
337
451
|
}
|
|
338
|
-
function
|
|
452
|
+
function unknownToLlm(block, sanitize) {
|
|
453
|
+
const hint = block.hint?.trim();
|
|
454
|
+
const raw = block.raw?.trim();
|
|
455
|
+
if (raw && hint) {
|
|
456
|
+
const h = sanitize ? sanitizeNoiseChars(hint) : hint;
|
|
457
|
+
const body = raw.split(/\r?\n/).map((ln) => sanitize ? sanitizeNoiseChars(ln) : ln.trimEnd()).join("\n");
|
|
458
|
+
return `[UNKNOWN] ${h}
|
|
459
|
+
${body.split("\n").map((l) => ` ${l}`).join("\n")}`;
|
|
460
|
+
}
|
|
461
|
+
if (raw) {
|
|
462
|
+
return raw.split(/\r?\n/).map((ln) => sanitize ? sanitizeNoiseChars(ln) : ln.trimEnd()).join("\n");
|
|
463
|
+
}
|
|
464
|
+
if (hint) {
|
|
465
|
+
return `[UNKNOWN] ${sanitize ? sanitizeNoiseChars(hint) : hint}`;
|
|
466
|
+
}
|
|
467
|
+
return "[UNKNOWN]";
|
|
468
|
+
}
|
|
469
|
+
function blockToLlm(block, tables, images, tableTag, imageTag, pageMarker, orderedDepthCounters, resetListState, skipEmptyParagraphs, glue, headerSep, sanitize) {
|
|
339
470
|
switch (block.type) {
|
|
340
471
|
case "heading": {
|
|
341
472
|
resetListState();
|
|
342
473
|
const lv = clampHeadingLevel2(block.level);
|
|
343
|
-
|
|
474
|
+
const t = block.text.replace(/\r?\n/g, " ").trim();
|
|
475
|
+
if (t.length === 0) return void 0;
|
|
476
|
+
const text = sanitize ? sanitizeNoiseChars(t) : t.replace(/\s+/g, " ").trim();
|
|
477
|
+
return `[H${lv}] ${text}`;
|
|
344
478
|
}
|
|
345
479
|
case "paragraph": {
|
|
346
480
|
resetListState();
|
|
347
481
|
const t = block.text.trim();
|
|
348
482
|
if (t.length === 0 && skipEmptyParagraphs) return void 0;
|
|
349
|
-
|
|
483
|
+
const flat = t.replace(/\r?\n/g, " ").replace(/\s+/g, " ").trim();
|
|
484
|
+
return sanitize ? sanitizeNoiseChars(flat) : flat;
|
|
485
|
+
}
|
|
486
|
+
case "list-item": {
|
|
487
|
+
const t = block.text.trim();
|
|
488
|
+
if (t.length === 0) return void 0;
|
|
489
|
+
return listItemLine2(block, orderedDepthCounters, sanitize);
|
|
350
490
|
}
|
|
351
|
-
case "list-item":
|
|
352
|
-
return listItemLine2(block, orderedDepthCounters);
|
|
353
491
|
case "table": {
|
|
354
492
|
resetListState();
|
|
355
493
|
const t = resolveTable2(tables, block.tableId);
|
|
356
494
|
if (!t) return `${tableTag} MISSING id=${block.tableId}`;
|
|
357
|
-
return tableToLlmBlock(t, tableTag);
|
|
495
|
+
return tableToLlmBlock(t, tableTag, glue, headerSep, sanitize);
|
|
358
496
|
}
|
|
359
497
|
case "image-ref": {
|
|
360
498
|
resetListState();
|
|
361
499
|
const img = images.find((i) => i.id === block.imageId);
|
|
362
|
-
const
|
|
500
|
+
const altRaw = (block.alt ?? img?.alt ?? "").replace(/\r?\n/g, " ").trim();
|
|
501
|
+
const alt = sanitize ? sanitizeNoiseChars(altRaw) : altRaw;
|
|
363
502
|
if (img?.src) {
|
|
364
503
|
return `${imageTag} alt=${JSON.stringify(alt)} url=${JSON.stringify(img.src)}`;
|
|
365
504
|
}
|
|
@@ -371,9 +510,7 @@ function blockToLlm(block, tables, images, tableTag, imageTag, pageMarker, order
|
|
|
371
510
|
}
|
|
372
511
|
case "unknown": {
|
|
373
512
|
resetListState();
|
|
374
|
-
|
|
375
|
-
if (raw) return raw.replace(/\r?\n/g, "\n");
|
|
376
|
-
return block.hint ? `[UNKNOWN: ${block.hint}]` : "[UNKNOWN]";
|
|
513
|
+
return unknownToLlm(block, sanitize);
|
|
377
514
|
}
|
|
378
515
|
default: {
|
|
379
516
|
const _exhaustive = block;
|
|
@@ -393,6 +530,9 @@ function convertStructuredToLlmText(result, options) {
|
|
|
393
530
|
const appendOrphanTables = options?.appendUnreferencedTables !== false;
|
|
394
531
|
const compact = options?.compact === true;
|
|
395
532
|
const skipEmptyParagraphs = options?.skipEmptyParagraphs !== false;
|
|
533
|
+
const sanitize = options?.sanitizeNoise !== false;
|
|
534
|
+
const headerSep = options?.tableHeaderSeparator !== false;
|
|
535
|
+
const glue = options?.tableColumnSeparator ?? " | ";
|
|
396
536
|
const sep = compact ? "\n" : "\n\n";
|
|
397
537
|
const { blocks, tables, pages, images, warnings } = safeArrays2(result);
|
|
398
538
|
const hasPageModel = pages.length > 0;
|
|
@@ -437,7 +577,10 @@ function convertStructuredToLlmText(result, options) {
|
|
|
437
577
|
pageMarker,
|
|
438
578
|
orderedDepthCounters,
|
|
439
579
|
resetListState,
|
|
440
|
-
skipEmptyParagraphs
|
|
580
|
+
skipEmptyParagraphs,
|
|
581
|
+
glue,
|
|
582
|
+
headerSep,
|
|
583
|
+
sanitize
|
|
441
584
|
);
|
|
442
585
|
if (chunk !== void 0 && chunk.length > 0) parts.push(chunk);
|
|
443
586
|
}
|
|
@@ -447,16 +590,18 @@ function convertStructuredToLlmText(result, options) {
|
|
|
447
590
|
if (orphans.length > 0) {
|
|
448
591
|
parts.push(`[MORE_TABLES]`);
|
|
449
592
|
for (const t of orphans) {
|
|
450
|
-
parts.push(tableToLlmBlock(t, tableTag));
|
|
593
|
+
parts.push(tableToLlmBlock(t, tableTag, glue, headerSep, sanitize));
|
|
451
594
|
}
|
|
452
595
|
}
|
|
453
596
|
}
|
|
454
597
|
let out = parts.join(sep).replace(/\n{3,}/g, "\n\n").trim();
|
|
598
|
+
out = sanitizeLineOriented(out, sanitize);
|
|
455
599
|
if (out.length === 0 && fallback) {
|
|
456
600
|
out = typeof result.text === "string" ? result.text.trim() : "";
|
|
601
|
+
out = sanitizeLineOriented(out, sanitize);
|
|
457
602
|
}
|
|
458
603
|
if (includeWarnings && warnings.length > 0) {
|
|
459
|
-
const warnLines = warnings.map((w) => `- ${String(w).replace(/\r?\n/g, " ")}`).join("\n");
|
|
604
|
+
const warnLines = warnings.map((w) => `- ${sanitizeNoiseChars(String(w).replace(/\r?\n/g, " "))}`).join("\n");
|
|
460
605
|
const block = `[WARNINGS]
|
|
461
606
|
${warnLines}`;
|
|
462
607
|
out = out ? `${out}${sep}${block}` : block;
|
|
@@ -467,7 +612,7 @@ function structuredDocumentToLlmText(structured, options) {
|
|
|
467
612
|
return convertStructuredToLlmText(structured, options);
|
|
468
613
|
}
|
|
469
614
|
|
|
470
|
-
// src/
|
|
615
|
+
// src/chunking.ts
|
|
471
616
|
var SLICE_MARKDOWN_OPTS = {
|
|
472
617
|
includeMetadataHeader: false,
|
|
473
618
|
pageTransitionMarkers: false,
|
|
@@ -479,7 +624,9 @@ var SLICE_LLM_OPTS = {
|
|
|
479
624
|
includeDocumentMetadata: false,
|
|
480
625
|
includeWarnings: false,
|
|
481
626
|
pageTransitionMarkers: false,
|
|
482
|
-
appendUnreferencedTables: false
|
|
627
|
+
appendUnreferencedTables: false,
|
|
628
|
+
tableHeaderSeparator: true,
|
|
629
|
+
sanitizeNoise: true
|
|
483
630
|
};
|
|
484
631
|
function clampHeadingLevel3(level) {
|
|
485
632
|
if (level === void 0 || !Number.isFinite(level)) return 2;
|
|
@@ -504,15 +651,83 @@ function renderSlice(result, block, includeMarkdown) {
|
|
|
504
651
|
function joinChunkParts(parts) {
|
|
505
652
|
return parts.map((p) => p.trim()).filter((p) => p.length > 0).join("\n\n");
|
|
506
653
|
}
|
|
654
|
+
function pageSpanLabelFromRange(minP, maxP) {
|
|
655
|
+
if (minP === void 0) return void 0;
|
|
656
|
+
const a = minP + 1;
|
|
657
|
+
if (maxP === void 0 || maxP === minP) return String(a);
|
|
658
|
+
const b = maxP + 1;
|
|
659
|
+
return `${a}\u2013${b}`;
|
|
660
|
+
}
|
|
507
661
|
function safeBlocks(result) {
|
|
508
662
|
return Array.isArray(result.blocks) ? result.blocks : [];
|
|
509
663
|
}
|
|
664
|
+
function packUnitsIntoChunks(units, options) {
|
|
665
|
+
const {
|
|
666
|
+
maxChars,
|
|
667
|
+
overlapChars,
|
|
668
|
+
preferHeadings,
|
|
669
|
+
preserveTables,
|
|
670
|
+
includeMarkdown,
|
|
671
|
+
includePageSpanLabel
|
|
672
|
+
} = options;
|
|
673
|
+
const chunks = [];
|
|
674
|
+
let current = [];
|
|
675
|
+
let pendingTextPrefix = "";
|
|
676
|
+
function projectedTextLength(next) {
|
|
677
|
+
const body = joinChunkParts(current.map((u) => u.text).concat(next.text));
|
|
678
|
+
const full = pendingTextPrefix ? `${pendingTextPrefix}${body.length > 0 ? "\n\n" + body : ""}` : body;
|
|
679
|
+
return full.length;
|
|
680
|
+
}
|
|
681
|
+
function flush() {
|
|
682
|
+
if (current.length === 0) return;
|
|
683
|
+
const body = joinChunkParts(current.map((u) => u.text));
|
|
684
|
+
const text = pendingTextPrefix ? `${pendingTextPrefix}${body.length > 0 ? "\n\n" + body : ""}`.trim() : body.trim();
|
|
685
|
+
const markdown = includeMarkdown && current.length > 0 ? joinChunkParts(current.map((u) => u.md)).trim() : void 0;
|
|
686
|
+
const pages = current.map((u) => u.pageIndex).filter((n) => n !== void 0);
|
|
687
|
+
const pageIndex = pages.length > 0 ? Math.min(...pages) : void 0;
|
|
688
|
+
const pageEndIndex = pages.length > 0 ? Math.max(...pages) : void 0;
|
|
689
|
+
const headingPath = current.length > 0 ? current[current.length - 1].headingPath : void 0;
|
|
690
|
+
const pageSpanLabel = includePageSpanLabel && pageIndex !== void 0 ? pageSpanLabelFromRange(pageIndex, pageEndIndex) : void 0;
|
|
691
|
+
if (text.length > 0 || markdown && markdown.length > 0) {
|
|
692
|
+
chunks.push({
|
|
693
|
+
index: chunks.length,
|
|
694
|
+
text,
|
|
695
|
+
markdown: markdown && markdown.length > 0 ? markdown : void 0,
|
|
696
|
+
headingPath: headingPath && headingPath.length > 0 ? [...headingPath] : void 0,
|
|
697
|
+
pageIndex,
|
|
698
|
+
pageEndIndex,
|
|
699
|
+
pageSpanLabel
|
|
700
|
+
});
|
|
701
|
+
}
|
|
702
|
+
pendingTextPrefix = overlapChars > 0 && text.length > 0 ? text.slice(Math.max(0, text.length - overlapChars)).trimStart() : "";
|
|
703
|
+
current = [];
|
|
704
|
+
}
|
|
705
|
+
for (let i = 0; i < units.length; i++) {
|
|
706
|
+
const unit = units[i];
|
|
707
|
+
if (preferHeadings && unit.isHeading && current.length > 0) {
|
|
708
|
+
flush();
|
|
709
|
+
}
|
|
710
|
+
if (preserveTables && unit.isTable && unit.text.length > maxChars) {
|
|
711
|
+
if (current.length > 0) flush();
|
|
712
|
+
current = [unit];
|
|
713
|
+
flush();
|
|
714
|
+
continue;
|
|
715
|
+
}
|
|
716
|
+
if (current.length > 0 && projectedTextLength(unit) > maxChars) {
|
|
717
|
+
flush();
|
|
718
|
+
}
|
|
719
|
+
current.push(unit);
|
|
720
|
+
}
|
|
721
|
+
flush();
|
|
722
|
+
return chunks;
|
|
723
|
+
}
|
|
510
724
|
function splitStructuredIntoChunks(result, options) {
|
|
511
725
|
const maxChars = Math.max(1, options?.maxChars ?? 4e3);
|
|
512
726
|
const overlapChars = Math.max(0, options?.overlapChars ?? 0);
|
|
513
727
|
const preferHeadings = options?.preferHeadings !== false;
|
|
514
728
|
const preserveTables = options?.preserveTables !== false;
|
|
515
729
|
const includeMarkdown = options?.includeMarkdown !== false;
|
|
730
|
+
const includePageSpanLabel = options?.includePageSpanLabel !== false;
|
|
516
731
|
const blocks = safeBlocks(result);
|
|
517
732
|
if (blocks.length === 0) {
|
|
518
733
|
const text = convertStructuredToLlmText(result, {
|
|
@@ -532,7 +747,8 @@ function splitStructuredIntoChunks(result, options) {
|
|
|
532
747
|
markdown: md && md.length > 0 ? md : void 0,
|
|
533
748
|
headingPath: void 0,
|
|
534
749
|
pageIndex: void 0,
|
|
535
|
-
pageEndIndex: void 0
|
|
750
|
+
pageEndIndex: void 0,
|
|
751
|
+
pageSpanLabel: void 0
|
|
536
752
|
}
|
|
537
753
|
];
|
|
538
754
|
}
|
|
@@ -558,54 +774,15 @@ function splitStructuredIntoChunks(result, options) {
|
|
|
558
774
|
headingPath
|
|
559
775
|
});
|
|
560
776
|
}
|
|
561
|
-
const
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
}
|
|
569
|
-
|
|
570
|
-
if (current.length === 0) return;
|
|
571
|
-
const body = joinChunkParts(current.map((u) => u.text));
|
|
572
|
-
const text = pendingTextPrefix ? `${pendingTextPrefix}${body.length > 0 ? "\n\n" + body : ""}`.trim() : body.trim();
|
|
573
|
-
const markdown = includeMarkdown && current.length > 0 ? joinChunkParts(current.map((u) => u.md)).trim() : void 0;
|
|
574
|
-
const pages = current.map((u) => u.pageIndex).filter((n) => n !== void 0);
|
|
575
|
-
const pageIndex = pages.length > 0 ? Math.min(...pages) : void 0;
|
|
576
|
-
const pageEndIndex = pages.length > 0 ? Math.max(...pages) : void 0;
|
|
577
|
-
const headingPath = current.length > 0 ? current[current.length - 1].headingPath : void 0;
|
|
578
|
-
if (text.length > 0 || markdown && markdown.length > 0) {
|
|
579
|
-
chunks.push({
|
|
580
|
-
index: chunks.length,
|
|
581
|
-
text,
|
|
582
|
-
markdown: markdown && markdown.length > 0 ? markdown : void 0,
|
|
583
|
-
headingPath: headingPath && headingPath.length > 0 ? [...headingPath] : void 0,
|
|
584
|
-
pageIndex,
|
|
585
|
-
pageEndIndex
|
|
586
|
-
});
|
|
587
|
-
}
|
|
588
|
-
pendingTextPrefix = overlapChars > 0 && text.length > 0 ? text.slice(Math.max(0, text.length - overlapChars)).trimStart() : "";
|
|
589
|
-
current = [];
|
|
590
|
-
}
|
|
591
|
-
for (let i = 0; i < units.length; i++) {
|
|
592
|
-
const unit = units[i];
|
|
593
|
-
if (preferHeadings && unit.isHeading && current.length > 0) {
|
|
594
|
-
flush();
|
|
595
|
-
}
|
|
596
|
-
if (preserveTables && unit.isTable && unit.text.length > maxChars) {
|
|
597
|
-
if (current.length > 0) flush();
|
|
598
|
-
current = [unit];
|
|
599
|
-
flush();
|
|
600
|
-
continue;
|
|
601
|
-
}
|
|
602
|
-
if (current.length > 0 && projectedTextLength(unit) > maxChars) {
|
|
603
|
-
flush();
|
|
604
|
-
}
|
|
605
|
-
current.push(unit);
|
|
606
|
-
}
|
|
607
|
-
flush();
|
|
608
|
-
if (chunks.length === 0) {
|
|
777
|
+
const packed = packUnitsIntoChunks(units, {
|
|
778
|
+
maxChars,
|
|
779
|
+
overlapChars,
|
|
780
|
+
preferHeadings,
|
|
781
|
+
preserveTables,
|
|
782
|
+
includeMarkdown,
|
|
783
|
+
includePageSpanLabel
|
|
784
|
+
});
|
|
785
|
+
if (packed.length === 0) {
|
|
609
786
|
return [
|
|
610
787
|
{
|
|
611
788
|
index: 0,
|
|
@@ -613,12 +790,14 @@ function splitStructuredIntoChunks(result, options) {
|
|
|
613
790
|
markdown: void 0,
|
|
614
791
|
headingPath: void 0,
|
|
615
792
|
pageIndex: void 0,
|
|
616
|
-
pageEndIndex: void 0
|
|
793
|
+
pageEndIndex: void 0,
|
|
794
|
+
pageSpanLabel: void 0
|
|
617
795
|
}
|
|
618
796
|
];
|
|
619
797
|
}
|
|
620
|
-
return
|
|
798
|
+
return packed.map((c, i) => ({ ...c, index: i }));
|
|
621
799
|
}
|
|
800
|
+
var extractStructuredChunks = splitStructuredIntoChunks;
|
|
622
801
|
|
|
623
802
|
// src/render.ts
|
|
624
803
|
function renderMarkdown(result, options) {
|
|
@@ -627,6 +806,9 @@ function renderMarkdown(result, options) {
|
|
|
627
806
|
function renderLlmText(result, options) {
|
|
628
807
|
return convertStructuredToLlmText(result, options);
|
|
629
808
|
}
|
|
809
|
+
function extractLlmContent(result, options) {
|
|
810
|
+
return renderLlmText(result, options);
|
|
811
|
+
}
|
|
630
812
|
function renderMarkdownSections(result, options) {
|
|
631
813
|
const chunks = splitStructuredIntoChunks(result, {
|
|
632
814
|
...options,
|
|
@@ -638,10 +820,548 @@ function renderMarkdownSections(result, options) {
|
|
|
638
820
|
headingPath: c.headingPath,
|
|
639
821
|
pageIndex: c.pageIndex,
|
|
640
822
|
pageEndIndex: c.pageEndIndex,
|
|
823
|
+
pageSpanLabel: c.pageSpanLabel,
|
|
641
824
|
text: c.text.trim().length > 0 ? c.text.trim() : void 0
|
|
642
825
|
}));
|
|
643
826
|
}
|
|
644
827
|
|
|
645
|
-
|
|
828
|
+
// src/dynamic-import-runtime.ts
|
|
829
|
+
function isNodeJsRuntime() {
|
|
830
|
+
return typeof process !== "undefined" && process.versions != null && typeof process.versions.node === "string";
|
|
831
|
+
}
|
|
832
|
+
function importEsm(moduleId) {
|
|
833
|
+
if (isNodeJsRuntime()) {
|
|
834
|
+
return import(moduleId);
|
|
835
|
+
}
|
|
836
|
+
const run = new Function(
|
|
837
|
+
"id",
|
|
838
|
+
"return import(id)"
|
|
839
|
+
);
|
|
840
|
+
return run(moduleId);
|
|
841
|
+
}
|
|
842
|
+
|
|
843
|
+
// src/node-runtime.ts
|
|
844
|
+
function isNodeRuntime() {
|
|
845
|
+
return typeof process !== "undefined" && typeof process.versions?.node === "string";
|
|
846
|
+
}
|
|
847
|
+
function assertNodeRuntime(capability) {
|
|
848
|
+
if (!isNodeRuntime()) {
|
|
849
|
+
throw new Error(
|
|
850
|
+
`@dragon708/docmind-markdown: ${capability} is only available in Node.js.`
|
|
851
|
+
);
|
|
852
|
+
}
|
|
853
|
+
}
|
|
854
|
+
|
|
855
|
+
// src/docx-markdown.ts
|
|
856
|
+
var TABLE_OMITTED_HTML = "\n<p><em>(Table omitted)</em></p>\n";
|
|
857
|
+
function normalizeMammothMessages(messages) {
|
|
858
|
+
return messages.map((m) => ({ type: m.type, message: m.message }));
|
|
859
|
+
}
|
|
860
|
+
async function toNodeBuffer(input) {
|
|
861
|
+
const { Buffer: Buffer2 } = await importEsm("node:buffer");
|
|
862
|
+
if (Buffer2.isBuffer(input)) return input;
|
|
863
|
+
if (input instanceof ArrayBuffer) return Buffer2.from(input);
|
|
864
|
+
return Buffer2.from(input);
|
|
865
|
+
}
|
|
866
|
+
function stripTablesFromHtml(html) {
|
|
867
|
+
return html.replace(/<table\b[^>]*>[\s\S]*?<\/table>/gi, TABLE_OMITTED_HTML);
|
|
868
|
+
}
|
|
869
|
+
function stripImagesFromHtml(html) {
|
|
870
|
+
return html.replace(/<img\b[^>]*\/?>/gi, "");
|
|
871
|
+
}
|
|
872
|
+
function stripPageBreakHrsFromHtml(html) {
|
|
873
|
+
return html.replace(
|
|
874
|
+
/<hr\b[^>]*>/gi,
|
|
875
|
+
(tag) => /\bpage-break\b/i.test(tag) ? "" : tag
|
|
876
|
+
);
|
|
877
|
+
}
|
|
878
|
+
function mergeStyleMaps(includePageBreaks, user) {
|
|
879
|
+
const parts = [];
|
|
880
|
+
if (includePageBreaks) {
|
|
881
|
+
parts.push("br[type=page] => hr.page-break");
|
|
882
|
+
}
|
|
883
|
+
if (typeof user === "string") parts.push(user);
|
|
884
|
+
else if (Array.isArray(user)) parts.push(...user);
|
|
885
|
+
if (parts.length === 0) return void 0;
|
|
886
|
+
return parts;
|
|
887
|
+
}
|
|
888
|
+
function applyCompactMarkdown(markdown) {
|
|
889
|
+
return markdown.split("\n").map((line) => line.replace(/[ \t]+$/g, "")).join("\n").replace(/\n{3,}/g, "\n\n").trim();
|
|
890
|
+
}
|
|
891
|
+
function shouldTryStructuredFallback(markdown, minLen) {
|
|
892
|
+
const t = markdown.trim();
|
|
893
|
+
if (t.length === 0) return "empty";
|
|
894
|
+
if (minLen !== void 0 && minLen > 0 && t.length < minLen) return "short";
|
|
895
|
+
return null;
|
|
896
|
+
}
|
|
897
|
+
function buildTurndownBaseOptions() {
|
|
898
|
+
return {
|
|
899
|
+
headingStyle: "atx",
|
|
900
|
+
codeBlockStyle: "fenced",
|
|
901
|
+
bulletListMarker: "-"
|
|
902
|
+
};
|
|
903
|
+
}
|
|
904
|
+
async function convertDocxToMarkdown(input, options) {
|
|
905
|
+
assertNodeRuntime("DOCX \u2192 Markdown (Mammoth \u2192 Turndown)");
|
|
906
|
+
const includeTables = options?.includeTables !== false;
|
|
907
|
+
const includeImages = options?.includeImages !== false;
|
|
908
|
+
const includePageBreaks = options?.includePageBreaks !== false;
|
|
909
|
+
const compactMode = options?.compactMode === true;
|
|
910
|
+
const minMarkdownLength = options?.minMarkdownLength;
|
|
911
|
+
const resolveStructured = options?.resolveStructured;
|
|
912
|
+
const structuredMdOpts = options?.structuredMarkdown;
|
|
913
|
+
const [{ default: mammoth }, { default: TurndownService }, { gfm }, buffer] = await Promise.all([
|
|
914
|
+
importEsm("mammoth"),
|
|
915
|
+
importEsm("turndown"),
|
|
916
|
+
includeTables ? importEsm("turndown-plugin-gfm") : Promise.resolve({ gfm: null }),
|
|
917
|
+
toNodeBuffer(input)
|
|
918
|
+
]);
|
|
919
|
+
const styleMap = mergeStyleMaps(includePageBreaks, options?.mammoth?.styleMap);
|
|
920
|
+
const mammothOpts = {
|
|
921
|
+
...options?.mammoth,
|
|
922
|
+
...styleMap !== void 0 ? { styleMap } : {}
|
|
923
|
+
};
|
|
924
|
+
if (includeImages && mammothOpts.convertImage === void 0) {
|
|
925
|
+
mammothOpts.convertImage = mammoth.images.dataUri;
|
|
926
|
+
}
|
|
927
|
+
const runDirect = async () => {
|
|
928
|
+
const htmlResult = await mammoth.convertToHtml(
|
|
929
|
+
{ buffer },
|
|
930
|
+
mammothOpts
|
|
931
|
+
);
|
|
932
|
+
let html = htmlResult.value;
|
|
933
|
+
if (!includeTables) html = stripTablesFromHtml(html);
|
|
934
|
+
if (!includeImages) html = stripImagesFromHtml(html);
|
|
935
|
+
if (!includePageBreaks) html = stripPageBreakHrsFromHtml(html);
|
|
936
|
+
const tdBase = {
|
|
937
|
+
...buildTurndownBaseOptions(),
|
|
938
|
+
...options?.turndown
|
|
939
|
+
};
|
|
940
|
+
const service = new TurndownService(
|
|
941
|
+
tdBase
|
|
942
|
+
);
|
|
943
|
+
if (includeTables && gfm) {
|
|
944
|
+
gfm(service);
|
|
945
|
+
}
|
|
946
|
+
let markdown = service.turndown(html).trim();
|
|
947
|
+
if (compactMode) markdown = applyCompactMarkdown(markdown);
|
|
948
|
+
return {
|
|
949
|
+
markdown,
|
|
950
|
+
messages: normalizeMammothMessages(htmlResult.messages)
|
|
951
|
+
};
|
|
952
|
+
};
|
|
953
|
+
const runFallback = async (reason, priorMessages, err) => {
|
|
954
|
+
if (!resolveStructured) {
|
|
955
|
+
if (reason === "error" && err !== void 0) throw err;
|
|
956
|
+
return {
|
|
957
|
+
markdown: "",
|
|
958
|
+
source: "mammoth-turndown",
|
|
959
|
+
messages: priorMessages,
|
|
960
|
+
fallbackReason: reason
|
|
961
|
+
};
|
|
962
|
+
}
|
|
963
|
+
const structured = await resolveStructured();
|
|
964
|
+
const md = convertStructuredToMarkdown(structured, structuredMdOpts);
|
|
965
|
+
const extra = [];
|
|
966
|
+
if (reason === "error" && err !== void 0) {
|
|
967
|
+
extra.push({
|
|
968
|
+
type: "warning",
|
|
969
|
+
message: `DOCX direct conversion failed; used structured fallback: ${String(err)}`
|
|
970
|
+
});
|
|
971
|
+
}
|
|
972
|
+
return {
|
|
973
|
+
markdown: compactMode ? applyCompactMarkdown(md) : md.trim(),
|
|
974
|
+
source: "structured-fallback",
|
|
975
|
+
messages: [...priorMessages, ...extra],
|
|
976
|
+
fallbackReason: reason
|
|
977
|
+
};
|
|
978
|
+
};
|
|
979
|
+
try {
|
|
980
|
+
const { markdown, messages } = await runDirect();
|
|
981
|
+
const insuff = shouldTryStructuredFallback(markdown, minMarkdownLength);
|
|
982
|
+
if (insuff && resolveStructured) {
|
|
983
|
+
return await runFallback(insuff, messages);
|
|
984
|
+
}
|
|
985
|
+
return { markdown, source: "mammoth-turndown", messages };
|
|
986
|
+
} catch (err) {
|
|
987
|
+
return await runFallback("error", [], err);
|
|
988
|
+
}
|
|
989
|
+
}
|
|
990
|
+
async function convertDocxBufferToMarkdown(input, options) {
|
|
991
|
+
const r = await convertDocxToMarkdown(input, options);
|
|
992
|
+
return { markdown: r.markdown, messages: r.messages };
|
|
993
|
+
}
|
|
994
|
+
|
|
995
|
+
// src/pdf-markdown.ts
|
|
996
|
+
var BROWSER_WARNING = "@dragon708/docmind-markdown: PDF \u2192 Markdown via @opendataloader/pdf requires Node.js. In the browser, use a server-side conversion or supply structured text/Markdown from your backend.";
|
|
997
|
+
function normalizePdfMarkdown(markdown, clean) {
|
|
998
|
+
const t = markdown.trim();
|
|
999
|
+
if (!clean) return t;
|
|
1000
|
+
return t.replace(/\n{3,}/g, "\n\n");
|
|
1001
|
+
}
|
|
1002
|
+
function engineOptions(options) {
|
|
1003
|
+
if (!options) return {};
|
|
1004
|
+
const {
|
|
1005
|
+
resolveStructured: _r,
|
|
1006
|
+
structuredMarkdown: _s,
|
|
1007
|
+
cleanMarkdown: _c,
|
|
1008
|
+
...rest
|
|
1009
|
+
} = options;
|
|
1010
|
+
return rest;
|
|
1011
|
+
}
|
|
1012
|
+
async function toNodeBuffer2(input) {
|
|
1013
|
+
const { Buffer: Buffer2 } = await importEsm("node:buffer");
|
|
1014
|
+
if (Buffer2.isBuffer(input)) return input;
|
|
1015
|
+
if (input instanceof ArrayBuffer) return Buffer2.from(input);
|
|
1016
|
+
return Buffer2.from(input);
|
|
1017
|
+
}
|
|
1018
|
+
async function convertPdfToMarkdown(input, options) {
|
|
1019
|
+
const clean = options?.cleanMarkdown !== false;
|
|
1020
|
+
const resolveStructured = options?.resolveStructured;
|
|
1021
|
+
const structuredMdOpts = options?.structuredMarkdown;
|
|
1022
|
+
const eng = engineOptions(options);
|
|
1023
|
+
if (!isNodeRuntime()) {
|
|
1024
|
+
return {
|
|
1025
|
+
markdown: "",
|
|
1026
|
+
warnings: [BROWSER_WARNING],
|
|
1027
|
+
source: "unsupported-runtime",
|
|
1028
|
+
fallbackReason: "unsupported-runtime"
|
|
1029
|
+
};
|
|
1030
|
+
}
|
|
1031
|
+
const warnings = [];
|
|
1032
|
+
let cleanup;
|
|
1033
|
+
try {
|
|
1034
|
+
let inputPath;
|
|
1035
|
+
if (typeof input === "string") {
|
|
1036
|
+
inputPath = input;
|
|
1037
|
+
} else {
|
|
1038
|
+
const [{ mkdtemp, writeFile, rm }, { join }, { tmpdir }, buffer] = await Promise.all([
|
|
1039
|
+
importEsm("node:fs/promises"),
|
|
1040
|
+
importEsm("node:path"),
|
|
1041
|
+
importEsm("node:os"),
|
|
1042
|
+
toNodeBuffer2(input)
|
|
1043
|
+
]);
|
|
1044
|
+
const dir = await mkdtemp(join(tmpdir(), "docmind-markdown-pdf-"));
|
|
1045
|
+
inputPath = join(dir, "document.pdf");
|
|
1046
|
+
await writeFile(inputPath, buffer);
|
|
1047
|
+
cleanup = async () => rm(dir, { recursive: true, force: true });
|
|
1048
|
+
}
|
|
1049
|
+
let convert;
|
|
1050
|
+
try {
|
|
1051
|
+
({ convert } = await importEsm(
|
|
1052
|
+
"@opendataloader/pdf"
|
|
1053
|
+
));
|
|
1054
|
+
} catch (e) {
|
|
1055
|
+
const hint = e instanceof Error && /Cannot find module|MODULE_NOT_FOUND/i.test(e.message) ? " Install `@opendataloader/pdf` in your project." : "";
|
|
1056
|
+
warnings.push(
|
|
1057
|
+
`@opendataloader/pdf could not be loaded (${e instanceof Error ? e.message : String(e)}).${hint}`
|
|
1058
|
+
);
|
|
1059
|
+
if (resolveStructured) {
|
|
1060
|
+
try {
|
|
1061
|
+
const structured = await resolveStructured();
|
|
1062
|
+
const md = normalizePdfMarkdown(
|
|
1063
|
+
convertStructuredToMarkdown(structured, structuredMdOpts),
|
|
1064
|
+
clean
|
|
1065
|
+
);
|
|
1066
|
+
return {
|
|
1067
|
+
markdown: md,
|
|
1068
|
+
warnings,
|
|
1069
|
+
source: "structured-fallback",
|
|
1070
|
+
fallbackReason: "module-not-found"
|
|
1071
|
+
};
|
|
1072
|
+
} catch (e2) {
|
|
1073
|
+
warnings.push(
|
|
1074
|
+
`Structured fallback failed: ${e2 instanceof Error ? e2.message : String(e2)}`
|
|
1075
|
+
);
|
|
1076
|
+
}
|
|
1077
|
+
}
|
|
1078
|
+
return {
|
|
1079
|
+
markdown: "",
|
|
1080
|
+
warnings,
|
|
1081
|
+
source: "opendataloader",
|
|
1082
|
+
fallbackReason: "module-not-found"
|
|
1083
|
+
};
|
|
1084
|
+
}
|
|
1085
|
+
let rawMarkdown;
|
|
1086
|
+
try {
|
|
1087
|
+
rawMarkdown = await convert(inputPath, {
|
|
1088
|
+
...eng,
|
|
1089
|
+
format: "markdown",
|
|
1090
|
+
toStdout: true,
|
|
1091
|
+
quiet: eng.quiet !== false
|
|
1092
|
+
}).then((s) => String(s));
|
|
1093
|
+
} catch (e) {
|
|
1094
|
+
warnings.push(`PDF conversion failed: ${e instanceof Error ? e.message : String(e)}`);
|
|
1095
|
+
if (resolveStructured) {
|
|
1096
|
+
try {
|
|
1097
|
+
const structured = await resolveStructured();
|
|
1098
|
+
const md = normalizePdfMarkdown(
|
|
1099
|
+
convertStructuredToMarkdown(structured, structuredMdOpts),
|
|
1100
|
+
clean
|
|
1101
|
+
);
|
|
1102
|
+
return {
|
|
1103
|
+
markdown: md,
|
|
1104
|
+
warnings,
|
|
1105
|
+
source: "structured-fallback",
|
|
1106
|
+
fallbackReason: "error"
|
|
1107
|
+
};
|
|
1108
|
+
} catch (e2) {
|
|
1109
|
+
warnings.push(
|
|
1110
|
+
`Structured fallback failed: ${e2 instanceof Error ? e2.message : String(e2)}`
|
|
1111
|
+
);
|
|
1112
|
+
}
|
|
1113
|
+
}
|
|
1114
|
+
return {
|
|
1115
|
+
markdown: "",
|
|
1116
|
+
warnings,
|
|
1117
|
+
source: "opendataloader",
|
|
1118
|
+
fallbackReason: "error"
|
|
1119
|
+
};
|
|
1120
|
+
}
|
|
1121
|
+
let markdown = normalizePdfMarkdown(rawMarkdown, clean);
|
|
1122
|
+
if (markdown.length === 0) {
|
|
1123
|
+
warnings.push("OpenDataLoader returned empty Markdown for this PDF.");
|
|
1124
|
+
if (resolveStructured) {
|
|
1125
|
+
try {
|
|
1126
|
+
const structured = await resolveStructured();
|
|
1127
|
+
markdown = normalizePdfMarkdown(
|
|
1128
|
+
convertStructuredToMarkdown(structured, structuredMdOpts),
|
|
1129
|
+
clean
|
|
1130
|
+
);
|
|
1131
|
+
return {
|
|
1132
|
+
markdown,
|
|
1133
|
+
warnings,
|
|
1134
|
+
source: "structured-fallback",
|
|
1135
|
+
fallbackReason: "empty"
|
|
1136
|
+
};
|
|
1137
|
+
} catch (e2) {
|
|
1138
|
+
warnings.push(
|
|
1139
|
+
`Structured fallback failed: ${e2 instanceof Error ? e2.message : String(e2)}`
|
|
1140
|
+
);
|
|
1141
|
+
}
|
|
1142
|
+
}
|
|
1143
|
+
return {
|
|
1144
|
+
markdown: "",
|
|
1145
|
+
warnings,
|
|
1146
|
+
source: "opendataloader",
|
|
1147
|
+
fallbackReason: "empty"
|
|
1148
|
+
};
|
|
1149
|
+
}
|
|
1150
|
+
return { markdown, warnings, source: "opendataloader" };
|
|
1151
|
+
} finally {
|
|
1152
|
+
if (cleanup) {
|
|
1153
|
+
await cleanup().catch(() => {
|
|
1154
|
+
});
|
|
1155
|
+
}
|
|
1156
|
+
}
|
|
1157
|
+
}
|
|
1158
|
+
function throwIfLegacyFailure(r) {
|
|
1159
|
+
if (r.source === "unsupported-runtime") {
|
|
1160
|
+
throw new Error(r.warnings[0] ?? "PDF \u2192 Markdown requires Node.js.");
|
|
1161
|
+
}
|
|
1162
|
+
if (r.markdown.trim().length === 0 && r.source !== "structured-fallback") {
|
|
1163
|
+
throw new Error(
|
|
1164
|
+
r.warnings.length > 0 ? r.warnings.join("; ") : "PDF conversion produced no Markdown."
|
|
1165
|
+
);
|
|
1166
|
+
}
|
|
1167
|
+
}
|
|
1168
|
+
async function convertPdfPathToMarkdown(inputPath, options) {
|
|
1169
|
+
assertNodeRuntime("PDF \u2192 Markdown (@opendataloader/pdf)");
|
|
1170
|
+
const r = await convertPdfToMarkdown(inputPath, options);
|
|
1171
|
+
throwIfLegacyFailure(r);
|
|
1172
|
+
return { markdown: r.markdown };
|
|
1173
|
+
}
|
|
1174
|
+
async function convertPdfBufferToMarkdown(input, options) {
|
|
1175
|
+
assertNodeRuntime("PDF \u2192 Markdown (@opendataloader/pdf)");
|
|
1176
|
+
const r = await convertPdfToMarkdown(input, options);
|
|
1177
|
+
throwIfLegacyFailure(r);
|
|
1178
|
+
return { markdown: r.markdown };
|
|
1179
|
+
}
|
|
1180
|
+
function isArrayBufferLike(data) {
|
|
1181
|
+
if (data instanceof ArrayBuffer) return true;
|
|
1182
|
+
if (typeof Uint8Array !== "undefined" && data instanceof Uint8Array) return true;
|
|
1183
|
+
if (typeof Buffer !== "undefined" && Buffer.isBuffer(data)) return true;
|
|
1184
|
+
return false;
|
|
1185
|
+
}
|
|
1186
|
+
function isExtractMarkdownFileInput(value) {
|
|
1187
|
+
if (value === null || typeof value !== "object" || !("data" in value)) return false;
|
|
1188
|
+
return isArrayBufferLike(value.data);
|
|
1189
|
+
}
|
|
1190
|
+
function isExtractMarkdownPathInput(value) {
|
|
1191
|
+
if (value === null || typeof value !== "object" || !("path" in value)) return false;
|
|
1192
|
+
return typeof value.path === "string";
|
|
1193
|
+
}
|
|
1194
|
+
function pickStructuredMarkdownOptions(options) {
|
|
1195
|
+
if (!options) return {};
|
|
1196
|
+
const { structuredFallback: _a, docx: _b, pdf: _c, ...rest } = options;
|
|
1197
|
+
return rest;
|
|
1198
|
+
}
|
|
1199
|
+
function buildDocxOptions(extract) {
|
|
1200
|
+
const docx = extract?.docx;
|
|
1201
|
+
const fb = extract?.structuredFallback;
|
|
1202
|
+
const sm = pickStructuredMarkdownOptions(extract);
|
|
1203
|
+
return {
|
|
1204
|
+
...docx,
|
|
1205
|
+
resolveStructured: docx?.resolveStructured ?? (fb ? () => Promise.resolve(fb) : void 0),
|
|
1206
|
+
structuredMarkdown: { ...sm, ...docx?.structuredMarkdown }
|
|
1207
|
+
};
|
|
1208
|
+
}
|
|
1209
|
+
function buildPdfOptions(extract) {
|
|
1210
|
+
const pdf = extract?.pdf;
|
|
1211
|
+
const fb = extract?.structuredFallback;
|
|
1212
|
+
const sm = pickStructuredMarkdownOptions(extract);
|
|
1213
|
+
return {
|
|
1214
|
+
...pdf,
|
|
1215
|
+
resolveStructured: pdf?.resolveStructured ?? (fb ? () => Promise.resolve(fb) : void 0),
|
|
1216
|
+
structuredMarkdown: { ...sm, ...pdf?.structuredMarkdown }
|
|
1217
|
+
};
|
|
1218
|
+
}
|
|
1219
|
+
function toUint8View(data) {
|
|
1220
|
+
if (data instanceof Uint8Array) return data;
|
|
1221
|
+
if (data instanceof ArrayBuffer) return new Uint8Array(data);
|
|
1222
|
+
return new Uint8Array(data);
|
|
1223
|
+
}
|
|
1224
|
+
function detectBinaryFormat(data, filename, mimeType) {
|
|
1225
|
+
const u = toUint8View(data);
|
|
1226
|
+
const lower = filename?.toLowerCase() ?? "";
|
|
1227
|
+
const mime = mimeType?.toLowerCase() ?? "";
|
|
1228
|
+
if (mime.includes("pdf") || lower.endsWith(".pdf")) return "pdf";
|
|
1229
|
+
if (mime.includes("wordprocessingml") || mime.includes("officedocument.wordprocessingml.document") || lower.endsWith(".docx")) {
|
|
1230
|
+
return "docx";
|
|
1231
|
+
}
|
|
1232
|
+
if (u.length >= 4 && u[0] === 37 && u[1] === 80 && u[2] === 68 && u[3] === 70) {
|
|
1233
|
+
return "pdf";
|
|
1234
|
+
}
|
|
1235
|
+
if (u.length >= 4 && u[0] === 80 && u[1] === 75 && (u[2] === 3 || u[2] === 5 || u[2] === 7)) {
|
|
1236
|
+
return "docx";
|
|
1237
|
+
}
|
|
1238
|
+
return "unknown";
|
|
1239
|
+
}
|
|
1240
|
+
function docxStrategyFromSource(source) {
|
|
1241
|
+
return source === "structured-fallback" ? "docx-structured-fallback" : "docx-mammoth";
|
|
1242
|
+
}
|
|
1243
|
+
function pdfStrategyFromResult(r) {
|
|
1244
|
+
if (r.source === "structured-fallback") return "pdf-structured-fallback";
|
|
1245
|
+
if (r.source === "unsupported-runtime") return "pdf-unsupported-runtime";
|
|
1246
|
+
return "pdf-opendataloader";
|
|
1247
|
+
}
|
|
1248
|
+
function mergeWarnings(base, ...more) {
|
|
1249
|
+
const out = [...base];
|
|
1250
|
+
for (const m of more) {
|
|
1251
|
+
if (m) for (const w of m) out.push(w);
|
|
1252
|
+
}
|
|
1253
|
+
return out;
|
|
1254
|
+
}
|
|
1255
|
+
async function extractMarkdown(input, options) {
|
|
1256
|
+
const smOpts = pickStructuredMarkdownOptions(options);
|
|
1257
|
+
const fb = options?.structuredFallback;
|
|
1258
|
+
if (isStructuredDocumentResult(input)) {
|
|
1259
|
+
const markdown = convertStructuredToMarkdown(input, smOpts);
|
|
1260
|
+
return {
|
|
1261
|
+
markdown,
|
|
1262
|
+
warnings: mergeWarnings(
|
|
1263
|
+
[],
|
|
1264
|
+
input.warnings
|
|
1265
|
+
),
|
|
1266
|
+
strategy: "structured"
|
|
1267
|
+
};
|
|
1268
|
+
}
|
|
1269
|
+
let data;
|
|
1270
|
+
let filename;
|
|
1271
|
+
let mimeType;
|
|
1272
|
+
const warnings = [];
|
|
1273
|
+
if (isExtractMarkdownPathInput(input)) {
|
|
1274
|
+
if (!isNodeRuntime()) {
|
|
1275
|
+
warnings.push(
|
|
1276
|
+
"@dragon708/docmind-markdown: `path` input requires Node.js to read the file. Provide `data` bytes or a StructuredDocumentResult instead."
|
|
1277
|
+
);
|
|
1278
|
+
if (fb) {
|
|
1279
|
+
return {
|
|
1280
|
+
markdown: convertStructuredToMarkdown(fb, smOpts),
|
|
1281
|
+
warnings: mergeWarnings(warnings, fb.warnings),
|
|
1282
|
+
strategy: "path-requires-node"
|
|
1283
|
+
};
|
|
1284
|
+
}
|
|
1285
|
+
return { markdown: "", warnings, strategy: "path-requires-node" };
|
|
1286
|
+
}
|
|
1287
|
+
const { readFile } = await importEsm(
|
|
1288
|
+
"node:fs/promises"
|
|
1289
|
+
);
|
|
1290
|
+
const { basename } = await importEsm("node:path");
|
|
1291
|
+
data = await readFile(input.path);
|
|
1292
|
+
filename = input.filename ?? basename(input.path);
|
|
1293
|
+
mimeType = input.mimeType;
|
|
1294
|
+
} else if (isExtractMarkdownFileInput(input)) {
|
|
1295
|
+
data = input.data;
|
|
1296
|
+
filename = input.filename;
|
|
1297
|
+
mimeType = input.mimeType;
|
|
1298
|
+
} else {
|
|
1299
|
+
warnings.push(
|
|
1300
|
+
"@dragon708/docmind-markdown: extractMarkdown input must be a StructuredDocumentResult, { data, \u2026 }, or { path, \u2026 }."
|
|
1301
|
+
);
|
|
1302
|
+
if (fb) {
|
|
1303
|
+
return {
|
|
1304
|
+
markdown: convertStructuredToMarkdown(fb, smOpts),
|
|
1305
|
+
warnings: mergeWarnings(warnings, fb.warnings),
|
|
1306
|
+
strategy: "binary-unidentified-structured-fallback"
|
|
1307
|
+
};
|
|
1308
|
+
}
|
|
1309
|
+
return { markdown: "", warnings, strategy: "binary-unidentified" };
|
|
1310
|
+
}
|
|
1311
|
+
const fmt = detectBinaryFormat(data, filename, mimeType);
|
|
1312
|
+
if (fmt === "docx") {
|
|
1313
|
+
if (!isNodeRuntime()) {
|
|
1314
|
+
warnings.push(
|
|
1315
|
+
"@dragon708/docmind-markdown: DOCX binary conversion needs Node.js (Mammoth/Turndown). Use structured input or run on the server."
|
|
1316
|
+
);
|
|
1317
|
+
if (fb) {
|
|
1318
|
+
return {
|
|
1319
|
+
markdown: convertStructuredToMarkdown(fb, smOpts),
|
|
1320
|
+
warnings: mergeWarnings(warnings, fb.warnings),
|
|
1321
|
+
strategy: "docx-requires-node"
|
|
1322
|
+
};
|
|
1323
|
+
}
|
|
1324
|
+
return { markdown: "", warnings, strategy: "docx-requires-node" };
|
|
1325
|
+
}
|
|
1326
|
+
const r = await convertDocxToMarkdown(data, buildDocxOptions(options));
|
|
1327
|
+
const w = mergeWarnings(
|
|
1328
|
+
warnings,
|
|
1329
|
+
r.messages.map((m) => m.message)
|
|
1330
|
+
);
|
|
1331
|
+
return {
|
|
1332
|
+
markdown: r.markdown,
|
|
1333
|
+
warnings: w,
|
|
1334
|
+
strategy: docxStrategyFromSource(r.source)
|
|
1335
|
+
};
|
|
1336
|
+
}
|
|
1337
|
+
if (fmt === "pdf") {
|
|
1338
|
+
const r = await convertPdfToMarkdown(data, buildPdfOptions(options));
|
|
1339
|
+
const strategy = pdfStrategyFromResult(r);
|
|
1340
|
+
const w = mergeWarnings(warnings, r.warnings);
|
|
1341
|
+
if (strategy === "pdf-unsupported-runtime" && r.markdown === "" && fb) {
|
|
1342
|
+
return {
|
|
1343
|
+
markdown: convertStructuredToMarkdown(fb, smOpts),
|
|
1344
|
+
warnings: mergeWarnings(w, fb.warnings, [
|
|
1345
|
+
"extractMarkdown: PDF route unavailable in this runtime; used structuredFallback."
|
|
1346
|
+
]),
|
|
1347
|
+
strategy: "pdf-structured-fallback"
|
|
1348
|
+
};
|
|
1349
|
+
}
|
|
1350
|
+
return { markdown: r.markdown, warnings: w, strategy };
|
|
1351
|
+
}
|
|
1352
|
+
warnings.push(
|
|
1353
|
+
"@dragon708/docmind-markdown: Unidentified binary format (expected PDF magic or ZIP/DOCX). Using structured fallback if provided."
|
|
1354
|
+
);
|
|
1355
|
+
if (fb) {
|
|
1356
|
+
return {
|
|
1357
|
+
markdown: convertStructuredToMarkdown(fb, smOpts),
|
|
1358
|
+
warnings: mergeWarnings(warnings, fb.warnings),
|
|
1359
|
+
strategy: "binary-unidentified-structured-fallback"
|
|
1360
|
+
};
|
|
1361
|
+
}
|
|
1362
|
+
return { markdown: "", warnings, strategy: "binary-unidentified" };
|
|
1363
|
+
}
|
|
1364
|
+
|
|
1365
|
+
export { convertDocxBufferToMarkdown, convertDocxToMarkdown, convertPdfBufferToMarkdown, convertPdfPathToMarkdown, convertPdfToMarkdown, convertStructuredToLlmText, convertStructuredToMarkdown, detectBinaryFormat, extractLlmContent, extractMarkdown, extractStructuredChunks, isExtractMarkdownFileInput, renderLlmText, renderMarkdown, renderMarkdownSections, splitStructuredIntoChunks, structuredDocumentToLlmText, structuredDocumentToMarkdown };
|
|
646
1366
|
//# sourceMappingURL=index.js.map
|
|
647
1367
|
//# sourceMappingURL=index.js.map
|