@dragon708/docmind-markdown 1.0.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/dist/index.d.ts +306 -9
  2. package/dist/index.js +819 -99
  3. package/package.json +19 -2
package/dist/index.js CHANGED
@@ -1,4 +1,6 @@
1
- // src/structuredToMarkdown.ts
1
+ import { isStructuredDocumentResult } from '@dragon708/docmind-shared';
2
+
3
+ // src/structured-markdown.ts
2
4
  function clampHeadingLevel(level) {
3
5
  if (level === void 0 || !Number.isFinite(level)) return 2;
4
6
  const n = Math.floor(level);
@@ -12,6 +14,9 @@ function escapeTableCell(text) {
12
14
  function safeString(s) {
13
15
  return typeof s === "string" ? s : "";
14
16
  }
17
+ function blockText(s) {
18
+ return safeString(s).trim();
19
+ }
15
20
  function safeArrays(result) {
16
21
  return {
17
22
  blocks: Array.isArray(result.blocks) ? result.blocks : [],
@@ -41,6 +46,21 @@ function metadataHeaderLines(meta) {
41
46
  }
42
47
  return lines;
43
48
  }
49
+ function expandTableRowForMarkdown(row) {
50
+ const out = [];
51
+ for (const cell of row) {
52
+ const base = escapeTableCell(cell.text);
53
+ const rs = cell.rowSpan !== void 0 && Number.isFinite(cell.rowSpan) ? Math.max(1, Math.floor(cell.rowSpan)) : 1;
54
+ const cs = cell.colSpan !== void 0 && Number.isFinite(cell.colSpan) ? Math.max(1, Math.floor(cell.colSpan)) : 1;
55
+ const note = rs > 1 ? `${base} *(rows: ${rs})*` : base;
56
+ out.push({ text: note });
57
+ for (let i = 1; i < cs; i++) out.push({ text: "" });
58
+ }
59
+ return out;
60
+ }
61
+ function tableRowWidth(row) {
62
+ return row.length;
63
+ }
44
64
  function tableToMarkdown(table) {
45
65
  const rows = table.rows;
46
66
  if (rows.length === 0) {
@@ -49,10 +69,12 @@ function tableToMarkdown(table) {
49
69
  *(empty table)*
50
70
  ` : "*(empty table)*\n";
51
71
  }
52
- const width = Math.max(...rows.map((r) => r.length));
53
- const header = rows[0] ?? [];
72
+ const expanded = rows.map((r) => expandTableRowForMarkdown(r));
73
+ const width = Math.max(1, ...expanded.map(tableRowWidth));
74
+ const padRow = (cells) => Array.from({ length: width }, (_, i) => escapeTableCell(cells[i]?.text ?? ""));
75
+ const line = (cells) => `| ${padRow(cells).join(" | ")} |`;
76
+ const header = expanded[0] ?? [];
54
77
  const sep = Array.from({ length: width }, () => "---");
55
- const line = (cells) => `| ${Array.from({ length: width }, (_, i) => escapeTableCell(cells[i]?.text ?? "")).join(" | ")} |`;
56
78
  const out = [];
57
79
  if (table.caption) {
58
80
  out.push(`**${escapeTableCell(table.caption)}**`);
@@ -60,8 +82,8 @@ function tableToMarkdown(table) {
60
82
  }
61
83
  out.push(line(header));
62
84
  out.push(`| ${sep.join(" | ")} |`);
63
- for (let r = 1; r < rows.length; r++) {
64
- out.push(line(rows[r]));
85
+ for (let r = 1; r < expanded.length; r++) {
86
+ out.push(line(expanded[r]));
65
87
  }
66
88
  return `${out.join("\n")}
67
89
  `;
@@ -88,10 +110,12 @@ function referencedImageIds(blocks) {
88
110
  }
89
111
  function convertStructuredToMarkdown(result, options) {
90
112
  const imagePlaceholder = options?.imagePlaceholder ?? "<!-- image: no src -->";
113
+ const imageMissingSrcMode = options?.imageMissingSrcMode ?? "placeholder";
91
114
  const pageSep = (options?.pageSeparator ?? "---").trimEnd();
92
115
  const pageTransitions = options?.pageTransitionMarkers !== false;
93
116
  const appendOrphanTables = options?.appendUnreferencedTables !== false;
94
117
  const appendOrphanImages = options?.appendUnreferencedImages === true;
118
+ const appendWarningsSection = options?.appendWarningsSection === true;
95
119
  const { blocks, tables, pages, images } = safeArrays(result);
96
120
  const hasPageModel = pages.length > 0;
97
121
  const parts = [];
@@ -139,11 +163,12 @@ function convertStructuredToMarkdown(result, options) {
139
163
  tables,
140
164
  images,
141
165
  imagePlaceholder,
166
+ imageMissingSrcMode,
142
167
  orderedDepthCounters,
143
168
  resetListState,
144
169
  pageSep
145
170
  );
146
- parts.push(chunk);
171
+ if (chunk.length > 0) parts.push(chunk);
147
172
  }
148
173
  if (appendOrphanTables) {
149
174
  const used = referencedTableIds(blocks);
@@ -176,6 +201,18 @@ function convertStructuredToMarkdown(result, options) {
176
201
  }
177
202
  }
178
203
  }
204
+ if (appendWarningsSection) {
205
+ const warns = Array.isArray(result.warnings) ? result.warnings : [];
206
+ if (warns.length > 0) {
207
+ parts.push("");
208
+ parts.push("### Extraction warnings");
209
+ parts.push("");
210
+ for (const w of warns) {
211
+ const line = String(w).replace(/\r?\n/g, " ").trim();
212
+ if (line.length > 0) parts.push(`- ${line}`);
213
+ }
214
+ }
215
+ }
179
216
  let body = parts.join("\n\n").replace(/\n{3,}/g, "\n\n").trimEnd();
180
217
  if (body.length === 0) {
181
218
  body = safeString(result.text).trim();
@@ -188,49 +225,93 @@ function escapeCommentText(s) {
188
225
  function listItemLine(block, orderedDepthCounters) {
189
226
  const depth = Math.max(0, block.depth ?? 0);
190
227
  const indent = " ".repeat(depth);
191
- const style = block.listStyle ?? "unordered";
228
+ const style = block.listStyle === "ordered" ? "ordered" : "unordered";
192
229
  if (style === "ordered") {
193
230
  while (orderedDepthCounters.length <= depth) orderedDepthCounters.push(0);
194
231
  orderedDepthCounters.length = depth + 1;
195
232
  orderedDepthCounters[depth] = (orderedDepthCounters[depth] ?? 0) + 1;
196
233
  const n = orderedDepthCounters[depth];
197
- return `${indent}${n}. ${block.text.trim()}`;
234
+ return `${indent}${n}. ${blockText(block.text)}`;
198
235
  }
199
236
  orderedDepthCounters.length = depth;
200
- return `${indent}- ${block.text.trim()}`;
237
+ return `${indent}- ${blockText(block.text)}`;
238
+ }
239
+ function imageRefWithoutSrcMarkdown(imageId, altDisplay, imagePlaceholder, imageMissingSrcMode, kind) {
240
+ if (imageMissingSrcMode === "llm-label") {
241
+ const altPart = altDisplay.length > 0 ? ` \u2014 _${altDisplay.replace(/_/g, "\\_")}_` : "";
242
+ return `*[Image: \`${escapeBackticks(imageId)}\`${altPart}]*`;
243
+ }
244
+ const hint = kind === "placeholder" ? " (placeholder)" : kind === "embedded" ? " (embedded)" : kind === "external" ? " (external, no URL)" : "";
245
+ return `${imagePlaceholder}${hint}`;
246
+ }
247
+ function escapeBackticks(s) {
248
+ return s.replace(/`/g, "\\`");
201
249
  }
202
- function blockToMarkdown(block, tables, images, imagePlaceholder, orderedDepthCounters, resetListState, pageSep) {
250
+ function quoteMarkdownLines(text, prefix) {
251
+ return text.split(/\r?\n/).map((ln) => `${prefix}${ln}`).join("\n");
252
+ }
253
+ function unknownBlockToMarkdown(block) {
254
+ const hint = block.hint?.trim();
255
+ const raw = block.raw?.trim();
256
+ if (raw && hint) {
257
+ return `> _Unrecognized block:_ ${hint}
258
+ >
259
+ ${quoteMarkdownLines(raw, "> ")}`;
260
+ }
261
+ if (raw) {
262
+ return raw.includes("\n") ? quoteMarkdownLines(raw, "> ") : raw;
263
+ }
264
+ if (hint) {
265
+ return `> _Unrecognized:_ ${hint}`;
266
+ }
267
+ return "<!-- unknown block -->";
268
+ }
269
+ function blockToMarkdown(block, tables, images, imagePlaceholder, imageMissingSrcMode, orderedDepthCounters, resetListState, pageSep) {
203
270
  switch (block.type) {
204
271
  case "heading": {
205
272
  resetListState();
206
273
  const level = clampHeadingLevel(block.level);
207
274
  const hashes = "#".repeat(level);
208
- return `${hashes} ${block.text.trim()}`;
275
+ const t = blockText(block.text);
276
+ if (t.length === 0) return "";
277
+ return `${hashes} ${t}`;
209
278
  }
210
279
  case "paragraph": {
211
280
  resetListState();
212
- return block.text.trim();
281
+ return blockText(block.text);
213
282
  }
214
283
  case "list-item":
215
- return listItemLine(block, orderedDepthCounters);
284
+ return blockText(block.text).length === 0 ? "" : listItemLine(block, orderedDepthCounters);
216
285
  case "table": {
217
286
  resetListState();
218
- const t = resolveTable(tables, block.tableId);
287
+ const tid = safeString(block.tableId);
288
+ if (!tid) {
289
+ return `<!-- table block: missing tableId -->`;
290
+ }
291
+ const t = resolveTable(tables, tid);
219
292
  if (!t) {
220
- return `<!-- table not found: ${escapeCommentText(block.tableId)} -->`;
293
+ return `<!-- table not found: ${escapeCommentText(tid)} -->`;
221
294
  }
222
295
  return tableToMarkdown(t).trimEnd();
223
296
  }
224
297
  case "image-ref": {
225
298
  resetListState();
226
- const img = resolveImage(images, block.imageId);
299
+ const iid = safeString(block.imageId);
300
+ if (!iid) {
301
+ return `<!-- image-ref: missing imageId -->`;
302
+ }
303
+ const img = resolveImage(images, iid);
227
304
  const altRaw = block.alt ?? img?.alt ?? "";
228
305
  const alt = altRaw.replace(/]/g, "\\]");
229
306
  const src = img?.src;
230
307
  if (src) return `![${alt}](${src})`;
231
- const kind = img?.kind;
232
- const hint = kind === "placeholder" ? " (placeholder)" : kind === "embedded" ? " (embedded)" : "";
233
- return `${imagePlaceholder}${hint}`;
308
+ return imageRefWithoutSrcMarkdown(
309
+ iid,
310
+ altRaw.trim(),
311
+ imagePlaceholder,
312
+ imageMissingSrcMode,
313
+ img?.kind
314
+ );
234
315
  }
235
316
  case "page-break": {
236
317
  resetListState();
@@ -238,10 +319,7 @@ function blockToMarkdown(block, tables, images, imagePlaceholder, orderedDepthCo
238
319
  }
239
320
  case "unknown": {
240
321
  resetListState();
241
- const raw = block.raw?.trim();
242
- if (raw) return raw;
243
- if (block.hint) return `<!-- ${escapeCommentText(block.hint)} -->`;
244
- return "<!-- unknown block -->";
322
+ return unknownBlockToMarkdown(block);
245
323
  }
246
324
  default: {
247
325
  const _exhaustive = block;
@@ -253,7 +331,7 @@ function structuredDocumentToMarkdown(structured, options) {
253
331
  return convertStructuredToMarkdown(structured, options);
254
332
  }
255
333
 
256
- // src/structuredToLlmText.ts
334
+ // src/llm-text.ts
257
335
  function clampHeadingLevel2(level) {
258
336
  if (level === void 0 || !Number.isFinite(level)) return 2;
259
337
  const n = Math.floor(level);
@@ -261,6 +339,13 @@ function clampHeadingLevel2(level) {
261
339
  if (n > 6) return 6;
262
340
  return n;
263
341
  }
342
+ function sanitizeNoiseChars(s) {
343
+ return s.replace(/[\u200B-\u200D\uFEFF\u2060]/g, "").replace(/\u00A0/g, " ").replace(/[ \t\f\v]+/g, " ").trim();
344
+ }
345
+ function sanitizeLineOriented(text, enabled) {
346
+ if (!enabled) return text;
347
+ return text.split("\n").map((line) => sanitizeNoiseChars(line)).join("\n");
348
+ }
264
349
  function safeArrays2(result) {
265
350
  return {
266
351
  blocks: Array.isArray(result.blocks) ? result.blocks : [],
@@ -296,17 +381,45 @@ function metadataDocBlock(meta, extraMax) {
296
381
  return `[DOC]
297
382
  ${lines.join("\n")}`;
298
383
  }
299
- function tableToLlmBlock(table, tag) {
384
+ function cellToLlmSegment(text, sanitize) {
385
+ const one = text.replace(/\r?\n/g, " ").replace(/\|/g, "\xB7").replace(/\s+/g, " ").trim();
386
+ return sanitize ? sanitizeNoiseChars(one) : one;
387
+ }
388
+ function expandTableRowForLlm(row) {
389
+ const out = [];
390
+ for (const cell of row) {
391
+ const base = cell.text;
392
+ const rs = cell.rowSpan !== void 0 && Number.isFinite(cell.rowSpan) ? Math.max(1, Math.floor(cell.rowSpan)) : 1;
393
+ const cs = cell.colSpan !== void 0 && Number.isFinite(cell.colSpan) ? Math.max(1, Math.floor(cell.colSpan)) : 1;
394
+ const note = rs > 1 ? `${base} (rows\xD7${rs})` : base;
395
+ out.push(note);
396
+ for (let i = 1; i < cs; i++) out.push("");
397
+ }
398
+ return out;
399
+ }
400
+ function tableHeaderRuleLine(nCols, glue) {
401
+ const unit = "---";
402
+ return Array.from({ length: Math.max(1, nCols) }, () => unit).join(glue);
403
+ }
404
+ function tableToLlmBlock(table, tag, glue, headerSep, sanitize) {
300
405
  const lines = [];
301
406
  lines.push(`${tag} id=${table.id}`);
302
- if (table.caption) lines.push(`Caption: ${table.caption}`);
407
+ if (table.caption) {
408
+ lines.push(`Caption: ${sanitize ? sanitizeNoiseChars(table.caption) : table.caption}`);
409
+ }
303
410
  const rows = table.rows;
304
411
  if (rows.length === 0) {
305
412
  lines.push("(empty table)");
306
413
  return lines.join("\n");
307
414
  }
308
- for (const row of rows) {
309
- lines.push(row.map((c) => c.text.replace(/\r?\n/g, " ").trim()).join(" | "));
415
+ const expanded = rows.map((r) => expandTableRowForLlm(r));
416
+ for (let ri = 0; ri < expanded.length; ri++) {
417
+ const row = expanded[ri];
418
+ const rendered = row.map((c) => cellToLlmSegment(c, sanitize)).join(glue);
419
+ lines.push(rendered);
420
+ if (headerSep && ri === 0 && expanded.length > 1) {
421
+ lines.push(tableHeaderRuleLine(row.length, glue));
422
+ }
310
423
  }
311
424
  return lines.join("\n");
312
425
  }
@@ -320,11 +433,12 @@ function referencedTableIds2(blocks) {
320
433
  }
321
434
  return ids;
322
435
  }
323
- function listItemLine2(block, orderedDepthCounters) {
436
+ function listItemLine2(block, orderedDepthCounters, sanitize) {
324
437
  const depth = Math.max(0, block.depth ?? 0);
325
438
  const indent = " ".repeat(depth);
326
- const style = block.listStyle ?? "unordered";
327
- const text = block.text.replace(/\r?\n/g, " ").trim();
439
+ const raw = block.text.replace(/\r?\n/g, " ").trim();
440
+ const text = sanitize ? sanitizeNoiseChars(raw) : raw.replace(/\s+/g, " ").trim();
441
+ const style = block.listStyle === "ordered" ? "ordered" : "unordered";
328
442
  if (style === "ordered") {
329
443
  while (orderedDepthCounters.length <= depth) orderedDepthCounters.push(0);
330
444
  orderedDepthCounters.length = depth + 1;
@@ -335,31 +449,56 @@ function listItemLine2(block, orderedDepthCounters) {
335
449
  orderedDepthCounters.length = depth;
336
450
  return `${indent}\u2022 ${text}`;
337
451
  }
338
- function blockToLlm(block, tables, images, tableTag, imageTag, pageMarker, orderedDepthCounters, resetListState, skipEmptyParagraphs) {
452
+ function unknownToLlm(block, sanitize) {
453
+ const hint = block.hint?.trim();
454
+ const raw = block.raw?.trim();
455
+ if (raw && hint) {
456
+ const h = sanitize ? sanitizeNoiseChars(hint) : hint;
457
+ const body = raw.split(/\r?\n/).map((ln) => sanitize ? sanitizeNoiseChars(ln) : ln.trimEnd()).join("\n");
458
+ return `[UNKNOWN] ${h}
459
+ ${body.split("\n").map((l) => ` ${l}`).join("\n")}`;
460
+ }
461
+ if (raw) {
462
+ return raw.split(/\r?\n/).map((ln) => sanitize ? sanitizeNoiseChars(ln) : ln.trimEnd()).join("\n");
463
+ }
464
+ if (hint) {
465
+ return `[UNKNOWN] ${sanitize ? sanitizeNoiseChars(hint) : hint}`;
466
+ }
467
+ return "[UNKNOWN]";
468
+ }
469
+ function blockToLlm(block, tables, images, tableTag, imageTag, pageMarker, orderedDepthCounters, resetListState, skipEmptyParagraphs, glue, headerSep, sanitize) {
339
470
  switch (block.type) {
340
471
  case "heading": {
341
472
  resetListState();
342
473
  const lv = clampHeadingLevel2(block.level);
343
- return `[H${lv}] ${block.text.replace(/\r?\n/g, " ").trim()}`;
474
+ const t = block.text.replace(/\r?\n/g, " ").trim();
475
+ if (t.length === 0) return void 0;
476
+ const text = sanitize ? sanitizeNoiseChars(t) : t.replace(/\s+/g, " ").trim();
477
+ return `[H${lv}] ${text}`;
344
478
  }
345
479
  case "paragraph": {
346
480
  resetListState();
347
481
  const t = block.text.trim();
348
482
  if (t.length === 0 && skipEmptyParagraphs) return void 0;
349
- return t.replace(/\r?\n/g, " ").replace(/\s+/g, " ").trim();
483
+ const flat = t.replace(/\r?\n/g, " ").replace(/\s+/g, " ").trim();
484
+ return sanitize ? sanitizeNoiseChars(flat) : flat;
485
+ }
486
+ case "list-item": {
487
+ const t = block.text.trim();
488
+ if (t.length === 0) return void 0;
489
+ return listItemLine2(block, orderedDepthCounters, sanitize);
350
490
  }
351
- case "list-item":
352
- return listItemLine2(block, orderedDepthCounters);
353
491
  case "table": {
354
492
  resetListState();
355
493
  const t = resolveTable2(tables, block.tableId);
356
494
  if (!t) return `${tableTag} MISSING id=${block.tableId}`;
357
- return tableToLlmBlock(t, tableTag);
495
+ return tableToLlmBlock(t, tableTag, glue, headerSep, sanitize);
358
496
  }
359
497
  case "image-ref": {
360
498
  resetListState();
361
499
  const img = images.find((i) => i.id === block.imageId);
362
- const alt = (block.alt ?? img?.alt ?? "").replace(/\r?\n/g, " ").trim();
500
+ const altRaw = (block.alt ?? img?.alt ?? "").replace(/\r?\n/g, " ").trim();
501
+ const alt = sanitize ? sanitizeNoiseChars(altRaw) : altRaw;
363
502
  if (img?.src) {
364
503
  return `${imageTag} alt=${JSON.stringify(alt)} url=${JSON.stringify(img.src)}`;
365
504
  }
@@ -371,9 +510,7 @@ function blockToLlm(block, tables, images, tableTag, imageTag, pageMarker, order
371
510
  }
372
511
  case "unknown": {
373
512
  resetListState();
374
- const raw = block.raw?.trim();
375
- if (raw) return raw.replace(/\r?\n/g, "\n");
376
- return block.hint ? `[UNKNOWN: ${block.hint}]` : "[UNKNOWN]";
513
+ return unknownToLlm(block, sanitize);
377
514
  }
378
515
  default: {
379
516
  const _exhaustive = block;
@@ -393,6 +530,9 @@ function convertStructuredToLlmText(result, options) {
393
530
  const appendOrphanTables = options?.appendUnreferencedTables !== false;
394
531
  const compact = options?.compact === true;
395
532
  const skipEmptyParagraphs = options?.skipEmptyParagraphs !== false;
533
+ const sanitize = options?.sanitizeNoise !== false;
534
+ const headerSep = options?.tableHeaderSeparator !== false;
535
+ const glue = options?.tableColumnSeparator ?? " | ";
396
536
  const sep = compact ? "\n" : "\n\n";
397
537
  const { blocks, tables, pages, images, warnings } = safeArrays2(result);
398
538
  const hasPageModel = pages.length > 0;
@@ -437,7 +577,10 @@ function convertStructuredToLlmText(result, options) {
437
577
  pageMarker,
438
578
  orderedDepthCounters,
439
579
  resetListState,
440
- skipEmptyParagraphs
580
+ skipEmptyParagraphs,
581
+ glue,
582
+ headerSep,
583
+ sanitize
441
584
  );
442
585
  if (chunk !== void 0 && chunk.length > 0) parts.push(chunk);
443
586
  }
@@ -447,16 +590,18 @@ function convertStructuredToLlmText(result, options) {
447
590
  if (orphans.length > 0) {
448
591
  parts.push(`[MORE_TABLES]`);
449
592
  for (const t of orphans) {
450
- parts.push(tableToLlmBlock(t, tableTag));
593
+ parts.push(tableToLlmBlock(t, tableTag, glue, headerSep, sanitize));
451
594
  }
452
595
  }
453
596
  }
454
597
  let out = parts.join(sep).replace(/\n{3,}/g, "\n\n").trim();
598
+ out = sanitizeLineOriented(out, sanitize);
455
599
  if (out.length === 0 && fallback) {
456
600
  out = typeof result.text === "string" ? result.text.trim() : "";
601
+ out = sanitizeLineOriented(out, sanitize);
457
602
  }
458
603
  if (includeWarnings && warnings.length > 0) {
459
- const warnLines = warnings.map((w) => `- ${String(w).replace(/\r?\n/g, " ")}`).join("\n");
604
+ const warnLines = warnings.map((w) => `- ${sanitizeNoiseChars(String(w).replace(/\r?\n/g, " "))}`).join("\n");
460
605
  const block = `[WARNINGS]
461
606
  ${warnLines}`;
462
607
  out = out ? `${out}${sep}${block}` : block;
@@ -467,7 +612,7 @@ function structuredDocumentToLlmText(structured, options) {
467
612
  return convertStructuredToLlmText(structured, options);
468
613
  }
469
614
 
470
- // src/splitStructuredIntoChunks.ts
615
+ // src/chunking.ts
471
616
  var SLICE_MARKDOWN_OPTS = {
472
617
  includeMetadataHeader: false,
473
618
  pageTransitionMarkers: false,
@@ -479,7 +624,9 @@ var SLICE_LLM_OPTS = {
479
624
  includeDocumentMetadata: false,
480
625
  includeWarnings: false,
481
626
  pageTransitionMarkers: false,
482
- appendUnreferencedTables: false
627
+ appendUnreferencedTables: false,
628
+ tableHeaderSeparator: true,
629
+ sanitizeNoise: true
483
630
  };
484
631
  function clampHeadingLevel3(level) {
485
632
  if (level === void 0 || !Number.isFinite(level)) return 2;
@@ -504,15 +651,83 @@ function renderSlice(result, block, includeMarkdown) {
504
651
  function joinChunkParts(parts) {
505
652
  return parts.map((p) => p.trim()).filter((p) => p.length > 0).join("\n\n");
506
653
  }
654
+ function pageSpanLabelFromRange(minP, maxP) {
655
+ if (minP === void 0) return void 0;
656
+ const a = minP + 1;
657
+ if (maxP === void 0 || maxP === minP) return String(a);
658
+ const b = maxP + 1;
659
+ return `${a}\u2013${b}`;
660
+ }
507
661
  function safeBlocks(result) {
508
662
  return Array.isArray(result.blocks) ? result.blocks : [];
509
663
  }
664
+ function packUnitsIntoChunks(units, options) {
665
+ const {
666
+ maxChars,
667
+ overlapChars,
668
+ preferHeadings,
669
+ preserveTables,
670
+ includeMarkdown,
671
+ includePageSpanLabel
672
+ } = options;
673
+ const chunks = [];
674
+ let current = [];
675
+ let pendingTextPrefix = "";
676
+ function projectedTextLength(next) {
677
+ const body = joinChunkParts(current.map((u) => u.text).concat(next.text));
678
+ const full = pendingTextPrefix ? `${pendingTextPrefix}${body.length > 0 ? "\n\n" + body : ""}` : body;
679
+ return full.length;
680
+ }
681
+ function flush() {
682
+ if (current.length === 0) return;
683
+ const body = joinChunkParts(current.map((u) => u.text));
684
+ const text = pendingTextPrefix ? `${pendingTextPrefix}${body.length > 0 ? "\n\n" + body : ""}`.trim() : body.trim();
685
+ const markdown = includeMarkdown && current.length > 0 ? joinChunkParts(current.map((u) => u.md)).trim() : void 0;
686
+ const pages = current.map((u) => u.pageIndex).filter((n) => n !== void 0);
687
+ const pageIndex = pages.length > 0 ? Math.min(...pages) : void 0;
688
+ const pageEndIndex = pages.length > 0 ? Math.max(...pages) : void 0;
689
+ const headingPath = current.length > 0 ? current[current.length - 1].headingPath : void 0;
690
+ const pageSpanLabel = includePageSpanLabel && pageIndex !== void 0 ? pageSpanLabelFromRange(pageIndex, pageEndIndex) : void 0;
691
+ if (text.length > 0 || markdown && markdown.length > 0) {
692
+ chunks.push({
693
+ index: chunks.length,
694
+ text,
695
+ markdown: markdown && markdown.length > 0 ? markdown : void 0,
696
+ headingPath: headingPath && headingPath.length > 0 ? [...headingPath] : void 0,
697
+ pageIndex,
698
+ pageEndIndex,
699
+ pageSpanLabel
700
+ });
701
+ }
702
+ pendingTextPrefix = overlapChars > 0 && text.length > 0 ? text.slice(Math.max(0, text.length - overlapChars)).trimStart() : "";
703
+ current = [];
704
+ }
705
+ for (let i = 0; i < units.length; i++) {
706
+ const unit = units[i];
707
+ if (preferHeadings && unit.isHeading && current.length > 0) {
708
+ flush();
709
+ }
710
+ if (preserveTables && unit.isTable && unit.text.length > maxChars) {
711
+ if (current.length > 0) flush();
712
+ current = [unit];
713
+ flush();
714
+ continue;
715
+ }
716
+ if (current.length > 0 && projectedTextLength(unit) > maxChars) {
717
+ flush();
718
+ }
719
+ current.push(unit);
720
+ }
721
+ flush();
722
+ return chunks;
723
+ }
510
724
  function splitStructuredIntoChunks(result, options) {
511
725
  const maxChars = Math.max(1, options?.maxChars ?? 4e3);
512
726
  const overlapChars = Math.max(0, options?.overlapChars ?? 0);
513
727
  const preferHeadings = options?.preferHeadings !== false;
514
728
  const preserveTables = options?.preserveTables !== false;
515
729
  const includeMarkdown = options?.includeMarkdown !== false;
730
+ const includePageSpanLabel = options?.includePageSpanLabel !== false;
516
731
  const blocks = safeBlocks(result);
517
732
  if (blocks.length === 0) {
518
733
  const text = convertStructuredToLlmText(result, {
@@ -532,7 +747,8 @@ function splitStructuredIntoChunks(result, options) {
532
747
  markdown: md && md.length > 0 ? md : void 0,
533
748
  headingPath: void 0,
534
749
  pageIndex: void 0,
535
- pageEndIndex: void 0
750
+ pageEndIndex: void 0,
751
+ pageSpanLabel: void 0
536
752
  }
537
753
  ];
538
754
  }
@@ -558,54 +774,15 @@ function splitStructuredIntoChunks(result, options) {
558
774
  headingPath
559
775
  });
560
776
  }
561
- const chunks = [];
562
- let current = [];
563
- let pendingTextPrefix = "";
564
- function projectedTextLength(next) {
565
- const body = joinChunkParts(current.map((u) => u.text).concat(next.text));
566
- const full = pendingTextPrefix ? `${pendingTextPrefix}${body.length > 0 ? "\n\n" + body : ""}` : body;
567
- return full.length;
568
- }
569
- function flush() {
570
- if (current.length === 0) return;
571
- const body = joinChunkParts(current.map((u) => u.text));
572
- const text = pendingTextPrefix ? `${pendingTextPrefix}${body.length > 0 ? "\n\n" + body : ""}`.trim() : body.trim();
573
- const markdown = includeMarkdown && current.length > 0 ? joinChunkParts(current.map((u) => u.md)).trim() : void 0;
574
- const pages = current.map((u) => u.pageIndex).filter((n) => n !== void 0);
575
- const pageIndex = pages.length > 0 ? Math.min(...pages) : void 0;
576
- const pageEndIndex = pages.length > 0 ? Math.max(...pages) : void 0;
577
- const headingPath = current.length > 0 ? current[current.length - 1].headingPath : void 0;
578
- if (text.length > 0 || markdown && markdown.length > 0) {
579
- chunks.push({
580
- index: chunks.length,
581
- text,
582
- markdown: markdown && markdown.length > 0 ? markdown : void 0,
583
- headingPath: headingPath && headingPath.length > 0 ? [...headingPath] : void 0,
584
- pageIndex,
585
- pageEndIndex
586
- });
587
- }
588
- pendingTextPrefix = overlapChars > 0 && text.length > 0 ? text.slice(Math.max(0, text.length - overlapChars)).trimStart() : "";
589
- current = [];
590
- }
591
- for (let i = 0; i < units.length; i++) {
592
- const unit = units[i];
593
- if (preferHeadings && unit.isHeading && current.length > 0) {
594
- flush();
595
- }
596
- if (preserveTables && unit.isTable && unit.text.length > maxChars) {
597
- if (current.length > 0) flush();
598
- current = [unit];
599
- flush();
600
- continue;
601
- }
602
- if (current.length > 0 && projectedTextLength(unit) > maxChars) {
603
- flush();
604
- }
605
- current.push(unit);
606
- }
607
- flush();
608
- if (chunks.length === 0) {
777
+ const packed = packUnitsIntoChunks(units, {
778
+ maxChars,
779
+ overlapChars,
780
+ preferHeadings,
781
+ preserveTables,
782
+ includeMarkdown,
783
+ includePageSpanLabel
784
+ });
785
+ if (packed.length === 0) {
609
786
  return [
610
787
  {
611
788
  index: 0,
@@ -613,12 +790,14 @@ function splitStructuredIntoChunks(result, options) {
613
790
  markdown: void 0,
614
791
  headingPath: void 0,
615
792
  pageIndex: void 0,
616
- pageEndIndex: void 0
793
+ pageEndIndex: void 0,
794
+ pageSpanLabel: void 0
617
795
  }
618
796
  ];
619
797
  }
620
- return chunks.map((c, i) => ({ ...c, index: i }));
798
+ return packed.map((c, i) => ({ ...c, index: i }));
621
799
  }
800
+ var extractStructuredChunks = splitStructuredIntoChunks;
622
801
 
623
802
  // src/render.ts
624
803
  function renderMarkdown(result, options) {
@@ -627,6 +806,9 @@ function renderMarkdown(result, options) {
627
806
  function renderLlmText(result, options) {
628
807
  return convertStructuredToLlmText(result, options);
629
808
  }
809
+ function extractLlmContent(result, options) {
810
+ return renderLlmText(result, options);
811
+ }
630
812
  function renderMarkdownSections(result, options) {
631
813
  const chunks = splitStructuredIntoChunks(result, {
632
814
  ...options,
@@ -638,10 +820,548 @@ function renderMarkdownSections(result, options) {
638
820
  headingPath: c.headingPath,
639
821
  pageIndex: c.pageIndex,
640
822
  pageEndIndex: c.pageEndIndex,
823
+ pageSpanLabel: c.pageSpanLabel,
641
824
  text: c.text.trim().length > 0 ? c.text.trim() : void 0
642
825
  }));
643
826
  }
644
827
 
645
- export { convertStructuredToLlmText, convertStructuredToMarkdown, renderLlmText, renderMarkdown, renderMarkdownSections, splitStructuredIntoChunks, structuredDocumentToLlmText, structuredDocumentToMarkdown };
828
+ // src/dynamic-import-runtime.ts
829
+ function isNodeJsRuntime() {
830
+ return typeof process !== "undefined" && process.versions != null && typeof process.versions.node === "string";
831
+ }
832
+ function importEsm(moduleId) {
833
+ if (isNodeJsRuntime()) {
834
+ return import(moduleId);
835
+ }
836
+ const run = new Function(
837
+ "id",
838
+ "return import(id)"
839
+ );
840
+ return run(moduleId);
841
+ }
842
+
843
+ // src/node-runtime.ts
844
+ function isNodeRuntime() {
845
+ return typeof process !== "undefined" && typeof process.versions?.node === "string";
846
+ }
847
+ function assertNodeRuntime(capability) {
848
+ if (!isNodeRuntime()) {
849
+ throw new Error(
850
+ `@dragon708/docmind-markdown: ${capability} is only available in Node.js.`
851
+ );
852
+ }
853
+ }
854
+
855
+ // src/docx-markdown.ts
856
+ var TABLE_OMITTED_HTML = "\n<p><em>(Table omitted)</em></p>\n";
857
+ function normalizeMammothMessages(messages) {
858
+ return messages.map((m) => ({ type: m.type, message: m.message }));
859
+ }
860
+ async function toNodeBuffer(input) {
861
+ const { Buffer: Buffer2 } = await importEsm("node:buffer");
862
+ if (Buffer2.isBuffer(input)) return input;
863
+ if (input instanceof ArrayBuffer) return Buffer2.from(input);
864
+ return Buffer2.from(input);
865
+ }
866
+ function stripTablesFromHtml(html) {
867
+ return html.replace(/<table\b[^>]*>[\s\S]*?<\/table>/gi, TABLE_OMITTED_HTML);
868
+ }
869
+ function stripImagesFromHtml(html) {
870
+ return html.replace(/<img\b[^>]*\/?>/gi, "");
871
+ }
872
+ function stripPageBreakHrsFromHtml(html) {
873
+ return html.replace(
874
+ /<hr\b[^>]*>/gi,
875
+ (tag) => /\bpage-break\b/i.test(tag) ? "" : tag
876
+ );
877
+ }
878
+ function mergeStyleMaps(includePageBreaks, user) {
879
+ const parts = [];
880
+ if (includePageBreaks) {
881
+ parts.push("br[type=page] => hr.page-break");
882
+ }
883
+ if (typeof user === "string") parts.push(user);
884
+ else if (Array.isArray(user)) parts.push(...user);
885
+ if (parts.length === 0) return void 0;
886
+ return parts;
887
+ }
888
+ function applyCompactMarkdown(markdown) {
889
+ return markdown.split("\n").map((line) => line.replace(/[ \t]+$/g, "")).join("\n").replace(/\n{3,}/g, "\n\n").trim();
890
+ }
891
+ function shouldTryStructuredFallback(markdown, minLen) {
892
+ const t = markdown.trim();
893
+ if (t.length === 0) return "empty";
894
+ if (minLen !== void 0 && minLen > 0 && t.length < minLen) return "short";
895
+ return null;
896
+ }
897
+ function buildTurndownBaseOptions() {
898
+ return {
899
+ headingStyle: "atx",
900
+ codeBlockStyle: "fenced",
901
+ bulletListMarker: "-"
902
+ };
903
+ }
904
+ async function convertDocxToMarkdown(input, options) {
905
+ assertNodeRuntime("DOCX \u2192 Markdown (Mammoth \u2192 Turndown)");
906
+ const includeTables = options?.includeTables !== false;
907
+ const includeImages = options?.includeImages !== false;
908
+ const includePageBreaks = options?.includePageBreaks !== false;
909
+ const compactMode = options?.compactMode === true;
910
+ const minMarkdownLength = options?.minMarkdownLength;
911
+ const resolveStructured = options?.resolveStructured;
912
+ const structuredMdOpts = options?.structuredMarkdown;
913
+ const [{ default: mammoth }, { default: TurndownService }, { gfm }, buffer] = await Promise.all([
914
+ importEsm("mammoth"),
915
+ importEsm("turndown"),
916
+ includeTables ? importEsm("turndown-plugin-gfm") : Promise.resolve({ gfm: null }),
917
+ toNodeBuffer(input)
918
+ ]);
919
+ const styleMap = mergeStyleMaps(includePageBreaks, options?.mammoth?.styleMap);
920
+ const mammothOpts = {
921
+ ...options?.mammoth,
922
+ ...styleMap !== void 0 ? { styleMap } : {}
923
+ };
924
+ if (includeImages && mammothOpts.convertImage === void 0) {
925
+ mammothOpts.convertImage = mammoth.images.dataUri;
926
+ }
927
+ const runDirect = async () => {
928
+ const htmlResult = await mammoth.convertToHtml(
929
+ { buffer },
930
+ mammothOpts
931
+ );
932
+ let html = htmlResult.value;
933
+ if (!includeTables) html = stripTablesFromHtml(html);
934
+ if (!includeImages) html = stripImagesFromHtml(html);
935
+ if (!includePageBreaks) html = stripPageBreakHrsFromHtml(html);
936
+ const tdBase = {
937
+ ...buildTurndownBaseOptions(),
938
+ ...options?.turndown
939
+ };
940
+ const service = new TurndownService(
941
+ tdBase
942
+ );
943
+ if (includeTables && gfm) {
944
+ gfm(service);
945
+ }
946
+ let markdown = service.turndown(html).trim();
947
+ if (compactMode) markdown = applyCompactMarkdown(markdown);
948
+ return {
949
+ markdown,
950
+ messages: normalizeMammothMessages(htmlResult.messages)
951
+ };
952
+ };
953
+ const runFallback = async (reason, priorMessages, err) => {
954
+ if (!resolveStructured) {
955
+ if (reason === "error" && err !== void 0) throw err;
956
+ return {
957
+ markdown: "",
958
+ source: "mammoth-turndown",
959
+ messages: priorMessages,
960
+ fallbackReason: reason
961
+ };
962
+ }
963
+ const structured = await resolveStructured();
964
+ const md = convertStructuredToMarkdown(structured, structuredMdOpts);
965
+ const extra = [];
966
+ if (reason === "error" && err !== void 0) {
967
+ extra.push({
968
+ type: "warning",
969
+ message: `DOCX direct conversion failed; used structured fallback: ${String(err)}`
970
+ });
971
+ }
972
+ return {
973
+ markdown: compactMode ? applyCompactMarkdown(md) : md.trim(),
974
+ source: "structured-fallback",
975
+ messages: [...priorMessages, ...extra],
976
+ fallbackReason: reason
977
+ };
978
+ };
979
+ try {
980
+ const { markdown, messages } = await runDirect();
981
+ const insuff = shouldTryStructuredFallback(markdown, minMarkdownLength);
982
+ if (insuff && resolveStructured) {
983
+ return await runFallback(insuff, messages);
984
+ }
985
+ return { markdown, source: "mammoth-turndown", messages };
986
+ } catch (err) {
987
+ return await runFallback("error", [], err);
988
+ }
989
+ }
990
+ async function convertDocxBufferToMarkdown(input, options) {
991
+ const r = await convertDocxToMarkdown(input, options);
992
+ return { markdown: r.markdown, messages: r.messages };
993
+ }
994
+
995
+ // src/pdf-markdown.ts
996
+ var BROWSER_WARNING = "@dragon708/docmind-markdown: PDF \u2192 Markdown via @opendataloader/pdf requires Node.js. In the browser, use a server-side conversion or supply structured text/Markdown from your backend.";
997
+ function normalizePdfMarkdown(markdown, clean) {
998
+ const t = markdown.trim();
999
+ if (!clean) return t;
1000
+ return t.replace(/\n{3,}/g, "\n\n");
1001
+ }
1002
+ function engineOptions(options) {
1003
+ if (!options) return {};
1004
+ const {
1005
+ resolveStructured: _r,
1006
+ structuredMarkdown: _s,
1007
+ cleanMarkdown: _c,
1008
+ ...rest
1009
+ } = options;
1010
+ return rest;
1011
+ }
1012
+ async function toNodeBuffer2(input) {
1013
+ const { Buffer: Buffer2 } = await importEsm("node:buffer");
1014
+ if (Buffer2.isBuffer(input)) return input;
1015
+ if (input instanceof ArrayBuffer) return Buffer2.from(input);
1016
+ return Buffer2.from(input);
1017
+ }
1018
+ async function convertPdfToMarkdown(input, options) {
1019
+ const clean = options?.cleanMarkdown !== false;
1020
+ const resolveStructured = options?.resolveStructured;
1021
+ const structuredMdOpts = options?.structuredMarkdown;
1022
+ const eng = engineOptions(options);
1023
+ if (!isNodeRuntime()) {
1024
+ return {
1025
+ markdown: "",
1026
+ warnings: [BROWSER_WARNING],
1027
+ source: "unsupported-runtime",
1028
+ fallbackReason: "unsupported-runtime"
1029
+ };
1030
+ }
1031
+ const warnings = [];
1032
+ let cleanup;
1033
+ try {
1034
+ let inputPath;
1035
+ if (typeof input === "string") {
1036
+ inputPath = input;
1037
+ } else {
1038
+ const [{ mkdtemp, writeFile, rm }, { join }, { tmpdir }, buffer] = await Promise.all([
1039
+ importEsm("node:fs/promises"),
1040
+ importEsm("node:path"),
1041
+ importEsm("node:os"),
1042
+ toNodeBuffer2(input)
1043
+ ]);
1044
+ const dir = await mkdtemp(join(tmpdir(), "docmind-markdown-pdf-"));
1045
+ inputPath = join(dir, "document.pdf");
1046
+ await writeFile(inputPath, buffer);
1047
+ cleanup = async () => rm(dir, { recursive: true, force: true });
1048
+ }
1049
+ let convert;
1050
+ try {
1051
+ ({ convert } = await importEsm(
1052
+ "@opendataloader/pdf"
1053
+ ));
1054
+ } catch (e) {
1055
+ const hint = e instanceof Error && /Cannot find module|MODULE_NOT_FOUND/i.test(e.message) ? " Install `@opendataloader/pdf` in your project." : "";
1056
+ warnings.push(
1057
+ `@opendataloader/pdf could not be loaded (${e instanceof Error ? e.message : String(e)}).${hint}`
1058
+ );
1059
+ if (resolveStructured) {
1060
+ try {
1061
+ const structured = await resolveStructured();
1062
+ const md = normalizePdfMarkdown(
1063
+ convertStructuredToMarkdown(structured, structuredMdOpts),
1064
+ clean
1065
+ );
1066
+ return {
1067
+ markdown: md,
1068
+ warnings,
1069
+ source: "structured-fallback",
1070
+ fallbackReason: "module-not-found"
1071
+ };
1072
+ } catch (e2) {
1073
+ warnings.push(
1074
+ `Structured fallback failed: ${e2 instanceof Error ? e2.message : String(e2)}`
1075
+ );
1076
+ }
1077
+ }
1078
+ return {
1079
+ markdown: "",
1080
+ warnings,
1081
+ source: "opendataloader",
1082
+ fallbackReason: "module-not-found"
1083
+ };
1084
+ }
1085
+ let rawMarkdown;
1086
+ try {
1087
+ rawMarkdown = await convert(inputPath, {
1088
+ ...eng,
1089
+ format: "markdown",
1090
+ toStdout: true,
1091
+ quiet: eng.quiet !== false
1092
+ }).then((s) => String(s));
1093
+ } catch (e) {
1094
+ warnings.push(`PDF conversion failed: ${e instanceof Error ? e.message : String(e)}`);
1095
+ if (resolveStructured) {
1096
+ try {
1097
+ const structured = await resolveStructured();
1098
+ const md = normalizePdfMarkdown(
1099
+ convertStructuredToMarkdown(structured, structuredMdOpts),
1100
+ clean
1101
+ );
1102
+ return {
1103
+ markdown: md,
1104
+ warnings,
1105
+ source: "structured-fallback",
1106
+ fallbackReason: "error"
1107
+ };
1108
+ } catch (e2) {
1109
+ warnings.push(
1110
+ `Structured fallback failed: ${e2 instanceof Error ? e2.message : String(e2)}`
1111
+ );
1112
+ }
1113
+ }
1114
+ return {
1115
+ markdown: "",
1116
+ warnings,
1117
+ source: "opendataloader",
1118
+ fallbackReason: "error"
1119
+ };
1120
+ }
1121
+ let markdown = normalizePdfMarkdown(rawMarkdown, clean);
1122
+ if (markdown.length === 0) {
1123
+ warnings.push("OpenDataLoader returned empty Markdown for this PDF.");
1124
+ if (resolveStructured) {
1125
+ try {
1126
+ const structured = await resolveStructured();
1127
+ markdown = normalizePdfMarkdown(
1128
+ convertStructuredToMarkdown(structured, structuredMdOpts),
1129
+ clean
1130
+ );
1131
+ return {
1132
+ markdown,
1133
+ warnings,
1134
+ source: "structured-fallback",
1135
+ fallbackReason: "empty"
1136
+ };
1137
+ } catch (e2) {
1138
+ warnings.push(
1139
+ `Structured fallback failed: ${e2 instanceof Error ? e2.message : String(e2)}`
1140
+ );
1141
+ }
1142
+ }
1143
+ return {
1144
+ markdown: "",
1145
+ warnings,
1146
+ source: "opendataloader",
1147
+ fallbackReason: "empty"
1148
+ };
1149
+ }
1150
+ return { markdown, warnings, source: "opendataloader" };
1151
+ } finally {
1152
+ if (cleanup) {
1153
+ await cleanup().catch(() => {
1154
+ });
1155
+ }
1156
+ }
1157
+ }
1158
+ function throwIfLegacyFailure(r) {
1159
+ if (r.source === "unsupported-runtime") {
1160
+ throw new Error(r.warnings[0] ?? "PDF \u2192 Markdown requires Node.js.");
1161
+ }
1162
+ if (r.markdown.trim().length === 0 && r.source !== "structured-fallback") {
1163
+ throw new Error(
1164
+ r.warnings.length > 0 ? r.warnings.join("; ") : "PDF conversion produced no Markdown."
1165
+ );
1166
+ }
1167
+ }
1168
+ async function convertPdfPathToMarkdown(inputPath, options) {
1169
+ assertNodeRuntime("PDF \u2192 Markdown (@opendataloader/pdf)");
1170
+ const r = await convertPdfToMarkdown(inputPath, options);
1171
+ throwIfLegacyFailure(r);
1172
+ return { markdown: r.markdown };
1173
+ }
1174
+ async function convertPdfBufferToMarkdown(input, options) {
1175
+ assertNodeRuntime("PDF \u2192 Markdown (@opendataloader/pdf)");
1176
+ const r = await convertPdfToMarkdown(input, options);
1177
+ throwIfLegacyFailure(r);
1178
+ return { markdown: r.markdown };
1179
+ }
1180
+ function isArrayBufferLike(data) {
1181
+ if (data instanceof ArrayBuffer) return true;
1182
+ if (typeof Uint8Array !== "undefined" && data instanceof Uint8Array) return true;
1183
+ if (typeof Buffer !== "undefined" && Buffer.isBuffer(data)) return true;
1184
+ return false;
1185
+ }
1186
+ function isExtractMarkdownFileInput(value) {
1187
+ if (value === null || typeof value !== "object" || !("data" in value)) return false;
1188
+ return isArrayBufferLike(value.data);
1189
+ }
1190
+ function isExtractMarkdownPathInput(value) {
1191
+ if (value === null || typeof value !== "object" || !("path" in value)) return false;
1192
+ return typeof value.path === "string";
1193
+ }
1194
+ function pickStructuredMarkdownOptions(options) {
1195
+ if (!options) return {};
1196
+ const { structuredFallback: _a, docx: _b, pdf: _c, ...rest } = options;
1197
+ return rest;
1198
+ }
1199
+ function buildDocxOptions(extract) {
1200
+ const docx = extract?.docx;
1201
+ const fb = extract?.structuredFallback;
1202
+ const sm = pickStructuredMarkdownOptions(extract);
1203
+ return {
1204
+ ...docx,
1205
+ resolveStructured: docx?.resolveStructured ?? (fb ? () => Promise.resolve(fb) : void 0),
1206
+ structuredMarkdown: { ...sm, ...docx?.structuredMarkdown }
1207
+ };
1208
+ }
1209
+ function buildPdfOptions(extract) {
1210
+ const pdf = extract?.pdf;
1211
+ const fb = extract?.structuredFallback;
1212
+ const sm = pickStructuredMarkdownOptions(extract);
1213
+ return {
1214
+ ...pdf,
1215
+ resolveStructured: pdf?.resolveStructured ?? (fb ? () => Promise.resolve(fb) : void 0),
1216
+ structuredMarkdown: { ...sm, ...pdf?.structuredMarkdown }
1217
+ };
1218
+ }
1219
+ function toUint8View(data) {
1220
+ if (data instanceof Uint8Array) return data;
1221
+ if (data instanceof ArrayBuffer) return new Uint8Array(data);
1222
+ return new Uint8Array(data);
1223
+ }
1224
+ function detectBinaryFormat(data, filename, mimeType) {
1225
+ const u = toUint8View(data);
1226
+ const lower = filename?.toLowerCase() ?? "";
1227
+ const mime = mimeType?.toLowerCase() ?? "";
1228
+ if (mime.includes("pdf") || lower.endsWith(".pdf")) return "pdf";
1229
+ if (mime.includes("wordprocessingml") || mime.includes("officedocument.wordprocessingml.document") || lower.endsWith(".docx")) {
1230
+ return "docx";
1231
+ }
1232
+ if (u.length >= 4 && u[0] === 37 && u[1] === 80 && u[2] === 68 && u[3] === 70) {
1233
+ return "pdf";
1234
+ }
1235
+ if (u.length >= 4 && u[0] === 80 && u[1] === 75 && (u[2] === 3 || u[2] === 5 || u[2] === 7)) {
1236
+ return "docx";
1237
+ }
1238
+ return "unknown";
1239
+ }
1240
+ function docxStrategyFromSource(source) {
1241
+ return source === "structured-fallback" ? "docx-structured-fallback" : "docx-mammoth";
1242
+ }
1243
+ function pdfStrategyFromResult(r) {
1244
+ if (r.source === "structured-fallback") return "pdf-structured-fallback";
1245
+ if (r.source === "unsupported-runtime") return "pdf-unsupported-runtime";
1246
+ return "pdf-opendataloader";
1247
+ }
1248
+ function mergeWarnings(base, ...more) {
1249
+ const out = [...base];
1250
+ for (const m of more) {
1251
+ if (m) for (const w of m) out.push(w);
1252
+ }
1253
+ return out;
1254
+ }
1255
+ async function extractMarkdown(input, options) {
1256
+ const smOpts = pickStructuredMarkdownOptions(options);
1257
+ const fb = options?.structuredFallback;
1258
+ if (isStructuredDocumentResult(input)) {
1259
+ const markdown = convertStructuredToMarkdown(input, smOpts);
1260
+ return {
1261
+ markdown,
1262
+ warnings: mergeWarnings(
1263
+ [],
1264
+ input.warnings
1265
+ ),
1266
+ strategy: "structured"
1267
+ };
1268
+ }
1269
+ let data;
1270
+ let filename;
1271
+ let mimeType;
1272
+ const warnings = [];
1273
+ if (isExtractMarkdownPathInput(input)) {
1274
+ if (!isNodeRuntime()) {
1275
+ warnings.push(
1276
+ "@dragon708/docmind-markdown: `path` input requires Node.js to read the file. Provide `data` bytes or a StructuredDocumentResult instead."
1277
+ );
1278
+ if (fb) {
1279
+ return {
1280
+ markdown: convertStructuredToMarkdown(fb, smOpts),
1281
+ warnings: mergeWarnings(warnings, fb.warnings),
1282
+ strategy: "path-requires-node"
1283
+ };
1284
+ }
1285
+ return { markdown: "", warnings, strategy: "path-requires-node" };
1286
+ }
1287
+ const { readFile } = await importEsm(
1288
+ "node:fs/promises"
1289
+ );
1290
+ const { basename } = await importEsm("node:path");
1291
+ data = await readFile(input.path);
1292
+ filename = input.filename ?? basename(input.path);
1293
+ mimeType = input.mimeType;
1294
+ } else if (isExtractMarkdownFileInput(input)) {
1295
+ data = input.data;
1296
+ filename = input.filename;
1297
+ mimeType = input.mimeType;
1298
+ } else {
1299
+ warnings.push(
1300
+ "@dragon708/docmind-markdown: extractMarkdown input must be a StructuredDocumentResult, { data, \u2026 }, or { path, \u2026 }."
1301
+ );
1302
+ if (fb) {
1303
+ return {
1304
+ markdown: convertStructuredToMarkdown(fb, smOpts),
1305
+ warnings: mergeWarnings(warnings, fb.warnings),
1306
+ strategy: "binary-unidentified-structured-fallback"
1307
+ };
1308
+ }
1309
+ return { markdown: "", warnings, strategy: "binary-unidentified" };
1310
+ }
1311
+ const fmt = detectBinaryFormat(data, filename, mimeType);
1312
+ if (fmt === "docx") {
1313
+ if (!isNodeRuntime()) {
1314
+ warnings.push(
1315
+ "@dragon708/docmind-markdown: DOCX binary conversion needs Node.js (Mammoth/Turndown). Use structured input or run on the server."
1316
+ );
1317
+ if (fb) {
1318
+ return {
1319
+ markdown: convertStructuredToMarkdown(fb, smOpts),
1320
+ warnings: mergeWarnings(warnings, fb.warnings),
1321
+ strategy: "docx-requires-node"
1322
+ };
1323
+ }
1324
+ return { markdown: "", warnings, strategy: "docx-requires-node" };
1325
+ }
1326
+ const r = await convertDocxToMarkdown(data, buildDocxOptions(options));
1327
+ const w = mergeWarnings(
1328
+ warnings,
1329
+ r.messages.map((m) => m.message)
1330
+ );
1331
+ return {
1332
+ markdown: r.markdown,
1333
+ warnings: w,
1334
+ strategy: docxStrategyFromSource(r.source)
1335
+ };
1336
+ }
1337
+ if (fmt === "pdf") {
1338
+ const r = await convertPdfToMarkdown(data, buildPdfOptions(options));
1339
+ const strategy = pdfStrategyFromResult(r);
1340
+ const w = mergeWarnings(warnings, r.warnings);
1341
+ if (strategy === "pdf-unsupported-runtime" && r.markdown === "" && fb) {
1342
+ return {
1343
+ markdown: convertStructuredToMarkdown(fb, smOpts),
1344
+ warnings: mergeWarnings(w, fb.warnings, [
1345
+ "extractMarkdown: PDF route unavailable in this runtime; used structuredFallback."
1346
+ ]),
1347
+ strategy: "pdf-structured-fallback"
1348
+ };
1349
+ }
1350
+ return { markdown: r.markdown, warnings: w, strategy };
1351
+ }
1352
+ warnings.push(
1353
+ "@dragon708/docmind-markdown: Unidentified binary format (expected PDF magic or ZIP/DOCX). Using structured fallback if provided."
1354
+ );
1355
+ if (fb) {
1356
+ return {
1357
+ markdown: convertStructuredToMarkdown(fb, smOpts),
1358
+ warnings: mergeWarnings(warnings, fb.warnings),
1359
+ strategy: "binary-unidentified-structured-fallback"
1360
+ };
1361
+ }
1362
+ return { markdown: "", warnings, strategy: "binary-unidentified" };
1363
+ }
1364
+
1365
+ export { convertDocxBufferToMarkdown, convertDocxToMarkdown, convertPdfBufferToMarkdown, convertPdfPathToMarkdown, convertPdfToMarkdown, convertStructuredToLlmText, convertStructuredToMarkdown, detectBinaryFormat, extractLlmContent, extractMarkdown, extractStructuredChunks, isExtractMarkdownFileInput, renderLlmText, renderMarkdown, renderMarkdownSections, splitStructuredIntoChunks, structuredDocumentToLlmText, structuredDocumentToMarkdown };
646
1366
  //# sourceMappingURL=index.js.map
647
1367
  //# sourceMappingURL=index.js.map