npm - @dragon708/docmind-markdown - Versions diffs - 1.0.0 → 1.1.1 - Mend

@dragon708/docmind-markdown 1.0.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.js CHANGED Viewed

@@ -1,4 +1,6 @@
-// src/structuredToMarkdown.ts
+import { isStructuredDocumentResult } from '@dragon708/docmind-shared';
+// src/structured-markdown.ts
 function clampHeadingLevel(level) {
   if (level === void 0 || !Number.isFinite(level)) return 2;
   const n = Math.floor(level);
@@ -12,6 +14,9 @@ function escapeTableCell(text) {
 function safeString(s) {
   return typeof s === "string" ? s : "";
 }
+function blockText(s) {
+  return safeString(s).trim();
+}
 function safeArrays(result) {
   return {
     blocks: Array.isArray(result.blocks) ? result.blocks : [],
@@ -41,6 +46,21 @@ function metadataHeaderLines(meta) {
   }
   return lines;
 }
+function expandTableRowForMarkdown(row) {
+  const out = [];
+  for (const cell of row) {
+    const base = escapeTableCell(cell.text);
+    const rs = cell.rowSpan !== void 0 && Number.isFinite(cell.rowSpan) ? Math.max(1, Math.floor(cell.rowSpan)) : 1;
+    const cs = cell.colSpan !== void 0 && Number.isFinite(cell.colSpan) ? Math.max(1, Math.floor(cell.colSpan)) : 1;
+    const note = rs > 1 ? `${base} *(rows: ${rs})*` : base;
+    out.push({ text: note });
+    for (let i = 1; i < cs; i++) out.push({ text: "" });
+  }
+  return out;
+}
+function tableRowWidth(row) {
+  return row.length;
+}
 function tableToMarkdown(table) {
   const rows = table.rows;
   if (rows.length === 0) {
@@ -49,10 +69,12 @@ function tableToMarkdown(table) {
 *(empty table)*
 ` : "*(empty table)*\n";
   }
-  const width = Math.max(...rows.map((r) => r.length));
-  const header = rows[0] ?? [];
+  const expanded = rows.map((r) => expandTableRowForMarkdown(r));
+  const width = Math.max(1, ...expanded.map(tableRowWidth));
+  const padRow = (cells) => Array.from({ length: width }, (_, i) => escapeTableCell(cells[i]?.text ?? ""));
+  const line = (cells) => `| ${padRow(cells).join(" | ")} |`;
+  const header = expanded[0] ?? [];
   const sep = Array.from({ length: width }, () => "---");
-  const line = (cells) => `| ${Array.from({ length: width }, (_, i) => escapeTableCell(cells[i]?.text ?? "")).join(" | ")} |`;
   const out = [];
   if (table.caption) {
     out.push(`**${escapeTableCell(table.caption)}**`);
@@ -60,8 +82,8 @@ function tableToMarkdown(table) {
   }
   out.push(line(header));
   out.push(`| ${sep.join(" | ")} |`);
-  for (let r = 1; r < rows.length; r++) {
-    out.push(line(rows[r]));
+  for (let r = 1; r < expanded.length; r++) {
+    out.push(line(expanded[r]));
   }
   return `${out.join("\n")}
 `;
@@ -88,10 +110,12 @@ function referencedImageIds(blocks) {
 }
 function convertStructuredToMarkdown(result, options) {
   const imagePlaceholder = options?.imagePlaceholder ?? "<!-- image: no src -->";
+  const imageMissingSrcMode = options?.imageMissingSrcMode ?? "placeholder";
   const pageSep = (options?.pageSeparator ?? "---").trimEnd();
   const pageTransitions = options?.pageTransitionMarkers !== false;
   const appendOrphanTables = options?.appendUnreferencedTables !== false;
   const appendOrphanImages = options?.appendUnreferencedImages === true;
+  const appendWarningsSection = options?.appendWarningsSection === true;
   const { blocks, tables, pages, images } = safeArrays(result);
   const hasPageModel = pages.length > 0;
   const parts = [];
@@ -139,11 +163,12 @@ function convertStructuredToMarkdown(result, options) {
       tables,
       images,
       imagePlaceholder,
+      imageMissingSrcMode,
       orderedDepthCounters,
       resetListState,
       pageSep
     );
-    parts.push(chunk);
+    if (chunk.length > 0) parts.push(chunk);
   }
   if (appendOrphanTables) {
     const used = referencedTableIds(blocks);
@@ -176,6 +201,18 @@ function convertStructuredToMarkdown(result, options) {
       }
     }
   }
+  if (appendWarningsSection) {
+    const warns = Array.isArray(result.warnings) ? result.warnings : [];
+    if (warns.length > 0) {
+      parts.push("");
+      parts.push("### Extraction warnings");
+      parts.push("");
+      for (const w of warns) {
+        const line = String(w).replace(/\r?\n/g, " ").trim();
+        if (line.length > 0) parts.push(`- ${line}`);
+      }
+    }
+  }
   let body = parts.join("\n\n").replace(/\n{3,}/g, "\n\n").trimEnd();
   if (body.length === 0) {
     body = safeString(result.text).trim();
@@ -188,49 +225,93 @@ function escapeCommentText(s) {
 function listItemLine(block, orderedDepthCounters) {
   const depth = Math.max(0, block.depth ?? 0);
   const indent = "  ".repeat(depth);
-  const style = block.listStyle ?? "unordered";
+  const style = block.listStyle === "ordered" ? "ordered" : "unordered";
   if (style === "ordered") {
     while (orderedDepthCounters.length <= depth) orderedDepthCounters.push(0);
     orderedDepthCounters.length = depth + 1;
     orderedDepthCounters[depth] = (orderedDepthCounters[depth] ?? 0) + 1;
     const n = orderedDepthCounters[depth];
-    return `${indent}${n}. ${block.text.trim()}`;
+    return `${indent}${n}. ${blockText(block.text)}`;
   }
   orderedDepthCounters.length = depth;
-  return `${indent}- ${block.text.trim()}`;
+  return `${indent}- ${blockText(block.text)}`;
+}
+function imageRefWithoutSrcMarkdown(imageId, altDisplay, imagePlaceholder, imageMissingSrcMode, kind) {
+  if (imageMissingSrcMode === "llm-label") {
+    const altPart = altDisplay.length > 0 ? ` \u2014 _${altDisplay.replace(/_/g, "\\_")}_` : "";
+    return `*[Image: \`${escapeBackticks(imageId)}\`${altPart}]*`;
+  }
+  const hint = kind === "placeholder" ? " (placeholder)" : kind === "embedded" ? " (embedded)" : kind === "external" ? " (external, no URL)" : "";
+  return `${imagePlaceholder}${hint}`;
+}
+function escapeBackticks(s) {
+  return s.replace(/`/g, "\\`");
 }
-function blockToMarkdown(block, tables, images, imagePlaceholder, orderedDepthCounters, resetListState, pageSep) {
+function quoteMarkdownLines(text, prefix) {
+  return text.split(/\r?\n/).map((ln) => `${prefix}${ln}`).join("\n");
+}
+function unknownBlockToMarkdown(block) {
+  const hint = block.hint?.trim();
+  const raw = block.raw?.trim();
+  if (raw && hint) {
+    return `> _Unrecognized block:_ ${hint}
+>
+${quoteMarkdownLines(raw, "> ")}`;
+  }
+  if (raw) {
+    return raw.includes("\n") ? quoteMarkdownLines(raw, "> ") : raw;
+  }
+  if (hint) {
+    return `> _Unrecognized:_ ${hint}`;
+  }
+  return "<!-- unknown block -->";
+}
+function blockToMarkdown(block, tables, images, imagePlaceholder, imageMissingSrcMode, orderedDepthCounters, resetListState, pageSep) {
   switch (block.type) {
     case "heading": {
       resetListState();
       const level = clampHeadingLevel(block.level);
       const hashes = "#".repeat(level);
-      return `${hashes} ${block.text.trim()}`;
+      const t = blockText(block.text);
+      if (t.length === 0) return "";
+      return `${hashes} ${t}`;
     }
     case "paragraph": {
       resetListState();
-      return block.text.trim();
+      return blockText(block.text);
     }
     case "list-item":
-      return listItemLine(block, orderedDepthCounters);
+      return blockText(block.text).length === 0 ? "" : listItemLine(block, orderedDepthCounters);
     case "table": {
       resetListState();
-      const t = resolveTable(tables, block.tableId);
+      const tid = safeString(block.tableId);
+      if (!tid) {
+        return `<!-- table block: missing tableId -->`;
+      }
+      const t = resolveTable(tables, tid);
       if (!t) {
-        return `<!-- table not found: ${escapeCommentText(block.tableId)} -->`;
+        return `<!-- table not found: ${escapeCommentText(tid)} -->`;
       }
       return tableToMarkdown(t).trimEnd();
     }
     case "image-ref": {
       resetListState();
-      const img = resolveImage(images, block.imageId);
+      const iid = safeString(block.imageId);
+      if (!iid) {
+        return `<!-- image-ref: missing imageId -->`;
+      }
+      const img = resolveImage(images, iid);
       const altRaw = block.alt ?? img?.alt ?? "";
       const alt = altRaw.replace(/]/g, "\\]");
       const src = img?.src;
       if (src) return `![${alt}](${src})`;
-      const kind = img?.kind;
-      const hint = kind === "placeholder" ? " (placeholder)" : kind === "embedded" ? " (embedded)" : "";
-      return `${imagePlaceholder}${hint}`;
+      return imageRefWithoutSrcMarkdown(
+        iid,
+        altRaw.trim(),
+        imagePlaceholder,
+        imageMissingSrcMode,
+        img?.kind
+      );
     }
     case "page-break": {
       resetListState();
@@ -238,10 +319,7 @@ function blockToMarkdown(block, tables, images, imagePlaceholder, orderedDepthCo
     }
     case "unknown": {
       resetListState();
-      const raw = block.raw?.trim();
-      if (raw) return raw;
-      if (block.hint) return `<!-- ${escapeCommentText(block.hint)} -->`;
-      return "<!-- unknown block -->";
+      return unknownBlockToMarkdown(block);
     }
     default: {
       const _exhaustive = block;
@@ -253,7 +331,7 @@ function structuredDocumentToMarkdown(structured, options) {
   return convertStructuredToMarkdown(structured, options);
 }
-// src/structuredToLlmText.ts
+// src/llm-text.ts
 function clampHeadingLevel2(level) {
   if (level === void 0 || !Number.isFinite(level)) return 2;
   const n = Math.floor(level);
@@ -261,6 +339,13 @@ function clampHeadingLevel2(level) {
   if (n > 6) return 6;
   return n;
 }
+function sanitizeNoiseChars(s) {
+  return s.replace(/[\u200B-\u200D\uFEFF\u2060]/g, "").replace(/\u00A0/g, " ").replace(/[ \t\f\v]+/g, " ").trim();
+}
+function sanitizeLineOriented(text, enabled) {
+  if (!enabled) return text;
+  return text.split("\n").map((line) => sanitizeNoiseChars(line)).join("\n");
+}
 function safeArrays2(result) {
   return {
     blocks: Array.isArray(result.blocks) ? result.blocks : [],
@@ -296,17 +381,45 @@ function metadataDocBlock(meta, extraMax) {
   return `[DOC]
 ${lines.join("\n")}`;
 }
-function tableToLlmBlock(table, tag) {
+function cellToLlmSegment(text, sanitize) {
+  const one = text.replace(/\r?\n/g, " ").replace(/\|/g, "\xB7").replace(/\s+/g, " ").trim();
+  return sanitize ? sanitizeNoiseChars(one) : one;
+}
+function expandTableRowForLlm(row) {
+  const out = [];
+  for (const cell of row) {
+    const base = cell.text;
+    const rs = cell.rowSpan !== void 0 && Number.isFinite(cell.rowSpan) ? Math.max(1, Math.floor(cell.rowSpan)) : 1;
+    const cs = cell.colSpan !== void 0 && Number.isFinite(cell.colSpan) ? Math.max(1, Math.floor(cell.colSpan)) : 1;
+    const note = rs > 1 ? `${base} (rows\xD7${rs})` : base;
+    out.push(note);
+    for (let i = 1; i < cs; i++) out.push("");
+  }
+  return out;
+}
+function tableHeaderRuleLine(nCols, glue) {
+  const unit = "---";
+  return Array.from({ length: Math.max(1, nCols) }, () => unit).join(glue);
+}
+function tableToLlmBlock(table, tag, glue, headerSep, sanitize) {
   const lines = [];
   lines.push(`${tag} id=${table.id}`);
-  if (table.caption) lines.push(`Caption: ${table.caption}`);
+  if (table.caption) {
+    lines.push(`Caption: ${sanitize ? sanitizeNoiseChars(table.caption) : table.caption}`);
+  }
   const rows = table.rows;
   if (rows.length === 0) {
     lines.push("(empty table)");
     return lines.join("\n");
   }
-  for (const row of rows) {
-    lines.push(row.map((c) => c.text.replace(/\r?\n/g, " ").trim()).join(" | "));
+  const expanded = rows.map((r) => expandTableRowForLlm(r));
+  for (let ri = 0; ri < expanded.length; ri++) {
+    const row = expanded[ri];
+    const rendered = row.map((c) => cellToLlmSegment(c, sanitize)).join(glue);
+    lines.push(rendered);
+    if (headerSep && ri === 0 && expanded.length > 1) {
+      lines.push(tableHeaderRuleLine(row.length, glue));
+    }
   }
   return lines.join("\n");
 }
@@ -320,11 +433,12 @@ function referencedTableIds2(blocks) {
   }
   return ids;
 }
-function listItemLine2(block, orderedDepthCounters) {
+function listItemLine2(block, orderedDepthCounters, sanitize) {
   const depth = Math.max(0, block.depth ?? 0);
   const indent = "  ".repeat(depth);
-  const style = block.listStyle ?? "unordered";
-  const text = block.text.replace(/\r?\n/g, " ").trim();
+  const raw = block.text.replace(/\r?\n/g, " ").trim();
+  const text = sanitize ? sanitizeNoiseChars(raw) : raw.replace(/\s+/g, " ").trim();
+  const style = block.listStyle === "ordered" ? "ordered" : "unordered";
   if (style === "ordered") {
     while (orderedDepthCounters.length <= depth) orderedDepthCounters.push(0);
     orderedDepthCounters.length = depth + 1;
@@ -335,31 +449,56 @@ function listItemLine2(block, orderedDepthCounters) {
   orderedDepthCounters.length = depth;
   return `${indent}\u2022 ${text}`;
 }
-function blockToLlm(block, tables, images, tableTag, imageTag, pageMarker, orderedDepthCounters, resetListState, skipEmptyParagraphs) {
+function unknownToLlm(block, sanitize) {
+  const hint = block.hint?.trim();
+  const raw = block.raw?.trim();
+  if (raw && hint) {
+    const h = sanitize ? sanitizeNoiseChars(hint) : hint;
+    const body = raw.split(/\r?\n/).map((ln) => sanitize ? sanitizeNoiseChars(ln) : ln.trimEnd()).join("\n");
+    return `[UNKNOWN] ${h}
+${body.split("\n").map((l) => `  ${l}`).join("\n")}`;
+  }
+  if (raw) {
+    return raw.split(/\r?\n/).map((ln) => sanitize ? sanitizeNoiseChars(ln) : ln.trimEnd()).join("\n");
+  }
+  if (hint) {
+    return `[UNKNOWN] ${sanitize ? sanitizeNoiseChars(hint) : hint}`;
+  }
+  return "[UNKNOWN]";
+}
+function blockToLlm(block, tables, images, tableTag, imageTag, pageMarker, orderedDepthCounters, resetListState, skipEmptyParagraphs, glue, headerSep, sanitize) {
   switch (block.type) {
     case "heading": {
       resetListState();
       const lv = clampHeadingLevel2(block.level);
-      return `[H${lv}] ${block.text.replace(/\r?\n/g, " ").trim()}`;
+      const t = block.text.replace(/\r?\n/g, " ").trim();
+      if (t.length === 0) return void 0;
+      const text = sanitize ? sanitizeNoiseChars(t) : t.replace(/\s+/g, " ").trim();
+      return `[H${lv}] ${text}`;
     }
     case "paragraph": {
       resetListState();
       const t = block.text.trim();
       if (t.length === 0 && skipEmptyParagraphs) return void 0;
-      return t.replace(/\r?\n/g, " ").replace(/\s+/g, " ").trim();
+      const flat = t.replace(/\r?\n/g, " ").replace(/\s+/g, " ").trim();
+      return sanitize ? sanitizeNoiseChars(flat) : flat;
+    }
+    case "list-item": {
+      const t = block.text.trim();
+      if (t.length === 0) return void 0;
+      return listItemLine2(block, orderedDepthCounters, sanitize);
     }
-    case "list-item":
-      return listItemLine2(block, orderedDepthCounters);
     case "table": {
       resetListState();
       const t = resolveTable2(tables, block.tableId);
       if (!t) return `${tableTag} MISSING id=${block.tableId}`;
-      return tableToLlmBlock(t, tableTag);
+      return tableToLlmBlock(t, tableTag, glue, headerSep, sanitize);
     }
     case "image-ref": {
       resetListState();
       const img = images.find((i) => i.id === block.imageId);
-      const alt = (block.alt ?? img?.alt ?? "").replace(/\r?\n/g, " ").trim();
+      const altRaw = (block.alt ?? img?.alt ?? "").replace(/\r?\n/g, " ").trim();
+      const alt = sanitize ? sanitizeNoiseChars(altRaw) : altRaw;
       if (img?.src) {
         return `${imageTag} alt=${JSON.stringify(alt)} url=${JSON.stringify(img.src)}`;
       }
@@ -371,9 +510,7 @@ function blockToLlm(block, tables, images, tableTag, imageTag, pageMarker, order
     }
     case "unknown": {
       resetListState();
-      const raw = block.raw?.trim();
-      if (raw) return raw.replace(/\r?\n/g, "\n");
-      return block.hint ? `[UNKNOWN: ${block.hint}]` : "[UNKNOWN]";
+      return unknownToLlm(block, sanitize);
     }
     default: {
       const _exhaustive = block;
@@ -393,6 +530,9 @@ function convertStructuredToLlmText(result, options) {
   const appendOrphanTables = options?.appendUnreferencedTables !== false;
   const compact = options?.compact === true;
   const skipEmptyParagraphs = options?.skipEmptyParagraphs !== false;
+  const sanitize = options?.sanitizeNoise !== false;
+  const headerSep = options?.tableHeaderSeparator !== false;
+  const glue = options?.tableColumnSeparator ?? " | ";
   const sep = compact ? "\n" : "\n\n";
   const { blocks, tables, pages, images, warnings } = safeArrays2(result);
   const hasPageModel = pages.length > 0;
@@ -437,7 +577,10 @@ function convertStructuredToLlmText(result, options) {
       pageMarker,
       orderedDepthCounters,
       resetListState,
-      skipEmptyParagraphs
+      skipEmptyParagraphs,
+      glue,
+      headerSep,
+      sanitize
     );
     if (chunk !== void 0 && chunk.length > 0) parts.push(chunk);
   }
@@ -447,16 +590,18 @@ function convertStructuredToLlmText(result, options) {
     if (orphans.length > 0) {
       parts.push(`[MORE_TABLES]`);
       for (const t of orphans) {
-        parts.push(tableToLlmBlock(t, tableTag));
+        parts.push(tableToLlmBlock(t, tableTag, glue, headerSep, sanitize));
       }
     }
   }
   let out = parts.join(sep).replace(/\n{3,}/g, "\n\n").trim();
+  out = sanitizeLineOriented(out, sanitize);
   if (out.length === 0 && fallback) {
     out = typeof result.text === "string" ? result.text.trim() : "";
+    out = sanitizeLineOriented(out, sanitize);
   }
   if (includeWarnings && warnings.length > 0) {
-    const warnLines = warnings.map((w) => `- ${String(w).replace(/\r?\n/g, " ")}`).join("\n");
+    const warnLines = warnings.map((w) => `- ${sanitizeNoiseChars(String(w).replace(/\r?\n/g, " "))}`).join("\n");
     const block = `[WARNINGS]
 ${warnLines}`;
     out = out ? `${out}${sep}${block}` : block;
@@ -467,7 +612,7 @@ function structuredDocumentToLlmText(structured, options) {
   return convertStructuredToLlmText(structured, options);
 }
-// src/splitStructuredIntoChunks.ts
+// src/chunking.ts
 var SLICE_MARKDOWN_OPTS = {
   includeMetadataHeader: false,
   pageTransitionMarkers: false,
@@ -479,7 +624,9 @@ var SLICE_LLM_OPTS = {
   includeDocumentMetadata: false,
   includeWarnings: false,
   pageTransitionMarkers: false,
-  appendUnreferencedTables: false
+  appendUnreferencedTables: false,
+  tableHeaderSeparator: true,
+  sanitizeNoise: true
 };
 function clampHeadingLevel3(level) {
   if (level === void 0 || !Number.isFinite(level)) return 2;
@@ -504,15 +651,83 @@ function renderSlice(result, block, includeMarkdown) {
 function joinChunkParts(parts) {
   return parts.map((p) => p.trim()).filter((p) => p.length > 0).join("\n\n");
 }
+function pageSpanLabelFromRange(minP, maxP) {
+  if (minP === void 0) return void 0;
+  const a = minP + 1;
+  if (maxP === void 0 || maxP === minP) return String(a);
+  const b = maxP + 1;
+  return `${a}\u2013${b}`;
+}
 function safeBlocks(result) {
   return Array.isArray(result.blocks) ? result.blocks : [];
 }
+function packUnitsIntoChunks(units, options) {
+  const {
+    maxChars,
+    overlapChars,
+    preferHeadings,
+    preserveTables,
+    includeMarkdown,
+    includePageSpanLabel
+  } = options;
+  const chunks = [];
+  let current = [];
+  let pendingTextPrefix = "";
+  function projectedTextLength(next) {
+    const body = joinChunkParts(current.map((u) => u.text).concat(next.text));
+    const full = pendingTextPrefix ? `${pendingTextPrefix}${body.length > 0 ? "\n\n" + body : ""}` : body;
+    return full.length;
+  }
+  function flush() {
+    if (current.length === 0) return;
+    const body = joinChunkParts(current.map((u) => u.text));
+    const text = pendingTextPrefix ? `${pendingTextPrefix}${body.length > 0 ? "\n\n" + body : ""}`.trim() : body.trim();
+    const markdown = includeMarkdown && current.length > 0 ? joinChunkParts(current.map((u) => u.md)).trim() : void 0;
+    const pages = current.map((u) => u.pageIndex).filter((n) => n !== void 0);
+    const pageIndex = pages.length > 0 ? Math.min(...pages) : void 0;
+    const pageEndIndex = pages.length > 0 ? Math.max(...pages) : void 0;
+    const headingPath = current.length > 0 ? current[current.length - 1].headingPath : void 0;
+    const pageSpanLabel = includePageSpanLabel && pageIndex !== void 0 ? pageSpanLabelFromRange(pageIndex, pageEndIndex) : void 0;
+    if (text.length > 0 || markdown && markdown.length > 0) {
+      chunks.push({
+        index: chunks.length,
+        text,
+        markdown: markdown && markdown.length > 0 ? markdown : void 0,
+        headingPath: headingPath && headingPath.length > 0 ? [...headingPath] : void 0,
+        pageIndex,
+        pageEndIndex,
+        pageSpanLabel
+      });
+    }
+    pendingTextPrefix = overlapChars > 0 && text.length > 0 ? text.slice(Math.max(0, text.length - overlapChars)).trimStart() : "";
+    current = [];
+  }
+  for (let i = 0; i < units.length; i++) {
+    const unit = units[i];
+    if (preferHeadings && unit.isHeading && current.length > 0) {
+      flush();
+    }
+    if (preserveTables && unit.isTable && unit.text.length > maxChars) {
+      if (current.length > 0) flush();
+      current = [unit];
+      flush();
+      continue;
+    }
+    if (current.length > 0 && projectedTextLength(unit) > maxChars) {
+      flush();
+    }
+    current.push(unit);
+  }
+  flush();
+  return chunks;
+}
 function splitStructuredIntoChunks(result, options) {
   const maxChars = Math.max(1, options?.maxChars ?? 4e3);
   const overlapChars = Math.max(0, options?.overlapChars ?? 0);
   const preferHeadings = options?.preferHeadings !== false;
   const preserveTables = options?.preserveTables !== false;
   const includeMarkdown = options?.includeMarkdown !== false;
+  const includePageSpanLabel = options?.includePageSpanLabel !== false;
   const blocks = safeBlocks(result);
   if (blocks.length === 0) {
     const text = convertStructuredToLlmText(result, {
@@ -532,7 +747,8 @@ function splitStructuredIntoChunks(result, options) {
         markdown: md && md.length > 0 ? md : void 0,
         headingPath: void 0,
         pageIndex: void 0,
-        pageEndIndex: void 0
+        pageEndIndex: void 0,
+        pageSpanLabel: void 0
       }
     ];
   }
@@ -558,54 +774,15 @@ function splitStructuredIntoChunks(result, options) {
       headingPath
     });
   }
-  const chunks = [];
-  let current = [];
-  let pendingTextPrefix = "";
-  function projectedTextLength(next) {
-    const body = joinChunkParts(current.map((u) => u.text).concat(next.text));
-    const full = pendingTextPrefix ? `${pendingTextPrefix}${body.length > 0 ? "\n\n" + body : ""}` : body;
-    return full.length;
-  }
-  function flush() {
-    if (current.length === 0) return;
-    const body = joinChunkParts(current.map((u) => u.text));
-    const text = pendingTextPrefix ? `${pendingTextPrefix}${body.length > 0 ? "\n\n" + body : ""}`.trim() : body.trim();
-    const markdown = includeMarkdown && current.length > 0 ? joinChunkParts(current.map((u) => u.md)).trim() : void 0;
-    const pages = current.map((u) => u.pageIndex).filter((n) => n !== void 0);
-    const pageIndex = pages.length > 0 ? Math.min(...pages) : void 0;
-    const pageEndIndex = pages.length > 0 ? Math.max(...pages) : void 0;
-    const headingPath = current.length > 0 ? current[current.length - 1].headingPath : void 0;
-    if (text.length > 0 || markdown && markdown.length > 0) {
-      chunks.push({
-        index: chunks.length,
-        text,
-        markdown: markdown && markdown.length > 0 ? markdown : void 0,
-        headingPath: headingPath && headingPath.length > 0 ? [...headingPath] : void 0,
-        pageIndex,
-        pageEndIndex
-      });
-    }
-    pendingTextPrefix = overlapChars > 0 && text.length > 0 ? text.slice(Math.max(0, text.length - overlapChars)).trimStart() : "";
-    current = [];
-  }
-  for (let i = 0; i < units.length; i++) {
-    const unit = units[i];
-    if (preferHeadings && unit.isHeading && current.length > 0) {
-      flush();
-    }
-    if (preserveTables && unit.isTable && unit.text.length > maxChars) {
-      if (current.length > 0) flush();
-      current = [unit];
-      flush();
-      continue;
-    }
-    if (current.length > 0 && projectedTextLength(unit) > maxChars) {
-      flush();
-    }
-    current.push(unit);
-  }
-  flush();
-  if (chunks.length === 0) {
+  const packed = packUnitsIntoChunks(units, {
+    maxChars,
+    overlapChars,
+    preferHeadings,
+    preserveTables,
+    includeMarkdown,
+    includePageSpanLabel
+  });
+  if (packed.length === 0) {
     return [
       {
         index: 0,
@@ -613,12 +790,14 @@ function splitStructuredIntoChunks(result, options) {
         markdown: void 0,
         headingPath: void 0,
         pageIndex: void 0,
-        pageEndIndex: void 0
+        pageEndIndex: void 0,
+        pageSpanLabel: void 0
       }
     ];
   }
-  return chunks.map((c, i) => ({ ...c, index: i }));
+  return packed.map((c, i) => ({ ...c, index: i }));
 }
+var extractStructuredChunks = splitStructuredIntoChunks;
 // src/render.ts
 function renderMarkdown(result, options) {
@@ -627,6 +806,9 @@ function renderMarkdown(result, options) {
 function renderLlmText(result, options) {
   return convertStructuredToLlmText(result, options);
 }
+function extractLlmContent(result, options) {
+  return renderLlmText(result, options);
+}
 function renderMarkdownSections(result, options) {
   const chunks = splitStructuredIntoChunks(result, {
     ...options,
@@ -638,10 +820,548 @@ function renderMarkdownSections(result, options) {
     headingPath: c.headingPath,
     pageIndex: c.pageIndex,
     pageEndIndex: c.pageEndIndex,
+    pageSpanLabel: c.pageSpanLabel,
     text: c.text.trim().length > 0 ? c.text.trim() : void 0
   }));
 }
-export { convertStructuredToLlmText, convertStructuredToMarkdown, renderLlmText, renderMarkdown, renderMarkdownSections, splitStructuredIntoChunks, structuredDocumentToLlmText, structuredDocumentToMarkdown };
+// src/dynamic-import-runtime.ts
+function isNodeJsRuntime() {
+  return typeof process !== "undefined" && process.versions != null && typeof process.versions.node === "string";
+}
+function importEsm(moduleId) {
+  if (isNodeJsRuntime()) {
+    return import(moduleId);
+  }
+  const run = new Function(
+    "id",
+    "return import(id)"
+  );
+  return run(moduleId);
+}
+// src/node-runtime.ts
+function isNodeRuntime() {
+  return typeof process !== "undefined" && typeof process.versions?.node === "string";
+}
+function assertNodeRuntime(capability) {
+  if (!isNodeRuntime()) {
+    throw new Error(
+      `@dragon708/docmind-markdown: ${capability} is only available in Node.js.`
+    );
+  }
+}
+// src/docx-markdown.ts
+var TABLE_OMITTED_HTML = "\n<p><em>(Table omitted)</em></p>\n";
+function normalizeMammothMessages(messages) {
+  return messages.map((m) => ({ type: m.type, message: m.message }));
+}
+async function toNodeBuffer(input) {
+  const { Buffer: Buffer2 } = await importEsm("node:buffer");
+  if (Buffer2.isBuffer(input)) return input;
+  if (input instanceof ArrayBuffer) return Buffer2.from(input);
+  return Buffer2.from(input);
+}
+function stripTablesFromHtml(html) {
+  return html.replace(/<table\b[^>]*>[\s\S]*?<\/table>/gi, TABLE_OMITTED_HTML);
+}
+function stripImagesFromHtml(html) {
+  return html.replace(/<img\b[^>]*\/?>/gi, "");
+}
+function stripPageBreakHrsFromHtml(html) {
+  return html.replace(
+    /<hr\b[^>]*>/gi,
+    (tag) => /\bpage-break\b/i.test(tag) ? "" : tag
+  );
+}
+function mergeStyleMaps(includePageBreaks, user) {
+  const parts = [];
+  if (includePageBreaks) {
+    parts.push("br[type=page] => hr.page-break");
+  }
+  if (typeof user === "string") parts.push(user);
+  else if (Array.isArray(user)) parts.push(...user);
+  if (parts.length === 0) return void 0;
+  return parts;
+}
+function applyCompactMarkdown(markdown) {
+  return markdown.split("\n").map((line) => line.replace(/[ \t]+$/g, "")).join("\n").replace(/\n{3,}/g, "\n\n").trim();
+}
+function shouldTryStructuredFallback(markdown, minLen) {
+  const t = markdown.trim();
+  if (t.length === 0) return "empty";
+  if (minLen !== void 0 && minLen > 0 && t.length < minLen) return "short";
+  return null;
+}
+function buildTurndownBaseOptions() {
+  return {
+    headingStyle: "atx",
+    codeBlockStyle: "fenced",
+    bulletListMarker: "-"
+  };
+}
+async function convertDocxToMarkdown(input, options) {
+  assertNodeRuntime("DOCX \u2192 Markdown (Mammoth \u2192 Turndown)");
+  const includeTables = options?.includeTables !== false;
+  const includeImages = options?.includeImages !== false;
+  const includePageBreaks = options?.includePageBreaks !== false;
+  const compactMode = options?.compactMode === true;
+  const minMarkdownLength = options?.minMarkdownLength;
+  const resolveStructured = options?.resolveStructured;
+  const structuredMdOpts = options?.structuredMarkdown;
+  const [{ default: mammoth }, { default: TurndownService }, { gfm }, buffer] = await Promise.all([
+    importEsm("mammoth"),
+    importEsm("turndown"),
+    includeTables ? importEsm("turndown-plugin-gfm") : Promise.resolve({ gfm: null }),
+    toNodeBuffer(input)
+  ]);
+  const styleMap = mergeStyleMaps(includePageBreaks, options?.mammoth?.styleMap);
+  const mammothOpts = {
+    ...options?.mammoth,
+    ...styleMap !== void 0 ? { styleMap } : {}
+  };
+  if (includeImages && mammothOpts.convertImage === void 0) {
+    mammothOpts.convertImage = mammoth.images.dataUri;
+  }
+  const runDirect = async () => {
+    const htmlResult = await mammoth.convertToHtml(
+      { buffer },
+      mammothOpts
+    );
+    let html = htmlResult.value;
+    if (!includeTables) html = stripTablesFromHtml(html);
+    if (!includeImages) html = stripImagesFromHtml(html);
+    if (!includePageBreaks) html = stripPageBreakHrsFromHtml(html);
+    const tdBase = {
+      ...buildTurndownBaseOptions(),
+      ...options?.turndown
+    };
+    const service = new TurndownService(
+      tdBase
+    );
+    if (includeTables && gfm) {
+      gfm(service);
+    }
+    let markdown = service.turndown(html).trim();
+    if (compactMode) markdown = applyCompactMarkdown(markdown);
+    return {
+      markdown,
+      messages: normalizeMammothMessages(htmlResult.messages)
+    };
+  };
+  const runFallback = async (reason, priorMessages, err) => {
+    if (!resolveStructured) {
+      if (reason === "error" && err !== void 0) throw err;
+      return {
+        markdown: "",
+        source: "mammoth-turndown",
+        messages: priorMessages,
+        fallbackReason: reason
+      };
+    }
+    const structured = await resolveStructured();
+    const md = convertStructuredToMarkdown(structured, structuredMdOpts);
+    const extra = [];
+    if (reason === "error" && err !== void 0) {
+      extra.push({
+        type: "warning",
+        message: `DOCX direct conversion failed; used structured fallback: ${String(err)}`
+      });
+    }
+    return {
+      markdown: compactMode ? applyCompactMarkdown(md) : md.trim(),
+      source: "structured-fallback",
+      messages: [...priorMessages, ...extra],
+      fallbackReason: reason
+    };
+  };
+  try {
+    const { markdown, messages } = await runDirect();
+    const insuff = shouldTryStructuredFallback(markdown, minMarkdownLength);
+    if (insuff && resolveStructured) {
+      return await runFallback(insuff, messages);
+    }
+    return { markdown, source: "mammoth-turndown", messages };
+  } catch (err) {
+    return await runFallback("error", [], err);
+  }
+}
+async function convertDocxBufferToMarkdown(input, options) {
+  const r = await convertDocxToMarkdown(input, options);
+  return { markdown: r.markdown, messages: r.messages };
+}
+// src/pdf-markdown.ts
+var BROWSER_WARNING = "@dragon708/docmind-markdown: PDF \u2192 Markdown via @opendataloader/pdf requires Node.js. In the browser, use a server-side conversion or supply structured text/Markdown from your backend.";
+function normalizePdfMarkdown(markdown, clean) {
+  const t = markdown.trim();
+  if (!clean) return t;
+  return t.replace(/\n{3,}/g, "\n\n");
+}
+function engineOptions(options) {
+  if (!options) return {};
+  const {
+    resolveStructured: _r,
+    structuredMarkdown: _s,
+    cleanMarkdown: _c,
+    ...rest
+  } = options;
+  return rest;
+}
+async function toNodeBuffer2(input) {
+  const { Buffer: Buffer2 } = await importEsm("node:buffer");
+  if (Buffer2.isBuffer(input)) return input;
+  if (input instanceof ArrayBuffer) return Buffer2.from(input);
+  return Buffer2.from(input);
+}
+async function convertPdfToMarkdown(input, options) {
+  const clean = options?.cleanMarkdown !== false;
+  const resolveStructured = options?.resolveStructured;
+  const structuredMdOpts = options?.structuredMarkdown;
+  const eng = engineOptions(options);
+  if (!isNodeRuntime()) {
+    return {
+      markdown: "",
+      warnings: [BROWSER_WARNING],
+      source: "unsupported-runtime",
+      fallbackReason: "unsupported-runtime"
+    };
+  }
+  const warnings = [];
+  let cleanup;
+  try {
+    let inputPath;
+    if (typeof input === "string") {
+      inputPath = input;
+    } else {
+      const [{ mkdtemp, writeFile, rm }, { join }, { tmpdir }, buffer] = await Promise.all([
+        importEsm("node:fs/promises"),
+        importEsm("node:path"),
+        importEsm("node:os"),
+        toNodeBuffer2(input)
+      ]);
+      const dir = await mkdtemp(join(tmpdir(), "docmind-markdown-pdf-"));
+      inputPath = join(dir, "document.pdf");
+      await writeFile(inputPath, buffer);
+      cleanup = async () => rm(dir, { recursive: true, force: true });
+    }
+    let convert;
+    try {
+      ({ convert } = await importEsm(
+        "@opendataloader/pdf"
+      ));
+    } catch (e) {
+      const hint = e instanceof Error && /Cannot find module|MODULE_NOT_FOUND/i.test(e.message) ? " Install `@opendataloader/pdf` in your project." : "";
+      warnings.push(
+        `@opendataloader/pdf could not be loaded (${e instanceof Error ? e.message : String(e)}).${hint}`
+      );
+      if (resolveStructured) {
+        try {
+          const structured = await resolveStructured();
+          const md = normalizePdfMarkdown(
+            convertStructuredToMarkdown(structured, structuredMdOpts),
+            clean
+          );
+          return {
+            markdown: md,
+            warnings,
+            source: "structured-fallback",
+            fallbackReason: "module-not-found"
+          };
+        } catch (e2) {
+          warnings.push(
+            `Structured fallback failed: ${e2 instanceof Error ? e2.message : String(e2)}`
+          );
+        }
+      }
+      return {
+        markdown: "",
+        warnings,
+        source: "opendataloader",
+        fallbackReason: "module-not-found"
+      };
+    }
+    let rawMarkdown;
+    try {
+      rawMarkdown = await convert(inputPath, {
+        ...eng,
+        format: "markdown",
+        toStdout: true,
+        quiet: eng.quiet !== false
+      }).then((s) => String(s));
+    } catch (e) {
+      warnings.push(`PDF conversion failed: ${e instanceof Error ? e.message : String(e)}`);
+      if (resolveStructured) {
+        try {
+          const structured = await resolveStructured();
+          const md = normalizePdfMarkdown(
+            convertStructuredToMarkdown(structured, structuredMdOpts),
+            clean
+          );
+          return {
+            markdown: md,
+            warnings,
+            source: "structured-fallback",
+            fallbackReason: "error"
+          };
+        } catch (e2) {
+          warnings.push(
+            `Structured fallback failed: ${e2 instanceof Error ? e2.message : String(e2)}`
+          );
+        }
+      }
+      return {
+        markdown: "",
+        warnings,
+        source: "opendataloader",
+        fallbackReason: "error"
+      };
+    }
+    let markdown = normalizePdfMarkdown(rawMarkdown, clean);
+    if (markdown.length === 0) {
+      warnings.push("OpenDataLoader returned empty Markdown for this PDF.");
+      if (resolveStructured) {
+        try {
+          const structured = await resolveStructured();
+          markdown = normalizePdfMarkdown(
+            convertStructuredToMarkdown(structured, structuredMdOpts),
+            clean
+          );
+          return {
+            markdown,
+            warnings,
+            source: "structured-fallback",
+            fallbackReason: "empty"
+          };
+        } catch (e2) {
+          warnings.push(
+            `Structured fallback failed: ${e2 instanceof Error ? e2.message : String(e2)}`
+          );
+        }
+      }
+      return {
+        markdown: "",
+        warnings,
+        source: "opendataloader",
+        fallbackReason: "empty"
+      };
+    }
+    return { markdown, warnings, source: "opendataloader" };
+  } finally {
+    if (cleanup) {
+      await cleanup().catch(() => {
+      });
+    }
+  }
+}
+function throwIfLegacyFailure(r) {
+  if (r.source === "unsupported-runtime") {
+    throw new Error(r.warnings[0] ?? "PDF \u2192 Markdown requires Node.js.");
+  }
+  if (r.markdown.trim().length === 0 && r.source !== "structured-fallback") {
+    throw new Error(
+      r.warnings.length > 0 ? r.warnings.join("; ") : "PDF conversion produced no Markdown."
+    );
+  }
+}
+async function convertPdfPathToMarkdown(inputPath, options) {
+  assertNodeRuntime("PDF \u2192 Markdown (@opendataloader/pdf)");
+  const r = await convertPdfToMarkdown(inputPath, options);
+  throwIfLegacyFailure(r);
+  return { markdown: r.markdown };
+}
+async function convertPdfBufferToMarkdown(input, options) {
+  assertNodeRuntime("PDF \u2192 Markdown (@opendataloader/pdf)");
+  const r = await convertPdfToMarkdown(input, options);
+  throwIfLegacyFailure(r);
+  return { markdown: r.markdown };
+}
+function isArrayBufferLike(data) {
+  if (data instanceof ArrayBuffer) return true;
+  if (typeof Uint8Array !== "undefined" && data instanceof Uint8Array) return true;
+  if (typeof Buffer !== "undefined" && Buffer.isBuffer(data)) return true;
+  return false;
+}
+function isExtractMarkdownFileInput(value) {
+  if (value === null || typeof value !== "object" || !("data" in value)) return false;
+  return isArrayBufferLike(value.data);
+}
+function isExtractMarkdownPathInput(value) {
+  if (value === null || typeof value !== "object" || !("path" in value)) return false;
+  return typeof value.path === "string";
+}
+function pickStructuredMarkdownOptions(options) {
+  if (!options) return {};
+  const { structuredFallback: _a, docx: _b, pdf: _c, ...rest } = options;
+  return rest;
+}
+function buildDocxOptions(extract) {
+  const docx = extract?.docx;
+  const fb = extract?.structuredFallback;
+  const sm = pickStructuredMarkdownOptions(extract);
+  return {
+    ...docx,
+    resolveStructured: docx?.resolveStructured ?? (fb ? () => Promise.resolve(fb) : void 0),
+    structuredMarkdown: { ...sm, ...docx?.structuredMarkdown }
+  };
+}
+function buildPdfOptions(extract) {
+  const pdf = extract?.pdf;
+  const fb = extract?.structuredFallback;
+  const sm = pickStructuredMarkdownOptions(extract);
+  return {
+    ...pdf,
+    resolveStructured: pdf?.resolveStructured ?? (fb ? () => Promise.resolve(fb) : void 0),
+    structuredMarkdown: { ...sm, ...pdf?.structuredMarkdown }
+  };
+}
+function toUint8View(data) {
+  if (data instanceof Uint8Array) return data;
+  if (data instanceof ArrayBuffer) return new Uint8Array(data);
+  return new Uint8Array(data);
+}
+function detectBinaryFormat(data, filename, mimeType) {
+  const u = toUint8View(data);
+  const lower = filename?.toLowerCase() ?? "";
+  const mime = mimeType?.toLowerCase() ?? "";
+  if (mime.includes("pdf") || lower.endsWith(".pdf")) return "pdf";
+  if (mime.includes("wordprocessingml") || mime.includes("officedocument.wordprocessingml.document") || lower.endsWith(".docx")) {
+    return "docx";
+  }
+  if (u.length >= 4 && u[0] === 37 && u[1] === 80 && u[2] === 68 && u[3] === 70) {
+    return "pdf";
+  }
+  if (u.length >= 4 && u[0] === 80 && u[1] === 75 && (u[2] === 3 || u[2] === 5 || u[2] === 7)) {
+    return "docx";
+  }
+  return "unknown";
+}
+function docxStrategyFromSource(source) {
+  return source === "structured-fallback" ? "docx-structured-fallback" : "docx-mammoth";
+}
+function pdfStrategyFromResult(r) {
+  if (r.source === "structured-fallback") return "pdf-structured-fallback";
+  if (r.source === "unsupported-runtime") return "pdf-unsupported-runtime";
+  return "pdf-opendataloader";
+}
+function mergeWarnings(base, ...more) {
+  const out = [...base];
+  for (const m of more) {
+    if (m) for (const w of m) out.push(w);
+  }
+  return out;
+}
+async function extractMarkdown(input, options) {
+  const smOpts = pickStructuredMarkdownOptions(options);
+  const fb = options?.structuredFallback;
+  if (isStructuredDocumentResult(input)) {
+    const markdown = convertStructuredToMarkdown(input, smOpts);
+    return {
+      markdown,
+      warnings: mergeWarnings(
+        [],
+        input.warnings
+      ),
+      strategy: "structured"
+    };
+  }
+  let data;
+  let filename;
+  let mimeType;
+  const warnings = [];
+  if (isExtractMarkdownPathInput(input)) {
+    if (!isNodeRuntime()) {
+      warnings.push(
+        "@dragon708/docmind-markdown: `path` input requires Node.js to read the file. Provide `data` bytes or a StructuredDocumentResult instead."
+      );
+      if (fb) {
+        return {
+          markdown: convertStructuredToMarkdown(fb, smOpts),
+          warnings: mergeWarnings(warnings, fb.warnings),
+          strategy: "path-requires-node"
+        };
+      }
+      return { markdown: "", warnings, strategy: "path-requires-node" };
+    }
+    const { readFile } = await importEsm(
+      "node:fs/promises"
+    );
+    const { basename } = await importEsm("node:path");
+    data = await readFile(input.path);
+    filename = input.filename ?? basename(input.path);
+    mimeType = input.mimeType;
+  } else if (isExtractMarkdownFileInput(input)) {
+    data = input.data;
+    filename = input.filename;
+    mimeType = input.mimeType;
+  } else {
+    warnings.push(
+      "@dragon708/docmind-markdown: extractMarkdown input must be a StructuredDocumentResult, { data, \u2026 }, or { path, \u2026 }."
+    );
+    if (fb) {
+      return {
+        markdown: convertStructuredToMarkdown(fb, smOpts),
+        warnings: mergeWarnings(warnings, fb.warnings),
+        strategy: "binary-unidentified-structured-fallback"
+      };
+    }
+    return { markdown: "", warnings, strategy: "binary-unidentified" };
+  }
+  const fmt = detectBinaryFormat(data, filename, mimeType);
+  if (fmt === "docx") {
+    if (!isNodeRuntime()) {
+      warnings.push(
+        "@dragon708/docmind-markdown: DOCX binary conversion needs Node.js (Mammoth/Turndown). Use structured input or run on the server."
+      );
+      if (fb) {
+        return {
+          markdown: convertStructuredToMarkdown(fb, smOpts),
+          warnings: mergeWarnings(warnings, fb.warnings),
+          strategy: "docx-requires-node"
+        };
+      }
+      return { markdown: "", warnings, strategy: "docx-requires-node" };
+    }
+    const r = await convertDocxToMarkdown(data, buildDocxOptions(options));
+    const w = mergeWarnings(
+      warnings,
+      r.messages.map((m) => m.message)
+    );
+    return {
+      markdown: r.markdown,
+      warnings: w,
+      strategy: docxStrategyFromSource(r.source)
+    };
+  }
+  if (fmt === "pdf") {
+    const r = await convertPdfToMarkdown(data, buildPdfOptions(options));
+    const strategy = pdfStrategyFromResult(r);
+    const w = mergeWarnings(warnings, r.warnings);
+    if (strategy === "pdf-unsupported-runtime" && r.markdown === "" && fb) {
+      return {
+        markdown: convertStructuredToMarkdown(fb, smOpts),
+        warnings: mergeWarnings(w, fb.warnings, [
+          "extractMarkdown: PDF route unavailable in this runtime; used structuredFallback."
+        ]),
+        strategy: "pdf-structured-fallback"
+      };
+    }
+    return { markdown: r.markdown, warnings: w, strategy };
+  }
+  warnings.push(
+    "@dragon708/docmind-markdown: Unidentified binary format (expected PDF magic or ZIP/DOCX). Using structured fallback if provided."
+  );
+  if (fb) {
+    return {
+      markdown: convertStructuredToMarkdown(fb, smOpts),
+      warnings: mergeWarnings(warnings, fb.warnings),
+      strategy: "binary-unidentified-structured-fallback"
+    };
+  }
+  return { markdown: "", warnings, strategy: "binary-unidentified" };
+}
+export { convertDocxBufferToMarkdown, convertDocxToMarkdown, convertPdfBufferToMarkdown, convertPdfPathToMarkdown, convertPdfToMarkdown, convertStructuredToLlmText, convertStructuredToMarkdown, detectBinaryFormat, extractLlmContent, extractMarkdown, extractStructuredChunks, isExtractMarkdownFileInput, renderLlmText, renderMarkdown, renderMarkdownSections, splitStructuredIntoChunks, structuredDocumentToLlmText, structuredDocumentToMarkdown };
 //# sourceMappingURL=index.js.map
 //# sourceMappingURL=index.js.map