npm - @dragon708/docmind-node - Versions diffs - 1.11.0 → 1.12.1 - Mend

@dragon708/docmind-node 1.11.0 → 1.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -6,7 +6,7 @@ import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions, ExtractStructuredDataFromDo
 export { AnalyzeDocxIncludeFlags, DocxEmbeddedImage, DocxEmbeddedImageConversionOptions, DocxEmbeddedImageConversionResult, DocxEmbeddedImageWebSlice, DocxImageExtractionMode, ExtractImagesFromDocxOptions, ExtractImagesFromDocxResult, ExtractStructuredDataFromDocxOptions, convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, extractStructuredDataFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
 import { PdfAnalyzeOptions, ExtractStructuredDataFromPdfOptions } from '@dragon708/docmind-pdf';
 export { ExtractStructuredDataFromPdfIncludeFlags, ExtractStructuredDataFromPdfOptions, extractStructuredDataFromPdf } from '@dragon708/docmind-pdf';
-import { RenderLlmTextOptions, RenderMarkdownOptions, RenderMarkdownSectionsOptions, MarkdownSection } from '@dragon708/docmind-markdown';
+import { RenderLlmTextOptions, RenderMarkdownOptions, ConvertDocxToMarkdownOptions, ConvertPdfToMarkdownOptions, RenderMarkdownSectionsOptions, MarkdownSection } from '@dragon708/docmind-markdown';
 export { MarkdownSection } from '@dragon708/docmind-markdown';
 /**
@@ -53,10 +53,16 @@ interface NodeExtractStructuredDataOptions extends DocMindAnalyzeOptions {
     readonly normalize?: NormalizeStructuredOptions;
 }
 /**
- * {@link extractMarkdown}: inherits {@link NodeExtractStructuredDataOptions}; set `markdown` for `renderMarkdown` knobs (`@dragon708/docmind-markdown`).
+ * {@link extractMarkdown}: inherits {@link NodeExtractStructuredDataOptions}; `markdown` maps to structured-serializer
+ * options passed through `extractMarkdown` in `@dragon708/docmind-markdown`. `markdownDocx` / `markdownPdf`
+ * configure Mammoth→Turndown and `@opendataloader/pdf` respectively — separate from `docx` / `pdf` used only by {@link extractStructuredData}.
  */
 interface NodeExtractMarkdownOptions extends NodeExtractStructuredDataOptions {
     readonly markdown?: RenderMarkdownOptions;
+    /** Forwarded to `extractMarkdown` → `convertDocxToMarkdown` when bytes are DOCX (Node). */
+    readonly markdownDocx?: ConvertDocxToMarkdownOptions;
+    /** Forwarded to `extractMarkdown` → `convertPdfToMarkdown` when bytes are PDF. */
+    readonly markdownPdf?: ConvertPdfToMarkdownOptions;
 }
 /**
  * {@link extractLlmContent}: same structured fields; `llm` forwards to `renderLlmText`.
@@ -125,20 +131,26 @@ declare function runOcr(input: NodeAnalyzeInput, options?: NodeAnalyzeOptions):
 declare function extractStructuredData(input: NodeAnalyzeInput, options?: NodeExtractStructuredDataOptions): Promise<StructuredDocumentResult>;
 /**
- * End-to-end: resolve bytes → {@link extractStructuredData} → `renderMarkdown` (`@dragon708/docmind-markdown`).
+ * End-to-end: {@link extractStructuredData} (for fallback + option parity) plus
+ * `extractMarkdown` from `@dragon708/docmind-markdown` on `{ data, filename, mimeType }`.
+ * On Node, PDF/DOCX bytes use specialized routes (`@opendataloader/pdf`, Mammoth+Turndown) when detection matches;
+ * the structured envelope is always passed as `structuredFallback`.
  *
  * @param input - Path, buffer, or {@link NodeAnalyzeInput} accepted by the Node facade.
- * @param options - Structured routing (`pdf` / `docx` / `ocr` / `normalize`) plus optional `markdown` render options.
+ * @param options - Structured routing (`pdf` / `docx` / `ocr` / `normalize`), optional `markdown` serializer knobs,
+ *   and optional `markdownDocx` / `markdownPdf` for the binary Markdown pipelines (distinct from structured-only `docx` / `pdf`).
  */
 declare function extractMarkdown(input: NodeAnalyzeInput, options?: NodeExtractMarkdownOptions): Promise<string>;
 /**
- * Same routing as {@link extractMarkdown}, then `renderLlmText` for prompts / RAG.
+ * {@link extractStructuredData} then `renderLlmText` in `@dragon708/docmind-markdown` (tagged plain text for LLMs).
+ * That package's `extractLlmContent` is the same transform on an in-memory structured result only.
  *
- * @param options - Optional `llm` slice forwarded to `@dragon708/docmind-markdown`.
+ * @param options - Structured routing plus optional `llm` passed through to `renderLlmText`.
  */
 declare function extractLlmContent(input: NodeAnalyzeInput, options?: NodeExtractLlmContentOptions): Promise<string>;
 /**
- * Structured extract → `renderMarkdownSections` (Markdown + optional parallel `text` per slice).
+ * Structured extract → {@link renderMarkdownSections} (`splitStructuredIntoChunks` with `includeMarkdown: true`;
+ * same layer as `extractStructuredChunks` / `splitStructuredIntoChunks` in `@dragon708/docmind-markdown`).
  *
  * @param options - Optional `chunks` (e.g. `maxChars`, `preferHeadings`) from `@dragon708/docmind-markdown`.
  */
@@ -146,7 +158,7 @@ declare function extractStructuredChunks(input: NodeAnalyzeInput, options?: Node
 /** High-level features the user can ask DocMind for (per input kind and runtime). */
 type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "structured-output"
-/** Node: {@link extractMarkdown} after {@link extractStructuredData} via `@dragon708/docmind-markdown`. */
+/** Node: {@link extractMarkdown} — hybrid `extractMarkdown` in `@dragon708/docmind-markdown` (binary PDF/DOCX routes + structured fallback). */
  | "markdown"
 /** Node: {@link extractLlmContent} (LLM-oriented plain text). */
  | "llm-text"
@@ -248,7 +260,7 @@ type NodeExplainAnalysisPlanOptions = Omit<ExplainAnalysisPlanOptions, "intent">
 /**
  * Epic 1 — **Capabilities:** after resolving {@link NodeAnalyzeInput}, lists which of
- * `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` | `markdown` | `llm-text` | `structured-chunks`
+ * `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` | `markdown` (hybrid package extract on Node) | `llm-text` | `structured-chunks` (split + Markdown sections)
  * (and image-specific ids such as `ocr-multipage`, `image-normalization`, `tiff`, `heic-node-only`, `bmp`, `gif-first-frame`) apply for that kind in Node (for PDF, `text` / `metadata` /
  * `pages` / `ocr` describe the v2 pdf-parse + PDF.js + OCR stack; see {@link buildNodeCapabilityReport}).
  * For **DOCX**, `docxEmbeddedImages` and `docxStructure` describe ZIP media and optional OOXML v2 extractors (`options.docx.include`).

package/dist/index.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import { assertValidAnalyzeFileInput, detectFileKind, normalizeToStructuredResult, UNKNOWN_FORMAT_WARNING, analyzeText, notImplementedResult, analyzeFileRequestsStructured, toUint8Array, isNamedInput, isBinaryInput, isBlob, isFile, getMimeType } from '@dragon708/docmind-shared';
+import { assertValidAnalyzeFileInput, detectFileKind, normalizeToStructuredResult, UNKNOWN_FORMAT_WARNING, analyzeText, notImplementedResult, analyzeFileRequestsStructured, isNamedInput, toUint8Array, isBinaryInput, isBlob, isFile, getMimeType } from '@dragon708/docmind-shared';
 export { analyzeFileRequestsStructured, detectFileKind, isStructuredDocumentResult } from '@dragon708/docmind-shared';
 import { extractStructuredDataFromDocx, analyzeDocx } from '@dragon708/docmind-docx';
 export { convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, extractStructuredDataFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
@@ -9,7 +9,7 @@ export { extractStructuredDataFromPdf } from '@dragon708/docmind-pdf';
 import { readFile } from 'fs/promises';
 import { basename } from 'path';
 import { fileURLToPath } from 'url';
-import { renderMarkdown, renderLlmText, renderMarkdownSections } from '@dragon708/docmind-markdown';
+import { extractMarkdown as extractMarkdown$1, renderLlmText, renderMarkdownSections } from '@dragon708/docmind-markdown';
 // src/analyze.ts
@@ -620,9 +620,26 @@ async function runOcr(input, options) {
 }
 async function extractMarkdown(input, options) {
   throwIfAborted(options?.signal);
-  const { markdown: markdownOpts, ...structuredOpts } = options ?? {};
-  const structured = await extractStructuredData(input, structuredOpts);
-  return renderMarkdown(structured, markdownOpts);
+  const { markdown: markdownOpts, markdownDocx, markdownPdf, ...structuredOpts } = options ?? {};
+  const resolved = await resolveNodeAnalyzeInput(input);
+  const structured = await extractStructuredData(resolved, structuredOpts);
+  const data = await bytesFromDetectInput(resolved);
+  let filename;
+  let mimeType;
+  if (isNamedInput(resolved)) {
+    filename = resolved.name;
+    mimeType = resolved.mimeType;
+  }
+  const r = await extractMarkdown$1(
+    { data, filename, mimeType },
+    {
+      ...markdownOpts ?? {},
+      ...markdownDocx !== void 0 ? { docx: markdownDocx } : {},
+      ...markdownPdf !== void 0 ? { pdf: markdownPdf } : {},
+      structuredFallback: structured
+    }
+  );
+  return r.markdown;
 }
 async function extractLlmContent(input, options) {
   throwIfAborted(options?.signal);
@@ -700,13 +717,13 @@ function buildNodeCapabilityReport(kind) {
           "extractStructuredData: analyzePdf + PDF.js per-page text, outline, links, annotations, merged via normalizeToStructuredResult; respects pdf.ocr like analyzeFile."
         ]),
         slot("markdown", true, [
-          "extractMarkdown: same structured pipeline, then `@dragon708/docmind-markdown` renderMarkdown (GFM-style)."
+          "extractMarkdown: `@dragon708/docmind-markdown` hybrid \u2014 PDF bytes prefer `@opendataloader/pdf` \u2192 Markdown; structured PDF (`extractStructuredData`, respects pdf.ocr) is always built as fallback and for non-binary inputs."
         ]),
         slot("llm-text", true, [
-          "extractLlmContent: structured \u2192 compact plain text (renderLlmText) for prompts / embeddings."
+          "extractLlmContent: structured envelope \u2192 LLM-oriented plain text (`renderLlmText` in `@dragon708/docmind-markdown`)."
         ]),
         slot("structured-chunks", true, [
-          "extractStructuredChunks: structured \u2192 Markdown sections via split/render (heading-aware chunking)."
+          "extractStructuredChunks: structured \u2192 `renderMarkdownSections` (splitStructuredIntoChunks + Markdown per slice; heading-aware chunking)."
         ])
       ];
       break;
@@ -729,7 +746,7 @@ function buildNodeCapabilityReport(kind) {
           "extractStructuredData runs analyzeDocx with merged OOXML includes (blocks, tables, headings, embedded images, etc.) into StructuredDocumentResult."
         ]),
         slot("markdown", true, [
-          "extractMarkdown: structured DOCX envelope \u2192 Markdown (`@dragon708/docmind-markdown`)."
+          "extractMarkdown: hybrid \u2014 DOCX bytes use Mammoth\u2192Turndown on Node; structured DOCX (`extractStructuredData`, options.docx.include) is always built as fallback."
         ]),
         slot("llm-text", true, ["extractLlmContent: structured \u2192 LLM-oriented plain text."]),
         slot("structured-chunks", true, [
@@ -1023,7 +1040,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlic
       if (kind === "pdf") {
         nativeExtraction = {
           willAttempt: true,
-          description: intent === "extractStructuredData" ? "extractStructuredData: analyzePdf plus PDF.js page rows, outline, links, and annotations, normalized to StructuredDocumentResult." : `${intent}: same structured PDF pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\`.`
+          description: intent === "extractStructuredData" ? "extractStructuredData: analyzePdf plus PDF.js page rows, outline, links, and annotations, normalized to StructuredDocumentResult." : intent === "extractMarkdown" ? "extractMarkdown: structured PDF extract (same as extractStructuredData) for fallback; primary Markdown from `@dragon708/docmind-markdown` tries `@opendataloader/pdf` on PDF bytes when possible." : `${intent}: same structured PDF pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\` (renderLlmText or chunk sections).`
         };
         ocr = {
           mayUse: pdfOcr !== "off",
@@ -1032,7 +1049,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlic
       } else if (kind === "docx") {
         nativeExtraction = {
           willAttempt: true,
-          description: intent === "extractStructuredData" ? "extractStructuredData: Mammoth plus merged OOXML extractors (structure, headings, tables, blocks, pagesApprox, embeddedImages) in one envelope." : `${intent}: same structured DOCX pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\`.`
+          description: intent === "extractStructuredData" ? "extractStructuredData: Mammoth plus merged OOXML extractors (structure, headings, tables, blocks, pagesApprox, embeddedImages) in one envelope." : intent === "extractMarkdown" ? "extractMarkdown: structured DOCX envelope for fallback; primary Markdown from Mammoth\u2192Turndown on DOCX bytes when possible (`@dragon708/docmind-markdown`)." : `${intent}: same structured DOCX pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\`.`
         };
         ocr = { mayUse: false, description: "DOCX does not use OCR." };
       } else if (kind === "image") {
@@ -1055,7 +1072,9 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlic
       if (intent === "extractMarkdown") {
         limitations = [
           ...limitations,
-          ...lim("Output: Markdown string via renderMarkdown (tables, headings, lists).")
+          ...lim(
+            "Output: Markdown string from `@dragon708/docmind-markdown` extractMarkdown (PDF/DOCX binary routes on Node when applicable; structured serializer as fallback)."
+          )
         ];
       } else if (intent === "extractLlmContent") {
         limitations = [
@@ -1066,7 +1085,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlic
         limitations = [
           ...limitations,
           ...lim(
-            "Output: MarkdownSection[] via renderMarkdownSections (splitStructuredIntoChunks under the hood)."
+            "Output: MarkdownSection[] via renderMarkdownSections (splitStructuredIntoChunks / extractStructuredChunks alias in `@dragon708/docmind-markdown`)."
           )
         ];
       }
@@ -1337,7 +1356,27 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude, ocr, anal
         };
     }
   }
-  if (intent === "extractMarkdown" || intent === "extractLlmContent" || intent === "extractStructuredChunks") {
+  if (intent === "extractMarkdown") {
+    const sub = planForIntent(
+      "extractStructuredData",
+      kind,
+      pdfOcrForAnalyze,
+      docxInclude,
+      ocr,
+      analyzeFileOutput
+    );
+    return {
+      intent,
+      steps: [
+        ...sub.steps ?? [],
+        {
+          id: "markdown_hybrid_package",
+          status: "planned"
+        }
+      ]
+    };
+  }
+  if (intent === "extractLlmContent" || intent === "extractStructuredChunks") {
     const sub = planForIntent(
       "extractStructuredData",
       kind,
@@ -1350,7 +1389,10 @@ function planForIntent(intentOpt, kind, pdfOcrForAnalyze, docxInclude, ocr, anal
       intent,
       steps: [
         ...sub.steps ?? [],
-        { id: "docmind_markdown_render", status: "planned" }
+        {
+          id: intent === "extractLlmContent" ? "docmind_markdown_llm_text" : "docmind_markdown_chunk_sections",
+          status: "planned"
+        }
       ]
     };
   }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@dragon708/docmind-node",
-  "version": "1.11.0",
+  "version": "1.12.1",
   "description": "Official DocMind Node facade: analyzeFile, intent APIs, PDF/DOCX/OCR, and fs helpers.",
   "type": "module",
   "main": "./dist/index.js",
@@ -33,7 +33,7 @@
   "license": "MIT",
   "dependencies": {
     "@dragon708/docmind-docx": "^1.8.0",
-    "@dragon708/docmind-markdown": "^1.0.0",
+    "@dragon708/docmind-markdown": "^1.1.1",
     "@dragon708/docmind-ocr": "^1.1.4",
     "@dragon708/docmind-pdf": "^2.2.0",
     "@dragon708/docmind-shared": "^1.2.0"