npm - @dragon708/docmind-node - Versions diffs - 1.12.3 → 1.13.1 - Mend

@dragon708/docmind-node 1.12.3 → 1.13.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -6,7 +6,7 @@ import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions, ExtractStructuredDataFromDo
 export { AnalyzeDocxIncludeFlags, DocxEmbeddedImage, DocxEmbeddedImageConversionOptions, DocxEmbeddedImageConversionResult, DocxEmbeddedImageWebSlice, DocxImageExtractionMode, ExtractImagesFromDocxOptions, ExtractImagesFromDocxResult, ExtractStructuredDataFromDocxOptions, convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, extractStructuredDataFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
 import { PdfAnalyzeOptions, ExtractStructuredDataFromPdfOptions } from '@dragon708/docmind-pdf';
 export { ExtractStructuredDataFromPdfIncludeFlags, ExtractStructuredDataFromPdfOptions, extractStructuredDataFromPdf } from '@dragon708/docmind-pdf';
-import { RenderLlmTextOptions, RenderMarkdownOptions, ConvertDocxToMarkdownOptions, ConvertPdfToMarkdownOptions, RenderMarkdownSectionsOptions, MarkdownSection } from '@dragon708/docmind-markdown';
+import { RenderLlmTextOptions, RenderMarkdownOptions, ConvertDocxToMarkdownOptions, ConvertPdfToMarkdownOptions, ExtractMarkdownStrategy, RenderMarkdownSectionsOptions, MarkdownSection } from '@dragon708/docmind-markdown';
 export { MarkdownSection } from '@dragon708/docmind-markdown';
 /**
@@ -55,14 +55,22 @@ interface NodeExtractStructuredDataOptions extends DocMindAnalyzeOptions {
 /**
  * {@link extractMarkdown}: inherits {@link NodeExtractStructuredDataOptions}; `markdown` maps to structured-serializer
  * options passed through `extractMarkdown` in `@dragon708/docmind-markdown`. `markdownDocx` / `markdownPdf`
- * configure Mammoth→Turndown and `@opendataloader/pdf` respectively — separate from `docx` / `pdf` used only by {@link extractStructuredData}.
+ * configure Mammoth→Turndown and `@cognipeer/to-markdown` (`convertPdfToMarkdown`) respectively — separate from `docx` / `pdf` used only by {@link extractStructuredData}.
  */
 interface NodeExtractMarkdownOptions extends NodeExtractStructuredDataOptions {
     readonly markdown?: RenderMarkdownOptions;
     /** Forwarded to `extractMarkdown` → `convertDocxToMarkdown` when bytes are DOCX (Node). */
     readonly markdownDocx?: ConvertDocxToMarkdownOptions;
-    /** Forwarded to `extractMarkdown` → `convertPdfToMarkdown` when bytes are PDF. */
+    /** Forwarded to `extractMarkdown` → `convertPdfToMarkdown` when bytes are PDF (Node; `@cognipeer/to-markdown`). */
     readonly markdownPdf?: ConvertPdfToMarkdownOptions;
+    /**
+     * Optional: receive `strategy` and merged `warnings` from `@dragon708/docmind-markdown` `extractMarkdown`
+     * (e.g. `pdf-cognipeer-specialized` vs `pdf-structured-fallback`, `docx-mammoth` vs `docx-structured-fallback`) without changing the `Promise<string>` return type.
+     */
+    readonly onMarkdownExtract?: (info: {
+        strategy: ExtractMarkdownStrategy;
+        warnings: readonly string[];
+    }) => void;
 }
 /**
  * {@link extractLlmContent}: same structured fields; `llm` forwards to `renderLlmText`.
@@ -133,8 +141,14 @@ declare function extractStructuredData(input: NodeAnalyzeInput, options?: NodeEx
 /**
  * End-to-end: {@link extractStructuredData} (for fallback + option parity) plus
  * `extractMarkdown` from `@dragon708/docmind-markdown` on `{ data, filename, mimeType }`.
- * On Node, PDF/DOCX bytes use specialized routes (`@opendataloader/pdf`, Mammoth+Turndown) when detection matches;
- * the structured envelope is always passed as `structuredFallback`.
+ *
+ * On Node, **DOCX** bytes use **Mammoth → Turndown**; **PDF** bytes use **`convertPdfToMarkdown`**
+ * (`@cognipeer/to-markdown`, Node-first, no JVM). The structured envelope is always passed as `structuredFallback`
+ * so specialized routes can fall back to structured → Markdown serialization with clear warnings when needed.
+ *
+ * {@link NodeExtractMarkdownOptions.onMarkdownExtract} receives `strategy` and merged `warnings` from the markdown
+ * package (for example `pdf-cognipeer-specialized`, `pdf-structured-fallback`, `docx-structured-fallback`, and
+ * `[docmind-markdown:extractMarkdown]` trace lines).
  *
  * @param input - Path, buffer, or {@link NodeAnalyzeInput} accepted by the Node facade.
  * @param options - Structured routing (`pdf` / `docx` / `ocr` / `normalize`), optional `markdown` serializer knobs,
@@ -143,7 +157,8 @@ declare function extractStructuredData(input: NodeAnalyzeInput, options?: NodeEx
 declare function extractMarkdown(input: NodeAnalyzeInput, options?: NodeExtractMarkdownOptions): Promise<string>;
 /**
  * {@link extractStructuredData} then `renderLlmText` in `@dragon708/docmind-markdown` (tagged plain text for LLMs).
- * That package's `extractLlmContent` is the same transform on an in-memory structured result only.
+ * This path **does not** run the binary PDF/DOCX Markdown pipelines (no `@cognipeer/to-markdown` / Mammoth→Turndown here);
+ * it linearizes the structured envelope only. Use {@link extractMarkdown} when you need specialized PDF or DOCX Markdown layout.
  *
  * @param options - Structured routing plus optional `llm` passed through to `renderLlmText`.
  */

package/dist/index.js CHANGED Viewed

@@ -620,7 +620,13 @@ async function runOcr(input, options) {
 }
 async function extractMarkdown(input, options) {
   throwIfAborted(options?.signal);
-  const { markdown: markdownOpts, markdownDocx, markdownPdf, ...structuredOpts } = options ?? {};
+  const {
+    markdown: markdownOpts,
+    markdownDocx,
+    markdownPdf,
+    onMarkdownExtract,
+    ...structuredOpts
+  } = options ?? {};
   const resolved = await resolveNodeAnalyzeInput(input);
   const structured = await extractStructuredData(resolved, structuredOpts);
   const data = await bytesFromDetectInput(resolved);
@@ -639,6 +645,7 @@ async function extractMarkdown(input, options) {
       structuredFallback: structured
     }
   );
+  onMarkdownExtract?.({ strategy: r.strategy, warnings: r.warnings });
   return r.markdown;
 }
 async function extractLlmContent(input, options) {
@@ -717,7 +724,7 @@ function buildNodeCapabilityReport(kind) {
           "extractStructuredData: analyzePdf + PDF.js per-page text, outline, links, annotations, merged via normalizeToStructuredResult; respects pdf.ocr like analyzeFile."
         ]),
         slot("markdown", true, [
-          "extractMarkdown: `@dragon708/docmind-markdown` hybrid \u2014 PDF bytes prefer `@opendataloader/pdf` \u2192 Markdown; structured PDF (`extractStructuredData`, respects pdf.ocr) is always built as fallback and for non-binary inputs."
+          "extractMarkdown: `@dragon708/docmind-markdown` hybrid \u2014 PDF bytes use `@cognipeer/to-markdown` (Node-first, no Java); structured PDF (`extractStructuredData`, respects pdf.ocr) is always built as fallback."
         ]),
         slot("llm-text", true, [
           "extractLlmContent: structured envelope \u2192 LLM-oriented plain text (`renderLlmText` in `@dragon708/docmind-markdown`)."
@@ -1040,7 +1047,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlic
       if (kind === "pdf") {
         nativeExtraction = {
           willAttempt: true,
-          description: intent === "extractStructuredData" ? "extractStructuredData: analyzePdf plus PDF.js page rows, outline, links, and annotations, normalized to StructuredDocumentResult." : intent === "extractMarkdown" ? "extractMarkdown: structured PDF extract (same as extractStructuredData) for fallback; primary Markdown from `@dragon708/docmind-markdown` tries `@opendataloader/pdf` on PDF bytes when possible." : `${intent}: same structured PDF pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\` (renderLlmText or chunk sections).`
+          description: intent === "extractStructuredData" ? "extractStructuredData: analyzePdf plus PDF.js page rows, outline, links, and annotations, normalized to StructuredDocumentResult." : intent === "extractMarkdown" ? "extractMarkdown: structured PDF extract (same as extractStructuredData) for fallback; primary Markdown on PDF bytes uses `@cognipeer/to-markdown` via `@dragon708/docmind-markdown` (Node-first, no Java)." : `${intent}: same structured PDF pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\` (renderLlmText or chunk sections).`
         };
         ocr = {
           mayUse: pdfOcr !== "off",

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@dragon708/docmind-node",
-  "version": "1.12.3",
+  "version": "1.13.1",
   "description": "Official DocMind Node facade: analyzeFile, intent APIs, PDF/DOCX/OCR, and fs helpers.",
   "type": "module",
   "main": "./dist/index.js",
@@ -33,7 +33,7 @@
   "license": "MIT",
   "dependencies": {
     "@dragon708/docmind-docx": "^1.8.0",
-    "@dragon708/docmind-markdown": "^1.1.3",
+    "@dragon708/docmind-markdown": "^1.2.1",
     "@dragon708/docmind-ocr": "^1.1.4",
     "@dragon708/docmind-pdf": "^2.2.0",
     "@dragon708/docmind-shared": "^1.2.0"