@dragon708/docmind-node 1.12.3 → 1.13.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -6,7 +6,7 @@ import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions, ExtractStructuredDataFromDo
6
6
  export { AnalyzeDocxIncludeFlags, DocxEmbeddedImage, DocxEmbeddedImageConversionOptions, DocxEmbeddedImageConversionResult, DocxEmbeddedImageWebSlice, DocxImageExtractionMode, ExtractImagesFromDocxOptions, ExtractImagesFromDocxResult, ExtractStructuredDataFromDocxOptions, convertDocxEmbeddedImageToWeb, convertDocxImagesForWeb, docxImageIsBrowserRenderable, docxImageRequiresConversion, docxImageSuggestedTargetFormat, docxImageToDataUri, extractImagesFromDocx, extractStructuredDataFromDocx, isNodeJsRuntime } from '@dragon708/docmind-docx';
7
7
  import { PdfAnalyzeOptions, ExtractStructuredDataFromPdfOptions } from '@dragon708/docmind-pdf';
8
8
  export { ExtractStructuredDataFromPdfIncludeFlags, ExtractStructuredDataFromPdfOptions, extractStructuredDataFromPdf } from '@dragon708/docmind-pdf';
9
- import { RenderLlmTextOptions, RenderMarkdownOptions, ConvertDocxToMarkdownOptions, ConvertPdfToMarkdownOptions, RenderMarkdownSectionsOptions, MarkdownSection } from '@dragon708/docmind-markdown';
9
+ import { RenderLlmTextOptions, RenderMarkdownOptions, ConvertDocxToMarkdownOptions, ConvertPdfToMarkdownOptions, ExtractMarkdownStrategy, RenderMarkdownSectionsOptions, MarkdownSection } from '@dragon708/docmind-markdown';
10
10
  export { MarkdownSection } from '@dragon708/docmind-markdown';
11
11
 
12
12
  /**
@@ -55,14 +55,22 @@ interface NodeExtractStructuredDataOptions extends DocMindAnalyzeOptions {
55
55
  /**
56
56
  * {@link extractMarkdown}: inherits {@link NodeExtractStructuredDataOptions}; `markdown` maps to structured-serializer
57
57
  * options passed through `extractMarkdown` in `@dragon708/docmind-markdown`. `markdownDocx` / `markdownPdf`
58
- * configure Mammoth→Turndown and `@opendataloader/pdf` respectively — separate from `docx` / `pdf` used only by {@link extractStructuredData}.
58
+ * configure Mammoth→Turndown and `@cognipeer/to-markdown` (`convertPdfToMarkdown`) respectively — separate from `docx` / `pdf` used only by {@link extractStructuredData}.
59
59
  */
60
60
  interface NodeExtractMarkdownOptions extends NodeExtractStructuredDataOptions {
61
61
  readonly markdown?: RenderMarkdownOptions;
62
62
  /** Forwarded to `extractMarkdown` → `convertDocxToMarkdown` when bytes are DOCX (Node). */
63
63
  readonly markdownDocx?: ConvertDocxToMarkdownOptions;
64
- /** Forwarded to `extractMarkdown` → `convertPdfToMarkdown` when bytes are PDF. */
64
+ /** Forwarded to `extractMarkdown` → `convertPdfToMarkdown` when bytes are PDF (Node; `@cognipeer/to-markdown`). */
65
65
  readonly markdownPdf?: ConvertPdfToMarkdownOptions;
66
+ /**
67
+ * Optional: receive `strategy` and merged `warnings` from `@dragon708/docmind-markdown` `extractMarkdown`
68
+ * (e.g. `pdf-cognipeer-specialized` vs `pdf-structured-fallback`, `docx-mammoth` vs `docx-structured-fallback`) without changing the `Promise<string>` return type.
69
+ */
70
+ readonly onMarkdownExtract?: (info: {
71
+ strategy: ExtractMarkdownStrategy;
72
+ warnings: readonly string[];
73
+ }) => void;
66
74
  }
67
75
  /**
68
76
  * {@link extractLlmContent}: same structured fields; `llm` forwards to `renderLlmText`.
@@ -133,8 +141,14 @@ declare function extractStructuredData(input: NodeAnalyzeInput, options?: NodeEx
133
141
  /**
134
142
  * End-to-end: {@link extractStructuredData} (for fallback + option parity) plus
135
143
  * `extractMarkdown` from `@dragon708/docmind-markdown` on `{ data, filename, mimeType }`.
136
- * On Node, PDF/DOCX bytes use specialized routes (`@opendataloader/pdf`, Mammoth+Turndown) when detection matches;
137
- * the structured envelope is always passed as `structuredFallback`.
144
+ *
145
+ * On Node, **DOCX** bytes use **Mammoth Turndown**; **PDF** bytes use **`convertPdfToMarkdown`**
146
+ * (`@cognipeer/to-markdown`, Node-first, no JVM). The structured envelope is always passed as `structuredFallback`
147
+ * so specialized routes can fall back to structured → Markdown serialization with clear warnings when needed.
148
+ *
149
+ * {@link NodeExtractMarkdownOptions.onMarkdownExtract} receives `strategy` and merged `warnings` from the markdown
150
+ * package (for example `pdf-cognipeer-specialized`, `pdf-structured-fallback`, `docx-structured-fallback`, and
151
+ * `[docmind-markdown:extractMarkdown]` trace lines).
138
152
  *
139
153
  * @param input - Path, buffer, or {@link NodeAnalyzeInput} accepted by the Node facade.
140
154
  * @param options - Structured routing (`pdf` / `docx` / `ocr` / `normalize`), optional `markdown` serializer knobs,
@@ -143,7 +157,8 @@ declare function extractStructuredData(input: NodeAnalyzeInput, options?: NodeEx
143
157
  declare function extractMarkdown(input: NodeAnalyzeInput, options?: NodeExtractMarkdownOptions): Promise<string>;
144
158
  /**
145
159
  * {@link extractStructuredData} then `renderLlmText` in `@dragon708/docmind-markdown` (tagged plain text for LLMs).
146
- * That package's `extractLlmContent` is the same transform on an in-memory structured result only.
160
+ * This path **does not** run the binary PDF/DOCX Markdown pipelines (no `@cognipeer/to-markdown` / Mammoth→Turndown here);
161
+ * it linearizes the structured envelope only. Use {@link extractMarkdown} when you need specialized PDF or DOCX Markdown layout.
147
162
  *
148
163
  * @param options - Structured routing plus optional `llm` passed through to `renderLlmText`.
149
164
  */
package/dist/index.js CHANGED
@@ -620,7 +620,13 @@ async function runOcr(input, options) {
620
620
  }
621
621
  async function extractMarkdown(input, options) {
622
622
  throwIfAborted(options?.signal);
623
- const { markdown: markdownOpts, markdownDocx, markdownPdf, ...structuredOpts } = options ?? {};
623
+ const {
624
+ markdown: markdownOpts,
625
+ markdownDocx,
626
+ markdownPdf,
627
+ onMarkdownExtract,
628
+ ...structuredOpts
629
+ } = options ?? {};
624
630
  const resolved = await resolveNodeAnalyzeInput(input);
625
631
  const structured = await extractStructuredData(resolved, structuredOpts);
626
632
  const data = await bytesFromDetectInput(resolved);
@@ -639,6 +645,7 @@ async function extractMarkdown(input, options) {
639
645
  structuredFallback: structured
640
646
  }
641
647
  );
648
+ onMarkdownExtract?.({ strategy: r.strategy, warnings: r.warnings });
642
649
  return r.markdown;
643
650
  }
644
651
  async function extractLlmContent(input, options) {
@@ -717,7 +724,7 @@ function buildNodeCapabilityReport(kind) {
717
724
  "extractStructuredData: analyzePdf + PDF.js per-page text, outline, links, annotations, merged via normalizeToStructuredResult; respects pdf.ocr like analyzeFile."
718
725
  ]),
719
726
  slot("markdown", true, [
720
- "extractMarkdown: `@dragon708/docmind-markdown` hybrid \u2014 PDF bytes prefer `@opendataloader/pdf` \u2192 Markdown; structured PDF (`extractStructuredData`, respects pdf.ocr) is always built as fallback and for non-binary inputs."
727
+ "extractMarkdown: `@dragon708/docmind-markdown` hybrid \u2014 PDF bytes use `@cognipeer/to-markdown` (Node-first, no Java); structured PDF (`extractStructuredData`, respects pdf.ocr) is always built as fallback."
721
728
  ]),
722
729
  slot("llm-text", true, [
723
730
  "extractLlmContent: structured envelope \u2192 LLM-oriented plain text (`renderLlmText` in `@dragon708/docmind-markdown`)."
@@ -1040,7 +1047,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlic
1040
1047
  if (kind === "pdf") {
1041
1048
  nativeExtraction = {
1042
1049
  willAttempt: true,
1043
- description: intent === "extractStructuredData" ? "extractStructuredData: analyzePdf plus PDF.js page rows, outline, links, and annotations, normalized to StructuredDocumentResult." : intent === "extractMarkdown" ? "extractMarkdown: structured PDF extract (same as extractStructuredData) for fallback; primary Markdown from `@dragon708/docmind-markdown` tries `@opendataloader/pdf` on PDF bytes when possible." : `${intent}: same structured PDF pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\` (renderLlmText or chunk sections).`
1050
+ description: intent === "extractStructuredData" ? "extractStructuredData: analyzePdf plus PDF.js page rows, outline, links, and annotations, normalized to StructuredDocumentResult." : intent === "extractMarkdown" ? "extractMarkdown: structured PDF extract (same as extractStructuredData) for fallback; primary Markdown on PDF bytes uses `@cognipeer/to-markdown` via `@dragon708/docmind-markdown` (Node-first, no Java)." : `${intent}: same structured PDF pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\` (renderLlmText or chunk sections).`
1044
1051
  };
1045
1052
  ocr = {
1046
1053
  mayUse: pdfOcr !== "off",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@dragon708/docmind-node",
3
- "version": "1.12.3",
3
+ "version": "1.13.1",
4
4
  "description": "Official DocMind Node facade: analyzeFile, intent APIs, PDF/DOCX/OCR, and fs helpers.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -33,7 +33,7 @@
33
33
  "license": "MIT",
34
34
  "dependencies": {
35
35
  "@dragon708/docmind-docx": "^1.8.0",
36
- "@dragon708/docmind-markdown": "^1.1.3",
36
+ "@dragon708/docmind-markdown": "^1.2.1",
37
37
  "@dragon708/docmind-ocr": "^1.1.4",
38
38
  "@dragon708/docmind-pdf": "^2.2.0",
39
39
  "@dragon708/docmind-shared": "^1.2.0"