@dragon708/docmind-node 1.13.0 → 1.13.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -55,17 +55,17 @@ interface NodeExtractStructuredDataOptions extends DocMindAnalyzeOptions {
55
55
  /**
56
56
  * {@link extractMarkdown}: inherits {@link NodeExtractStructuredDataOptions}; `markdown` maps to structured-serializer
57
57
  * options passed through `extractMarkdown` in `@dragon708/docmind-markdown`. `markdownDocx` / `markdownPdf`
58
- * configure Mammoth→Turndown and `@opendataloader/pdf` respectively — separate from `docx` / `pdf` used only by {@link extractStructuredData}.
58
+ * configure Mammoth→Turndown and `@cognipeer/to-markdown` (`convertPdfToMarkdown`) respectively — separate from `docx` / `pdf` used only by {@link extractStructuredData}.
59
59
  */
60
60
  interface NodeExtractMarkdownOptions extends NodeExtractStructuredDataOptions {
61
61
  readonly markdown?: RenderMarkdownOptions;
62
62
  /** Forwarded to `extractMarkdown` → `convertDocxToMarkdown` when bytes are DOCX (Node). */
63
63
  readonly markdownDocx?: ConvertDocxToMarkdownOptions;
64
- /** Forwarded to `extractMarkdown` → `convertPdfToMarkdown` when bytes are PDF. */
64
+ /** Forwarded to `extractMarkdown` → `convertPdfToMarkdown` when bytes are PDF (Node; `@cognipeer/to-markdown`). */
65
65
  readonly markdownPdf?: ConvertPdfToMarkdownOptions;
66
66
  /**
67
67
  * Optional: receive `strategy` and merged `warnings` from `@dragon708/docmind-markdown` `extractMarkdown`
68
- * (e.g. `pdf-opendataloader` vs `pdf-structured-fallback`) without changing the `Promise<string>` return type.
68
+ * (e.g. `pdf-cognipeer-specialized` vs `pdf-structured-fallback`, `docx-mammoth` vs `docx-structured-fallback`) without changing the `Promise<string>` return type.
69
69
  */
70
70
  readonly onMarkdownExtract?: (info: {
71
71
  strategy: ExtractMarkdownStrategy;
@@ -141,8 +141,14 @@ declare function extractStructuredData(input: NodeAnalyzeInput, options?: NodeEx
141
141
  /**
142
142
  * End-to-end: {@link extractStructuredData} (for fallback + option parity) plus
143
143
  * `extractMarkdown` from `@dragon708/docmind-markdown` on `{ data, filename, mimeType }`.
144
- * On Node, PDF/DOCX bytes use specialized routes (`@opendataloader/pdf`, Mammoth+Turndown) when detection matches;
145
- * the structured envelope is always passed as `structuredFallback`.
144
+ *
145
+ * On Node, **DOCX** bytes use **Mammoth Turndown**; **PDF** bytes use **`convertPdfToMarkdown`**
146
+ * (`@cognipeer/to-markdown`, Node-first, no JVM). The structured envelope is always passed as `structuredFallback`
147
+ * so specialized routes can fall back to structured → Markdown serialization with clear warnings when needed.
148
+ *
149
+ * {@link NodeExtractMarkdownOptions.onMarkdownExtract} receives `strategy` and merged `warnings` from the markdown
150
+ * package (for example `pdf-cognipeer-specialized`, `pdf-structured-fallback`, `docx-structured-fallback`, and
151
+ * `[docmind-markdown:extractMarkdown]` trace lines).
146
152
  *
147
153
  * @param input - Path, buffer, or {@link NodeAnalyzeInput} accepted by the Node facade.
148
154
  * @param options - Structured routing (`pdf` / `docx` / `ocr` / `normalize`), optional `markdown` serializer knobs,
@@ -151,7 +157,8 @@ declare function extractStructuredData(input: NodeAnalyzeInput, options?: NodeEx
151
157
  declare function extractMarkdown(input: NodeAnalyzeInput, options?: NodeExtractMarkdownOptions): Promise<string>;
152
158
  /**
153
159
  * {@link extractStructuredData} then `renderLlmText` in `@dragon708/docmind-markdown` (tagged plain text for LLMs).
154
- * That package's `extractLlmContent` is the same transform on an in-memory structured result only.
160
+ * This path **does not** run the binary PDF/DOCX Markdown pipelines (no `@cognipeer/to-markdown` / Mammoth→Turndown here);
161
+ * it linearizes the structured envelope only. Use {@link extractMarkdown} when you need specialized PDF or DOCX Markdown layout.
155
162
  *
156
163
  * @param options - Structured routing plus optional `llm` passed through to `renderLlmText`.
157
164
  */
package/dist/index.js CHANGED
@@ -724,7 +724,7 @@ function buildNodeCapabilityReport(kind) {
724
724
  "extractStructuredData: analyzePdf + PDF.js per-page text, outline, links, annotations, merged via normalizeToStructuredResult; respects pdf.ocr like analyzeFile."
725
725
  ]),
726
726
  slot("markdown", true, [
727
- "extractMarkdown: `@dragon708/docmind-markdown` hybrid \u2014 PDF bytes prefer `@opendataloader/pdf` \u2192 Markdown; structured PDF (`extractStructuredData`, respects pdf.ocr) is always built as fallback and for non-binary inputs."
727
+ "extractMarkdown: `@dragon708/docmind-markdown` hybrid \u2014 PDF bytes use `@cognipeer/to-markdown` (Node-first, no Java); structured PDF (`extractStructuredData`, respects pdf.ocr) is always built as fallback."
728
728
  ]),
729
729
  slot("llm-text", true, [
730
730
  "extractLlmContent: structured envelope \u2192 LLM-oriented plain text (`renderLlmText` in `@dragon708/docmind-markdown`)."
@@ -1047,7 +1047,7 @@ function buildNodeExplainReport(kind, intent, pdfOcr, plan, docxInclude, ocrSlic
1047
1047
  if (kind === "pdf") {
1048
1048
  nativeExtraction = {
1049
1049
  willAttempt: true,
1050
- description: intent === "extractStructuredData" ? "extractStructuredData: analyzePdf plus PDF.js page rows, outline, links, and annotations, normalized to StructuredDocumentResult." : intent === "extractMarkdown" ? "extractMarkdown: structured PDF extract (same as extractStructuredData) for fallback; primary Markdown from `@dragon708/docmind-markdown` tries `@opendataloader/pdf` on PDF bytes when possible." : `${intent}: same structured PDF pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\` (renderLlmText or chunk sections).`
1050
+ description: intent === "extractStructuredData" ? "extractStructuredData: analyzePdf plus PDF.js page rows, outline, links, and annotations, normalized to StructuredDocumentResult." : intent === "extractMarkdown" ? "extractMarkdown: structured PDF extract (same as extractStructuredData) for fallback; primary Markdown on PDF bytes uses `@cognipeer/to-markdown` via `@dragon708/docmind-markdown` (Node-first, no Java)." : `${intent}: same structured PDF pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\` (renderLlmText or chunk sections).`
1051
1051
  };
1052
1052
  ocr = {
1053
1053
  mayUse: pdfOcr !== "off",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@dragon708/docmind-node",
3
- "version": "1.13.0",
3
+ "version": "1.13.2",
4
4
  "description": "Official DocMind Node facade: analyzeFile, intent APIs, PDF/DOCX/OCR, and fs helpers.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -33,7 +33,7 @@
33
33
  "license": "MIT",
34
34
  "dependencies": {
35
35
  "@dragon708/docmind-docx": "^1.8.0",
36
- "@dragon708/docmind-markdown": "^1.2.0",
36
+ "@dragon708/docmind-markdown": "^1.2.6",
37
37
  "@dragon708/docmind-ocr": "^1.1.4",
38
38
  "@dragon708/docmind-pdf": "^2.2.0",
39
39
  "@dragon708/docmind-shared": "^1.2.0"