npm - @dragon708/docmind-browser - Versions diffs - 1.7.0 → 1.8.0 - Mend

@dragon708/docmind-browser 1.7.0 → 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -44,7 +44,10 @@ type BrowserExtractStructuredDataOptions = BrowserAnalyzeOptions & {
     readonly normalize?: NormalizeStructuredOptions;
 };
 /**
- * {@link extractMarkdown}: structured options plus `markdown` for `renderMarkdown` (`@dragon708/docmind-markdown`).
+ * {@link extractMarkdown}: structured options plus `markdown` (passed through to `extractMarkdown` in
+ * `@dragon708/docmind-markdown`, including structured-serializer knobs). Binary PDF/DOCX converters inside that
+ * package are not used for PDF in-browser and the DOCX bytes→Turndown path is Node-only; the browser still
+ * gets correct DOCX Markdown via structured fallback from {@link extractStructuredData}.
  */
 interface BrowserExtractMarkdownOptions extends BrowserExtractStructuredDataOptions {
     readonly markdown?: RenderMarkdownOptions;
@@ -120,24 +123,33 @@ declare function runOcr(input: BrowserAnalyzeInput, options?: BrowserAnalyzeOpti
 declare function extractStructuredData(input: BrowserAnalyzeInput, options?: BrowserExtractStructuredDataOptions): Promise<StructuredDocumentResult>;
 /**
- * {@link extractStructuredData} → `renderMarkdown` (browser-safe; no Node-only APIs).
- * PDF: empty structured stub → usually empty output; see {@link getCapabilities} capability `markdown`.
+ * {@link extractStructuredData} for a full structured envelope, then `extractMarkdown` from
+ * `@dragon708/docmind-markdown` on `{ data, filename?, mimeType? }` with that result as `structuredFallback`.
  *
- * @param input - `File` / `Blob` / bytes as accepted by the browser facade.
- * @param options - `markdown` render options plus structured/`ocr`/`docx` routing (same as {@link extractStructuredData}).
+ * - **PDF:** the markdown package does not load `@opendataloader/pdf` here; output comes from the structured
+ *   fallback (empty in-browser stub — see {@link getCapabilities}).
+ * - **DOCX:** the package’s direct bytes → Mammoth → Turndown path is **Node-only**; in-browser, Markdown is
+ *   produced via `convertStructuredToMarkdown` on the structured envelope (still Mammoth/OOXML-backed via
+ *   `@dragon708/docmind-docx`), with an explanatory warning from the markdown package.
+ * - **Text / image:** unidentified or non-binary bytes use the same structured serializer.
+ *
+ * @param options - `markdown` options plus the same routing as {@link extractStructuredData} (`ocr`, `docx`, `normalize`).
  */
 declare function extractMarkdown(input: BrowserAnalyzeInput, options?: BrowserExtractMarkdownOptions): Promise<string>;
 /**
- * Same as {@link extractMarkdown} but `renderLlmText` (compact plain text for LLM prompts).
+ * {@link extractStructuredData} then `renderLlmText` (`@dragon708/docmind-markdown`). For a structured value you already have, that package's `extractLlmContent` matches `renderLlmText` (no file I/O).
  */
 declare function extractLlmContent(input: BrowserAnalyzeInput, options?: BrowserExtractLlmContentOptions): Promise<string>;
 /**
- * Structured extract → `renderMarkdownSections` for sectioned Markdown in the browser.
+ * Structured extract → `renderMarkdownSections` (`splitStructuredIntoChunks` with Markdown; same as
+ * `extractStructuredChunks` alias in `@dragon708/docmind-markdown`).
  */
 declare function extractStructuredChunks(input: BrowserAnalyzeInput, options?: BrowserExtractStructuredChunksOptions): Promise<readonly MarkdownSection[]>;
 /** High-level features the user can ask DocMind for (per input kind and runtime). */
-type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "structured-output" | "markdown" | "llm-text" | "structured-chunks" | "image-normalization" | "gif-first-frame" | "bmp" | "heic" | "tiff";
+type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "structured-output"
+/** Browser: {@link extractMarkdown} via `@dragon708/docmind-markdown` `extractMarkdown` + structured fallback (PDF empty; DOCX structured path when binary converter is Node-only). */
+ | "markdown" | "llm-text" | "structured-chunks" | "image-normalization" | "gif-first-frame" | "bmp" | "heic" | "tiff";
 declare function docxIncludeRequested(flags?: AnalyzeDocxIncludeFlags): boolean;
 /** DOCX `word/media` en runtime browser (JSZip; sin pipeline Node). */
 interface DocxEmbeddedImageCapabilities {
@@ -218,7 +230,7 @@ type BrowserExplainAnalysisPlanOptions = Omit<ExplainAnalysisPlanOptions, "inten
 /**
  * Epic 1 — **Capabilities:** detects kind from the same hints as `analyzeFile`, then lists which of
- * `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` | `markdown` | `llm-text` | `structured-chunks`
+ * `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` | `markdown` (package `extractMarkdown` + structured fallback; PDF empty here) | `llm-text` | `structured-chunks` (split + Markdown sections)
  * and image-specific ids (`image-normalization`, `bmp`, `gif-first-frame`, `heic`, `tiff`) apply in the browser (PDF always unsupported for meaningful content).
  * No Mammoth/Tesseract/PDF parsing. For DOCX, {@link GetCapabilitiesReport.docxStructure} / `docxEmbeddedImages` describe v2 opt-in features.
  */

package/dist/index.js CHANGED Viewed

@@ -4,7 +4,7 @@ import { extractStructuredDataFromDocx, analyzeDocx } from '@dragon708/docmind-d
 export { extractStructuredDataFromDocx } from '@dragon708/docmind-docx';
 import { extractStructuredDataFromImage, preprocessHasEffect, resolveImageFormat, normalizeImageForOcr, ocrTiff, ocrImageDetailed } from '@dragon708/docmind-ocr';
 export { extractStructuredDataFromImage } from '@dragon708/docmind-ocr';
-import { renderMarkdown, renderLlmText, renderMarkdownSections } from '@dragon708/docmind-markdown';
+import { extractMarkdown as extractMarkdown$1, renderLlmText, renderMarkdownSections } from '@dragon708/docmind-markdown';
 // src/analyzeFile.ts
 function assertBrowserInput(input) {
@@ -520,11 +520,26 @@ async function runOcr(input, options) {
       return notImplementedResult(kind, "none", [UNKNOWN_FORMAT_WARNING]);
   }
 }
+function browserFileHints(input) {
+  if (input instanceof File) {
+    return {
+      filename: input.name,
+      mimeType: input.type ? input.type : void 0
+    };
+  }
+  return {};
+}
 async function extractMarkdown(input, options) {
   throwIfAborted(options?.signal);
   const { markdown: markdownOpts, ...structuredOpts } = options ?? {};
   const structured = await extractStructuredData(input, structuredOpts);
-  return renderMarkdown(structured, markdownOpts);
+  const data = await toUint8Array(input);
+  const hints = browserFileHints(input);
+  const r = await extractMarkdown$1(
+    { data, filename: hints.filename, mimeType: hints.mimeType },
+    { ...markdownOpts ?? {}, structuredFallback: structured }
+  );
+  return r.markdown;
 }
 async function extractLlmContent(input, options) {
   throwIfAborted(options?.signal);
@@ -576,7 +591,7 @@ var TEXT_META_NOTE = "Plain text has no structured document metadata; extractMet
 var OCR_OFF_NOTE = 'Image OCR may be skipped when `ocr.mode` is "off" in analyze options.';
 var STRUCTURED_OCR_OFF = 'Structured image output uses OCR; when `ocr.mode` is "off", `extractStructuredData` returns an empty envelope with a warning.';
 var UNKNOWN_KIND = "Could not determine file kind from name, MIME, or bytes; all features are reported as unsupported until the kind is known.";
-var MARKDOWN_PDF_BROWSER = "PDF: no parser in-browser \u2014 extractMarkdown / extractLlmContent / extractStructuredChunks only render an empty structured stub; use @dragon708/docmind-node for real PDF \u2192 Markdown / LLM text / chunks.";
+var MARKDOWN_PDF_BROWSER = "PDF: no parser in-browser \u2014 `@opendataloader/pdf` is not loaded here. extractMarkdown still calls `extractMarkdown` in `@dragon708/docmind-markdown`, which falls back to the empty structured stub (same empty Markdown). extractLlmContent / extractStructuredChunks use the structured envelope only. Use @dragon708/docmind-node for real PDF \u2192 Markdown / LLM text / chunks.";
 var MARKDOWN_IMAGE_OCR_OFF = 'Same as structured-output: when ocr.mode is "off", structured (and thus Markdown/LLM/chunk exports) are empty aside from warnings.';
 function slot(id, supported, warnings) {
   return warnings?.length ? { id, supported, warnings } : { id, supported };
@@ -619,13 +634,13 @@ function buildBrowserCapabilityReport(kind) {
           "`extractStructuredData` uses `@dragon708/docmind-docx` (Mammoth + OOXML) and returns `StructuredDocumentResult`; optional `options.docx` slices are forwarded."
         ]),
         slot("markdown", true, [
-          "extractMarkdown: structured DOCX \u2192 Markdown via `@dragon708/docmind-markdown` (browser-safe, no Node APIs)."
+          "extractMarkdown: `@dragon708/docmind-markdown` `extractMarkdown` on bytes + structured fallback. The package\u2019s DOCX-bytes Mammoth\u2192Turndown path is Node-only; in-browser, Markdown is produced from `extractStructuredData` (Mammoth/OOXML in `@dragon708/docmind-docx`) via structured serialization, with a clear package warning that the binary shortcut is skipped."
         ]),
         slot("llm-text", true, [
-          "extractLlmContent: structured \u2192 compact plain text for prompts (same pipeline as Markdown export)."
+          "extractLlmContent: structured envelope \u2192 `renderLlmText` (LLM-ready plain text; no binary PDF/DOCX Markdown routes)."
         ]),
         slot("structured-chunks", true, [
-          "extractStructuredChunks: structured \u2192 Markdown sections (`splitStructuredIntoChunks` in markdown package)."
+          "extractStructuredChunks: structured \u2192 `renderMarkdownSections` / `splitStructuredIntoChunks` (heading-aware chunking + optional parallel `text`)."
         ])
       ];
       break;
@@ -664,7 +679,7 @@ function buildBrowserCapabilityReport(kind) {
           "HEIC/HEIF and TIFF limitations match `getCapabilities` (`heic`, `tiff`) and OCR warnings."
         ]),
         slot("markdown", true, [
-          "extractMarkdown: OCR structured layout \u2192 Markdown when OCR runs; HEIC unsupported; TIFF best-effort.",
+          "extractMarkdown: same bytes + structured fallback through package `extractMarkdown` when applicable; OCR structured layout \u2192 Markdown when OCR runs. HEIC unsupported; TIFF best-effort.",
           MARKDOWN_IMAGE_OCR_OFF
         ]),
         slot("llm-text", true, [
@@ -688,11 +703,13 @@ function buildBrowserCapabilityReport(kind) {
           "`extractStructuredData` decodes UTF-8 (via `analyzeText`) and normalizes to `StructuredDocumentResult` (paragraph block rollup)."
         ]),
         slot("markdown", true, [
-          "extractMarkdown: UTF-8 structured rollup \u2192 Markdown (`@dragon708/docmind-markdown`)."
+          "extractMarkdown: bytes + structured fallback through `@dragon708/docmind-markdown` `extractMarkdown` (typically structured serializer for UTF-8 text)."
+        ]),
+        slot("llm-text", true, [
+          "extractLlmContent: UTF-8 structured rollup \u2192 `renderLlmText` in `@dragon708/docmind-markdown`."
         ]),
-        slot("llm-text", true, ["extractLlmContent: UTF-8 structured \u2192 LLM-oriented plain text."]),
         slot("structured-chunks", true, [
-          "extractStructuredChunks: typically a single Markdown section when only paragraph rollup exists."
+          "extractStructuredChunks: typically one Markdown section when only paragraph rollup exists."
         ])
       ];
       break;
@@ -897,10 +914,22 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude, ocr
     case "extractLlmContent":
     case "extractStructuredChunks":
       if (kind === "docx") {
-        nativeExtraction = {
-          willAttempt: true,
-          description: intent === "extractStructuredData" ? "`extractStructuredDataFromDocx`: Mammoth plus required OOXML extractors (structure, headings, tables, blocks, pagesApprox, embeddedImages unless disabled), then `normalizeToStructuredResult`. Optional `options.docx` is forwarded." : `${String(intent)}: same structured DOCX pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\` (browser-safe).`
-        };
+        if (intent === "extractStructuredData") {
+          nativeExtraction = {
+            willAttempt: true,
+            description: "`extractStructuredDataFromDocx`: Mammoth plus required OOXML extractors (structure, headings, tables, blocks, pagesApprox, embeddedImages unless disabled), then `normalizeToStructuredResult`. Optional `options.docx` is forwarded."
+          };
+        } else if (intent === "extractMarkdown") {
+          nativeExtraction = {
+            willAttempt: true,
+            description: "extractMarkdown: `extractStructuredData` (Mammoth/OOXML) for a full structured envelope, then `extractMarkdown` in `@dragon708/docmind-markdown`. The package\u2019s DOCX-bytes Mammoth\u2192Turndown shortcut is Node-only; in-browser Markdown uses structured serialization on that envelope (with a package warning)."
+          };
+        } else {
+          nativeExtraction = {
+            willAttempt: true,
+            description: `${String(intent)}: same structured DOCX pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\` (\`renderLlmText\` or \`renderMarkdownSections\`).`
+          };
+        }
         ocr = { mayUse: false, description: "DOCX structured path does not use OCR." };
         limitations = lim(DOCX_ZIP_NOTE_BROWSER);
       } else if (kind === "image") {
@@ -923,17 +952,21 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude, ocr
       if (intent === "extractMarkdown") {
         limitations = [
           ...limitations,
-          ...lim("Output: Markdown string via renderMarkdown.")
+          ...lim(
+            "Output: Markdown string from `extractMarkdown` in `@dragon708/docmind-markdown`. PDF in-browser: empty (no `@opendataloader/pdf`). DOCX: structured Markdown path when the binary converter is Node-only."
+          )
         ];
       } else if (intent === "extractLlmContent") {
         limitations = [
           ...limitations,
-          ...lim("Output: compact plain text via renderLlmText.")
+          ...lim("Output: compact plain text via `renderLlmText` (structured input only in this runtime).")
         ];
       } else if (intent === "extractStructuredChunks") {
         limitations = [
           ...limitations,
-          ...lim("Output: MarkdownSection[] via renderMarkdownSections.")
+          ...lim(
+            "Output: MarkdownSection[] via `renderMarkdownSections` (`splitStructuredIntoChunks` / `extractStructuredChunks` alias)."
+          )
         ];
       }
       break;
@@ -1088,7 +1121,21 @@ function planForIntent(intentOpt, kind, ocrMode, docxInclude, ocr, analyzeFileOu
         };
     }
   }
-  if (intent === "extractMarkdown" || intent === "extractLlmContent" || intent === "extractStructuredChunks") {
+  if (intent === "extractMarkdown") {
+    const sub = planForIntent(
+      "extractStructuredData",
+      kind,
+      ocrMode,
+      docxInclude,
+      ocr,
+      analyzeFileOutput
+    );
+    return {
+      intent,
+      steps: [...sub.steps ?? [], { id: "markdown_hybrid_package", status: "planned" }]
+    };
+  }
+  if (intent === "extractLlmContent" || intent === "extractStructuredChunks") {
     const sub = planForIntent(
       "extractStructuredData",
       kind,
@@ -1101,7 +1148,10 @@ function planForIntent(intentOpt, kind, ocrMode, docxInclude, ocr, analyzeFileOu
       intent,
       steps: [
         ...sub.steps ?? [],
-        { id: "docmind_markdown_render", status: "planned" }
+        {
+          id: intent === "extractLlmContent" ? "docmind_markdown_llm_text" : "docmind_markdown_chunk_sections",
+          status: "planned"
+        }
       ]
     };
   }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@dragon708/docmind-browser",
-  "version": "1.7.0",
+  "version": "1.8.0",
   "description": "Official DocMind browser facade: analyzeFile and intent APIs (DOCX, image OCR, text). PDF and fs paths use @dragon708/docmind-node.",
   "type": "module",
   "sideEffects": false,
@@ -34,7 +34,7 @@
   "license": "MIT",
   "dependencies": {
     "@dragon708/docmind-docx": "^1.8.0",
-    "@dragon708/docmind-markdown": "^1.0.0",
+    "@dragon708/docmind-markdown": "^1.1.0",
     "@dragon708/docmind-ocr": "^1.1.4",
     "@dragon708/docmind-shared": "^1.2.0"
   },