@dragon708/docmind-browser 1.7.0 → 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -44,7 +44,10 @@ type BrowserExtractStructuredDataOptions = BrowserAnalyzeOptions & {
44
44
  readonly normalize?: NormalizeStructuredOptions;
45
45
  };
46
46
  /**
47
- * {@link extractMarkdown}: structured options plus `markdown` for `renderMarkdown` (`@dragon708/docmind-markdown`).
47
+ * {@link extractMarkdown}: structured options plus `markdown` (passed through to `extractMarkdown` in
48
+ * `@dragon708/docmind-markdown`, including structured-serializer knobs). Binary PDF/DOCX converters inside that
49
+ * package are not used for PDF in-browser and the DOCX bytes→Turndown path is Node-only; the browser still
50
+ * gets correct DOCX Markdown via structured fallback from {@link extractStructuredData}.
48
51
  */
49
52
  interface BrowserExtractMarkdownOptions extends BrowserExtractStructuredDataOptions {
50
53
  readonly markdown?: RenderMarkdownOptions;
@@ -120,24 +123,33 @@ declare function runOcr(input: BrowserAnalyzeInput, options?: BrowserAnalyzeOpti
120
123
  declare function extractStructuredData(input: BrowserAnalyzeInput, options?: BrowserExtractStructuredDataOptions): Promise<StructuredDocumentResult>;
121
124
 
122
125
  /**
123
- * {@link extractStructuredData} `renderMarkdown` (browser-safe; no Node-only APIs).
124
- * PDF: empty structured stub usually empty output; see {@link getCapabilities} capability `markdown`.
126
+ * {@link extractStructuredData} for a full structured envelope, then `extractMarkdown` from
127
+ * `@dragon708/docmind-markdown` on `{ data, filename?, mimeType? }` with that result as `structuredFallback`.
125
128
  *
126
- * @param input - `File` / `Blob` / bytes as accepted by the browser facade.
127
- * @param options - `markdown` render options plus structured/`ocr`/`docx` routing (same as {@link extractStructuredData}).
129
+ * - **PDF:** the markdown package does not load `@opendataloader/pdf` here; output comes from the structured
130
+ * fallback (empty in-browser stub see {@link getCapabilities}).
131
+ * - **DOCX:** the package’s direct bytes → Mammoth → Turndown path is **Node-only**; in-browser, Markdown is
132
+ * produced via `convertStructuredToMarkdown` on the structured envelope (still Mammoth/OOXML-backed via
133
+ * `@dragon708/docmind-docx`), with an explanatory warning from the markdown package.
134
+ * - **Text / image:** unidentified or non-binary bytes use the same structured serializer.
135
+ *
136
+ * @param options - `markdown` options plus the same routing as {@link extractStructuredData} (`ocr`, `docx`, `normalize`).
128
137
  */
129
138
  declare function extractMarkdown(input: BrowserAnalyzeInput, options?: BrowserExtractMarkdownOptions): Promise<string>;
130
139
  /**
131
- * Same as {@link extractMarkdown} but `renderLlmText` (compact plain text for LLM prompts).
140
+ * {@link extractStructuredData} then `renderLlmText` (`@dragon708/docmind-markdown`). For a structured value you already have, that package's `extractLlmContent` matches `renderLlmText` (no file I/O).
132
141
  */
133
142
  declare function extractLlmContent(input: BrowserAnalyzeInput, options?: BrowserExtractLlmContentOptions): Promise<string>;
134
143
  /**
135
- * Structured extract → `renderMarkdownSections` for sectioned Markdown in the browser.
144
+ * Structured extract → `renderMarkdownSections` (`splitStructuredIntoChunks` with Markdown; same as
145
+ * `extractStructuredChunks` alias in `@dragon708/docmind-markdown`).
136
146
  */
137
147
  declare function extractStructuredChunks(input: BrowserAnalyzeInput, options?: BrowserExtractStructuredChunksOptions): Promise<readonly MarkdownSection[]>;
138
148
 
139
149
  /** High-level features the user can ask DocMind for (per input kind and runtime). */
140
- type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "structured-output" | "markdown" | "llm-text" | "structured-chunks" | "image-normalization" | "gif-first-frame" | "bmp" | "heic" | "tiff";
150
+ type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "structured-output"
151
+ /** Browser: {@link extractMarkdown} via `@dragon708/docmind-markdown` `extractMarkdown` + structured fallback (PDF empty; DOCX structured path when binary converter is Node-only). */
152
+ | "markdown" | "llm-text" | "structured-chunks" | "image-normalization" | "gif-first-frame" | "bmp" | "heic" | "tiff";
141
153
  declare function docxIncludeRequested(flags?: AnalyzeDocxIncludeFlags): boolean;
142
154
  /** DOCX `word/media` en runtime browser (JSZip; sin pipeline Node). */
143
155
  interface DocxEmbeddedImageCapabilities {
@@ -218,7 +230,7 @@ type BrowserExplainAnalysisPlanOptions = Omit<ExplainAnalysisPlanOptions, "inten
218
230
 
219
231
  /**
220
232
  * Epic 1 — **Capabilities:** detects kind from the same hints as `analyzeFile`, then lists which of
221
- * `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` | `markdown` | `llm-text` | `structured-chunks`
233
+ * `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` | `markdown` (package `extractMarkdown` + structured fallback; PDF empty here) | `llm-text` | `structured-chunks` (split + Markdown sections)
222
234
  * and image-specific ids (`image-normalization`, `bmp`, `gif-first-frame`, `heic`, `tiff`) apply in the browser (PDF always unsupported for meaningful content).
223
235
  * No Mammoth/Tesseract/PDF parsing. For DOCX, {@link GetCapabilitiesReport.docxStructure} / `docxEmbeddedImages` describe v2 opt-in features.
224
236
  */
package/dist/index.js CHANGED
@@ -4,7 +4,7 @@ import { extractStructuredDataFromDocx, analyzeDocx } from '@dragon708/docmind-d
4
4
  export { extractStructuredDataFromDocx } from '@dragon708/docmind-docx';
5
5
  import { extractStructuredDataFromImage, preprocessHasEffect, resolveImageFormat, normalizeImageForOcr, ocrTiff, ocrImageDetailed } from '@dragon708/docmind-ocr';
6
6
  export { extractStructuredDataFromImage } from '@dragon708/docmind-ocr';
7
- import { renderMarkdown, renderLlmText, renderMarkdownSections } from '@dragon708/docmind-markdown';
7
+ import { extractMarkdown as extractMarkdown$1, renderLlmText, renderMarkdownSections } from '@dragon708/docmind-markdown';
8
8
 
9
9
  // src/analyzeFile.ts
10
10
  function assertBrowserInput(input) {
@@ -520,11 +520,26 @@ async function runOcr(input, options) {
520
520
  return notImplementedResult(kind, "none", [UNKNOWN_FORMAT_WARNING]);
521
521
  }
522
522
  }
523
+ function browserFileHints(input) {
524
+ if (input instanceof File) {
525
+ return {
526
+ filename: input.name,
527
+ mimeType: input.type ? input.type : void 0
528
+ };
529
+ }
530
+ return {};
531
+ }
523
532
  async function extractMarkdown(input, options) {
524
533
  throwIfAborted(options?.signal);
525
534
  const { markdown: markdownOpts, ...structuredOpts } = options ?? {};
526
535
  const structured = await extractStructuredData(input, structuredOpts);
527
- return renderMarkdown(structured, markdownOpts);
536
+ const data = await toUint8Array(input);
537
+ const hints = browserFileHints(input);
538
+ const r = await extractMarkdown$1(
539
+ { data, filename: hints.filename, mimeType: hints.mimeType },
540
+ { ...markdownOpts ?? {}, structuredFallback: structured }
541
+ );
542
+ return r.markdown;
528
543
  }
529
544
  async function extractLlmContent(input, options) {
530
545
  throwIfAborted(options?.signal);
@@ -576,7 +591,7 @@ var TEXT_META_NOTE = "Plain text has no structured document metadata; extractMet
576
591
  var OCR_OFF_NOTE = 'Image OCR may be skipped when `ocr.mode` is "off" in analyze options.';
577
592
  var STRUCTURED_OCR_OFF = 'Structured image output uses OCR; when `ocr.mode` is "off", `extractStructuredData` returns an empty envelope with a warning.';
578
593
  var UNKNOWN_KIND = "Could not determine file kind from name, MIME, or bytes; all features are reported as unsupported until the kind is known.";
579
- var MARKDOWN_PDF_BROWSER = "PDF: no parser in-browser \u2014 extractMarkdown / extractLlmContent / extractStructuredChunks only render an empty structured stub; use @dragon708/docmind-node for real PDF \u2192 Markdown / LLM text / chunks.";
594
+ var MARKDOWN_PDF_BROWSER = "PDF: no parser in-browser \u2014 `@opendataloader/pdf` is not loaded here. extractMarkdown still calls `extractMarkdown` in `@dragon708/docmind-markdown`, which falls back to the empty structured stub (same empty Markdown). extractLlmContent / extractStructuredChunks use the structured envelope only. Use @dragon708/docmind-node for real PDF \u2192 Markdown / LLM text / chunks.";
580
595
  var MARKDOWN_IMAGE_OCR_OFF = 'Same as structured-output: when ocr.mode is "off", structured (and thus Markdown/LLM/chunk exports) are empty aside from warnings.';
581
596
  function slot(id, supported, warnings) {
582
597
  return warnings?.length ? { id, supported, warnings } : { id, supported };
@@ -619,13 +634,13 @@ function buildBrowserCapabilityReport(kind) {
619
634
  "`extractStructuredData` uses `@dragon708/docmind-docx` (Mammoth + OOXML) and returns `StructuredDocumentResult`; optional `options.docx` slices are forwarded."
620
635
  ]),
621
636
  slot("markdown", true, [
622
- "extractMarkdown: structured DOCX \u2192 Markdown via `@dragon708/docmind-markdown` (browser-safe, no Node APIs)."
637
+ "extractMarkdown: `@dragon708/docmind-markdown` `extractMarkdown` on bytes + structured fallback. The package\u2019s DOCX-bytes Mammoth\u2192Turndown path is Node-only; in-browser, Markdown is produced from `extractStructuredData` (Mammoth/OOXML in `@dragon708/docmind-docx`) via structured serialization, with a clear package warning that the binary shortcut is skipped."
623
638
  ]),
624
639
  slot("llm-text", true, [
625
- "extractLlmContent: structured \u2192 compact plain text for prompts (same pipeline as Markdown export)."
640
+ "extractLlmContent: structured envelope \u2192 `renderLlmText` (LLM-ready plain text; no binary PDF/DOCX Markdown routes)."
626
641
  ]),
627
642
  slot("structured-chunks", true, [
628
- "extractStructuredChunks: structured \u2192 Markdown sections (`splitStructuredIntoChunks` in markdown package)."
643
+ "extractStructuredChunks: structured \u2192 `renderMarkdownSections` / `splitStructuredIntoChunks` (heading-aware chunking + optional parallel `text`)."
629
644
  ])
630
645
  ];
631
646
  break;
@@ -664,7 +679,7 @@ function buildBrowserCapabilityReport(kind) {
664
679
  "HEIC/HEIF and TIFF limitations match `getCapabilities` (`heic`, `tiff`) and OCR warnings."
665
680
  ]),
666
681
  slot("markdown", true, [
667
- "extractMarkdown: OCR structured layout \u2192 Markdown when OCR runs; HEIC unsupported; TIFF best-effort.",
682
+ "extractMarkdown: same bytes + structured fallback through package `extractMarkdown` when applicable; OCR structured layout \u2192 Markdown when OCR runs. HEIC unsupported; TIFF best-effort.",
668
683
  MARKDOWN_IMAGE_OCR_OFF
669
684
  ]),
670
685
  slot("llm-text", true, [
@@ -688,11 +703,13 @@ function buildBrowserCapabilityReport(kind) {
688
703
  "`extractStructuredData` decodes UTF-8 (via `analyzeText`) and normalizes to `StructuredDocumentResult` (paragraph block rollup)."
689
704
  ]),
690
705
  slot("markdown", true, [
691
- "extractMarkdown: UTF-8 structured rollup \u2192 Markdown (`@dragon708/docmind-markdown`)."
706
+ "extractMarkdown: bytes + structured fallback through `@dragon708/docmind-markdown` `extractMarkdown` (typically structured serializer for UTF-8 text)."
707
+ ]),
708
+ slot("llm-text", true, [
709
+ "extractLlmContent: UTF-8 structured rollup \u2192 `renderLlmText` in `@dragon708/docmind-markdown`."
692
710
  ]),
693
- slot("llm-text", true, ["extractLlmContent: UTF-8 structured \u2192 LLM-oriented plain text."]),
694
711
  slot("structured-chunks", true, [
695
- "extractStructuredChunks: typically a single Markdown section when only paragraph rollup exists."
712
+ "extractStructuredChunks: typically one Markdown section when only paragraph rollup exists."
696
713
  ])
697
714
  ];
698
715
  break;
@@ -897,10 +914,22 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude, ocr
897
914
  case "extractLlmContent":
898
915
  case "extractStructuredChunks":
899
916
  if (kind === "docx") {
900
- nativeExtraction = {
901
- willAttempt: true,
902
- description: intent === "extractStructuredData" ? "`extractStructuredDataFromDocx`: Mammoth plus required OOXML extractors (structure, headings, tables, blocks, pagesApprox, embeddedImages unless disabled), then `normalizeToStructuredResult`. Optional `options.docx` is forwarded." : `${String(intent)}: same structured DOCX pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\` (browser-safe).`
903
- };
917
+ if (intent === "extractStructuredData") {
918
+ nativeExtraction = {
919
+ willAttempt: true,
920
+ description: "`extractStructuredDataFromDocx`: Mammoth plus required OOXML extractors (structure, headings, tables, blocks, pagesApprox, embeddedImages unless disabled), then `normalizeToStructuredResult`. Optional `options.docx` is forwarded."
921
+ };
922
+ } else if (intent === "extractMarkdown") {
923
+ nativeExtraction = {
924
+ willAttempt: true,
925
+ description: "extractMarkdown: `extractStructuredData` (Mammoth/OOXML) for a full structured envelope, then `extractMarkdown` in `@dragon708/docmind-markdown`. The package\u2019s DOCX-bytes Mammoth\u2192Turndown shortcut is Node-only; in-browser Markdown uses structured serialization on that envelope (with a package warning)."
926
+ };
927
+ } else {
928
+ nativeExtraction = {
929
+ willAttempt: true,
930
+ description: `${String(intent)}: same structured DOCX pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\` (\`renderLlmText\` or \`renderMarkdownSections\`).`
931
+ };
932
+ }
904
933
  ocr = { mayUse: false, description: "DOCX structured path does not use OCR." };
905
934
  limitations = lim(DOCX_ZIP_NOTE_BROWSER);
906
935
  } else if (kind === "image") {
@@ -923,17 +952,21 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude, ocr
923
952
  if (intent === "extractMarkdown") {
924
953
  limitations = [
925
954
  ...limitations,
926
- ...lim("Output: Markdown string via renderMarkdown.")
955
+ ...lim(
956
+ "Output: Markdown string from `extractMarkdown` in `@dragon708/docmind-markdown`. PDF in-browser: empty (no `@opendataloader/pdf`). DOCX: structured Markdown path when the binary converter is Node-only."
957
+ )
927
958
  ];
928
959
  } else if (intent === "extractLlmContent") {
929
960
  limitations = [
930
961
  ...limitations,
931
- ...lim("Output: compact plain text via renderLlmText.")
962
+ ...lim("Output: compact plain text via `renderLlmText` (structured input only in this runtime).")
932
963
  ];
933
964
  } else if (intent === "extractStructuredChunks") {
934
965
  limitations = [
935
966
  ...limitations,
936
- ...lim("Output: MarkdownSection[] via renderMarkdownSections.")
967
+ ...lim(
968
+ "Output: MarkdownSection[] via `renderMarkdownSections` (`splitStructuredIntoChunks` / `extractStructuredChunks` alias)."
969
+ )
937
970
  ];
938
971
  }
939
972
  break;
@@ -1088,7 +1121,21 @@ function planForIntent(intentOpt, kind, ocrMode, docxInclude, ocr, analyzeFileOu
1088
1121
  };
1089
1122
  }
1090
1123
  }
1091
- if (intent === "extractMarkdown" || intent === "extractLlmContent" || intent === "extractStructuredChunks") {
1124
+ if (intent === "extractMarkdown") {
1125
+ const sub = planForIntent(
1126
+ "extractStructuredData",
1127
+ kind,
1128
+ ocrMode,
1129
+ docxInclude,
1130
+ ocr,
1131
+ analyzeFileOutput
1132
+ );
1133
+ return {
1134
+ intent,
1135
+ steps: [...sub.steps ?? [], { id: "markdown_hybrid_package", status: "planned" }]
1136
+ };
1137
+ }
1138
+ if (intent === "extractLlmContent" || intent === "extractStructuredChunks") {
1092
1139
  const sub = planForIntent(
1093
1140
  "extractStructuredData",
1094
1141
  kind,
@@ -1101,7 +1148,10 @@ function planForIntent(intentOpt, kind, ocrMode, docxInclude, ocr, analyzeFileOu
1101
1148
  intent,
1102
1149
  steps: [
1103
1150
  ...sub.steps ?? [],
1104
- { id: "docmind_markdown_render", status: "planned" }
1151
+ {
1152
+ id: intent === "extractLlmContent" ? "docmind_markdown_llm_text" : "docmind_markdown_chunk_sections",
1153
+ status: "planned"
1154
+ }
1105
1155
  ]
1106
1156
  };
1107
1157
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@dragon708/docmind-browser",
3
- "version": "1.7.0",
3
+ "version": "1.8.0",
4
4
  "description": "Official DocMind browser facade: analyzeFile and intent APIs (DOCX, image OCR, text). PDF and fs paths use @dragon708/docmind-node.",
5
5
  "type": "module",
6
6
  "sideEffects": false,
@@ -34,7 +34,7 @@
34
34
  "license": "MIT",
35
35
  "dependencies": {
36
36
  "@dragon708/docmind-docx": "^1.8.0",
37
- "@dragon708/docmind-markdown": "^1.0.0",
37
+ "@dragon708/docmind-markdown": "^1.1.0",
38
38
  "@dragon708/docmind-ocr": "^1.1.4",
39
39
  "@dragon708/docmind-shared": "^1.2.0"
40
40
  },