@dragon708/docmind-browser 1.6.0 → 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1,9 +1,11 @@
1
- import { DocMindAnalyzeOptions, AnalyzeFileOutputOptions, AnalysisResult, NormalizeStructuredOptions, StructuredDocumentResult, FileKind, RuntimeDescriptor, DocMindPublicIntent, AnalysisAnalyzer, ProcessingPlanDescriptor, ExplainAnalysisPlanOptions, GetCapabilitiesOptions } from '@dragon708/docmind-shared';
1
+ import { DocMindAnalyzeOptions, AnalyzeFileOutputOptions, NormalizeStructuredOptions, AnalysisResult, StructuredDocumentResult, FileKind, RuntimeDescriptor, DocMindPublicIntent, AnalysisAnalyzer, ProcessingPlanDescriptor, ExplainAnalysisPlanOptions, GetCapabilitiesOptions } from '@dragon708/docmind-shared';
2
2
  export { AnalysisAnalyzer, AnalysisResult, CapabilityDescriptor, DetectFileKindInput, DocMindPublicIntent, DocumentBlock, DocumentImageRef, DocumentPage, DocumentTable, DocxAnalysisCoreResult, ExplainAnalysisPlanOptions, ExplainAnalysisPlanResult, FileKind, FileKindMetadata, GenericAnalysisResult, GetCapabilitiesOptions, GetCapabilitiesResult, ImageAnalysisCoreResult, PdfAnalysisCoreResult, StructuredDocumentResult, TextAnalysisResult, analyzeFileRequestsStructured, detectFileKind, isStructuredDocumentResult } from '@dragon708/docmind-shared';
3
- import { OcrOptions, OcrTiffOptions, PreprocessImageOptions } from '@dragon708/docmind-ocr';
4
- export { ExtractStructuredDataFromImageOptions, extractStructuredDataFromImage } from '@dragon708/docmind-ocr';
3
+ import { RenderLlmTextOptions, RenderMarkdownOptions, RenderMarkdownSectionsOptions, MarkdownSection } from '@dragon708/docmind-markdown';
4
+ export { MarkdownSection } from '@dragon708/docmind-markdown';
5
5
  import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions } from '@dragon708/docmind-docx';
6
6
  export { AnalyzeDocxIncludeFlags, ExtractStructuredDataFromDocxOptions, extractStructuredDataFromDocx } from '@dragon708/docmind-docx';
7
+ import { OcrOptions, OcrTiffOptions, PreprocessImageOptions } from '@dragon708/docmind-ocr';
8
+ export { ExtractStructuredDataFromImageOptions, extractStructuredDataFromImage } from '@dragon708/docmind-ocr';
7
9
 
8
10
  /**
9
11
  * Opciones DOCX para el facade browser (Mammoth + inclusiones v2 de `@dragon708/docmind-docx`; sin APIs Node-only).
@@ -37,6 +39,27 @@ interface BrowserAnalyzeOptions extends DocMindAnalyzeOptions, AnalyzeFileOutput
37
39
  /** Solo DOCX: ver {@link BrowserAnalyzeDocxOptionsSlice}. */
38
40
  readonly docx?: BrowserAnalyzeDocxOptionsSlice;
39
41
  }
42
+ /** Options for {@link extractStructuredData}: same as {@link BrowserAnalyzeOptions} plus shared normalize knobs. */
43
+ type BrowserExtractStructuredDataOptions = BrowserAnalyzeOptions & {
44
+ readonly normalize?: NormalizeStructuredOptions;
45
+ };
46
+ /**
47
+ * {@link extractMarkdown}: structured options plus `markdown` (passed through to `extractMarkdown` in
48
+ * `@dragon708/docmind-markdown`, including structured-serializer knobs). Binary PDF/DOCX converters inside that
49
+ * package are not used for PDF in-browser and the DOCX bytes→Turndown path is Node-only; the browser still
50
+ * gets correct DOCX Markdown via structured fallback from {@link extractStructuredData}.
51
+ */
52
+ interface BrowserExtractMarkdownOptions extends BrowserExtractStructuredDataOptions {
53
+ readonly markdown?: RenderMarkdownOptions;
54
+ }
55
+ /** {@link extractLlmContent}: optional `llm` passed to `renderLlmText`. */
56
+ interface BrowserExtractLlmContentOptions extends BrowserExtractStructuredDataOptions {
57
+ readonly llm?: RenderLlmTextOptions;
58
+ }
59
+ /** {@link extractStructuredChunks}: optional `chunks` (split / section sizing). */
60
+ interface BrowserExtractStructuredChunksOptions extends BrowserExtractStructuredDataOptions {
61
+ readonly chunks?: RenderMarkdownSectionsOptions;
62
+ }
40
63
 
41
64
  /**
42
65
  * Inputs supported by the browser entry (DOM types only — no `fs`, no Node `Buffer` in the public surface).
@@ -90,10 +113,6 @@ declare function runOcr(input: BrowserAnalyzeInput, options?: BrowserAnalyzeOpti
90
113
  * PDF is not supported in this runtime (clear warnings, no PDF package import).
91
114
  */
92
115
 
93
- /** Options for {@link extractStructuredData}: same as {@link BrowserAnalyzeOptions} plus shared normalize knobs. */
94
- type BrowserExtractStructuredDataOptions = BrowserAnalyzeOptions & {
95
- readonly normalize?: NormalizeStructuredOptions;
96
- };
97
116
  /**
98
117
  * Returns a {@link StructuredDocumentResult} for inputs the browser runtime actually supports:
99
118
  * **DOCX** (`extractStructuredDataFromDocx`), **images** (`extractStructuredDataFromImage` when OCR is not off),
@@ -103,8 +122,34 @@ type BrowserExtractStructuredDataOptions = BrowserAnalyzeOptions & {
103
122
  */
104
123
  declare function extractStructuredData(input: BrowserAnalyzeInput, options?: BrowserExtractStructuredDataOptions): Promise<StructuredDocumentResult>;
105
124
 
125
+ /**
126
+ * {@link extractStructuredData} for a full structured envelope, then `extractMarkdown` from
127
+ * `@dragon708/docmind-markdown` on `{ data, filename?, mimeType? }` with that result as `structuredFallback`.
128
+ *
129
+ * - **PDF:** the markdown package does not load `@opendataloader/pdf` here; output comes from the structured
130
+ * fallback (empty in-browser stub — see {@link getCapabilities}).
131
+ * - **DOCX:** the package’s direct bytes → Mammoth → Turndown path is **Node-only**; in-browser, Markdown is
132
+ * produced via `convertStructuredToMarkdown` on the structured envelope (still Mammoth/OOXML-backed via
133
+ * `@dragon708/docmind-docx`), with an explanatory warning from the markdown package.
134
+ * - **Text / image:** unidentified or non-binary bytes use the same structured serializer.
135
+ *
136
+ * @param options - `markdown` options plus the same routing as {@link extractStructuredData} (`ocr`, `docx`, `normalize`).
137
+ */
138
+ declare function extractMarkdown(input: BrowserAnalyzeInput, options?: BrowserExtractMarkdownOptions): Promise<string>;
139
+ /**
140
+ * {@link extractStructuredData} then `renderLlmText` (`@dragon708/docmind-markdown`). For a structured value you already have, that package's `extractLlmContent` matches `renderLlmText` (no file I/O).
141
+ */
142
+ declare function extractLlmContent(input: BrowserAnalyzeInput, options?: BrowserExtractLlmContentOptions): Promise<string>;
143
+ /**
144
+ * Structured extract → `renderMarkdownSections` (`splitStructuredIntoChunks` with Markdown; same as
145
+ * `extractStructuredChunks` alias in `@dragon708/docmind-markdown`).
146
+ */
147
+ declare function extractStructuredChunks(input: BrowserAnalyzeInput, options?: BrowserExtractStructuredChunksOptions): Promise<readonly MarkdownSection[]>;
148
+
106
149
  /** High-level features the user can ask DocMind for (per input kind and runtime). */
107
- type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "structured-output" | "image-normalization" | "gif-first-frame" | "bmp" | "heic" | "tiff";
150
+ type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "structured-output"
151
+ /** Browser: {@link extractMarkdown} via `@dragon708/docmind-markdown` `extractMarkdown` + structured fallback (PDF empty; DOCX structured path when binary converter is Node-only). */
152
+ | "markdown" | "llm-text" | "structured-chunks" | "image-normalization" | "gif-first-frame" | "bmp" | "heic" | "tiff";
108
153
  declare function docxIncludeRequested(flags?: AnalyzeDocxIncludeFlags): boolean;
109
154
  /** DOCX `word/media` en runtime browser (JSZip; sin pipeline Node). */
110
155
  interface DocxEmbeddedImageCapabilities {
@@ -176,12 +221,17 @@ interface ExplainAnalysisPlanReport {
176
221
  readonly warnings?: readonly string[];
177
222
  }
178
223
 
224
+ /** Browser facade intents that run `@dragon708/docmind-markdown` after structured extraction. */
225
+ type BrowserMarkdownFacadeIntent = "extractMarkdown" | "extractLlmContent" | "extractStructuredChunks";
179
226
  /** Options for {@link explainAnalysisPlan}: shared fields plus optional `ocr` / `docx` for accurate step preview. */
180
- type BrowserExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<BrowserAnalyzeOptions, "ocr" | "docx" | "structuredOutput" | "output">;
227
+ type BrowserExplainAnalysisPlanOptions = Omit<ExplainAnalysisPlanOptions, "intent"> & Pick<BrowserAnalyzeOptions, "ocr" | "docx" | "structuredOutput" | "output"> & {
228
+ readonly intent?: DocMindPublicIntent | BrowserMarkdownFacadeIntent;
229
+ };
181
230
 
182
231
  /**
183
232
  * Epic 1 — **Capabilities:** detects kind from the same hints as `analyzeFile`, then lists which of
184
- * `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` and image-specific ids (`image-normalization`, `bmp`, `gif-first-frame`, `heic`, `tiff`) apply in the browser (PDF always unsupported).
233
+ * `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` | `markdown` (package `extractMarkdown` + structured fallback; PDF empty here) | `llm-text` | `structured-chunks` (split + Markdown sections)
234
+ * and image-specific ids (`image-normalization`, `bmp`, `gif-first-frame`, `heic`, `tiff`) apply in the browser (PDF always unsupported for meaningful content).
185
235
  * No Mammoth/Tesseract/PDF parsing. For DOCX, {@link GetCapabilitiesReport.docxStructure} / `docxEmbeddedImages` describe v2 opt-in features.
186
236
  */
187
237
  declare function getCapabilities(input: BrowserAnalyzeInput, options?: GetCapabilitiesOptions): Promise<GetCapabilitiesReport>;
@@ -191,4 +241,4 @@ declare function getCapabilities(input: BrowserAnalyzeInput, options?: GetCapabi
191
241
  */
192
242
  declare function explainAnalysisPlan(input: BrowserAnalyzeInput, options?: BrowserExplainAnalysisPlanOptions): Promise<ExplainAnalysisPlanReport>;
193
243
 
194
- export { BROWSER_PDF_STRUCTURED_UNSUPPORTED_WARNING, BROWSER_PDF_UNSUPPORTED_WARNING, type BrowserAnalyzeDocxOptionsSlice, type BrowserAnalyzeInput, type BrowserAnalyzeOptions, type BrowserExplainAnalysisPlanOptions, type BrowserExtractStructuredDataOptions, type BrowserOcrMode, type BrowserOcrOptions, DOCX_EMBEDDED_IMAGE_CAPABILITIES_BROWSER, DOCX_STRUCTURE_CAPABILITIES_BROWSER, type DocxEmbeddedImageCapabilities, type DocxStructuralCapabilities, type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractStructuredData, extractText, getCapabilities, runOcr };
244
+ export { BROWSER_PDF_STRUCTURED_UNSUPPORTED_WARNING, BROWSER_PDF_UNSUPPORTED_WARNING, type BrowserAnalyzeDocxOptionsSlice, type BrowserAnalyzeInput, type BrowserAnalyzeOptions, type BrowserExplainAnalysisPlanOptions, type BrowserExtractLlmContentOptions, type BrowserExtractMarkdownOptions, type BrowserExtractStructuredChunksOptions, type BrowserExtractStructuredDataOptions, type BrowserMarkdownFacadeIntent, type BrowserOcrMode, type BrowserOcrOptions, DOCX_EMBEDDED_IMAGE_CAPABILITIES_BROWSER, DOCX_STRUCTURE_CAPABILITIES_BROWSER, type DocxEmbeddedImageCapabilities, type DocxStructuralCapabilities, type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractLlmContent, extractMarkdown, extractMetadata, extractStructuredChunks, extractStructuredData, extractText, getCapabilities, runOcr };
package/dist/index.js CHANGED
@@ -4,6 +4,7 @@ import { extractStructuredDataFromDocx, analyzeDocx } from '@dragon708/docmind-d
4
4
  export { extractStructuredDataFromDocx } from '@dragon708/docmind-docx';
5
5
  import { extractStructuredDataFromImage, preprocessHasEffect, resolveImageFormat, normalizeImageForOcr, ocrTiff, ocrImageDetailed } from '@dragon708/docmind-ocr';
6
6
  export { extractStructuredDataFromImage } from '@dragon708/docmind-ocr';
7
+ import { extractMarkdown as extractMarkdown$1, renderLlmText, renderMarkdownSections } from '@dragon708/docmind-markdown';
7
8
 
8
9
  // src/analyzeFile.ts
9
10
  function assertBrowserInput(input) {
@@ -519,6 +520,39 @@ async function runOcr(input, options) {
519
520
  return notImplementedResult(kind, "none", [UNKNOWN_FORMAT_WARNING]);
520
521
  }
521
522
  }
523
+ function browserFileHints(input) {
524
+ if (input instanceof File) {
525
+ return {
526
+ filename: input.name,
527
+ mimeType: input.type ? input.type : void 0
528
+ };
529
+ }
530
+ return {};
531
+ }
532
+ async function extractMarkdown(input, options) {
533
+ throwIfAborted(options?.signal);
534
+ const { markdown: markdownOpts, ...structuredOpts } = options ?? {};
535
+ const structured = await extractStructuredData(input, structuredOpts);
536
+ const data = await toUint8Array(input);
537
+ const hints = browserFileHints(input);
538
+ const r = await extractMarkdown$1(
539
+ { data, filename: hints.filename, mimeType: hints.mimeType },
540
+ { ...markdownOpts ?? {}, structuredFallback: structured }
541
+ );
542
+ return r.markdown;
543
+ }
544
+ async function extractLlmContent(input, options) {
545
+ throwIfAborted(options?.signal);
546
+ const { llm: llmOpts, ...structuredOpts } = options ?? {};
547
+ const structured = await extractStructuredData(input, structuredOpts);
548
+ return renderLlmText(structured, llmOpts);
549
+ }
550
+ async function extractStructuredChunks(input, options) {
551
+ throwIfAborted(options?.signal);
552
+ const { chunks: chunkOpts, ...structuredOpts } = options ?? {};
553
+ const structured = await extractStructuredData(input, structuredOpts);
554
+ return renderMarkdownSections(structured, chunkOpts);
555
+ }
522
556
 
523
557
  // src/capabilityReport.ts
524
558
  function docxIncludeRequested(flags) {
@@ -557,6 +591,8 @@ var TEXT_META_NOTE = "Plain text has no structured document metadata; extractMet
557
591
  var OCR_OFF_NOTE = 'Image OCR may be skipped when `ocr.mode` is "off" in analyze options.';
558
592
  var STRUCTURED_OCR_OFF = 'Structured image output uses OCR; when `ocr.mode` is "off", `extractStructuredData` returns an empty envelope with a warning.';
559
593
  var UNKNOWN_KIND = "Could not determine file kind from name, MIME, or bytes; all features are reported as unsupported until the kind is known.";
594
+ var MARKDOWN_PDF_BROWSER = "PDF: no parser in-browser \u2014 `@opendataloader/pdf` is not loaded here. extractMarkdown still calls `extractMarkdown` in `@dragon708/docmind-markdown`, which falls back to the empty structured stub (same empty Markdown). extractLlmContent / extractStructuredChunks use the structured envelope only. Use @dragon708/docmind-node for real PDF \u2192 Markdown / LLM text / chunks.";
595
+ var MARKDOWN_IMAGE_OCR_OFF = 'Same as structured-output: when ocr.mode is "off", structured (and thus Markdown/LLM/chunk exports) are empty aside from warnings.';
560
596
  function slot(id, supported, warnings) {
561
597
  return warnings?.length ? { id, supported, warnings } : { id, supported };
562
598
  }
@@ -573,7 +609,10 @@ function buildBrowserCapabilityReport(kind) {
573
609
  slot("html", false, [pdf]),
574
610
  slot("ocr", false, [pdf]),
575
611
  slot("pages", false, [pdf]),
576
- slot("structured-output", false, [BROWSER_PDF_STRUCTURED_UNSUPPORTED_WARNING])
612
+ slot("structured-output", false, [BROWSER_PDF_STRUCTURED_UNSUPPORTED_WARNING]),
613
+ slot("markdown", false, [MARKDOWN_PDF_BROWSER]),
614
+ slot("llm-text", false, [MARKDOWN_PDF_BROWSER]),
615
+ slot("structured-chunks", false, [MARKDOWN_PDF_BROWSER])
577
616
  ];
578
617
  break;
579
618
  case "docx":
@@ -593,6 +632,15 @@ function buildBrowserCapabilityReport(kind) {
593
632
  ]),
594
633
  slot("structured-output", true, [
595
634
  "`extractStructuredData` uses `@dragon708/docmind-docx` (Mammoth + OOXML) and returns `StructuredDocumentResult`; optional `options.docx` slices are forwarded."
635
+ ]),
636
+ slot("markdown", true, [
637
+ "extractMarkdown: `@dragon708/docmind-markdown` `extractMarkdown` on bytes + structured fallback. The package\u2019s DOCX-bytes Mammoth\u2192Turndown path is Node-only; in-browser, Markdown is produced from `extractStructuredData` (Mammoth/OOXML in `@dragon708/docmind-docx`) via structured serialization, with a clear package warning that the binary shortcut is skipped."
638
+ ]),
639
+ slot("llm-text", true, [
640
+ "extractLlmContent: structured envelope \u2192 `renderLlmText` (LLM-ready plain text; no binary PDF/DOCX Markdown routes)."
641
+ ]),
642
+ slot("structured-chunks", true, [
643
+ "extractStructuredChunks: structured \u2192 `renderMarkdownSections` / `splitStructuredIntoChunks` (heading-aware chunking + optional parallel `text`)."
596
644
  ])
597
645
  ];
598
646
  break;
@@ -629,6 +677,18 @@ function buildBrowserCapabilityReport(kind) {
629
677
  "`extractStructuredData` uses `extractStructuredDataFromImage` (same OCR path as analyzeFile when `ocr.mode` is not off).",
630
678
  STRUCTURED_OCR_OFF,
631
679
  "HEIC/HEIF and TIFF limitations match `getCapabilities` (`heic`, `tiff`) and OCR warnings."
680
+ ]),
681
+ slot("markdown", true, [
682
+ "extractMarkdown: same bytes + structured fallback through package `extractMarkdown` when applicable; OCR structured layout \u2192 Markdown when OCR runs. HEIC unsupported; TIFF best-effort.",
683
+ MARKDOWN_IMAGE_OCR_OFF
684
+ ]),
685
+ slot("llm-text", true, [
686
+ "extractLlmContent: OCR structured \u2192 LLM plain text under the same OCR and format limits.",
687
+ MARKDOWN_IMAGE_OCR_OFF
688
+ ]),
689
+ slot("structured-chunks", true, [
690
+ "extractStructuredChunks: OCR structured \u2192 sectioned Markdown; empty when OCR is off or HEIC.",
691
+ MARKDOWN_IMAGE_OCR_OFF
632
692
  ])
633
693
  ];
634
694
  break;
@@ -641,6 +701,15 @@ function buildBrowserCapabilityReport(kind) {
641
701
  slot("pages", false),
642
702
  slot("structured-output", true, [
643
703
  "`extractStructuredData` decodes UTF-8 (via `analyzeText`) and normalizes to `StructuredDocumentResult` (paragraph block rollup)."
704
+ ]),
705
+ slot("markdown", true, [
706
+ "extractMarkdown: bytes + structured fallback through `@dragon708/docmind-markdown` `extractMarkdown` (typically structured serializer for UTF-8 text)."
707
+ ]),
708
+ slot("llm-text", true, [
709
+ "extractLlmContent: UTF-8 structured rollup \u2192 `renderLlmText` in `@dragon708/docmind-markdown`."
710
+ ]),
711
+ slot("structured-chunks", true, [
712
+ "extractStructuredChunks: typically one Markdown section when only paragraph rollup exists."
644
713
  ])
645
714
  ];
646
715
  break;
@@ -652,7 +721,10 @@ function buildBrowserCapabilityReport(kind) {
652
721
  slot("html", false),
653
722
  slot("ocr", false),
654
723
  slot("pages", false),
655
- slot("structured-output", false, [UNKNOWN_KIND])
724
+ slot("structured-output", false, [UNKNOWN_KIND]),
725
+ slot("markdown", false, [UNKNOWN_KIND]),
726
+ slot("llm-text", false, [UNKNOWN_KIND]),
727
+ slot("structured-chunks", false, [UNKNOWN_KIND])
656
728
  ];
657
729
  }
658
730
  return {
@@ -695,9 +767,10 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude, ocr
695
767
  let limitations = [];
696
768
  const ocrOffNote = ocrMode === "off" ? 'Image OCR is skipped when ocr.mode is "off".' : "";
697
769
  if (kind === "pdf") {
770
+ const structuredLikeIntent = intent === "extractStructuredData" || intent === "extractMarkdown" || intent === "extractLlmContent" || intent === "extractStructuredChunks";
698
771
  limitations = lim(
699
772
  BROWSER_PDF_UNSUPPORTED_WARNING,
700
- intent === "extractStructuredData" ? "`extractStructuredData` only returns an empty `StructuredDocumentResult` with warnings for PDF in-browser; use @dragon708/docmind-node for real PDF structured extraction." : ""
773
+ structuredLikeIntent ? "`extractStructuredData` / extractMarkdown / extractLlmContent / extractStructuredChunks only see an empty structured envelope in-browser for PDF; use @dragon708/docmind-node for real PDF extraction and Markdown/LLM/chunk exports." : ""
701
774
  );
702
775
  nativeExtraction = {
703
776
  willAttempt: false,
@@ -721,9 +794,10 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude, ocr
721
794
  });
722
795
  }
723
796
  if (kind === "unknown") {
797
+ const structuredLikeIntent = intent === "extractStructuredData" || intent === "extractMarkdown" || intent === "extractLlmContent" || intent === "extractStructuredChunks";
724
798
  limitations = lim(
725
799
  "Could not classify the file from name, MIME, or bytes; analysis will return not_implemented until hints improve.",
726
- intent === "extractStructuredData" ? "`extractStructuredData` needs a known kind (text, DOCX, or image) to produce structured output." : ""
800
+ structuredLikeIntent ? "Structured and Markdown/LLM/chunk exports need a known kind (text, DOCX, or image) in the browser runtime." : ""
727
801
  );
728
802
  nativeExtraction = { willAttempt: false, description: "No analyzer selected without a known file kind." };
729
803
  ocr = { mayUse: false, description: "OCR is not used for unknown kinds." };
@@ -836,11 +910,26 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude, ocr
836
910
  }
837
911
  break;
838
912
  case "extractStructuredData":
913
+ case "extractMarkdown":
914
+ case "extractLlmContent":
915
+ case "extractStructuredChunks":
839
916
  if (kind === "docx") {
840
- nativeExtraction = {
841
- willAttempt: true,
842
- description: "`extractStructuredDataFromDocx`: Mammoth plus required OOXML extractors (structure, headings, tables, blocks, pagesApprox, embeddedImages unless disabled), then `normalizeToStructuredResult`. Optional `options.docx` is forwarded."
843
- };
917
+ if (intent === "extractStructuredData") {
918
+ nativeExtraction = {
919
+ willAttempt: true,
920
+ description: "`extractStructuredDataFromDocx`: Mammoth plus required OOXML extractors (structure, headings, tables, blocks, pagesApprox, embeddedImages unless disabled), then `normalizeToStructuredResult`. Optional `options.docx` is forwarded."
921
+ };
922
+ } else if (intent === "extractMarkdown") {
923
+ nativeExtraction = {
924
+ willAttempt: true,
925
+ description: "extractMarkdown: `extractStructuredData` (Mammoth/OOXML) for a full structured envelope, then `extractMarkdown` in `@dragon708/docmind-markdown`. The package\u2019s DOCX-bytes Mammoth\u2192Turndown shortcut is Node-only; in-browser Markdown uses structured serialization on that envelope (with a package warning)."
926
+ };
927
+ } else {
928
+ nativeExtraction = {
929
+ willAttempt: true,
930
+ description: `${String(intent)}: same structured DOCX pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\` (\`renderLlmText\` or \`renderMarkdownSections\`).`
931
+ };
932
+ }
844
933
  ocr = { mayUse: false, description: "DOCX structured path does not use OCR." };
845
934
  limitations = lim(DOCX_ZIP_NOTE_BROWSER);
846
935
  } else if (kind === "image") {
@@ -856,10 +945,30 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude, ocr
856
945
  } else {
857
946
  nativeExtraction = {
858
947
  willAttempt: true,
859
- description: "UTF-8 decode via `analyzeText`, then `normalizeToStructuredResult` with a paragraph block rollup."
948
+ description: intent === "extractStructuredData" ? "UTF-8 decode via `analyzeText`, then `normalizeToStructuredResult` with a paragraph block rollup." : `${String(intent)}: UTF-8 structured envelope, then \`@dragon708/docmind-markdown\`.`
860
949
  };
861
950
  ocr = { mayUse: false, description: "OCR does not apply to text files." };
862
951
  }
952
+ if (intent === "extractMarkdown") {
953
+ limitations = [
954
+ ...limitations,
955
+ ...lim(
956
+ "Output: Markdown string from `extractMarkdown` in `@dragon708/docmind-markdown`. PDF in-browser: empty (no `@opendataloader/pdf`). DOCX: structured Markdown path when the binary converter is Node-only."
957
+ )
958
+ ];
959
+ } else if (intent === "extractLlmContent") {
960
+ limitations = [
961
+ ...limitations,
962
+ ...lim("Output: compact plain text via `renderLlmText` (structured input only in this runtime).")
963
+ ];
964
+ } else if (intent === "extractStructuredChunks") {
965
+ limitations = [
966
+ ...limitations,
967
+ ...lim(
968
+ "Output: MarkdownSection[] via `renderMarkdownSections` (`splitStructuredIntoChunks` / `extractStructuredChunks` alias)."
969
+ )
970
+ ];
971
+ }
863
972
  break;
864
973
  default:
865
974
  nativeExtraction = { willAttempt: false, description: "Intent not specialized in this runtime." };
@@ -1012,6 +1121,40 @@ function planForIntent(intentOpt, kind, ocrMode, docxInclude, ocr, analyzeFileOu
1012
1121
  };
1013
1122
  }
1014
1123
  }
1124
+ if (intent === "extractMarkdown") {
1125
+ const sub = planForIntent(
1126
+ "extractStructuredData",
1127
+ kind,
1128
+ ocrMode,
1129
+ docxInclude,
1130
+ ocr,
1131
+ analyzeFileOutput
1132
+ );
1133
+ return {
1134
+ intent,
1135
+ steps: [...sub.steps ?? [], { id: "markdown_hybrid_package", status: "planned" }]
1136
+ };
1137
+ }
1138
+ if (intent === "extractLlmContent" || intent === "extractStructuredChunks") {
1139
+ const sub = planForIntent(
1140
+ "extractStructuredData",
1141
+ kind,
1142
+ ocrMode,
1143
+ docxInclude,
1144
+ ocr,
1145
+ analyzeFileOutput
1146
+ );
1147
+ return {
1148
+ intent,
1149
+ steps: [
1150
+ ...sub.steps ?? [],
1151
+ {
1152
+ id: intent === "extractLlmContent" ? "docmind_markdown_llm_text" : "docmind_markdown_chunk_sections",
1153
+ status: "planned"
1154
+ }
1155
+ ]
1156
+ };
1157
+ }
1015
1158
  if (intent === "analyzeFile") {
1016
1159
  const base = planForAnalyzeFile(kind, ocrMode, docxInclude, ocr);
1017
1160
  if (!analyzeFileRequestsStructured(analyzeFileOutput)) return base;
@@ -1121,6 +1264,6 @@ async function explainAnalysisPlan(input, options) {
1121
1264
  return buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInc, ocrSlice);
1122
1265
  }
1123
1266
 
1124
- export { BROWSER_PDF_STRUCTURED_UNSUPPORTED_WARNING, BROWSER_PDF_UNSUPPORTED_WARNING, DOCX_EMBEDDED_IMAGE_CAPABILITIES_BROWSER, DOCX_STRUCTURE_CAPABILITIES_BROWSER, analyzeFile, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractStructuredData, extractText, getCapabilities, runOcr };
1267
+ export { BROWSER_PDF_STRUCTURED_UNSUPPORTED_WARNING, BROWSER_PDF_UNSUPPORTED_WARNING, DOCX_EMBEDDED_IMAGE_CAPABILITIES_BROWSER, DOCX_STRUCTURE_CAPABILITIES_BROWSER, analyzeFile, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractLlmContent, extractMarkdown, extractMetadata, extractStructuredChunks, extractStructuredData, extractText, getCapabilities, runOcr };
1125
1268
  //# sourceMappingURL=index.js.map
1126
1269
  //# sourceMappingURL=index.js.map
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@dragon708/docmind-browser",
3
- "version": "1.6.0",
3
+ "version": "1.8.0",
4
4
  "description": "Official DocMind browser facade: analyzeFile and intent APIs (DOCX, image OCR, text). PDF and fs paths use @dragon708/docmind-node.",
5
5
  "type": "module",
6
6
  "sideEffects": false,
@@ -34,6 +34,7 @@
34
34
  "license": "MIT",
35
35
  "dependencies": {
36
36
  "@dragon708/docmind-docx": "^1.8.0",
37
+ "@dragon708/docmind-markdown": "^1.1.0",
37
38
  "@dragon708/docmind-ocr": "^1.1.4",
38
39
  "@dragon708/docmind-shared": "^1.2.0"
39
40
  },