@dragon708/docmind-browser 1.6.0 → 1.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +61 -11
- package/dist/index.js +153 -10
- package/package.json +2 -1
package/dist/index.d.ts
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
|
-
import { DocMindAnalyzeOptions, AnalyzeFileOutputOptions,
|
|
1
|
+
import { DocMindAnalyzeOptions, AnalyzeFileOutputOptions, NormalizeStructuredOptions, AnalysisResult, StructuredDocumentResult, FileKind, RuntimeDescriptor, DocMindPublicIntent, AnalysisAnalyzer, ProcessingPlanDescriptor, ExplainAnalysisPlanOptions, GetCapabilitiesOptions } from '@dragon708/docmind-shared';
|
|
2
2
|
export { AnalysisAnalyzer, AnalysisResult, CapabilityDescriptor, DetectFileKindInput, DocMindPublicIntent, DocumentBlock, DocumentImageRef, DocumentPage, DocumentTable, DocxAnalysisCoreResult, ExplainAnalysisPlanOptions, ExplainAnalysisPlanResult, FileKind, FileKindMetadata, GenericAnalysisResult, GetCapabilitiesOptions, GetCapabilitiesResult, ImageAnalysisCoreResult, PdfAnalysisCoreResult, StructuredDocumentResult, TextAnalysisResult, analyzeFileRequestsStructured, detectFileKind, isStructuredDocumentResult } from '@dragon708/docmind-shared';
|
|
3
|
-
import {
|
|
4
|
-
export {
|
|
3
|
+
import { RenderLlmTextOptions, RenderMarkdownOptions, RenderMarkdownSectionsOptions, MarkdownSection } from '@dragon708/docmind-markdown';
|
|
4
|
+
export { MarkdownSection } from '@dragon708/docmind-markdown';
|
|
5
5
|
import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions } from '@dragon708/docmind-docx';
|
|
6
6
|
export { AnalyzeDocxIncludeFlags, ExtractStructuredDataFromDocxOptions, extractStructuredDataFromDocx } from '@dragon708/docmind-docx';
|
|
7
|
+
import { OcrOptions, OcrTiffOptions, PreprocessImageOptions } from '@dragon708/docmind-ocr';
|
|
8
|
+
export { ExtractStructuredDataFromImageOptions, extractStructuredDataFromImage } from '@dragon708/docmind-ocr';
|
|
7
9
|
|
|
8
10
|
/**
|
|
9
11
|
* Opciones DOCX para el facade browser (Mammoth + inclusiones v2 de `@dragon708/docmind-docx`; sin APIs Node-only).
|
|
@@ -37,6 +39,27 @@ interface BrowserAnalyzeOptions extends DocMindAnalyzeOptions, AnalyzeFileOutput
|
|
|
37
39
|
/** Solo DOCX: ver {@link BrowserAnalyzeDocxOptionsSlice}. */
|
|
38
40
|
readonly docx?: BrowserAnalyzeDocxOptionsSlice;
|
|
39
41
|
}
|
|
42
|
+
/** Options for {@link extractStructuredData}: same as {@link BrowserAnalyzeOptions} plus shared normalize knobs. */
|
|
43
|
+
type BrowserExtractStructuredDataOptions = BrowserAnalyzeOptions & {
|
|
44
|
+
readonly normalize?: NormalizeStructuredOptions;
|
|
45
|
+
};
|
|
46
|
+
/**
|
|
47
|
+
* {@link extractMarkdown}: structured options plus `markdown` (passed through to `extractMarkdown` in
|
|
48
|
+
* `@dragon708/docmind-markdown`, including structured-serializer knobs). Binary PDF/DOCX converters inside that
|
|
49
|
+
* package are not used for PDF in-browser and the DOCX bytes→Turndown path is Node-only; the browser still
|
|
50
|
+
* gets correct DOCX Markdown via structured fallback from {@link extractStructuredData}.
|
|
51
|
+
*/
|
|
52
|
+
interface BrowserExtractMarkdownOptions extends BrowserExtractStructuredDataOptions {
|
|
53
|
+
readonly markdown?: RenderMarkdownOptions;
|
|
54
|
+
}
|
|
55
|
+
/** {@link extractLlmContent}: optional `llm` passed to `renderLlmText`. */
|
|
56
|
+
interface BrowserExtractLlmContentOptions extends BrowserExtractStructuredDataOptions {
|
|
57
|
+
readonly llm?: RenderLlmTextOptions;
|
|
58
|
+
}
|
|
59
|
+
/** {@link extractStructuredChunks}: optional `chunks` (split / section sizing). */
|
|
60
|
+
interface BrowserExtractStructuredChunksOptions extends BrowserExtractStructuredDataOptions {
|
|
61
|
+
readonly chunks?: RenderMarkdownSectionsOptions;
|
|
62
|
+
}
|
|
40
63
|
|
|
41
64
|
/**
|
|
42
65
|
* Inputs supported by the browser entry (DOM types only — no `fs`, no Node `Buffer` in the public surface).
|
|
@@ -90,10 +113,6 @@ declare function runOcr(input: BrowserAnalyzeInput, options?: BrowserAnalyzeOpti
|
|
|
90
113
|
* PDF is not supported in this runtime (clear warnings, no PDF package import).
|
|
91
114
|
*/
|
|
92
115
|
|
|
93
|
-
/** Options for {@link extractStructuredData}: same as {@link BrowserAnalyzeOptions} plus shared normalize knobs. */
|
|
94
|
-
type BrowserExtractStructuredDataOptions = BrowserAnalyzeOptions & {
|
|
95
|
-
readonly normalize?: NormalizeStructuredOptions;
|
|
96
|
-
};
|
|
97
116
|
/**
|
|
98
117
|
* Returns a {@link StructuredDocumentResult} for inputs the browser runtime actually supports:
|
|
99
118
|
* **DOCX** (`extractStructuredDataFromDocx`), **images** (`extractStructuredDataFromImage` when OCR is not off),
|
|
@@ -103,8 +122,34 @@ type BrowserExtractStructuredDataOptions = BrowserAnalyzeOptions & {
|
|
|
103
122
|
*/
|
|
104
123
|
declare function extractStructuredData(input: BrowserAnalyzeInput, options?: BrowserExtractStructuredDataOptions): Promise<StructuredDocumentResult>;
|
|
105
124
|
|
|
125
|
+
/**
|
|
126
|
+
* {@link extractStructuredData} for a full structured envelope, then `extractMarkdown` from
|
|
127
|
+
* `@dragon708/docmind-markdown` on `{ data, filename?, mimeType? }` with that result as `structuredFallback`.
|
|
128
|
+
*
|
|
129
|
+
* - **PDF:** the markdown package does not load `@opendataloader/pdf` here; output comes from the structured
|
|
130
|
+
* fallback (empty in-browser stub — see {@link getCapabilities}).
|
|
131
|
+
* - **DOCX:** the package’s direct bytes → Mammoth → Turndown path is **Node-only**; in-browser, Markdown is
|
|
132
|
+
* produced via `convertStructuredToMarkdown` on the structured envelope (still Mammoth/OOXML-backed via
|
|
133
|
+
* `@dragon708/docmind-docx`), with an explanatory warning from the markdown package.
|
|
134
|
+
* - **Text / image:** unidentified or non-binary bytes use the same structured serializer.
|
|
135
|
+
*
|
|
136
|
+
* @param options - `markdown` options plus the same routing as {@link extractStructuredData} (`ocr`, `docx`, `normalize`).
|
|
137
|
+
*/
|
|
138
|
+
declare function extractMarkdown(input: BrowserAnalyzeInput, options?: BrowserExtractMarkdownOptions): Promise<string>;
|
|
139
|
+
/**
|
|
140
|
+
* {@link extractStructuredData} then `renderLlmText` (`@dragon708/docmind-markdown`). For a structured value you already have, that package's `extractLlmContent` matches `renderLlmText` (no file I/O).
|
|
141
|
+
*/
|
|
142
|
+
declare function extractLlmContent(input: BrowserAnalyzeInput, options?: BrowserExtractLlmContentOptions): Promise<string>;
|
|
143
|
+
/**
|
|
144
|
+
* Structured extract → `renderMarkdownSections` (`splitStructuredIntoChunks` with Markdown; same as
|
|
145
|
+
* `extractStructuredChunks` alias in `@dragon708/docmind-markdown`).
|
|
146
|
+
*/
|
|
147
|
+
declare function extractStructuredChunks(input: BrowserAnalyzeInput, options?: BrowserExtractStructuredChunksOptions): Promise<readonly MarkdownSection[]>;
|
|
148
|
+
|
|
106
149
|
/** High-level features the user can ask DocMind for (per input kind and runtime). */
|
|
107
|
-
type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "structured-output"
|
|
150
|
+
type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "structured-output"
|
|
151
|
+
/** Browser: {@link extractMarkdown} via `@dragon708/docmind-markdown` `extractMarkdown` + structured fallback (PDF empty; DOCX structured path when binary converter is Node-only). */
|
|
152
|
+
| "markdown" | "llm-text" | "structured-chunks" | "image-normalization" | "gif-first-frame" | "bmp" | "heic" | "tiff";
|
|
108
153
|
declare function docxIncludeRequested(flags?: AnalyzeDocxIncludeFlags): boolean;
|
|
109
154
|
/** DOCX `word/media` en runtime browser (JSZip; sin pipeline Node). */
|
|
110
155
|
interface DocxEmbeddedImageCapabilities {
|
|
@@ -176,12 +221,17 @@ interface ExplainAnalysisPlanReport {
|
|
|
176
221
|
readonly warnings?: readonly string[];
|
|
177
222
|
}
|
|
178
223
|
|
|
224
|
+
/** Browser facade intents that run `@dragon708/docmind-markdown` after structured extraction. */
|
|
225
|
+
type BrowserMarkdownFacadeIntent = "extractMarkdown" | "extractLlmContent" | "extractStructuredChunks";
|
|
179
226
|
/** Options for {@link explainAnalysisPlan}: shared fields plus optional `ocr` / `docx` for accurate step preview. */
|
|
180
|
-
type BrowserExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<BrowserAnalyzeOptions, "ocr" | "docx" | "structuredOutput" | "output"
|
|
227
|
+
type BrowserExplainAnalysisPlanOptions = Omit<ExplainAnalysisPlanOptions, "intent"> & Pick<BrowserAnalyzeOptions, "ocr" | "docx" | "structuredOutput" | "output"> & {
|
|
228
|
+
readonly intent?: DocMindPublicIntent | BrowserMarkdownFacadeIntent;
|
|
229
|
+
};
|
|
181
230
|
|
|
182
231
|
/**
|
|
183
232
|
* Epic 1 — **Capabilities:** detects kind from the same hints as `analyzeFile`, then lists which of
|
|
184
|
-
* `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output`
|
|
233
|
+
* `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` | `markdown` (package `extractMarkdown` + structured fallback; PDF empty here) | `llm-text` | `structured-chunks` (split + Markdown sections)
|
|
234
|
+
* and image-specific ids (`image-normalization`, `bmp`, `gif-first-frame`, `heic`, `tiff`) apply in the browser (PDF always unsupported for meaningful content).
|
|
185
235
|
* No Mammoth/Tesseract/PDF parsing. For DOCX, {@link GetCapabilitiesReport.docxStructure} / `docxEmbeddedImages` describe v2 opt-in features.
|
|
186
236
|
*/
|
|
187
237
|
declare function getCapabilities(input: BrowserAnalyzeInput, options?: GetCapabilitiesOptions): Promise<GetCapabilitiesReport>;
|
|
@@ -191,4 +241,4 @@ declare function getCapabilities(input: BrowserAnalyzeInput, options?: GetCapabi
|
|
|
191
241
|
*/
|
|
192
242
|
declare function explainAnalysisPlan(input: BrowserAnalyzeInput, options?: BrowserExplainAnalysisPlanOptions): Promise<ExplainAnalysisPlanReport>;
|
|
193
243
|
|
|
194
|
-
export { BROWSER_PDF_STRUCTURED_UNSUPPORTED_WARNING, BROWSER_PDF_UNSUPPORTED_WARNING, type BrowserAnalyzeDocxOptionsSlice, type BrowserAnalyzeInput, type BrowserAnalyzeOptions, type BrowserExplainAnalysisPlanOptions, type BrowserExtractStructuredDataOptions, type BrowserOcrMode, type BrowserOcrOptions, DOCX_EMBEDDED_IMAGE_CAPABILITIES_BROWSER, DOCX_STRUCTURE_CAPABILITIES_BROWSER, type DocxEmbeddedImageCapabilities, type DocxStructuralCapabilities, type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractStructuredData, extractText, getCapabilities, runOcr };
|
|
244
|
+
export { BROWSER_PDF_STRUCTURED_UNSUPPORTED_WARNING, BROWSER_PDF_UNSUPPORTED_WARNING, type BrowserAnalyzeDocxOptionsSlice, type BrowserAnalyzeInput, type BrowserAnalyzeOptions, type BrowserExplainAnalysisPlanOptions, type BrowserExtractLlmContentOptions, type BrowserExtractMarkdownOptions, type BrowserExtractStructuredChunksOptions, type BrowserExtractStructuredDataOptions, type BrowserMarkdownFacadeIntent, type BrowserOcrMode, type BrowserOcrOptions, DOCX_EMBEDDED_IMAGE_CAPABILITIES_BROWSER, DOCX_STRUCTURE_CAPABILITIES_BROWSER, type DocxEmbeddedImageCapabilities, type DocxStructuralCapabilities, type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractLlmContent, extractMarkdown, extractMetadata, extractStructuredChunks, extractStructuredData, extractText, getCapabilities, runOcr };
|
package/dist/index.js
CHANGED
|
@@ -4,6 +4,7 @@ import { extractStructuredDataFromDocx, analyzeDocx } from '@dragon708/docmind-d
|
|
|
4
4
|
export { extractStructuredDataFromDocx } from '@dragon708/docmind-docx';
|
|
5
5
|
import { extractStructuredDataFromImage, preprocessHasEffect, resolveImageFormat, normalizeImageForOcr, ocrTiff, ocrImageDetailed } from '@dragon708/docmind-ocr';
|
|
6
6
|
export { extractStructuredDataFromImage } from '@dragon708/docmind-ocr';
|
|
7
|
+
import { extractMarkdown as extractMarkdown$1, renderLlmText, renderMarkdownSections } from '@dragon708/docmind-markdown';
|
|
7
8
|
|
|
8
9
|
// src/analyzeFile.ts
|
|
9
10
|
function assertBrowserInput(input) {
|
|
@@ -519,6 +520,39 @@ async function runOcr(input, options) {
|
|
|
519
520
|
return notImplementedResult(kind, "none", [UNKNOWN_FORMAT_WARNING]);
|
|
520
521
|
}
|
|
521
522
|
}
|
|
523
|
+
function browserFileHints(input) {
|
|
524
|
+
if (input instanceof File) {
|
|
525
|
+
return {
|
|
526
|
+
filename: input.name,
|
|
527
|
+
mimeType: input.type ? input.type : void 0
|
|
528
|
+
};
|
|
529
|
+
}
|
|
530
|
+
return {};
|
|
531
|
+
}
|
|
532
|
+
async function extractMarkdown(input, options) {
|
|
533
|
+
throwIfAborted(options?.signal);
|
|
534
|
+
const { markdown: markdownOpts, ...structuredOpts } = options ?? {};
|
|
535
|
+
const structured = await extractStructuredData(input, structuredOpts);
|
|
536
|
+
const data = await toUint8Array(input);
|
|
537
|
+
const hints = browserFileHints(input);
|
|
538
|
+
const r = await extractMarkdown$1(
|
|
539
|
+
{ data, filename: hints.filename, mimeType: hints.mimeType },
|
|
540
|
+
{ ...markdownOpts ?? {}, structuredFallback: structured }
|
|
541
|
+
);
|
|
542
|
+
return r.markdown;
|
|
543
|
+
}
|
|
544
|
+
async function extractLlmContent(input, options) {
|
|
545
|
+
throwIfAborted(options?.signal);
|
|
546
|
+
const { llm: llmOpts, ...structuredOpts } = options ?? {};
|
|
547
|
+
const structured = await extractStructuredData(input, structuredOpts);
|
|
548
|
+
return renderLlmText(structured, llmOpts);
|
|
549
|
+
}
|
|
550
|
+
async function extractStructuredChunks(input, options) {
|
|
551
|
+
throwIfAborted(options?.signal);
|
|
552
|
+
const { chunks: chunkOpts, ...structuredOpts } = options ?? {};
|
|
553
|
+
const structured = await extractStructuredData(input, structuredOpts);
|
|
554
|
+
return renderMarkdownSections(structured, chunkOpts);
|
|
555
|
+
}
|
|
522
556
|
|
|
523
557
|
// src/capabilityReport.ts
|
|
524
558
|
function docxIncludeRequested(flags) {
|
|
@@ -557,6 +591,8 @@ var TEXT_META_NOTE = "Plain text has no structured document metadata; extractMet
|
|
|
557
591
|
var OCR_OFF_NOTE = 'Image OCR may be skipped when `ocr.mode` is "off" in analyze options.';
|
|
558
592
|
var STRUCTURED_OCR_OFF = 'Structured image output uses OCR; when `ocr.mode` is "off", `extractStructuredData` returns an empty envelope with a warning.';
|
|
559
593
|
var UNKNOWN_KIND = "Could not determine file kind from name, MIME, or bytes; all features are reported as unsupported until the kind is known.";
|
|
594
|
+
var MARKDOWN_PDF_BROWSER = "PDF: no parser in-browser \u2014 `@opendataloader/pdf` is not loaded here. extractMarkdown still calls `extractMarkdown` in `@dragon708/docmind-markdown`, which falls back to the empty structured stub (same empty Markdown). extractLlmContent / extractStructuredChunks use the structured envelope only. Use @dragon708/docmind-node for real PDF \u2192 Markdown / LLM text / chunks.";
|
|
595
|
+
var MARKDOWN_IMAGE_OCR_OFF = 'Same as structured-output: when ocr.mode is "off", structured (and thus Markdown/LLM/chunk exports) are empty aside from warnings.';
|
|
560
596
|
function slot(id, supported, warnings) {
|
|
561
597
|
return warnings?.length ? { id, supported, warnings } : { id, supported };
|
|
562
598
|
}
|
|
@@ -573,7 +609,10 @@ function buildBrowserCapabilityReport(kind) {
|
|
|
573
609
|
slot("html", false, [pdf]),
|
|
574
610
|
slot("ocr", false, [pdf]),
|
|
575
611
|
slot("pages", false, [pdf]),
|
|
576
|
-
slot("structured-output", false, [BROWSER_PDF_STRUCTURED_UNSUPPORTED_WARNING])
|
|
612
|
+
slot("structured-output", false, [BROWSER_PDF_STRUCTURED_UNSUPPORTED_WARNING]),
|
|
613
|
+
slot("markdown", false, [MARKDOWN_PDF_BROWSER]),
|
|
614
|
+
slot("llm-text", false, [MARKDOWN_PDF_BROWSER]),
|
|
615
|
+
slot("structured-chunks", false, [MARKDOWN_PDF_BROWSER])
|
|
577
616
|
];
|
|
578
617
|
break;
|
|
579
618
|
case "docx":
|
|
@@ -593,6 +632,15 @@ function buildBrowserCapabilityReport(kind) {
|
|
|
593
632
|
]),
|
|
594
633
|
slot("structured-output", true, [
|
|
595
634
|
"`extractStructuredData` uses `@dragon708/docmind-docx` (Mammoth + OOXML) and returns `StructuredDocumentResult`; optional `options.docx` slices are forwarded."
|
|
635
|
+
]),
|
|
636
|
+
slot("markdown", true, [
|
|
637
|
+
"extractMarkdown: `@dragon708/docmind-markdown` `extractMarkdown` on bytes + structured fallback. The package\u2019s DOCX-bytes Mammoth\u2192Turndown path is Node-only; in-browser, Markdown is produced from `extractStructuredData` (Mammoth/OOXML in `@dragon708/docmind-docx`) via structured serialization, with a clear package warning that the binary shortcut is skipped."
|
|
638
|
+
]),
|
|
639
|
+
slot("llm-text", true, [
|
|
640
|
+
"extractLlmContent: structured envelope \u2192 `renderLlmText` (LLM-ready plain text; no binary PDF/DOCX Markdown routes)."
|
|
641
|
+
]),
|
|
642
|
+
slot("structured-chunks", true, [
|
|
643
|
+
"extractStructuredChunks: structured \u2192 `renderMarkdownSections` / `splitStructuredIntoChunks` (heading-aware chunking + optional parallel `text`)."
|
|
596
644
|
])
|
|
597
645
|
];
|
|
598
646
|
break;
|
|
@@ -629,6 +677,18 @@ function buildBrowserCapabilityReport(kind) {
|
|
|
629
677
|
"`extractStructuredData` uses `extractStructuredDataFromImage` (same OCR path as analyzeFile when `ocr.mode` is not off).",
|
|
630
678
|
STRUCTURED_OCR_OFF,
|
|
631
679
|
"HEIC/HEIF and TIFF limitations match `getCapabilities` (`heic`, `tiff`) and OCR warnings."
|
|
680
|
+
]),
|
|
681
|
+
slot("markdown", true, [
|
|
682
|
+
"extractMarkdown: same bytes + structured fallback through package `extractMarkdown` when applicable; OCR structured layout \u2192 Markdown when OCR runs. HEIC unsupported; TIFF best-effort.",
|
|
683
|
+
MARKDOWN_IMAGE_OCR_OFF
|
|
684
|
+
]),
|
|
685
|
+
slot("llm-text", true, [
|
|
686
|
+
"extractLlmContent: OCR structured \u2192 LLM plain text under the same OCR and format limits.",
|
|
687
|
+
MARKDOWN_IMAGE_OCR_OFF
|
|
688
|
+
]),
|
|
689
|
+
slot("structured-chunks", true, [
|
|
690
|
+
"extractStructuredChunks: OCR structured \u2192 sectioned Markdown; empty when OCR is off or HEIC.",
|
|
691
|
+
MARKDOWN_IMAGE_OCR_OFF
|
|
632
692
|
])
|
|
633
693
|
];
|
|
634
694
|
break;
|
|
@@ -641,6 +701,15 @@ function buildBrowserCapabilityReport(kind) {
|
|
|
641
701
|
slot("pages", false),
|
|
642
702
|
slot("structured-output", true, [
|
|
643
703
|
"`extractStructuredData` decodes UTF-8 (via `analyzeText`) and normalizes to `StructuredDocumentResult` (paragraph block rollup)."
|
|
704
|
+
]),
|
|
705
|
+
slot("markdown", true, [
|
|
706
|
+
"extractMarkdown: bytes + structured fallback through `@dragon708/docmind-markdown` `extractMarkdown` (typically structured serializer for UTF-8 text)."
|
|
707
|
+
]),
|
|
708
|
+
slot("llm-text", true, [
|
|
709
|
+
"extractLlmContent: UTF-8 structured rollup \u2192 `renderLlmText` in `@dragon708/docmind-markdown`."
|
|
710
|
+
]),
|
|
711
|
+
slot("structured-chunks", true, [
|
|
712
|
+
"extractStructuredChunks: typically one Markdown section when only paragraph rollup exists."
|
|
644
713
|
])
|
|
645
714
|
];
|
|
646
715
|
break;
|
|
@@ -652,7 +721,10 @@ function buildBrowserCapabilityReport(kind) {
|
|
|
652
721
|
slot("html", false),
|
|
653
722
|
slot("ocr", false),
|
|
654
723
|
slot("pages", false),
|
|
655
|
-
slot("structured-output", false, [UNKNOWN_KIND])
|
|
724
|
+
slot("structured-output", false, [UNKNOWN_KIND]),
|
|
725
|
+
slot("markdown", false, [UNKNOWN_KIND]),
|
|
726
|
+
slot("llm-text", false, [UNKNOWN_KIND]),
|
|
727
|
+
slot("structured-chunks", false, [UNKNOWN_KIND])
|
|
656
728
|
];
|
|
657
729
|
}
|
|
658
730
|
return {
|
|
@@ -695,9 +767,10 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude, ocr
|
|
|
695
767
|
let limitations = [];
|
|
696
768
|
const ocrOffNote = ocrMode === "off" ? 'Image OCR is skipped when ocr.mode is "off".' : "";
|
|
697
769
|
if (kind === "pdf") {
|
|
770
|
+
const structuredLikeIntent = intent === "extractStructuredData" || intent === "extractMarkdown" || intent === "extractLlmContent" || intent === "extractStructuredChunks";
|
|
698
771
|
limitations = lim(
|
|
699
772
|
BROWSER_PDF_UNSUPPORTED_WARNING,
|
|
700
|
-
|
|
773
|
+
structuredLikeIntent ? "`extractStructuredData` / extractMarkdown / extractLlmContent / extractStructuredChunks only see an empty structured envelope in-browser for PDF; use @dragon708/docmind-node for real PDF extraction and Markdown/LLM/chunk exports." : ""
|
|
701
774
|
);
|
|
702
775
|
nativeExtraction = {
|
|
703
776
|
willAttempt: false,
|
|
@@ -721,9 +794,10 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude, ocr
|
|
|
721
794
|
});
|
|
722
795
|
}
|
|
723
796
|
if (kind === "unknown") {
|
|
797
|
+
const structuredLikeIntent = intent === "extractStructuredData" || intent === "extractMarkdown" || intent === "extractLlmContent" || intent === "extractStructuredChunks";
|
|
724
798
|
limitations = lim(
|
|
725
799
|
"Could not classify the file from name, MIME, or bytes; analysis will return not_implemented until hints improve.",
|
|
726
|
-
|
|
800
|
+
structuredLikeIntent ? "Structured and Markdown/LLM/chunk exports need a known kind (text, DOCX, or image) in the browser runtime." : ""
|
|
727
801
|
);
|
|
728
802
|
nativeExtraction = { willAttempt: false, description: "No analyzer selected without a known file kind." };
|
|
729
803
|
ocr = { mayUse: false, description: "OCR is not used for unknown kinds." };
|
|
@@ -836,11 +910,26 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude, ocr
|
|
|
836
910
|
}
|
|
837
911
|
break;
|
|
838
912
|
case "extractStructuredData":
|
|
913
|
+
case "extractMarkdown":
|
|
914
|
+
case "extractLlmContent":
|
|
915
|
+
case "extractStructuredChunks":
|
|
839
916
|
if (kind === "docx") {
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
917
|
+
if (intent === "extractStructuredData") {
|
|
918
|
+
nativeExtraction = {
|
|
919
|
+
willAttempt: true,
|
|
920
|
+
description: "`extractStructuredDataFromDocx`: Mammoth plus required OOXML extractors (structure, headings, tables, blocks, pagesApprox, embeddedImages unless disabled), then `normalizeToStructuredResult`. Optional `options.docx` is forwarded."
|
|
921
|
+
};
|
|
922
|
+
} else if (intent === "extractMarkdown") {
|
|
923
|
+
nativeExtraction = {
|
|
924
|
+
willAttempt: true,
|
|
925
|
+
description: "extractMarkdown: `extractStructuredData` (Mammoth/OOXML) for a full structured envelope, then `extractMarkdown` in `@dragon708/docmind-markdown`. The package\u2019s DOCX-bytes Mammoth\u2192Turndown shortcut is Node-only; in-browser Markdown uses structured serialization on that envelope (with a package warning)."
|
|
926
|
+
};
|
|
927
|
+
} else {
|
|
928
|
+
nativeExtraction = {
|
|
929
|
+
willAttempt: true,
|
|
930
|
+
description: `${String(intent)}: same structured DOCX pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\` (\`renderLlmText\` or \`renderMarkdownSections\`).`
|
|
931
|
+
};
|
|
932
|
+
}
|
|
844
933
|
ocr = { mayUse: false, description: "DOCX structured path does not use OCR." };
|
|
845
934
|
limitations = lim(DOCX_ZIP_NOTE_BROWSER);
|
|
846
935
|
} else if (kind === "image") {
|
|
@@ -856,10 +945,30 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude, ocr
|
|
|
856
945
|
} else {
|
|
857
946
|
nativeExtraction = {
|
|
858
947
|
willAttempt: true,
|
|
859
|
-
description: "UTF-8 decode via `analyzeText`, then `normalizeToStructuredResult` with a paragraph block rollup."
|
|
948
|
+
description: intent === "extractStructuredData" ? "UTF-8 decode via `analyzeText`, then `normalizeToStructuredResult` with a paragraph block rollup." : `${String(intent)}: UTF-8 structured envelope, then \`@dragon708/docmind-markdown\`.`
|
|
860
949
|
};
|
|
861
950
|
ocr = { mayUse: false, description: "OCR does not apply to text files." };
|
|
862
951
|
}
|
|
952
|
+
if (intent === "extractMarkdown") {
|
|
953
|
+
limitations = [
|
|
954
|
+
...limitations,
|
|
955
|
+
...lim(
|
|
956
|
+
"Output: Markdown string from `extractMarkdown` in `@dragon708/docmind-markdown`. PDF in-browser: empty (no `@opendataloader/pdf`). DOCX: structured Markdown path when the binary converter is Node-only."
|
|
957
|
+
)
|
|
958
|
+
];
|
|
959
|
+
} else if (intent === "extractLlmContent") {
|
|
960
|
+
limitations = [
|
|
961
|
+
...limitations,
|
|
962
|
+
...lim("Output: compact plain text via `renderLlmText` (structured input only in this runtime).")
|
|
963
|
+
];
|
|
964
|
+
} else if (intent === "extractStructuredChunks") {
|
|
965
|
+
limitations = [
|
|
966
|
+
...limitations,
|
|
967
|
+
...lim(
|
|
968
|
+
"Output: MarkdownSection[] via `renderMarkdownSections` (`splitStructuredIntoChunks` / `extractStructuredChunks` alias)."
|
|
969
|
+
)
|
|
970
|
+
];
|
|
971
|
+
}
|
|
863
972
|
break;
|
|
864
973
|
default:
|
|
865
974
|
nativeExtraction = { willAttempt: false, description: "Intent not specialized in this runtime." };
|
|
@@ -1012,6 +1121,40 @@ function planForIntent(intentOpt, kind, ocrMode, docxInclude, ocr, analyzeFileOu
|
|
|
1012
1121
|
};
|
|
1013
1122
|
}
|
|
1014
1123
|
}
|
|
1124
|
+
if (intent === "extractMarkdown") {
|
|
1125
|
+
const sub = planForIntent(
|
|
1126
|
+
"extractStructuredData",
|
|
1127
|
+
kind,
|
|
1128
|
+
ocrMode,
|
|
1129
|
+
docxInclude,
|
|
1130
|
+
ocr,
|
|
1131
|
+
analyzeFileOutput
|
|
1132
|
+
);
|
|
1133
|
+
return {
|
|
1134
|
+
intent,
|
|
1135
|
+
steps: [...sub.steps ?? [], { id: "markdown_hybrid_package", status: "planned" }]
|
|
1136
|
+
};
|
|
1137
|
+
}
|
|
1138
|
+
if (intent === "extractLlmContent" || intent === "extractStructuredChunks") {
|
|
1139
|
+
const sub = planForIntent(
|
|
1140
|
+
"extractStructuredData",
|
|
1141
|
+
kind,
|
|
1142
|
+
ocrMode,
|
|
1143
|
+
docxInclude,
|
|
1144
|
+
ocr,
|
|
1145
|
+
analyzeFileOutput
|
|
1146
|
+
);
|
|
1147
|
+
return {
|
|
1148
|
+
intent,
|
|
1149
|
+
steps: [
|
|
1150
|
+
...sub.steps ?? [],
|
|
1151
|
+
{
|
|
1152
|
+
id: intent === "extractLlmContent" ? "docmind_markdown_llm_text" : "docmind_markdown_chunk_sections",
|
|
1153
|
+
status: "planned"
|
|
1154
|
+
}
|
|
1155
|
+
]
|
|
1156
|
+
};
|
|
1157
|
+
}
|
|
1015
1158
|
if (intent === "analyzeFile") {
|
|
1016
1159
|
const base = planForAnalyzeFile(kind, ocrMode, docxInclude, ocr);
|
|
1017
1160
|
if (!analyzeFileRequestsStructured(analyzeFileOutput)) return base;
|
|
@@ -1121,6 +1264,6 @@ async function explainAnalysisPlan(input, options) {
|
|
|
1121
1264
|
return buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInc, ocrSlice);
|
|
1122
1265
|
}
|
|
1123
1266
|
|
|
1124
|
-
export { BROWSER_PDF_STRUCTURED_UNSUPPORTED_WARNING, BROWSER_PDF_UNSUPPORTED_WARNING, DOCX_EMBEDDED_IMAGE_CAPABILITIES_BROWSER, DOCX_STRUCTURE_CAPABILITIES_BROWSER, analyzeFile, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractStructuredData, extractText, getCapabilities, runOcr };
|
|
1267
|
+
export { BROWSER_PDF_STRUCTURED_UNSUPPORTED_WARNING, BROWSER_PDF_UNSUPPORTED_WARNING, DOCX_EMBEDDED_IMAGE_CAPABILITIES_BROWSER, DOCX_STRUCTURE_CAPABILITIES_BROWSER, analyzeFile, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractLlmContent, extractMarkdown, extractMetadata, extractStructuredChunks, extractStructuredData, extractText, getCapabilities, runOcr };
|
|
1125
1268
|
//# sourceMappingURL=index.js.map
|
|
1126
1269
|
//# sourceMappingURL=index.js.map
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@dragon708/docmind-browser",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.8.0",
|
|
4
4
|
"description": "Official DocMind browser facade: analyzeFile and intent APIs (DOCX, image OCR, text). PDF and fs paths use @dragon708/docmind-node.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"sideEffects": false,
|
|
@@ -34,6 +34,7 @@
|
|
|
34
34
|
"license": "MIT",
|
|
35
35
|
"dependencies": {
|
|
36
36
|
"@dragon708/docmind-docx": "^1.8.0",
|
|
37
|
+
"@dragon708/docmind-markdown": "^1.1.0",
|
|
37
38
|
"@dragon708/docmind-ocr": "^1.1.4",
|
|
38
39
|
"@dragon708/docmind-shared": "^1.2.0"
|
|
39
40
|
},
|