@dragon708/docmind-browser 1.6.0 → 1.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +49 -11
- package/dist/index.js +100 -7
- package/package.json +2 -1
package/dist/index.d.ts
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
|
-
import { DocMindAnalyzeOptions, AnalyzeFileOutputOptions,
|
|
1
|
+
import { DocMindAnalyzeOptions, AnalyzeFileOutputOptions, NormalizeStructuredOptions, AnalysisResult, StructuredDocumentResult, FileKind, RuntimeDescriptor, DocMindPublicIntent, AnalysisAnalyzer, ProcessingPlanDescriptor, ExplainAnalysisPlanOptions, GetCapabilitiesOptions } from '@dragon708/docmind-shared';
|
|
2
2
|
export { AnalysisAnalyzer, AnalysisResult, CapabilityDescriptor, DetectFileKindInput, DocMindPublicIntent, DocumentBlock, DocumentImageRef, DocumentPage, DocumentTable, DocxAnalysisCoreResult, ExplainAnalysisPlanOptions, ExplainAnalysisPlanResult, FileKind, FileKindMetadata, GenericAnalysisResult, GetCapabilitiesOptions, GetCapabilitiesResult, ImageAnalysisCoreResult, PdfAnalysisCoreResult, StructuredDocumentResult, TextAnalysisResult, analyzeFileRequestsStructured, detectFileKind, isStructuredDocumentResult } from '@dragon708/docmind-shared';
|
|
3
|
-
import {
|
|
4
|
-
export {
|
|
3
|
+
import { RenderLlmTextOptions, RenderMarkdownOptions, RenderMarkdownSectionsOptions, MarkdownSection } from '@dragon708/docmind-markdown';
|
|
4
|
+
export { MarkdownSection } from '@dragon708/docmind-markdown';
|
|
5
5
|
import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions } from '@dragon708/docmind-docx';
|
|
6
6
|
export { AnalyzeDocxIncludeFlags, ExtractStructuredDataFromDocxOptions, extractStructuredDataFromDocx } from '@dragon708/docmind-docx';
|
|
7
|
+
import { OcrOptions, OcrTiffOptions, PreprocessImageOptions } from '@dragon708/docmind-ocr';
|
|
8
|
+
export { ExtractStructuredDataFromImageOptions, extractStructuredDataFromImage } from '@dragon708/docmind-ocr';
|
|
7
9
|
|
|
8
10
|
/**
|
|
9
11
|
* Opciones DOCX para el facade browser (Mammoth + inclusiones v2 de `@dragon708/docmind-docx`; sin APIs Node-only).
|
|
@@ -37,6 +39,24 @@ interface BrowserAnalyzeOptions extends DocMindAnalyzeOptions, AnalyzeFileOutput
|
|
|
37
39
|
/** Solo DOCX: ver {@link BrowserAnalyzeDocxOptionsSlice}. */
|
|
38
40
|
readonly docx?: BrowserAnalyzeDocxOptionsSlice;
|
|
39
41
|
}
|
|
42
|
+
/** Options for {@link extractStructuredData}: same as {@link BrowserAnalyzeOptions} plus shared normalize knobs. */
|
|
43
|
+
type BrowserExtractStructuredDataOptions = BrowserAnalyzeOptions & {
|
|
44
|
+
readonly normalize?: NormalizeStructuredOptions;
|
|
45
|
+
};
|
|
46
|
+
/**
|
|
47
|
+
* {@link extractMarkdown}: structured options plus `markdown` for `renderMarkdown` (`@dragon708/docmind-markdown`).
|
|
48
|
+
*/
|
|
49
|
+
interface BrowserExtractMarkdownOptions extends BrowserExtractStructuredDataOptions {
|
|
50
|
+
readonly markdown?: RenderMarkdownOptions;
|
|
51
|
+
}
|
|
52
|
+
/** {@link extractLlmContent}: optional `llm` passed to `renderLlmText`. */
|
|
53
|
+
interface BrowserExtractLlmContentOptions extends BrowserExtractStructuredDataOptions {
|
|
54
|
+
readonly llm?: RenderLlmTextOptions;
|
|
55
|
+
}
|
|
56
|
+
/** {@link extractStructuredChunks}: optional `chunks` (split / section sizing). */
|
|
57
|
+
interface BrowserExtractStructuredChunksOptions extends BrowserExtractStructuredDataOptions {
|
|
58
|
+
readonly chunks?: RenderMarkdownSectionsOptions;
|
|
59
|
+
}
|
|
40
60
|
|
|
41
61
|
/**
|
|
42
62
|
* Inputs supported by the browser entry (DOM types only — no `fs`, no Node `Buffer` in the public surface).
|
|
@@ -90,10 +110,6 @@ declare function runOcr(input: BrowserAnalyzeInput, options?: BrowserAnalyzeOpti
|
|
|
90
110
|
* PDF is not supported in this runtime (clear warnings, no PDF package import).
|
|
91
111
|
*/
|
|
92
112
|
|
|
93
|
-
/** Options for {@link extractStructuredData}: same as {@link BrowserAnalyzeOptions} plus shared normalize knobs. */
|
|
94
|
-
type BrowserExtractStructuredDataOptions = BrowserAnalyzeOptions & {
|
|
95
|
-
readonly normalize?: NormalizeStructuredOptions;
|
|
96
|
-
};
|
|
97
113
|
/**
|
|
98
114
|
* Returns a {@link StructuredDocumentResult} for inputs the browser runtime actually supports:
|
|
99
115
|
* **DOCX** (`extractStructuredDataFromDocx`), **images** (`extractStructuredDataFromImage` when OCR is not off),
|
|
@@ -103,8 +119,25 @@ type BrowserExtractStructuredDataOptions = BrowserAnalyzeOptions & {
|
|
|
103
119
|
*/
|
|
104
120
|
declare function extractStructuredData(input: BrowserAnalyzeInput, options?: BrowserExtractStructuredDataOptions): Promise<StructuredDocumentResult>;
|
|
105
121
|
|
|
122
|
+
/**
|
|
123
|
+
* {@link extractStructuredData} → `renderMarkdown` (browser-safe; no Node-only APIs).
|
|
124
|
+
* PDF: empty structured stub → usually empty output; see {@link getCapabilities} capability `markdown`.
|
|
125
|
+
*
|
|
126
|
+
* @param input - `File` / `Blob` / bytes as accepted by the browser facade.
|
|
127
|
+
* @param options - `markdown` render options plus structured/`ocr`/`docx` routing (same as {@link extractStructuredData}).
|
|
128
|
+
*/
|
|
129
|
+
declare function extractMarkdown(input: BrowserAnalyzeInput, options?: BrowserExtractMarkdownOptions): Promise<string>;
|
|
130
|
+
/**
|
|
131
|
+
* Same as {@link extractMarkdown} but `renderLlmText` (compact plain text for LLM prompts).
|
|
132
|
+
*/
|
|
133
|
+
declare function extractLlmContent(input: BrowserAnalyzeInput, options?: BrowserExtractLlmContentOptions): Promise<string>;
|
|
134
|
+
/**
|
|
135
|
+
* Structured extract → `renderMarkdownSections` for sectioned Markdown in the browser.
|
|
136
|
+
*/
|
|
137
|
+
declare function extractStructuredChunks(input: BrowserAnalyzeInput, options?: BrowserExtractStructuredChunksOptions): Promise<readonly MarkdownSection[]>;
|
|
138
|
+
|
|
106
139
|
/** High-level features the user can ask DocMind for (per input kind and runtime). */
|
|
107
|
-
type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "structured-output" | "image-normalization" | "gif-first-frame" | "bmp" | "heic" | "tiff";
|
|
140
|
+
type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "structured-output" | "markdown" | "llm-text" | "structured-chunks" | "image-normalization" | "gif-first-frame" | "bmp" | "heic" | "tiff";
|
|
108
141
|
declare function docxIncludeRequested(flags?: AnalyzeDocxIncludeFlags): boolean;
|
|
109
142
|
/** DOCX `word/media` en runtime browser (JSZip; sin pipeline Node). */
|
|
110
143
|
interface DocxEmbeddedImageCapabilities {
|
|
@@ -176,12 +209,17 @@ interface ExplainAnalysisPlanReport {
|
|
|
176
209
|
readonly warnings?: readonly string[];
|
|
177
210
|
}
|
|
178
211
|
|
|
212
|
+
/** Browser facade intents that run `@dragon708/docmind-markdown` after structured extraction. */
|
|
213
|
+
type BrowserMarkdownFacadeIntent = "extractMarkdown" | "extractLlmContent" | "extractStructuredChunks";
|
|
179
214
|
/** Options for {@link explainAnalysisPlan}: shared fields plus optional `ocr` / `docx` for accurate step preview. */
|
|
180
|
-
type BrowserExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<BrowserAnalyzeOptions, "ocr" | "docx" | "structuredOutput" | "output"
|
|
215
|
+
type BrowserExplainAnalysisPlanOptions = Omit<ExplainAnalysisPlanOptions, "intent"> & Pick<BrowserAnalyzeOptions, "ocr" | "docx" | "structuredOutput" | "output"> & {
|
|
216
|
+
readonly intent?: DocMindPublicIntent | BrowserMarkdownFacadeIntent;
|
|
217
|
+
};
|
|
181
218
|
|
|
182
219
|
/**
|
|
183
220
|
* Epic 1 — **Capabilities:** detects kind from the same hints as `analyzeFile`, then lists which of
|
|
184
|
-
* `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output`
|
|
221
|
+
* `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` | `markdown` | `llm-text` | `structured-chunks`
|
|
222
|
+
* and image-specific ids (`image-normalization`, `bmp`, `gif-first-frame`, `heic`, `tiff`) apply in the browser (PDF always unsupported for meaningful content).
|
|
185
223
|
* No Mammoth/Tesseract/PDF parsing. For DOCX, {@link GetCapabilitiesReport.docxStructure} / `docxEmbeddedImages` describe v2 opt-in features.
|
|
186
224
|
*/
|
|
187
225
|
declare function getCapabilities(input: BrowserAnalyzeInput, options?: GetCapabilitiesOptions): Promise<GetCapabilitiesReport>;
|
|
@@ -191,4 +229,4 @@ declare function getCapabilities(input: BrowserAnalyzeInput, options?: GetCapabi
|
|
|
191
229
|
*/
|
|
192
230
|
declare function explainAnalysisPlan(input: BrowserAnalyzeInput, options?: BrowserExplainAnalysisPlanOptions): Promise<ExplainAnalysisPlanReport>;
|
|
193
231
|
|
|
194
|
-
export { BROWSER_PDF_STRUCTURED_UNSUPPORTED_WARNING, BROWSER_PDF_UNSUPPORTED_WARNING, type BrowserAnalyzeDocxOptionsSlice, type BrowserAnalyzeInput, type BrowserAnalyzeOptions, type BrowserExplainAnalysisPlanOptions, type BrowserExtractStructuredDataOptions, type BrowserOcrMode, type BrowserOcrOptions, DOCX_EMBEDDED_IMAGE_CAPABILITIES_BROWSER, DOCX_STRUCTURE_CAPABILITIES_BROWSER, type DocxEmbeddedImageCapabilities, type DocxStructuralCapabilities, type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractStructuredData, extractText, getCapabilities, runOcr };
|
|
232
|
+
export { BROWSER_PDF_STRUCTURED_UNSUPPORTED_WARNING, BROWSER_PDF_UNSUPPORTED_WARNING, type BrowserAnalyzeDocxOptionsSlice, type BrowserAnalyzeInput, type BrowserAnalyzeOptions, type BrowserExplainAnalysisPlanOptions, type BrowserExtractLlmContentOptions, type BrowserExtractMarkdownOptions, type BrowserExtractStructuredChunksOptions, type BrowserExtractStructuredDataOptions, type BrowserMarkdownFacadeIntent, type BrowserOcrMode, type BrowserOcrOptions, DOCX_EMBEDDED_IMAGE_CAPABILITIES_BROWSER, DOCX_STRUCTURE_CAPABILITIES_BROWSER, type DocxEmbeddedImageCapabilities, type DocxStructuralCapabilities, type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractLlmContent, extractMarkdown, extractMetadata, extractStructuredChunks, extractStructuredData, extractText, getCapabilities, runOcr };
|
package/dist/index.js
CHANGED
|
@@ -4,6 +4,7 @@ import { extractStructuredDataFromDocx, analyzeDocx } from '@dragon708/docmind-d
|
|
|
4
4
|
export { extractStructuredDataFromDocx } from '@dragon708/docmind-docx';
|
|
5
5
|
import { extractStructuredDataFromImage, preprocessHasEffect, resolveImageFormat, normalizeImageForOcr, ocrTiff, ocrImageDetailed } from '@dragon708/docmind-ocr';
|
|
6
6
|
export { extractStructuredDataFromImage } from '@dragon708/docmind-ocr';
|
|
7
|
+
import { renderMarkdown, renderLlmText, renderMarkdownSections } from '@dragon708/docmind-markdown';
|
|
7
8
|
|
|
8
9
|
// src/analyzeFile.ts
|
|
9
10
|
function assertBrowserInput(input) {
|
|
@@ -519,6 +520,24 @@ async function runOcr(input, options) {
|
|
|
519
520
|
return notImplementedResult(kind, "none", [UNKNOWN_FORMAT_WARNING]);
|
|
520
521
|
}
|
|
521
522
|
}
|
|
523
|
+
async function extractMarkdown(input, options) {
|
|
524
|
+
throwIfAborted(options?.signal);
|
|
525
|
+
const { markdown: markdownOpts, ...structuredOpts } = options ?? {};
|
|
526
|
+
const structured = await extractStructuredData(input, structuredOpts);
|
|
527
|
+
return renderMarkdown(structured, markdownOpts);
|
|
528
|
+
}
|
|
529
|
+
async function extractLlmContent(input, options) {
|
|
530
|
+
throwIfAborted(options?.signal);
|
|
531
|
+
const { llm: llmOpts, ...structuredOpts } = options ?? {};
|
|
532
|
+
const structured = await extractStructuredData(input, structuredOpts);
|
|
533
|
+
return renderLlmText(structured, llmOpts);
|
|
534
|
+
}
|
|
535
|
+
async function extractStructuredChunks(input, options) {
|
|
536
|
+
throwIfAborted(options?.signal);
|
|
537
|
+
const { chunks: chunkOpts, ...structuredOpts } = options ?? {};
|
|
538
|
+
const structured = await extractStructuredData(input, structuredOpts);
|
|
539
|
+
return renderMarkdownSections(structured, chunkOpts);
|
|
540
|
+
}
|
|
522
541
|
|
|
523
542
|
// src/capabilityReport.ts
|
|
524
543
|
function docxIncludeRequested(flags) {
|
|
@@ -557,6 +576,8 @@ var TEXT_META_NOTE = "Plain text has no structured document metadata; extractMet
|
|
|
557
576
|
var OCR_OFF_NOTE = 'Image OCR may be skipped when `ocr.mode` is "off" in analyze options.';
|
|
558
577
|
var STRUCTURED_OCR_OFF = 'Structured image output uses OCR; when `ocr.mode` is "off", `extractStructuredData` returns an empty envelope with a warning.';
|
|
559
578
|
var UNKNOWN_KIND = "Could not determine file kind from name, MIME, or bytes; all features are reported as unsupported until the kind is known.";
|
|
579
|
+
var MARKDOWN_PDF_BROWSER = "PDF: no parser in-browser \u2014 extractMarkdown / extractLlmContent / extractStructuredChunks only render an empty structured stub; use @dragon708/docmind-node for real PDF \u2192 Markdown / LLM text / chunks.";
|
|
580
|
+
var MARKDOWN_IMAGE_OCR_OFF = 'Same as structured-output: when ocr.mode is "off", structured (and thus Markdown/LLM/chunk exports) are empty aside from warnings.';
|
|
560
581
|
function slot(id, supported, warnings) {
|
|
561
582
|
return warnings?.length ? { id, supported, warnings } : { id, supported };
|
|
562
583
|
}
|
|
@@ -573,7 +594,10 @@ function buildBrowserCapabilityReport(kind) {
|
|
|
573
594
|
slot("html", false, [pdf]),
|
|
574
595
|
slot("ocr", false, [pdf]),
|
|
575
596
|
slot("pages", false, [pdf]),
|
|
576
|
-
slot("structured-output", false, [BROWSER_PDF_STRUCTURED_UNSUPPORTED_WARNING])
|
|
597
|
+
slot("structured-output", false, [BROWSER_PDF_STRUCTURED_UNSUPPORTED_WARNING]),
|
|
598
|
+
slot("markdown", false, [MARKDOWN_PDF_BROWSER]),
|
|
599
|
+
slot("llm-text", false, [MARKDOWN_PDF_BROWSER]),
|
|
600
|
+
slot("structured-chunks", false, [MARKDOWN_PDF_BROWSER])
|
|
577
601
|
];
|
|
578
602
|
break;
|
|
579
603
|
case "docx":
|
|
@@ -593,6 +617,15 @@ function buildBrowserCapabilityReport(kind) {
|
|
|
593
617
|
]),
|
|
594
618
|
slot("structured-output", true, [
|
|
595
619
|
"`extractStructuredData` uses `@dragon708/docmind-docx` (Mammoth + OOXML) and returns `StructuredDocumentResult`; optional `options.docx` slices are forwarded."
|
|
620
|
+
]),
|
|
621
|
+
slot("markdown", true, [
|
|
622
|
+
"extractMarkdown: structured DOCX \u2192 Markdown via `@dragon708/docmind-markdown` (browser-safe, no Node APIs)."
|
|
623
|
+
]),
|
|
624
|
+
slot("llm-text", true, [
|
|
625
|
+
"extractLlmContent: structured \u2192 compact plain text for prompts (same pipeline as Markdown export)."
|
|
626
|
+
]),
|
|
627
|
+
slot("structured-chunks", true, [
|
|
628
|
+
"extractStructuredChunks: structured \u2192 Markdown sections (`splitStructuredIntoChunks` in markdown package)."
|
|
596
629
|
])
|
|
597
630
|
];
|
|
598
631
|
break;
|
|
@@ -629,6 +662,18 @@ function buildBrowserCapabilityReport(kind) {
|
|
|
629
662
|
"`extractStructuredData` uses `extractStructuredDataFromImage` (same OCR path as analyzeFile when `ocr.mode` is not off).",
|
|
630
663
|
STRUCTURED_OCR_OFF,
|
|
631
664
|
"HEIC/HEIF and TIFF limitations match `getCapabilities` (`heic`, `tiff`) and OCR warnings."
|
|
665
|
+
]),
|
|
666
|
+
slot("markdown", true, [
|
|
667
|
+
"extractMarkdown: OCR structured layout \u2192 Markdown when OCR runs; HEIC unsupported; TIFF best-effort.",
|
|
668
|
+
MARKDOWN_IMAGE_OCR_OFF
|
|
669
|
+
]),
|
|
670
|
+
slot("llm-text", true, [
|
|
671
|
+
"extractLlmContent: OCR structured \u2192 LLM plain text under the same OCR and format limits.",
|
|
672
|
+
MARKDOWN_IMAGE_OCR_OFF
|
|
673
|
+
]),
|
|
674
|
+
slot("structured-chunks", true, [
|
|
675
|
+
"extractStructuredChunks: OCR structured \u2192 sectioned Markdown; empty when OCR is off or HEIC.",
|
|
676
|
+
MARKDOWN_IMAGE_OCR_OFF
|
|
632
677
|
])
|
|
633
678
|
];
|
|
634
679
|
break;
|
|
@@ -641,6 +686,13 @@ function buildBrowserCapabilityReport(kind) {
|
|
|
641
686
|
slot("pages", false),
|
|
642
687
|
slot("structured-output", true, [
|
|
643
688
|
"`extractStructuredData` decodes UTF-8 (via `analyzeText`) and normalizes to `StructuredDocumentResult` (paragraph block rollup)."
|
|
689
|
+
]),
|
|
690
|
+
slot("markdown", true, [
|
|
691
|
+
"extractMarkdown: UTF-8 structured rollup \u2192 Markdown (`@dragon708/docmind-markdown`)."
|
|
692
|
+
]),
|
|
693
|
+
slot("llm-text", true, ["extractLlmContent: UTF-8 structured \u2192 LLM-oriented plain text."]),
|
|
694
|
+
slot("structured-chunks", true, [
|
|
695
|
+
"extractStructuredChunks: typically a single Markdown section when only paragraph rollup exists."
|
|
644
696
|
])
|
|
645
697
|
];
|
|
646
698
|
break;
|
|
@@ -652,7 +704,10 @@ function buildBrowserCapabilityReport(kind) {
|
|
|
652
704
|
slot("html", false),
|
|
653
705
|
slot("ocr", false),
|
|
654
706
|
slot("pages", false),
|
|
655
|
-
slot("structured-output", false, [UNKNOWN_KIND])
|
|
707
|
+
slot("structured-output", false, [UNKNOWN_KIND]),
|
|
708
|
+
slot("markdown", false, [UNKNOWN_KIND]),
|
|
709
|
+
slot("llm-text", false, [UNKNOWN_KIND]),
|
|
710
|
+
slot("structured-chunks", false, [UNKNOWN_KIND])
|
|
656
711
|
];
|
|
657
712
|
}
|
|
658
713
|
return {
|
|
@@ -695,9 +750,10 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude, ocr
|
|
|
695
750
|
let limitations = [];
|
|
696
751
|
const ocrOffNote = ocrMode === "off" ? 'Image OCR is skipped when ocr.mode is "off".' : "";
|
|
697
752
|
if (kind === "pdf") {
|
|
753
|
+
const structuredLikeIntent = intent === "extractStructuredData" || intent === "extractMarkdown" || intent === "extractLlmContent" || intent === "extractStructuredChunks";
|
|
698
754
|
limitations = lim(
|
|
699
755
|
BROWSER_PDF_UNSUPPORTED_WARNING,
|
|
700
|
-
|
|
756
|
+
structuredLikeIntent ? "`extractStructuredData` / extractMarkdown / extractLlmContent / extractStructuredChunks only see an empty structured envelope in-browser for PDF; use @dragon708/docmind-node for real PDF extraction and Markdown/LLM/chunk exports." : ""
|
|
701
757
|
);
|
|
702
758
|
nativeExtraction = {
|
|
703
759
|
willAttempt: false,
|
|
@@ -721,9 +777,10 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude, ocr
|
|
|
721
777
|
});
|
|
722
778
|
}
|
|
723
779
|
if (kind === "unknown") {
|
|
780
|
+
const structuredLikeIntent = intent === "extractStructuredData" || intent === "extractMarkdown" || intent === "extractLlmContent" || intent === "extractStructuredChunks";
|
|
724
781
|
limitations = lim(
|
|
725
782
|
"Could not classify the file from name, MIME, or bytes; analysis will return not_implemented until hints improve.",
|
|
726
|
-
|
|
783
|
+
structuredLikeIntent ? "Structured and Markdown/LLM/chunk exports need a known kind (text, DOCX, or image) in the browser runtime." : ""
|
|
727
784
|
);
|
|
728
785
|
nativeExtraction = { willAttempt: false, description: "No analyzer selected without a known file kind." };
|
|
729
786
|
ocr = { mayUse: false, description: "OCR is not used for unknown kinds." };
|
|
@@ -836,10 +893,13 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude, ocr
|
|
|
836
893
|
}
|
|
837
894
|
break;
|
|
838
895
|
case "extractStructuredData":
|
|
896
|
+
case "extractMarkdown":
|
|
897
|
+
case "extractLlmContent":
|
|
898
|
+
case "extractStructuredChunks":
|
|
839
899
|
if (kind === "docx") {
|
|
840
900
|
nativeExtraction = {
|
|
841
901
|
willAttempt: true,
|
|
842
|
-
description: "`extractStructuredDataFromDocx`: Mammoth plus required OOXML extractors (structure, headings, tables, blocks, pagesApprox, embeddedImages unless disabled), then `normalizeToStructuredResult`. Optional `options.docx` is forwarded."
|
|
902
|
+
description: intent === "extractStructuredData" ? "`extractStructuredDataFromDocx`: Mammoth plus required OOXML extractors (structure, headings, tables, blocks, pagesApprox, embeddedImages unless disabled), then `normalizeToStructuredResult`. Optional `options.docx` is forwarded." : `${String(intent)}: same structured DOCX pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\` (browser-safe).`
|
|
843
903
|
};
|
|
844
904
|
ocr = { mayUse: false, description: "DOCX structured path does not use OCR." };
|
|
845
905
|
limitations = lim(DOCX_ZIP_NOTE_BROWSER);
|
|
@@ -856,10 +916,26 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude, ocr
|
|
|
856
916
|
} else {
|
|
857
917
|
nativeExtraction = {
|
|
858
918
|
willAttempt: true,
|
|
859
|
-
description: "UTF-8 decode via `analyzeText`, then `normalizeToStructuredResult` with a paragraph block rollup."
|
|
919
|
+
description: intent === "extractStructuredData" ? "UTF-8 decode via `analyzeText`, then `normalizeToStructuredResult` with a paragraph block rollup." : `${String(intent)}: UTF-8 structured envelope, then \`@dragon708/docmind-markdown\`.`
|
|
860
920
|
};
|
|
861
921
|
ocr = { mayUse: false, description: "OCR does not apply to text files." };
|
|
862
922
|
}
|
|
923
|
+
if (intent === "extractMarkdown") {
|
|
924
|
+
limitations = [
|
|
925
|
+
...limitations,
|
|
926
|
+
...lim("Output: Markdown string via renderMarkdown.")
|
|
927
|
+
];
|
|
928
|
+
} else if (intent === "extractLlmContent") {
|
|
929
|
+
limitations = [
|
|
930
|
+
...limitations,
|
|
931
|
+
...lim("Output: compact plain text via renderLlmText.")
|
|
932
|
+
];
|
|
933
|
+
} else if (intent === "extractStructuredChunks") {
|
|
934
|
+
limitations = [
|
|
935
|
+
...limitations,
|
|
936
|
+
...lim("Output: MarkdownSection[] via renderMarkdownSections.")
|
|
937
|
+
];
|
|
938
|
+
}
|
|
863
939
|
break;
|
|
864
940
|
default:
|
|
865
941
|
nativeExtraction = { willAttempt: false, description: "Intent not specialized in this runtime." };
|
|
@@ -1012,6 +1088,23 @@ function planForIntent(intentOpt, kind, ocrMode, docxInclude, ocr, analyzeFileOu
|
|
|
1012
1088
|
};
|
|
1013
1089
|
}
|
|
1014
1090
|
}
|
|
1091
|
+
if (intent === "extractMarkdown" || intent === "extractLlmContent" || intent === "extractStructuredChunks") {
|
|
1092
|
+
const sub = planForIntent(
|
|
1093
|
+
"extractStructuredData",
|
|
1094
|
+
kind,
|
|
1095
|
+
ocrMode,
|
|
1096
|
+
docxInclude,
|
|
1097
|
+
ocr,
|
|
1098
|
+
analyzeFileOutput
|
|
1099
|
+
);
|
|
1100
|
+
return {
|
|
1101
|
+
intent,
|
|
1102
|
+
steps: [
|
|
1103
|
+
...sub.steps ?? [],
|
|
1104
|
+
{ id: "docmind_markdown_render", status: "planned" }
|
|
1105
|
+
]
|
|
1106
|
+
};
|
|
1107
|
+
}
|
|
1015
1108
|
if (intent === "analyzeFile") {
|
|
1016
1109
|
const base = planForAnalyzeFile(kind, ocrMode, docxInclude, ocr);
|
|
1017
1110
|
if (!analyzeFileRequestsStructured(analyzeFileOutput)) return base;
|
|
@@ -1121,6 +1214,6 @@ async function explainAnalysisPlan(input, options) {
|
|
|
1121
1214
|
return buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInc, ocrSlice);
|
|
1122
1215
|
}
|
|
1123
1216
|
|
|
1124
|
-
export { BROWSER_PDF_STRUCTURED_UNSUPPORTED_WARNING, BROWSER_PDF_UNSUPPORTED_WARNING, DOCX_EMBEDDED_IMAGE_CAPABILITIES_BROWSER, DOCX_STRUCTURE_CAPABILITIES_BROWSER, analyzeFile, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractStructuredData, extractText, getCapabilities, runOcr };
|
|
1217
|
+
export { BROWSER_PDF_STRUCTURED_UNSUPPORTED_WARNING, BROWSER_PDF_UNSUPPORTED_WARNING, DOCX_EMBEDDED_IMAGE_CAPABILITIES_BROWSER, DOCX_STRUCTURE_CAPABILITIES_BROWSER, analyzeFile, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractLlmContent, extractMarkdown, extractMetadata, extractStructuredChunks, extractStructuredData, extractText, getCapabilities, runOcr };
|
|
1125
1218
|
//# sourceMappingURL=index.js.map
|
|
1126
1219
|
//# sourceMappingURL=index.js.map
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@dragon708/docmind-browser",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.7.0",
|
|
4
4
|
"description": "Official DocMind browser facade: analyzeFile and intent APIs (DOCX, image OCR, text). PDF and fs paths use @dragon708/docmind-node.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"sideEffects": false,
|
|
@@ -34,6 +34,7 @@
|
|
|
34
34
|
"license": "MIT",
|
|
35
35
|
"dependencies": {
|
|
36
36
|
"@dragon708/docmind-docx": "^1.8.0",
|
|
37
|
+
"@dragon708/docmind-markdown": "^1.0.0",
|
|
37
38
|
"@dragon708/docmind-ocr": "^1.1.4",
|
|
38
39
|
"@dragon708/docmind-shared": "^1.2.0"
|
|
39
40
|
},
|