@dragon708/docmind-browser 1.7.0 → 1.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +21 -9
- package/dist/index.js +69 -19
- package/package.json +2 -2
package/dist/index.d.ts
CHANGED
|
@@ -44,7 +44,10 @@ type BrowserExtractStructuredDataOptions = BrowserAnalyzeOptions & {
|
|
|
44
44
|
readonly normalize?: NormalizeStructuredOptions;
|
|
45
45
|
};
|
|
46
46
|
/**
|
|
47
|
-
* {@link extractMarkdown}: structured options plus `markdown`
|
|
47
|
+
* {@link extractMarkdown}: structured options plus `markdown` (passed through to `extractMarkdown` in
|
|
48
|
+
* `@dragon708/docmind-markdown`, including structured-serializer knobs). Binary PDF/DOCX converters inside that
|
|
49
|
+
* package are not used for PDF in-browser and the DOCX bytes→Turndown path is Node-only; the browser still
|
|
50
|
+
* gets correct DOCX Markdown via structured fallback from {@link extractStructuredData}.
|
|
48
51
|
*/
|
|
49
52
|
interface BrowserExtractMarkdownOptions extends BrowserExtractStructuredDataOptions {
|
|
50
53
|
readonly markdown?: RenderMarkdownOptions;
|
|
@@ -120,24 +123,33 @@ declare function runOcr(input: BrowserAnalyzeInput, options?: BrowserAnalyzeOpti
|
|
|
120
123
|
declare function extractStructuredData(input: BrowserAnalyzeInput, options?: BrowserExtractStructuredDataOptions): Promise<StructuredDocumentResult>;
|
|
121
124
|
|
|
122
125
|
/**
|
|
123
|
-
* {@link extractStructuredData}
|
|
124
|
-
*
|
|
126
|
+
* {@link extractStructuredData} for a full structured envelope, then `extractMarkdown` from
|
|
127
|
+
* `@dragon708/docmind-markdown` on `{ data, filename?, mimeType? }` with that result as `structuredFallback`.
|
|
125
128
|
*
|
|
126
|
-
*
|
|
127
|
-
*
|
|
129
|
+
* - **PDF:** the markdown package does not load `@opendataloader/pdf` here; output comes from the structured
|
|
130
|
+
* fallback (empty in-browser stub — see {@link getCapabilities}).
|
|
131
|
+
* - **DOCX:** the package’s direct bytes → Mammoth → Turndown path is **Node-only**; in-browser, Markdown is
|
|
132
|
+
* produced via `convertStructuredToMarkdown` on the structured envelope (still Mammoth/OOXML-backed via
|
|
133
|
+
* `@dragon708/docmind-docx`), with an explanatory warning from the markdown package.
|
|
134
|
+
* - **Text / image:** unidentified or non-binary bytes use the same structured serializer.
|
|
135
|
+
*
|
|
136
|
+
* @param options - `markdown` options plus the same routing as {@link extractStructuredData} (`ocr`, `docx`, `normalize`).
|
|
128
137
|
*/
|
|
129
138
|
declare function extractMarkdown(input: BrowserAnalyzeInput, options?: BrowserExtractMarkdownOptions): Promise<string>;
|
|
130
139
|
/**
|
|
131
|
-
*
|
|
140
|
+
* {@link extractStructuredData} then `renderLlmText` (`@dragon708/docmind-markdown`). For a structured value you already have, that package's `extractLlmContent` matches `renderLlmText` (no file I/O).
|
|
132
141
|
*/
|
|
133
142
|
declare function extractLlmContent(input: BrowserAnalyzeInput, options?: BrowserExtractLlmContentOptions): Promise<string>;
|
|
134
143
|
/**
|
|
135
|
-
* Structured extract → `renderMarkdownSections`
|
|
144
|
+
* Structured extract → `renderMarkdownSections` (`splitStructuredIntoChunks` with Markdown; same as
|
|
145
|
+
* `extractStructuredChunks` alias in `@dragon708/docmind-markdown`).
|
|
136
146
|
*/
|
|
137
147
|
declare function extractStructuredChunks(input: BrowserAnalyzeInput, options?: BrowserExtractStructuredChunksOptions): Promise<readonly MarkdownSection[]>;
|
|
138
148
|
|
|
139
149
|
/** High-level features the user can ask DocMind for (per input kind and runtime). */
|
|
140
|
-
type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "structured-output"
|
|
150
|
+
type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "structured-output"
|
|
151
|
+
/** Browser: {@link extractMarkdown} via `@dragon708/docmind-markdown` `extractMarkdown` + structured fallback (PDF empty; DOCX structured path when binary converter is Node-only). */
|
|
152
|
+
| "markdown" | "llm-text" | "structured-chunks" | "image-normalization" | "gif-first-frame" | "bmp" | "heic" | "tiff";
|
|
141
153
|
declare function docxIncludeRequested(flags?: AnalyzeDocxIncludeFlags): boolean;
|
|
142
154
|
/** DOCX `word/media` en runtime browser (JSZip; sin pipeline Node). */
|
|
143
155
|
interface DocxEmbeddedImageCapabilities {
|
|
@@ -218,7 +230,7 @@ type BrowserExplainAnalysisPlanOptions = Omit<ExplainAnalysisPlanOptions, "inten
|
|
|
218
230
|
|
|
219
231
|
/**
|
|
220
232
|
* Epic 1 — **Capabilities:** detects kind from the same hints as `analyzeFile`, then lists which of
|
|
221
|
-
* `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` | `markdown` | `llm-text` | `structured-chunks`
|
|
233
|
+
* `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` | `markdown` (package `extractMarkdown` + structured fallback; PDF empty here) | `llm-text` | `structured-chunks` (split + Markdown sections)
|
|
222
234
|
* and image-specific ids (`image-normalization`, `bmp`, `gif-first-frame`, `heic`, `tiff`) apply in the browser (PDF always unsupported for meaningful content).
|
|
223
235
|
* No Mammoth/Tesseract/PDF parsing. For DOCX, {@link GetCapabilitiesReport.docxStructure} / `docxEmbeddedImages` describe v2 opt-in features.
|
|
224
236
|
*/
|
package/dist/index.js
CHANGED
|
@@ -4,7 +4,7 @@ import { extractStructuredDataFromDocx, analyzeDocx } from '@dragon708/docmind-d
|
|
|
4
4
|
export { extractStructuredDataFromDocx } from '@dragon708/docmind-docx';
|
|
5
5
|
import { extractStructuredDataFromImage, preprocessHasEffect, resolveImageFormat, normalizeImageForOcr, ocrTiff, ocrImageDetailed } from '@dragon708/docmind-ocr';
|
|
6
6
|
export { extractStructuredDataFromImage } from '@dragon708/docmind-ocr';
|
|
7
|
-
import {
|
|
7
|
+
import { extractMarkdown as extractMarkdown$1, renderLlmText, renderMarkdownSections } from '@dragon708/docmind-markdown';
|
|
8
8
|
|
|
9
9
|
// src/analyzeFile.ts
|
|
10
10
|
function assertBrowserInput(input) {
|
|
@@ -520,11 +520,26 @@ async function runOcr(input, options) {
|
|
|
520
520
|
return notImplementedResult(kind, "none", [UNKNOWN_FORMAT_WARNING]);
|
|
521
521
|
}
|
|
522
522
|
}
|
|
523
|
+
function browserFileHints(input) {
|
|
524
|
+
if (input instanceof File) {
|
|
525
|
+
return {
|
|
526
|
+
filename: input.name,
|
|
527
|
+
mimeType: input.type ? input.type : void 0
|
|
528
|
+
};
|
|
529
|
+
}
|
|
530
|
+
return {};
|
|
531
|
+
}
|
|
523
532
|
async function extractMarkdown(input, options) {
|
|
524
533
|
throwIfAborted(options?.signal);
|
|
525
534
|
const { markdown: markdownOpts, ...structuredOpts } = options ?? {};
|
|
526
535
|
const structured = await extractStructuredData(input, structuredOpts);
|
|
527
|
-
|
|
536
|
+
const data = await toUint8Array(input);
|
|
537
|
+
const hints = browserFileHints(input);
|
|
538
|
+
const r = await extractMarkdown$1(
|
|
539
|
+
{ data, filename: hints.filename, mimeType: hints.mimeType },
|
|
540
|
+
{ ...markdownOpts ?? {}, structuredFallback: structured }
|
|
541
|
+
);
|
|
542
|
+
return r.markdown;
|
|
528
543
|
}
|
|
529
544
|
async function extractLlmContent(input, options) {
|
|
530
545
|
throwIfAborted(options?.signal);
|
|
@@ -576,7 +591,7 @@ var TEXT_META_NOTE = "Plain text has no structured document metadata; extractMet
|
|
|
576
591
|
var OCR_OFF_NOTE = 'Image OCR may be skipped when `ocr.mode` is "off" in analyze options.';
|
|
577
592
|
var STRUCTURED_OCR_OFF = 'Structured image output uses OCR; when `ocr.mode` is "off", `extractStructuredData` returns an empty envelope with a warning.';
|
|
578
593
|
var UNKNOWN_KIND = "Could not determine file kind from name, MIME, or bytes; all features are reported as unsupported until the kind is known.";
|
|
579
|
-
var MARKDOWN_PDF_BROWSER = "PDF: no parser in-browser \u2014 extractMarkdown
|
|
594
|
+
var MARKDOWN_PDF_BROWSER = "PDF: no parser in-browser \u2014 `@opendataloader/pdf` is not loaded here. extractMarkdown still calls `extractMarkdown` in `@dragon708/docmind-markdown`, which falls back to the empty structured stub (same empty Markdown). extractLlmContent / extractStructuredChunks use the structured envelope only. Use @dragon708/docmind-node for real PDF \u2192 Markdown / LLM text / chunks.";
|
|
580
595
|
var MARKDOWN_IMAGE_OCR_OFF = 'Same as structured-output: when ocr.mode is "off", structured (and thus Markdown/LLM/chunk exports) are empty aside from warnings.';
|
|
581
596
|
function slot(id, supported, warnings) {
|
|
582
597
|
return warnings?.length ? { id, supported, warnings } : { id, supported };
|
|
@@ -619,13 +634,13 @@ function buildBrowserCapabilityReport(kind) {
|
|
|
619
634
|
"`extractStructuredData` uses `@dragon708/docmind-docx` (Mammoth + OOXML) and returns `StructuredDocumentResult`; optional `options.docx` slices are forwarded."
|
|
620
635
|
]),
|
|
621
636
|
slot("markdown", true, [
|
|
622
|
-
"extractMarkdown: structured DOCX \
|
|
637
|
+
"extractMarkdown: `@dragon708/docmind-markdown` `extractMarkdown` on bytes + structured fallback. The package\u2019s DOCX-bytes Mammoth\u2192Turndown path is Node-only; in-browser, Markdown is produced from `extractStructuredData` (Mammoth/OOXML in `@dragon708/docmind-docx`) via structured serialization, with a clear package warning that the binary shortcut is skipped."
|
|
623
638
|
]),
|
|
624
639
|
slot("llm-text", true, [
|
|
625
|
-
"extractLlmContent: structured \u2192
|
|
640
|
+
"extractLlmContent: structured envelope \u2192 `renderLlmText` (LLM-ready plain text; no binary PDF/DOCX Markdown routes)."
|
|
626
641
|
]),
|
|
627
642
|
slot("structured-chunks", true, [
|
|
628
|
-
"extractStructuredChunks: structured \u2192
|
|
643
|
+
"extractStructuredChunks: structured \u2192 `renderMarkdownSections` / `splitStructuredIntoChunks` (heading-aware chunking + optional parallel `text`)."
|
|
629
644
|
])
|
|
630
645
|
];
|
|
631
646
|
break;
|
|
@@ -664,7 +679,7 @@ function buildBrowserCapabilityReport(kind) {
|
|
|
664
679
|
"HEIC/HEIF and TIFF limitations match `getCapabilities` (`heic`, `tiff`) and OCR warnings."
|
|
665
680
|
]),
|
|
666
681
|
slot("markdown", true, [
|
|
667
|
-
"extractMarkdown: OCR structured layout \u2192 Markdown when OCR runs
|
|
682
|
+
"extractMarkdown: same bytes + structured fallback through package `extractMarkdown` when applicable; OCR structured layout \u2192 Markdown when OCR runs. HEIC unsupported; TIFF best-effort.",
|
|
668
683
|
MARKDOWN_IMAGE_OCR_OFF
|
|
669
684
|
]),
|
|
670
685
|
slot("llm-text", true, [
|
|
@@ -688,11 +703,13 @@ function buildBrowserCapabilityReport(kind) {
|
|
|
688
703
|
"`extractStructuredData` decodes UTF-8 (via `analyzeText`) and normalizes to `StructuredDocumentResult` (paragraph block rollup)."
|
|
689
704
|
]),
|
|
690
705
|
slot("markdown", true, [
|
|
691
|
-
"extractMarkdown:
|
|
706
|
+
"extractMarkdown: bytes + structured fallback through `@dragon708/docmind-markdown` `extractMarkdown` (typically structured serializer for UTF-8 text)."
|
|
707
|
+
]),
|
|
708
|
+
slot("llm-text", true, [
|
|
709
|
+
"extractLlmContent: UTF-8 structured rollup \u2192 `renderLlmText` in `@dragon708/docmind-markdown`."
|
|
692
710
|
]),
|
|
693
|
-
slot("llm-text", true, ["extractLlmContent: UTF-8 structured \u2192 LLM-oriented plain text."]),
|
|
694
711
|
slot("structured-chunks", true, [
|
|
695
|
-
"extractStructuredChunks: typically
|
|
712
|
+
"extractStructuredChunks: typically one Markdown section when only paragraph rollup exists."
|
|
696
713
|
])
|
|
697
714
|
];
|
|
698
715
|
break;
|
|
@@ -897,10 +914,22 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude, ocr
|
|
|
897
914
|
case "extractLlmContent":
|
|
898
915
|
case "extractStructuredChunks":
|
|
899
916
|
if (kind === "docx") {
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
917
|
+
if (intent === "extractStructuredData") {
|
|
918
|
+
nativeExtraction = {
|
|
919
|
+
willAttempt: true,
|
|
920
|
+
description: "`extractStructuredDataFromDocx`: Mammoth plus required OOXML extractors (structure, headings, tables, blocks, pagesApprox, embeddedImages unless disabled), then `normalizeToStructuredResult`. Optional `options.docx` is forwarded."
|
|
921
|
+
};
|
|
922
|
+
} else if (intent === "extractMarkdown") {
|
|
923
|
+
nativeExtraction = {
|
|
924
|
+
willAttempt: true,
|
|
925
|
+
description: "extractMarkdown: `extractStructuredData` (Mammoth/OOXML) for a full structured envelope, then `extractMarkdown` in `@dragon708/docmind-markdown`. The package\u2019s DOCX-bytes Mammoth\u2192Turndown shortcut is Node-only; in-browser Markdown uses structured serialization on that envelope (with a package warning)."
|
|
926
|
+
};
|
|
927
|
+
} else {
|
|
928
|
+
nativeExtraction = {
|
|
929
|
+
willAttempt: true,
|
|
930
|
+
description: `${String(intent)}: same structured DOCX pipeline as extractStructuredData, then \`@dragon708/docmind-markdown\` (\`renderLlmText\` or \`renderMarkdownSections\`).`
|
|
931
|
+
};
|
|
932
|
+
}
|
|
904
933
|
ocr = { mayUse: false, description: "DOCX structured path does not use OCR." };
|
|
905
934
|
limitations = lim(DOCX_ZIP_NOTE_BROWSER);
|
|
906
935
|
} else if (kind === "image") {
|
|
@@ -923,17 +952,21 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude, ocr
|
|
|
923
952
|
if (intent === "extractMarkdown") {
|
|
924
953
|
limitations = [
|
|
925
954
|
...limitations,
|
|
926
|
-
...lim(
|
|
955
|
+
...lim(
|
|
956
|
+
"Output: Markdown string from `extractMarkdown` in `@dragon708/docmind-markdown`. PDF in-browser: empty (no `@opendataloader/pdf`). DOCX: structured Markdown path when the binary converter is Node-only."
|
|
957
|
+
)
|
|
927
958
|
];
|
|
928
959
|
} else if (intent === "extractLlmContent") {
|
|
929
960
|
limitations = [
|
|
930
961
|
...limitations,
|
|
931
|
-
...lim("Output: compact plain text via renderLlmText.")
|
|
962
|
+
...lim("Output: compact plain text via `renderLlmText` (structured input only in this runtime).")
|
|
932
963
|
];
|
|
933
964
|
} else if (intent === "extractStructuredChunks") {
|
|
934
965
|
limitations = [
|
|
935
966
|
...limitations,
|
|
936
|
-
...lim(
|
|
967
|
+
...lim(
|
|
968
|
+
"Output: MarkdownSection[] via `renderMarkdownSections` (`splitStructuredIntoChunks` / `extractStructuredChunks` alias)."
|
|
969
|
+
)
|
|
937
970
|
];
|
|
938
971
|
}
|
|
939
972
|
break;
|
|
@@ -1088,7 +1121,21 @@ function planForIntent(intentOpt, kind, ocrMode, docxInclude, ocr, analyzeFileOu
|
|
|
1088
1121
|
};
|
|
1089
1122
|
}
|
|
1090
1123
|
}
|
|
1091
|
-
if (intent === "extractMarkdown"
|
|
1124
|
+
if (intent === "extractMarkdown") {
|
|
1125
|
+
const sub = planForIntent(
|
|
1126
|
+
"extractStructuredData",
|
|
1127
|
+
kind,
|
|
1128
|
+
ocrMode,
|
|
1129
|
+
docxInclude,
|
|
1130
|
+
ocr,
|
|
1131
|
+
analyzeFileOutput
|
|
1132
|
+
);
|
|
1133
|
+
return {
|
|
1134
|
+
intent,
|
|
1135
|
+
steps: [...sub.steps ?? [], { id: "markdown_hybrid_package", status: "planned" }]
|
|
1136
|
+
};
|
|
1137
|
+
}
|
|
1138
|
+
if (intent === "extractLlmContent" || intent === "extractStructuredChunks") {
|
|
1092
1139
|
const sub = planForIntent(
|
|
1093
1140
|
"extractStructuredData",
|
|
1094
1141
|
kind,
|
|
@@ -1101,7 +1148,10 @@ function planForIntent(intentOpt, kind, ocrMode, docxInclude, ocr, analyzeFileOu
|
|
|
1101
1148
|
intent,
|
|
1102
1149
|
steps: [
|
|
1103
1150
|
...sub.steps ?? [],
|
|
1104
|
-
{
|
|
1151
|
+
{
|
|
1152
|
+
id: intent === "extractLlmContent" ? "docmind_markdown_llm_text" : "docmind_markdown_chunk_sections",
|
|
1153
|
+
status: "planned"
|
|
1154
|
+
}
|
|
1105
1155
|
]
|
|
1106
1156
|
};
|
|
1107
1157
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@dragon708/docmind-browser",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.8.0",
|
|
4
4
|
"description": "Official DocMind browser facade: analyzeFile and intent APIs (DOCX, image OCR, text). PDF and fs paths use @dragon708/docmind-node.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"sideEffects": false,
|
|
@@ -34,7 +34,7 @@
|
|
|
34
34
|
"license": "MIT",
|
|
35
35
|
"dependencies": {
|
|
36
36
|
"@dragon708/docmind-docx": "^1.8.0",
|
|
37
|
-
"@dragon708/docmind-markdown": "^1.
|
|
37
|
+
"@dragon708/docmind-markdown": "^1.1.0",
|
|
38
38
|
"@dragon708/docmind-ocr": "^1.1.4",
|
|
39
39
|
"@dragon708/docmind-shared": "^1.2.0"
|
|
40
40
|
},
|