@dragon708/docmind-browser 1.8.4 → 1.8.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +26 -10
- package/dist/index.js +21 -4
- package/package.json +2 -2
package/dist/index.d.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { DocMindAnalyzeOptions, AnalyzeFileOutputOptions, NormalizeStructuredOptions, AnalysisResult, StructuredDocumentResult, FileKind, RuntimeDescriptor, DocMindPublicIntent, AnalysisAnalyzer, ProcessingPlanDescriptor, ExplainAnalysisPlanOptions, GetCapabilitiesOptions } from '@dragon708/docmind-shared';
|
|
2
2
|
export { AnalysisAnalyzer, AnalysisResult, CapabilityDescriptor, DetectFileKindInput, DocMindPublicIntent, DocumentBlock, DocumentImageRef, DocumentPage, DocumentTable, DocxAnalysisCoreResult, ExplainAnalysisPlanOptions, ExplainAnalysisPlanResult, FileKind, FileKindMetadata, GenericAnalysisResult, GetCapabilitiesOptions, GetCapabilitiesResult, ImageAnalysisCoreResult, PdfAnalysisCoreResult, StructuredDocumentResult, TextAnalysisResult, analyzeFileRequestsStructured, detectFileKind, isStructuredDocumentResult } from '@dragon708/docmind-shared';
|
|
3
|
-
import { RenderLlmTextOptions, RenderMarkdownOptions, RenderMarkdownSectionsOptions, MarkdownSection } from '@dragon708/docmind-markdown';
|
|
4
|
-
export { MarkdownSection } from '@dragon708/docmind-markdown';
|
|
3
|
+
import { RenderLlmTextOptions, RenderMarkdownOptions, ConvertHtmlToMarkdownOptions, ConvertCsvToMarkdownOptions, ConvertSpreadsheetToMarkdownOptions, ExtractMarkdownStrategy, ExtractMarkdownRoutingInfo, RenderMarkdownSectionsOptions, MarkdownSection } from '@dragon708/docmind-markdown';
|
|
4
|
+
export { ConvertCsvToMarkdownOptions, ConvertCsvToMarkdownResult, ConvertHtmlToMarkdownOptions, ConvertHtmlToMarkdownResult, ConvertSpreadsheetToMarkdownOptions, ConvertSpreadsheetToMarkdownResult, CsvStringInputMode, DetectedBinaryFormat, ExtractMarkdownMediaHint, ExtractMarkdownRoutingInfo, ExtractMarkdownStrategy, HtmlStringInputMode, MarkdownSection, convertCsvToMarkdown, convertHtmlToMarkdown, convertSpreadsheetToMarkdown, detectBinaryFormat } from '@dragon708/docmind-markdown';
|
|
5
5
|
import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions } from '@dragon708/docmind-docx';
|
|
6
6
|
export { AnalyzeDocxIncludeFlags, ExtractStructuredDataFromDocxOptions, extractStructuredDataFromDocx } from '@dragon708/docmind-docx';
|
|
7
7
|
import { OcrOptions, OcrTiffOptions, PreprocessImageOptions } from '@dragon708/docmind-ocr';
|
|
@@ -45,12 +45,26 @@ type BrowserExtractStructuredDataOptions = BrowserAnalyzeOptions & {
|
|
|
45
45
|
};
|
|
46
46
|
/**
|
|
47
47
|
* {@link extractMarkdown}: structured options plus `markdown` (passed through to `extractMarkdown` in
|
|
48
|
-
* `@dragon708/docmind-markdown`).
|
|
49
|
-
*
|
|
50
|
-
* DOCX
|
|
48
|
+
* `@dragon708/docmind-markdown`). **Cognipeer** (`convertPdfToMarkdown`, `convertHtmlToMarkdown`, `convertCsvToMarkdown`,
|
|
49
|
+
* `convertSpreadsheetToMarkdown`) is **Node-only**; in-browser those routes yield `*-unsupported-runtime` and use
|
|
50
|
+
* structured fallback. DOCX bytes→Mammoth→Turndown is also Node-only; the browser still gets DOCX Markdown via structured
|
|
51
|
+
* serialization. Optional `markdownHtml` / `markdownCsv` / `markdownSpreadsheet` forward to the same package for API parity
|
|
52
|
+
* with `@dragon708/docmind-node` `NodeExtractMarkdownOptions`.
|
|
51
53
|
*/
|
|
52
54
|
interface BrowserExtractMarkdownOptions extends BrowserExtractStructuredDataOptions {
|
|
53
55
|
readonly markdown?: RenderMarkdownOptions;
|
|
56
|
+
readonly markdownHtml?: ConvertHtmlToMarkdownOptions;
|
|
57
|
+
readonly markdownCsv?: ConvertCsvToMarkdownOptions;
|
|
58
|
+
readonly markdownSpreadsheet?: ConvertSpreadsheetToMarkdownOptions;
|
|
59
|
+
/**
|
|
60
|
+
* Optional: `strategy`, merged `warnings`, and `routing` from `@dragon708/docmind-markdown` `extractMarkdown`
|
|
61
|
+
* (e.g. `html-unsupported-runtime` + `routingSummary` in the browser).
|
|
62
|
+
*/
|
|
63
|
+
readonly onMarkdownExtract?: (info: {
|
|
64
|
+
strategy: ExtractMarkdownStrategy;
|
|
65
|
+
warnings: readonly string[];
|
|
66
|
+
routing?: ExtractMarkdownRoutingInfo;
|
|
67
|
+
}) => void;
|
|
54
68
|
}
|
|
55
69
|
/** {@link extractLlmContent}: optional `llm` passed to `renderLlmText`. */
|
|
56
70
|
interface BrowserExtractLlmContentOptions extends BrowserExtractStructuredDataOptions {
|
|
@@ -126,15 +140,17 @@ declare function extractStructuredData(input: BrowserAnalyzeInput, options?: Bro
|
|
|
126
140
|
* {@link extractStructuredData} for a full structured envelope, then `extractMarkdown` from
|
|
127
141
|
* `@dragon708/docmind-markdown` on `{ data, filename?, mimeType? }` with that result as `structuredFallback`.
|
|
128
142
|
*
|
|
129
|
-
* - **PDF
|
|
130
|
-
*
|
|
131
|
-
*
|
|
143
|
+
* - **PDF / HTML / CSV / Excel:** Cognipeer-backed specialized routes are **Node-only**. The markdown package detects
|
|
144
|
+
* format, returns `pdf-unsupported-runtime` / `html-unsupported-runtime` / etc. with warnings, and uses structured fallback
|
|
145
|
+
* — for PDF in this facade the envelope is an **empty stub** (see {@link getCapabilities}); for HTML/CSV/XLSX classified as
|
|
146
|
+
* `text`, structured fallback still yields Markdown from UTF-8 rollup when applicable.
|
|
132
147
|
* - **DOCX:** the package’s direct bytes → Mammoth → Turndown path is **Node-only**; in-browser, Markdown is
|
|
133
148
|
* produced via `convertStructuredToMarkdown` on the structured envelope (still Mammoth/OOXML-backed via
|
|
134
149
|
* `@dragon708/docmind-docx`), with an explanatory warning from the markdown package.
|
|
135
150
|
* - **Text / image:** unidentified or non-binary bytes use the same structured serializer.
|
|
136
151
|
*
|
|
137
|
-
* @param options - `markdown`
|
|
152
|
+
* @param options - `markdown` and optional `markdownHtml` / `markdownCsv` / `markdownSpreadsheet` (forwarded to the package),
|
|
153
|
+
* plus the same routing as {@link extractStructuredData} (`ocr`, `docx`, `normalize`).
|
|
138
154
|
*/
|
|
139
155
|
declare function extractMarkdown(input: BrowserAnalyzeInput, options?: BrowserExtractMarkdownOptions): Promise<string>;
|
|
140
156
|
/**
|
|
@@ -232,7 +248,7 @@ type BrowserExplainAnalysisPlanOptions = Omit<ExplainAnalysisPlanOptions, "inten
|
|
|
232
248
|
|
|
233
249
|
/**
|
|
234
250
|
* Epic 1 — **Capabilities:** detects kind from the same hints as `analyzeFile`, then lists which of
|
|
235
|
-
* `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` | `markdown` (hybrid `extractMarkdown`; **no**
|
|
251
|
+
* `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` | `markdown` (hybrid `extractMarkdown`; **no** Cognipeer PDF/HTML/CSV/Excel — specialized Markdown unsupported here; use `@dragon708/docmind-node`) | `llm-text` | `structured-chunks`
|
|
236
252
|
* and image-specific ids (`image-normalization`, `bmp`, `gif-first-frame`, `heic`, `tiff`) apply in the browser (PDF always unsupported for meaningful content).
|
|
237
253
|
* No PDF parser; Mammoth/Tesseract apply to DOCX/images only. For DOCX, {@link GetCapabilitiesReport.docxStructure} / `docxEmbeddedImages` describe v2 opt-in features.
|
|
238
254
|
*/
|
package/dist/index.js
CHANGED
|
@@ -5,6 +5,7 @@ export { extractStructuredDataFromDocx } from '@dragon708/docmind-docx';
|
|
|
5
5
|
import { extractStructuredDataFromImage, preprocessHasEffect, resolveImageFormat, normalizeImageForOcr, ocrTiff, ocrImageDetailed } from '@dragon708/docmind-ocr';
|
|
6
6
|
export { extractStructuredDataFromImage } from '@dragon708/docmind-ocr';
|
|
7
7
|
import { extractMarkdown as extractMarkdown$1, renderLlmText, renderMarkdownSections } from '@dragon708/docmind-markdown';
|
|
8
|
+
export { convertCsvToMarkdown, convertHtmlToMarkdown, convertSpreadsheetToMarkdown, detectBinaryFormat } from '@dragon708/docmind-markdown';
|
|
8
9
|
|
|
9
10
|
// src/analyzeFile.ts
|
|
10
11
|
function assertBrowserInput(input) {
|
|
@@ -531,14 +532,28 @@ function browserFileHints(input) {
|
|
|
531
532
|
}
|
|
532
533
|
async function extractMarkdown(input, options) {
|
|
533
534
|
throwIfAborted(options?.signal);
|
|
534
|
-
const {
|
|
535
|
+
const {
|
|
536
|
+
markdown: markdownOpts,
|
|
537
|
+
markdownHtml,
|
|
538
|
+
markdownCsv,
|
|
539
|
+
markdownSpreadsheet,
|
|
540
|
+
onMarkdownExtract,
|
|
541
|
+
...structuredOpts
|
|
542
|
+
} = options ?? {};
|
|
535
543
|
const structured = await extractStructuredData(input, structuredOpts);
|
|
536
544
|
const data = await toUint8Array(input);
|
|
537
545
|
const hints = browserFileHints(input);
|
|
538
546
|
const r = await extractMarkdown$1(
|
|
539
547
|
{ data, filename: hints.filename, mimeType: hints.mimeType },
|
|
540
|
-
{
|
|
548
|
+
{
|
|
549
|
+
...markdownOpts ?? {},
|
|
550
|
+
...markdownHtml !== void 0 ? { html: markdownHtml } : {},
|
|
551
|
+
...markdownCsv !== void 0 ? { csv: markdownCsv } : {},
|
|
552
|
+
...markdownSpreadsheet !== void 0 ? { spreadsheet: markdownSpreadsheet } : {},
|
|
553
|
+
structuredFallback: structured
|
|
554
|
+
}
|
|
541
555
|
);
|
|
556
|
+
onMarkdownExtract?.({ strategy: r.strategy, warnings: r.warnings, routing: r.routing });
|
|
542
557
|
return r.markdown;
|
|
543
558
|
}
|
|
544
559
|
async function extractLlmContent(input, options) {
|
|
@@ -592,6 +607,7 @@ var OCR_OFF_NOTE = 'Image OCR may be skipped when `ocr.mode` is "off" in analyze
|
|
|
592
607
|
var STRUCTURED_OCR_OFF = 'Structured image output uses OCR; when `ocr.mode` is "off", `extractStructuredData` returns an empty envelope with a warning.';
|
|
593
608
|
var UNKNOWN_KIND = "Could not determine file kind from name, MIME, or bytes; all features are reported as unsupported until the kind is known.";
|
|
594
609
|
var MARKDOWN_PDF_BROWSER = "PDF: no specialized PDF\u2192Markdown in-browser (`@cognipeer/to-markdown` / `convertPdfToMarkdown` are Node-only). `extractMarkdown` still calls `@dragon708/docmind-markdown` `extractMarkdown`, which skips the Node pipeline, warns (`pdf-unsupported-runtime`), and uses structured fallback \u2014 here that envelope is empty, so Markdown stays empty. `extractLlmContent` / `extractStructuredChunks` use the same empty structured envelope with browser PDF warnings. Use @dragon708/docmind-node for real PDF extraction and Markdown / LLM text / chunks.";
|
|
610
|
+
var MARKDOWN_COGNIPEER_HTML_CSV_XLSX_BROWSER = "HTML / CSV / Excel: `convertHtmlToMarkdown`, `convertCsvToMarkdown`, and `convertSpreadsheetToMarkdown` (same Cognipeer stack as PDF) are Node-only. In-browser `extractMarkdown` detects those formats, warns (`html-unsupported-runtime`, `csv-unsupported-runtime`, `spreadsheet-unsupported-runtime`), and uses structured fallback (UTF-8 text rollup for `text` kind, or OCR/layout for images). Use @dragon708/docmind-node for specialized conversion.";
|
|
595
611
|
var MARKDOWN_IMAGE_OCR_OFF = 'Same as structured-output: when ocr.mode is "off", structured (and thus Markdown/LLM/chunk exports) are empty aside from warnings.';
|
|
596
612
|
function slot(id, supported, warnings) {
|
|
597
613
|
return warnings?.length ? { id, supported, warnings } : { id, supported };
|
|
@@ -703,7 +719,8 @@ function buildBrowserCapabilityReport(kind) {
|
|
|
703
719
|
"`extractStructuredData` decodes UTF-8 (via `analyzeText`) and normalizes to `StructuredDocumentResult` (paragraph block rollup)."
|
|
704
720
|
]),
|
|
705
721
|
slot("markdown", true, [
|
|
706
|
-
"extractMarkdown: bytes + structured fallback through `@dragon708/docmind-markdown` `extractMarkdown` (typically structured serializer for UTF-8 text)."
|
|
722
|
+
"extractMarkdown: bytes + structured fallback through `@dragon708/docmind-markdown` `extractMarkdown` (typically structured serializer for UTF-8 text).",
|
|
723
|
+
MARKDOWN_COGNIPEER_HTML_CSV_XLSX_BROWSER
|
|
707
724
|
]),
|
|
708
725
|
slot("llm-text", true, [
|
|
709
726
|
"extractLlmContent: UTF-8 structured rollup \u2192 `renderLlmText` in `@dragon708/docmind-markdown`."
|
|
@@ -953,7 +970,7 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude, ocr
|
|
|
953
970
|
limitations = [
|
|
954
971
|
...limitations,
|
|
955
972
|
...lim(
|
|
956
|
-
"Output: Markdown string from `extractMarkdown` in `@dragon708/docmind-markdown`. PDF in-browser
|
|
973
|
+
"Output: Markdown string from `extractMarkdown` in `@dragon708/docmind-markdown`. Cognipeer (PDF/HTML/CSV/Excel) is Node-only in-browser \u2014 `*-unsupported-runtime` + structured fallback. DOCX: structured Markdown when bytes\u2192Turndown is Node-only."
|
|
957
974
|
)
|
|
958
975
|
];
|
|
959
976
|
} else if (intent === "extractLlmContent") {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@dragon708/docmind-browser",
|
|
3
|
-
"version": "1.8.
|
|
3
|
+
"version": "1.8.6",
|
|
4
4
|
"description": "Official DocMind browser facade: analyzeFile and intent APIs (DOCX, image OCR, text). PDF and fs paths use @dragon708/docmind-node.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"sideEffects": false,
|
|
@@ -34,7 +34,7 @@
|
|
|
34
34
|
"license": "MIT",
|
|
35
35
|
"dependencies": {
|
|
36
36
|
"@dragon708/docmind-docx": "^1.8.0",
|
|
37
|
-
"@dragon708/docmind-markdown": "^1.2.
|
|
37
|
+
"@dragon708/docmind-markdown": "^1.2.9",
|
|
38
38
|
"@dragon708/docmind-ocr": "^1.1.4",
|
|
39
39
|
"@dragon708/docmind-shared": "^1.2.0"
|
|
40
40
|
},
|