@dragon708/docmind-browser 1.8.3 → 1.8.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1,7 +1,7 @@
1
1
  import { DocMindAnalyzeOptions, AnalyzeFileOutputOptions, NormalizeStructuredOptions, AnalysisResult, StructuredDocumentResult, FileKind, RuntimeDescriptor, DocMindPublicIntent, AnalysisAnalyzer, ProcessingPlanDescriptor, ExplainAnalysisPlanOptions, GetCapabilitiesOptions } from '@dragon708/docmind-shared';
2
2
  export { AnalysisAnalyzer, AnalysisResult, CapabilityDescriptor, DetectFileKindInput, DocMindPublicIntent, DocumentBlock, DocumentImageRef, DocumentPage, DocumentTable, DocxAnalysisCoreResult, ExplainAnalysisPlanOptions, ExplainAnalysisPlanResult, FileKind, FileKindMetadata, GenericAnalysisResult, GetCapabilitiesOptions, GetCapabilitiesResult, ImageAnalysisCoreResult, PdfAnalysisCoreResult, StructuredDocumentResult, TextAnalysisResult, analyzeFileRequestsStructured, detectFileKind, isStructuredDocumentResult } from '@dragon708/docmind-shared';
3
- import { RenderLlmTextOptions, RenderMarkdownOptions, RenderMarkdownSectionsOptions, MarkdownSection } from '@dragon708/docmind-markdown';
4
- export { MarkdownSection } from '@dragon708/docmind-markdown';
3
+ import { RenderLlmTextOptions, RenderMarkdownOptions, ConvertHtmlToMarkdownOptions, ConvertCsvToMarkdownOptions, ConvertSpreadsheetToMarkdownOptions, ExtractMarkdownStrategy, ExtractMarkdownRoutingInfo, RenderMarkdownSectionsOptions, MarkdownSection } from '@dragon708/docmind-markdown';
4
+ export { ConvertCsvToMarkdownOptions, ConvertCsvToMarkdownResult, ConvertHtmlToMarkdownOptions, ConvertHtmlToMarkdownResult, ConvertSpreadsheetToMarkdownOptions, ConvertSpreadsheetToMarkdownResult, CsvStringInputMode, DetectedBinaryFormat, ExtractMarkdownMediaHint, ExtractMarkdownRoutingInfo, ExtractMarkdownStrategy, HtmlStringInputMode, MarkdownSection, convertCsvToMarkdown, convertHtmlToMarkdown, convertSpreadsheetToMarkdown, detectBinaryFormat } from '@dragon708/docmind-markdown';
5
5
  import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions } from '@dragon708/docmind-docx';
6
6
  export { AnalyzeDocxIncludeFlags, ExtractStructuredDataFromDocxOptions, extractStructuredDataFromDocx } from '@dragon708/docmind-docx';
7
7
  import { OcrOptions, OcrTiffOptions, PreprocessImageOptions } from '@dragon708/docmind-ocr';
@@ -45,12 +45,26 @@ type BrowserExtractStructuredDataOptions = BrowserAnalyzeOptions & {
45
45
  };
46
46
  /**
47
47
  * {@link extractMarkdown}: structured options plus `markdown` (passed through to `extractMarkdown` in
48
- * `@dragon708/docmind-markdown`, including structured-serializer knobs). Binary PDF/DOCX converters inside that
49
- * package are not used for PDF in-browser and the DOCX bytes→Turndown path is Node-only; the browser still
50
- * gets correct DOCX Markdown via structured fallback from {@link extractStructuredData}.
48
+ * `@dragon708/docmind-markdown`). **Cognipeer** (`convertPdfToMarkdown`, `convertHtmlToMarkdown`, `convertCsvToMarkdown`,
49
+ * `convertSpreadsheetToMarkdown`) is **Node-only**; in-browser those routes yield `*-unsupported-runtime` and use
50
+ * structured fallback. DOCX bytes→Mammoth→Turndown is also Node-only; the browser still gets DOCX Markdown via structured
51
+ * serialization. Optional `markdownHtml` / `markdownCsv` / `markdownSpreadsheet` forward to the same package for API parity
52
+ * with `@dragon708/docmind-node` `NodeExtractMarkdownOptions`.
51
53
  */
52
54
  interface BrowserExtractMarkdownOptions extends BrowserExtractStructuredDataOptions {
53
55
  readonly markdown?: RenderMarkdownOptions;
56
+ readonly markdownHtml?: ConvertHtmlToMarkdownOptions;
57
+ readonly markdownCsv?: ConvertCsvToMarkdownOptions;
58
+ readonly markdownSpreadsheet?: ConvertSpreadsheetToMarkdownOptions;
59
+ /**
60
+ * Optional: `strategy`, merged `warnings`, and `routing` from `@dragon708/docmind-markdown` `extractMarkdown`
61
+ * (e.g. `html-unsupported-runtime` + `routingSummary` in the browser).
62
+ */
63
+ readonly onMarkdownExtract?: (info: {
64
+ strategy: ExtractMarkdownStrategy;
65
+ warnings: readonly string[];
66
+ routing?: ExtractMarkdownRoutingInfo;
67
+ }) => void;
54
68
  }
55
69
  /** {@link extractLlmContent}: optional `llm` passed to `renderLlmText`. */
56
70
  interface BrowserExtractLlmContentOptions extends BrowserExtractStructuredDataOptions {
@@ -126,18 +140,22 @@ declare function extractStructuredData(input: BrowserAnalyzeInput, options?: Bro
126
140
  * {@link extractStructuredData} for a full structured envelope, then `extractMarkdown` from
127
141
  * `@dragon708/docmind-markdown` on `{ data, filename?, mimeType? }` with that result as `structuredFallback`.
128
142
  *
129
- * - **PDF:** the markdown package does not load `@opendataloader/pdf` here; output comes from the structured
130
- * fallback (empty in-browser stub see {@link getCapabilities}).
143
+ * - **PDF / HTML / CSV / Excel:** Cognipeer-backed specialized routes are **Node-only**. The markdown package detects
144
+ * format, returns `pdf-unsupported-runtime` / `html-unsupported-runtime` / etc. with warnings, and uses structured fallback
145
+ * — for PDF in this facade the envelope is an **empty stub** (see {@link getCapabilities}); for HTML/CSV/XLSX classified as
146
+ * `text`, structured fallback still yields Markdown from UTF-8 rollup when applicable.
131
147
  * - **DOCX:** the package’s direct bytes → Mammoth → Turndown path is **Node-only**; in-browser, Markdown is
132
148
  * produced via `convertStructuredToMarkdown` on the structured envelope (still Mammoth/OOXML-backed via
133
149
  * `@dragon708/docmind-docx`), with an explanatory warning from the markdown package.
134
150
  * - **Text / image:** unidentified or non-binary bytes use the same structured serializer.
135
151
  *
136
- * @param options - `markdown` options plus the same routing as {@link extractStructuredData} (`ocr`, `docx`, `normalize`).
152
+ * @param options - `markdown` and optional `markdownHtml` / `markdownCsv` / `markdownSpreadsheet` (forwarded to the package),
153
+ * plus the same routing as {@link extractStructuredData} (`ocr`, `docx`, `normalize`).
137
154
  */
138
155
  declare function extractMarkdown(input: BrowserAnalyzeInput, options?: BrowserExtractMarkdownOptions): Promise<string>;
139
156
  /**
140
- * {@link extractStructuredData} then `renderLlmText` (`@dragon708/docmind-markdown`). For a structured value you already have, that package's `extractLlmContent` matches `renderLlmText` (no file I/O).
157
+ * {@link extractStructuredData} then `renderLlmText` (`@dragon708/docmind-markdown`). No binary PDF/DOCX Markdown
158
+ * pipelines run here; for PDF in-browser the envelope is empty and warnings describe the limitation.
141
159
  */
142
160
  declare function extractLlmContent(input: BrowserAnalyzeInput, options?: BrowserExtractLlmContentOptions): Promise<string>;
143
161
  /**
@@ -148,7 +166,7 @@ declare function extractStructuredChunks(input: BrowserAnalyzeInput, options?: B
148
166
 
149
167
  /** High-level features the user can ask DocMind for (per input kind and runtime). */
150
168
  type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages" | "structured-output"
151
- /** Browser: {@link extractMarkdown} via `@dragon708/docmind-markdown` `extractMarkdown` + structured fallback (PDF empty; DOCX structured path when binary converter is Node-only). */
169
+ /** Browser: {@link extractMarkdown} via `@dragon708/docmind-markdown` + structured fallback (PDF: no specialized Markdown; DOCX: structured path when bytes→Turndown is Node-only). */
152
170
  | "markdown" | "llm-text" | "structured-chunks" | "image-normalization" | "gif-first-frame" | "bmp" | "heic" | "tiff";
153
171
  declare function docxIncludeRequested(flags?: AnalyzeDocxIncludeFlags): boolean;
154
172
  /** DOCX `word/media` en runtime browser (JSZip; sin pipeline Node). */
@@ -230,9 +248,9 @@ type BrowserExplainAnalysisPlanOptions = Omit<ExplainAnalysisPlanOptions, "inten
230
248
 
231
249
  /**
232
250
  * Epic 1 — **Capabilities:** detects kind from the same hints as `analyzeFile`, then lists which of
233
- * `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` | `markdown` (package `extractMarkdown` + structured fallback; PDF empty here) | `llm-text` | `structured-chunks` (split + Markdown sections)
251
+ * `text` | `metadata` | `html` | `ocr` | `pages` | `structured-output` | `markdown` (hybrid `extractMarkdown`; **no** Cognipeer PDF/HTML/CSV/Excel specialized Markdown unsupported here; use `@dragon708/docmind-node`) | `llm-text` | `structured-chunks`
234
252
  * and image-specific ids (`image-normalization`, `bmp`, `gif-first-frame`, `heic`, `tiff`) apply in the browser (PDF always unsupported for meaningful content).
235
- * No Mammoth/Tesseract/PDF parsing. For DOCX, {@link GetCapabilitiesReport.docxStructure} / `docxEmbeddedImages` describe v2 opt-in features.
253
+ * No PDF parser; Mammoth/Tesseract apply to DOCX/images only. For DOCX, {@link GetCapabilitiesReport.docxStructure} / `docxEmbeddedImages` describe v2 opt-in features.
236
254
  */
237
255
  declare function getCapabilities(input: BrowserAnalyzeInput, options?: GetCapabilitiesOptions): Promise<GetCapabilitiesReport>;
238
256
  /**
package/dist/index.js CHANGED
@@ -5,6 +5,7 @@ export { extractStructuredDataFromDocx } from '@dragon708/docmind-docx';
5
5
  import { extractStructuredDataFromImage, preprocessHasEffect, resolveImageFormat, normalizeImageForOcr, ocrTiff, ocrImageDetailed } from '@dragon708/docmind-ocr';
6
6
  export { extractStructuredDataFromImage } from '@dragon708/docmind-ocr';
7
7
  import { extractMarkdown as extractMarkdown$1, renderLlmText, renderMarkdownSections } from '@dragon708/docmind-markdown';
8
+ export { convertCsvToMarkdown, convertHtmlToMarkdown, convertSpreadsheetToMarkdown, detectBinaryFormat } from '@dragon708/docmind-markdown';
8
9
 
9
10
  // src/analyzeFile.ts
10
11
  function assertBrowserInput(input) {
@@ -531,14 +532,28 @@ function browserFileHints(input) {
531
532
  }
532
533
  async function extractMarkdown(input, options) {
533
534
  throwIfAborted(options?.signal);
534
- const { markdown: markdownOpts, ...structuredOpts } = options ?? {};
535
+ const {
536
+ markdown: markdownOpts,
537
+ markdownHtml,
538
+ markdownCsv,
539
+ markdownSpreadsheet,
540
+ onMarkdownExtract,
541
+ ...structuredOpts
542
+ } = options ?? {};
535
543
  const structured = await extractStructuredData(input, structuredOpts);
536
544
  const data = await toUint8Array(input);
537
545
  const hints = browserFileHints(input);
538
546
  const r = await extractMarkdown$1(
539
547
  { data, filename: hints.filename, mimeType: hints.mimeType },
540
- { ...markdownOpts ?? {}, structuredFallback: structured }
548
+ {
549
+ ...markdownOpts ?? {},
550
+ ...markdownHtml !== void 0 ? { html: markdownHtml } : {},
551
+ ...markdownCsv !== void 0 ? { csv: markdownCsv } : {},
552
+ ...markdownSpreadsheet !== void 0 ? { spreadsheet: markdownSpreadsheet } : {},
553
+ structuredFallback: structured
554
+ }
541
555
  );
556
+ onMarkdownExtract?.({ strategy: r.strategy, warnings: r.warnings, routing: r.routing });
542
557
  return r.markdown;
543
558
  }
544
559
  async function extractLlmContent(input, options) {
@@ -591,7 +606,8 @@ var TEXT_META_NOTE = "Plain text has no structured document metadata; extractMet
591
606
  var OCR_OFF_NOTE = 'Image OCR may be skipped when `ocr.mode` is "off" in analyze options.';
592
607
  var STRUCTURED_OCR_OFF = 'Structured image output uses OCR; when `ocr.mode` is "off", `extractStructuredData` returns an empty envelope with a warning.';
593
608
  var UNKNOWN_KIND = "Could not determine file kind from name, MIME, or bytes; all features are reported as unsupported until the kind is known.";
594
- var MARKDOWN_PDF_BROWSER = "PDF: no parser in-browser \u2014 `@opendataloader/pdf` is not loaded here. extractMarkdown still calls `extractMarkdown` in `@dragon708/docmind-markdown`, which falls back to the empty structured stub (same empty Markdown). extractLlmContent / extractStructuredChunks use the structured envelope only. Use @dragon708/docmind-node for real PDF \u2192 Markdown / LLM text / chunks.";
609
+ var MARKDOWN_PDF_BROWSER = "PDF: no specialized PDF\u2192Markdown in-browser (`@cognipeer/to-markdown` / `convertPdfToMarkdown` are Node-only). `extractMarkdown` still calls `@dragon708/docmind-markdown` `extractMarkdown`, which skips the Node pipeline, warns (`pdf-unsupported-runtime`), and uses structured fallback \u2014 here that envelope is empty, so Markdown stays empty. `extractLlmContent` / `extractStructuredChunks` use the same empty structured envelope with browser PDF warnings. Use @dragon708/docmind-node for real PDF extraction and Markdown / LLM text / chunks.";
610
+ var MARKDOWN_COGNIPEER_HTML_CSV_XLSX_BROWSER = "HTML / CSV / Excel: `convertHtmlToMarkdown`, `convertCsvToMarkdown`, and `convertSpreadsheetToMarkdown` (same Cognipeer stack as PDF) are Node-only. In-browser `extractMarkdown` detects those formats, warns (`html-unsupported-runtime`, `csv-unsupported-runtime`, `spreadsheet-unsupported-runtime`), and uses structured fallback (UTF-8 text rollup for `text` kind, or OCR/layout for images). Use @dragon708/docmind-node for specialized conversion.";
595
611
  var MARKDOWN_IMAGE_OCR_OFF = 'Same as structured-output: when ocr.mode is "off", structured (and thus Markdown/LLM/chunk exports) are empty aside from warnings.';
596
612
  function slot(id, supported, warnings) {
597
613
  return warnings?.length ? { id, supported, warnings } : { id, supported };
@@ -703,7 +719,8 @@ function buildBrowserCapabilityReport(kind) {
703
719
  "`extractStructuredData` decodes UTF-8 (via `analyzeText`) and normalizes to `StructuredDocumentResult` (paragraph block rollup)."
704
720
  ]),
705
721
  slot("markdown", true, [
706
- "extractMarkdown: bytes + structured fallback through `@dragon708/docmind-markdown` `extractMarkdown` (typically structured serializer for UTF-8 text)."
722
+ "extractMarkdown: bytes + structured fallback through `@dragon708/docmind-markdown` `extractMarkdown` (typically structured serializer for UTF-8 text).",
723
+ MARKDOWN_COGNIPEER_HTML_CSV_XLSX_BROWSER
707
724
  ]),
708
725
  slot("llm-text", true, [
709
726
  "extractLlmContent: UTF-8 structured rollup \u2192 `renderLlmText` in `@dragon708/docmind-markdown`."
@@ -770,7 +787,7 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude, ocr
770
787
  const structuredLikeIntent = intent === "extractStructuredData" || intent === "extractMarkdown" || intent === "extractLlmContent" || intent === "extractStructuredChunks";
771
788
  limitations = lim(
772
789
  BROWSER_PDF_UNSUPPORTED_WARNING,
773
- structuredLikeIntent ? "`extractStructuredData` / extractMarkdown / extractLlmContent / extractStructuredChunks only see an empty structured envelope in-browser for PDF; use @dragon708/docmind-node for real PDF extraction and Markdown/LLM/chunk exports." : ""
790
+ structuredLikeIntent ? "`extractStructuredData` / extractMarkdown / extractLlmContent / extractStructuredChunks only see an empty structured envelope in-browser for PDF (no `@cognipeer/to-markdown`; specialized PDF\u2192Markdown runs on Node via @dragon708/docmind-node)." : ""
774
791
  );
775
792
  nativeExtraction = {
776
793
  willAttempt: false,
@@ -953,7 +970,7 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude, ocr
953
970
  limitations = [
954
971
  ...limitations,
955
972
  ...lim(
956
- "Output: Markdown string from `extractMarkdown` in `@dragon708/docmind-markdown`. PDF in-browser: empty (no `@opendataloader/pdf`). DOCX: structured Markdown path when the binary converter is Node-only."
973
+ "Output: Markdown string from `extractMarkdown` in `@dragon708/docmind-markdown`. Cognipeer (PDF/HTML/CSV/Excel) is Node-only in-browser \u2014 `*-unsupported-runtime` + structured fallback. DOCX: structured Markdown when bytes\u2192Turndown is Node-only."
957
974
  )
958
975
  ];
959
976
  } else if (intent === "extractLlmContent") {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@dragon708/docmind-browser",
3
- "version": "1.8.3",
3
+ "version": "1.8.5",
4
4
  "description": "Official DocMind browser facade: analyzeFile and intent APIs (DOCX, image OCR, text). PDF and fs paths use @dragon708/docmind-node.",
5
5
  "type": "module",
6
6
  "sideEffects": false,
@@ -34,7 +34,7 @@
34
34
  "license": "MIT",
35
35
  "dependencies": {
36
36
  "@dragon708/docmind-docx": "^1.8.0",
37
- "@dragon708/docmind-markdown": "^1.1.3",
37
+ "@dragon708/docmind-markdown": "^1.2.8",
38
38
  "@dragon708/docmind-ocr": "^1.1.4",
39
39
  "@dragon708/docmind-shared": "^1.2.0"
40
40
  },