@dragon708/docmind-markdown 1.0.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/dist/index.d.ts +306 -9
  2. package/dist/index.js +819 -99
  3. package/package.json +19 -2
package/dist/index.d.ts CHANGED
@@ -5,8 +5,17 @@ export { DocumentBlock, DocumentPage, DocumentTable, StructuredDocumentResult }
5
5
  interface ConvertStructuredToMarkdownOptions {
6
6
  /**
7
7
  * When an `image-ref` block has no resolvable `src`, emit this string (default: HTML comment).
8
+ * Ignored when {@link imageMissingSrcMode} is `llm-label`.
8
9
  */
9
10
  readonly imagePlaceholder?: string;
11
+ /**
12
+ * How to render `image-ref` blocks whose resolved image has no `src` (OCR/placeholders).
13
+ * - `placeholder` — {@link imagePlaceholder} plus a short kind hint when known (default).
14
+ * - `llm-label` — plain Markdown italic label with id and alt, easy for models to read without HTML.
15
+ */
16
+ readonly imageMissingSrcMode?: "placeholder" | "llm-label";
17
+ /** If true, append `result.warnings` as a short Markdown section at the end (default: false). */
18
+ readonly appendWarningsSection?: boolean;
10
19
  /** If true, prepend a short human-readable metadata block when `result.metadata` has fields. */
11
20
  readonly includeMetadataHeader?: boolean;
12
21
  /**
@@ -30,6 +39,9 @@ type StructuredToMarkdownOptions = ConvertStructuredToMarkdownOptions;
30
39
  /**
31
40
  * Converts a {@link StructuredDocumentResult} to readable, semantic Markdown (GFM-style tables).
32
41
  *
42
+ * **Universal fallback** for DocMind: use when a format-specific pipeline (DOCX Mammoth, PDF OpenDataLoader, …)
43
+ * does not apply or fails, and for OCR / image / plain-text flows that already populate this shape.
44
+ *
33
45
  * Uses `blocks` in order; resolves `table` / `image-ref` via `tables` and `images`. When blocks are
34
46
  * empty or yield no output, falls back to the rollup `text`. Optional sections use `pages`, `metadata`,
35
47
  * and unreferenced `tables` / `images` according to options.
@@ -73,13 +85,29 @@ interface ConvertStructuredToLlmTextOptions {
73
85
  readonly compact?: boolean;
74
86
  /** Omit paragraph blocks whose trimmed text is empty (default `true`). */
75
87
  readonly skipEmptyParagraphs?: boolean;
88
+ /**
89
+ * Strip zero-width characters and normalize unusual spaces per line (default `true`).
90
+ * Keeps newlines; tuned for OCR / PDF paste noise.
91
+ */
92
+ readonly sanitizeNoise?: boolean;
93
+ /**
94
+ * When the table has two or more rows, emit a dashed rule after the first row (default `true`).
95
+ * Improves scanability for RAG/chat vs a flat pipe list.
96
+ */
97
+ readonly tableHeaderSeparator?: boolean;
98
+ /** Separator between cells in table rows (default ` | `). */
99
+ readonly tableColumnSeparator?: string;
76
100
  }
77
101
  /** @deprecated Use {@link ConvertStructuredToLlmTextOptions}. */
78
102
  type StructuredToLlmTextOptions = ConvertStructuredToLlmTextOptions;
79
103
  /**
80
104
  * Linearizes {@link StructuredDocumentResult} into plain text for prompts, RAG, and embeddings:
81
- * explicit `[Hn]` headings, readable tables, compact list lines, configurable page markers,
82
- * optional `[DOC]` metadata and `[WARNINGS]`. Not Markdown — tuned for density and clarity.
105
+ * explicit `[Hn]` headings, pipe-style tables with an optional header rule (aligned with {@link convertStructuredToMarkdown}
106
+ * semantics), compact list lines, configurable page markers, optional `[DOC]` metadata and `[WARNINGS]`.
107
+ * Not Markdown — denser and tag-oriented for models.
108
+ *
109
+ * Pairs with {@link splitStructuredIntoChunks}: per-chunk `text` uses the same formatter family (slice options omit
110
+ * document-level noise). Use {@link renderLlmText} as an alias for full-document export.
83
111
  */
84
112
  declare function convertStructuredToLlmText(result: StructuredDocumentResult, options?: ConvertStructuredToLlmTextOptions): string;
85
113
  /**
@@ -97,12 +125,17 @@ interface StructuredChunk {
97
125
  readonly markdown?: string;
98
126
  /** Breadcrumb of heading texts in scope for this chunk (best-effort). */
99
127
  readonly headingPath?: readonly string[];
100
- /** Smallest `pageIndex` among blocks in this chunk, when any. */
128
+ /** Smallest `pageIndex` among blocks in this chunk, when any (0-based, same as blocks). */
101
129
  readonly pageIndex?: number;
102
130
  /** Largest `pageIndex` among blocks in this chunk, when any. */
103
131
  readonly pageEndIndex?: number;
132
+ /**
133
+ * Human-facing page span for UI or citations (1-based). E.g. `"3"` or `"2–4"`.
134
+ * Set when {@link SplitStructuredIntoChunksOptions.includePageSpanLabel} is true and page indices exist.
135
+ */
136
+ readonly pageSpanLabel?: string;
104
137
  }
105
- /** Options for {@link splitStructuredIntoChunks}. */
138
+ /** Options for {@link splitStructuredIntoChunks} and {@link extractStructuredChunks}. */
106
139
  interface SplitStructuredIntoChunksOptions {
107
140
  /** Soft maximum characters for `text` per chunk (default `4000`). Tables may exceed this when {@link preserveTables} is true. */
108
141
  readonly maxChars?: number;
@@ -118,14 +151,26 @@ interface SplitStructuredIntoChunksOptions {
118
151
  readonly preserveTables?: boolean;
119
152
  /** When true (default), fill {@link StructuredChunk.markdown} using {@link convertStructuredToMarkdown} per slice. */
120
153
  readonly includeMarkdown?: boolean;
154
+ /**
155
+ * When true (default), set {@link StructuredChunk.pageSpanLabel} from {@link StructuredChunk.pageIndex} /
156
+ * {@link StructuredChunk.pageEndIndex} (1-based for display).
157
+ */
158
+ readonly includePageSpanLabel?: boolean;
121
159
  }
122
160
  /**
123
- * Splits a {@link StructuredDocumentResult} into ordered chunks for RAG / chat.
161
+ * Splits a {@link StructuredDocumentResult} into ordered chunks for RAG / chat / hybrid Markdown+text pipelines.
124
162
  *
125
- * First version: one unit per block, greedy packing with soft `maxChar` limits, optional heading-aligned
126
- * cuts, and atomic tables. Intended to refine later (finer splits, row-level tables, token limits).
163
+ * - **Headings:** optional hard cuts before each heading when {@link SplitStructuredIntoChunksOptions.preferHeadings} is true.
164
+ * - **Tables:** kept whole when {@link SplitStructuredIntoChunksOptions.preserveTables} is true (may exceed `maxChars`).
165
+ * - **Pages:** {@link StructuredChunk.pageIndex}, `pageEndIndex`, and optional {@link StructuredChunk.pageSpanLabel} (1-based).
166
+ * - **Dual serialization:** `text` uses {@link convertStructuredToLlmText}; `markdown` uses {@link convertStructuredToMarkdown}
167
+ * when {@link SplitStructuredIntoChunksOptions.includeMarkdown} is true — same block semantics as full-document export.
127
168
  */
128
169
  declare function splitStructuredIntoChunks(result: StructuredDocumentResult, options?: SplitStructuredIntoChunksOptions): StructuredChunk[];
170
+ /**
171
+ * Alias of {@link splitStructuredIntoChunks} — same hybrid Markdown + LLM-text chunking for structured results.
172
+ */
173
+ declare const extractStructuredChunks: typeof splitStructuredIntoChunks;
129
174
 
130
175
  /** Options for {@link renderMarkdown} (same as {@link ConvertStructuredToMarkdownOptions}). */
131
176
  type RenderMarkdownOptions = ConvertStructuredToMarkdownOptions;
@@ -136,9 +181,16 @@ type RenderLlmTextOptions = ConvertStructuredToLlmTextOptions;
136
181
  */
137
182
  declare function renderMarkdown(result: StructuredDocumentResult, options?: RenderMarkdownOptions): string;
138
183
  /**
139
- * Ergonomic alias for {@link convertStructuredToLlmText}: full document → plain text for LLM prompts.
184
+ * Ergonomic alias for {@link convertStructuredToLlmText}: full document → tagged plain text for LLM / RAG / embeddings.
185
+ * Semantics align with {@link convertStructuredToMarkdown} (headings, tables, page flow); chunks from
186
+ * {@link splitStructuredIntoChunks} use the same formatter with slice-scoped options for per-segment `text`.
140
187
  */
141
188
  declare function renderLlmText(result: StructuredDocumentResult, options?: RenderLlmTextOptions): string;
189
+ /**
190
+ * Same as {@link renderLlmText}: {@link StructuredDocumentResult} → tagged plain text for prompts / RAG.
191
+ * Use {@link extractMarkdown} when you have raw bytes or a path and optional `structuredFallback`; this entry point is **structured-only** (symmetric naming with facade `extractLlmContent` after they resolve a document).
192
+ */
193
+ declare function extractLlmContent(result: StructuredDocumentResult, options?: RenderLlmTextOptions): string;
142
194
  /** One Markdown slice aligned with chunking (headings / size limits). */
143
195
  interface MarkdownSection {
144
196
  readonly index: number;
@@ -146,6 +198,8 @@ interface MarkdownSection {
146
198
  readonly headingPath?: readonly string[];
147
199
  readonly pageIndex?: number;
148
200
  readonly pageEndIndex?: number;
201
+ /** 1-based page span label from {@link StructuredChunk.pageSpanLabel} when present. */
202
+ readonly pageSpanLabel?: string;
149
203
  /** Plain-text slice for the same block span (optional embedding / preview). */
150
204
  readonly text?: string;
151
205
  }
@@ -159,4 +213,247 @@ type RenderMarkdownSectionsOptions = SplitStructuredIntoChunksOptions;
159
213
  */
160
214
  declare function renderMarkdownSections(result: StructuredDocumentResult, options?: RenderMarkdownSectionsOptions): MarkdownSection[];
161
215
 
162
- export { type ConvertStructuredToLlmTextOptions, type ConvertStructuredToMarkdownOptions, type MarkdownSection, type RenderLlmTextOptions, type RenderMarkdownOptions, type RenderMarkdownSectionsOptions, type SplitStructuredIntoChunksOptions, type StructuredChunk, type StructuredToLlmTextOptions, type StructuredToMarkdownOptions, convertStructuredToLlmText, convertStructuredToMarkdown, renderLlmText, renderMarkdown, renderMarkdownSections, splitStructuredIntoChunks, structuredDocumentToLlmText, structuredDocumentToMarkdown };
216
+ /**
217
+ * Binary `.docx` payload accepted by {@link convertDocxToMarkdown}.
218
+ */
219
+ type DocxToMarkdownInput = Buffer | Uint8Array | ArrayBuffer;
220
+ /**
221
+ * Options forwarded to `mammoth.convertToHtml` (second argument).
222
+ * Mirrors mammoth’s `Options` shape so consumers are not forced to depend on mammoth types at compile time.
223
+ */
224
+ interface MammothConvertToHtmlOptions {
225
+ readonly styleMap?: string | string[];
226
+ readonly includeEmbeddedStyleMap?: boolean;
227
+ readonly includeDefaultStyleMap?: boolean;
228
+ readonly convertImage?: unknown;
229
+ readonly ignoreEmptyParagraphs?: boolean;
230
+ readonly idPrefix?: string;
231
+ readonly externalFileAccess?: boolean;
232
+ readonly transformDocument?: (element: unknown) => unknown;
233
+ }
234
+ /**
235
+ * Optional Turndown constructor options (`headingStyle`, `bulletListMarker`, …).
236
+ * `turndown` ships without TypeScript types; keep this loose for consumers.
237
+ */
238
+ type TurndownServiceOptions = Record<string, unknown>;
239
+ /**
240
+ * Options for {@link convertDocxToMarkdown} (and {@link convertDocxBufferToMarkdown}, which delegates here).
241
+ *
242
+ * Semantic toggles apply on top of Mammoth HTML → Turndown. For pixel-perfect Word layout, use another path;
243
+ * this pipeline targets headings, lists, tables (GFM), and readable images for LLMs.
244
+ */
245
+ interface ConvertDocxToMarkdownOptions {
246
+ /**
247
+ * Keep tables as GFM pipe tables where Mammoth emits `<table>` (uses `turndown-plugin-gfm`).
248
+ * @default true
249
+ */
250
+ readonly includeTables?: boolean;
251
+ /**
252
+ * Inline images as `data:` URIs in HTML before Turndown (via Mammoth).
253
+ * When `false`, `<img>` nodes are stripped from HTML (no figure placeholders in Markdown).
254
+ * @default true
255
+ */
256
+ readonly includeImages?: boolean;
257
+ /**
258
+ * Emit visible page separators for Word page breaks (`br[type=page]` → `<hr class="page-break">` → `---` in Markdown).
259
+ * @default true
260
+ */
261
+ readonly includePageBreaks?: boolean;
262
+ /**
263
+ * Tighter Markdown: fewer blank lines and trimmed trailing spaces on lines.
264
+ * @default false
265
+ */
266
+ readonly compactMode?: boolean;
267
+ /**
268
+ * When set together with {@link resolveStructured}, if the direct Markdown length (trimmed) is **below** this
269
+ * threshold, the structured fallback runs. Omit to only fall back on errors or completely empty output.
270
+ */
271
+ readonly minMarkdownLength?: number;
272
+ /** Extra mammoth options merged after DocMind-built defaults (e.g. custom `styleMap` entries). */
273
+ readonly mammoth?: MammothConvertToHtmlOptions;
274
+ /** Extra Turndown options merged after DocMind defaults. */
275
+ readonly turndown?: TurndownServiceOptions;
276
+ /**
277
+ * When direct Mammoth → Turndown fails, returns empty/short output (per {@link minMarkdownLength}), or you need
278
+ * block-level structure: provide a supplier that returns {@link StructuredDocumentResult} (e.g. from `@dragon708/docmind-docx`).
279
+ * The package does not call other monorepo packages by itself.
280
+ */
281
+ readonly resolveStructured?: () => Promise<StructuredDocumentResult>;
282
+ /** Passed to {@link convertStructuredToMarkdown} when {@link resolveStructured} is used. */
283
+ readonly structuredMarkdown?: ConvertStructuredToMarkdownOptions;
284
+ }
285
+ /** Normalized mammoth diagnostics (warnings/errors as strings). */
286
+ interface DocxMarkdownMessage {
287
+ readonly type: "warning" | "error" | string;
288
+ readonly message: string;
289
+ }
290
+ /** @deprecated Prefer {@link ConvertDocxToMarkdownResult} from {@link convertDocxToMarkdown}. */
291
+ interface DocxMarkdownResult {
292
+ readonly markdown: string;
293
+ readonly messages: readonly DocxMarkdownMessage[];
294
+ }
295
+ /** Which pipeline produced {@link ConvertDocxToMarkdownResult.markdown}. */
296
+ type DocxToMarkdownSource = "mammoth-turndown" | "structured-fallback";
297
+ interface ConvertDocxToMarkdownResult {
298
+ readonly markdown: string;
299
+ readonly source: DocxToMarkdownSource;
300
+ readonly messages: readonly DocxMarkdownMessage[];
301
+ /** Set when {@link source} is `structured-fallback`. */
302
+ readonly fallbackReason?: "error" | "empty" | "short";
303
+ }
304
+ /**
305
+ * **Node only.** Primary API: `.docx` bytes → semantic HTML (Mammoth) → LLM-friendly Markdown (Turndown + optional GFM).
306
+ *
307
+ * Optional peers: `mammoth`, `turndown`. Runtime dependency: `turndown-plugin-gfm` (declared on this package) when
308
+ * {@link ConvertDocxToMarkdownOptions.includeTables} is true.
309
+ *
310
+ * @see {@link convertDocxBufferToMarkdown} for a thin wrapper that only returns `markdown` and `messages`.
311
+ */
312
+ declare function convertDocxToMarkdown(input: DocxToMarkdownInput, options?: ConvertDocxToMarkdownOptions): Promise<ConvertDocxToMarkdownResult>;
313
+ /**
314
+ * **Node only.** Same as {@link convertDocxToMarkdown}, but returns only `markdown` and Mammoth `messages` for backward compatibility.
315
+ */
316
+ declare function convertDocxBufferToMarkdown(input: DocxToMarkdownInput, options?: ConvertDocxToMarkdownOptions): Promise<DocxMarkdownResult>;
317
+
318
+ /**
319
+ * Options forwarded to `@opendataloader/pdf` `convert()`, except `format` and `toStdout` (set internally).
320
+ * Shaped to match `ConvertOptions` from `@opendataloader/pdf` v2.x without a static type import.
321
+ */
322
+ interface OpenDataLoaderPdfConvertOptions {
323
+ outputDir?: string;
324
+ password?: string;
325
+ quiet?: boolean;
326
+ contentSafetyOff?: string | string[];
327
+ sanitize?: boolean;
328
+ keepLineBreaks?: boolean;
329
+ replaceInvalidChars?: string;
330
+ useStructTree?: boolean;
331
+ tableMethod?: string;
332
+ readingOrder?: string;
333
+ markdownPageSeparator?: string;
334
+ textPageSeparator?: string;
335
+ htmlPageSeparator?: string;
336
+ imageOutput?: string;
337
+ imageFormat?: string;
338
+ imageDir?: string;
339
+ pages?: string;
340
+ includeHeaderFooter?: boolean;
341
+ detectStrikethrough?: boolean;
342
+ hybrid?: string;
343
+ hybridMode?: string;
344
+ hybridUrl?: string;
345
+ hybridTimeout?: string;
346
+ hybridFallback?: boolean;
347
+ }
348
+ /**
349
+ * Options for {@link convertPdfToMarkdown}. OpenDataLoader fields are passed through; structured fields are local.
350
+ */
351
+ type ConvertPdfToMarkdownOptions = OpenDataLoaderPdfConvertOptions & {
352
+ /**
353
+ * When the OpenDataLoader path fails, returns empty output, or `@opendataloader/pdf` cannot load,
354
+ * call this to obtain {@link StructuredDocumentResult} (e.g. from another extractor) and serialize with
355
+ * {@link convertStructuredToMarkdown}. Does not import other DocMind packages.
356
+ */
357
+ readonly resolveStructured?: () => Promise<StructuredDocumentResult>;
358
+ /** Options for {@link convertStructuredToMarkdown} when using {@link resolveStructured}. */
359
+ readonly structuredMarkdown?: ConvertStructuredToMarkdownOptions;
360
+ /**
361
+ * Normalize whitespace in the final Markdown (trim, collapse 3+ newlines to 2).
362
+ * @default true
363
+ */
364
+ readonly cleanMarkdown?: boolean;
365
+ };
366
+ /** Input for {@link convertPdfToMarkdown}: filesystem path (Node) or PDF bytes. */
367
+ type PdfToMarkdownInput = string | Buffer | Uint8Array | ArrayBuffer;
368
+ /** Which pipeline produced {@link ConvertPdfToMarkdownResult.markdown}. */
369
+ type PdfToMarkdownSource = "opendataloader" | "structured-fallback" | "unsupported-runtime";
370
+ type PdfToMarkdownFallbackReason = "unsupported-runtime" | "error" | "empty" | "module-not-found";
371
+ interface ConvertPdfToMarkdownResult {
372
+ readonly markdown: string;
373
+ /** Human-readable issues (runtime, missing module, Java/PDF errors, empty output, fallback errors). */
374
+ readonly warnings: readonly string[];
375
+ readonly source: PdfToMarkdownSource;
376
+ readonly fallbackReason?: PdfToMarkdownFallbackReason;
377
+ }
378
+ /** @deprecated Prefer {@link ConvertPdfToMarkdownResult} from {@link convertPdfToMarkdown}. */
379
+ interface PdfMarkdownResult {
380
+ readonly markdown: string;
381
+ }
382
+ /**
383
+ * Primary API: PDF path or bytes → Markdown via `@opendataloader/pdf` on Node, with clear warnings and optional
384
+ * structured fallback. In non-Node runtimes returns an empty `markdown` and {@link PdfToMarkdownSource} `unsupported-runtime`
385
+ * without loading `@opendataloader/pdf`.
386
+ */
387
+ declare function convertPdfToMarkdown(input: PdfToMarkdownInput, options?: ConvertPdfToMarkdownOptions): Promise<ConvertPdfToMarkdownResult>;
388
+ /**
389
+ * **Node only.** PDF file path → Markdown via `@opendataloader/pdf` (`format: "markdown"`, `toStdout: true`).
390
+ *
391
+ * Throws if not Node, if conversion yields no Markdown (and no structured fallback), or on OpenDataLoader errors
392
+ * when no fallback is configured — same contract as before {@link convertPdfToMarkdown} existed.
393
+ */
394
+ declare function convertPdfPathToMarkdown(inputPath: string, options?: ConvertPdfToMarkdownOptions): Promise<PdfMarkdownResult>;
395
+ /**
396
+ * **Node only.** Same pipeline as {@link convertPdfPathToMarkdown}, but writes bytes to a temporary `.pdf`
397
+ * under the system temp directory (OpenDataLoader expects a file path).
398
+ */
399
+ declare function convertPdfBufferToMarkdown(input: Buffer | Uint8Array | ArrayBuffer, options?: ConvertPdfToMarkdownOptions): Promise<PdfMarkdownResult>;
400
+
401
+ /**
402
+ * Binary file payload for {@link extractMarkdown} when you have bytes (and optional name/MIME hints).
403
+ */
404
+ interface ExtractMarkdownFileInput {
405
+ readonly data: Buffer | Uint8Array | ArrayBuffer;
406
+ readonly filename?: string;
407
+ readonly mimeType?: string;
408
+ }
409
+ /**
410
+ * Node: read a PDF/DOCX from disk. Ignored or warned in non-Node runtimes unless {@link ExtractMarkdownOptions.structuredFallback} is set.
411
+ */
412
+ interface ExtractMarkdownPathInput {
413
+ readonly path: string;
414
+ readonly filename?: string;
415
+ readonly mimeType?: string;
416
+ }
417
+ type ExtractMarkdownInput = StructuredDocumentResult | ExtractMarkdownFileInput | ExtractMarkdownPathInput;
418
+ /**
419
+ * Options for {@link extractMarkdown}.
420
+ *
421
+ * Top-level fields match {@link ConvertStructuredToMarkdownOptions} so passing the same object you would pass to
422
+ * {@link convertStructuredToMarkdown} remains valid when `input` is a {@link StructuredDocumentResult}.
423
+ * Additional fields configure DOCX/PDF branches and cross-strategy fallback.
424
+ */
425
+ type ExtractMarkdownOptions = ConvertStructuredToMarkdownOptions & {
426
+ /**
427
+ * When a specialized binary route fails or cannot run (e.g. DOCX in the browser), this structured snapshot
428
+ * is passed to {@link convertStructuredToMarkdown} if nothing else produced Markdown.
429
+ */
430
+ readonly structuredFallback?: StructuredDocumentResult;
431
+ /** Overrides merged into {@link convertDocxToMarkdown} when the input is identified as `.docx`. */
432
+ readonly docx?: ConvertDocxToMarkdownOptions;
433
+ /** Overrides merged into {@link convertPdfToMarkdown} when the input is identified as `.pdf`. */
434
+ readonly pdf?: ConvertPdfToMarkdownOptions;
435
+ };
436
+ /** Which branch produced {@link ExtractMarkdownResult.markdown}. */
437
+ type ExtractMarkdownStrategy = "structured" | "docx-mammoth" | "docx-structured-fallback" | "pdf-opendataloader" | "pdf-structured-fallback" | "pdf-unsupported-runtime" | "docx-requires-node" | "path-requires-node" | "binary-unidentified" | "binary-unidentified-structured-fallback";
438
+ interface ExtractMarkdownResult {
439
+ readonly markdown: string;
440
+ /** Merged pipeline warnings (conversion, runtime, and optional {@link StructuredDocumentResult.warnings}). */
441
+ readonly warnings: readonly string[];
442
+ readonly strategy: ExtractMarkdownStrategy;
443
+ }
444
+ /** Type guard: file-like `{ data: … }` input for {@link extractMarkdown}. */
445
+ declare function isExtractMarkdownFileInput(value: unknown): value is ExtractMarkdownFileInput;
446
+ type DetectedBinaryFormat = "docx" | "pdf" | "unknown";
447
+ /** Detect PDF / OOXML zip (DOCX) from magic bytes and optional filename / MIME. */
448
+ declare function detectBinaryFormat(data: Buffer | Uint8Array | ArrayBuffer, filename?: string, mimeType?: string): DetectedBinaryFormat;
449
+ /**
450
+ * Produces Markdown from a {@link StructuredDocumentResult}, raw file bytes, or a filesystem `path` (Node),
451
+ * picking DOCX / PDF specialized pipelines when possible and falling back to {@link convertStructuredToMarkdown}.
452
+ *
453
+ * - **Structured input** — always uses the structured serializer (image/OCR/text/PDF/DOCX blocks already normalized).
454
+ * - **DOCX bytes / path** — {@link convertDocxToMarkdown} on Node; otherwise warns and uses {@link ExtractMarkdownOptions.structuredFallback} if provided.
455
+ * - **PDF bytes / path** — {@link convertPdfToMarkdown} (`@opendataloader/pdf` on Node when Java is available); in non-Node runtimes returns empty Markdown with warnings unless {@link ExtractMarkdownOptions.structuredFallback} supplies content.
456
+ */
457
+ declare function extractMarkdown(input: ExtractMarkdownInput, options?: ExtractMarkdownOptions): Promise<ExtractMarkdownResult>;
458
+
459
+ export { type ConvertDocxToMarkdownOptions, type ConvertDocxToMarkdownResult, type ConvertPdfToMarkdownOptions, type ConvertPdfToMarkdownResult, type ConvertStructuredToLlmTextOptions, type ConvertStructuredToMarkdownOptions, type DetectedBinaryFormat, type DocxMarkdownMessage, type DocxMarkdownResult, type DocxToMarkdownInput, type DocxToMarkdownSource, type ExtractMarkdownFileInput, type ExtractMarkdownInput, type ExtractMarkdownOptions, type ExtractMarkdownPathInput, type ExtractMarkdownResult, type ExtractMarkdownStrategy, type MammothConvertToHtmlOptions, type MarkdownSection, type OpenDataLoaderPdfConvertOptions, type PdfMarkdownResult, type PdfToMarkdownFallbackReason, type PdfToMarkdownInput, type PdfToMarkdownSource, type RenderLlmTextOptions, type RenderMarkdownOptions, type RenderMarkdownSectionsOptions, type SplitStructuredIntoChunksOptions, type StructuredChunk, type StructuredToLlmTextOptions, type StructuredToMarkdownOptions, type TurndownServiceOptions, convertDocxBufferToMarkdown, convertDocxToMarkdown, convertPdfBufferToMarkdown, convertPdfPathToMarkdown, convertPdfToMarkdown, convertStructuredToLlmText, convertStructuredToMarkdown, detectBinaryFormat, extractLlmContent, extractMarkdown, extractStructuredChunks, isExtractMarkdownFileInput, renderLlmText, renderMarkdown, renderMarkdownSections, splitStructuredIntoChunks, structuredDocumentToLlmText, structuredDocumentToMarkdown };