@dragon708/docmind-markdown 1.2.6 → 1.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -423,6 +423,102 @@ declare function convertPdfPathToMarkdown(inputPath: string, options?: ConvertPd
423
423
  */
424
424
  declare function convertPdfBufferToMarkdown(input: Buffer | Uint8Array | ArrayBuffer, options?: ConvertPdfToMarkdownOptions): Promise<PdfMarkdownResult>;
425
425
 
426
+ interface CognipeerFileMarkdownOptions {
427
+ readonly resolveStructured?: () => Promise<StructuredDocumentResult>;
428
+ readonly structuredMarkdown?: ConvertStructuredToMarkdownOptions;
429
+ /** @default true */
430
+ readonly cleanMarkdown?: boolean;
431
+ readonly fileName?: string;
432
+ readonly forceExtension?: string;
433
+ readonly url?: string;
434
+ }
435
+ type CognipeerFileMarkdownSource = "cognipeer" | "structured-fallback" | "unsupported-runtime" | "cognipeer-unavailable" | "cognipeer-failed";
436
+ type CognipeerFileFallbackReason = "unsupported-runtime" | "error" | "empty" | "module-not-found";
437
+ interface CognipeerFileMarkdownResult {
438
+ readonly markdown: string;
439
+ readonly warnings: readonly string[];
440
+ readonly source: CognipeerFileMarkdownSource;
441
+ readonly fallbackReason?: CognipeerFileFallbackReason;
442
+ }
443
+ type CognipeerFileInput = string | Buffer | Uint8Array | ArrayBuffer;
444
+
445
+ /**
446
+ * - **Path** (Node): filesystem path to `.html` / `.htm`
447
+ * - **Markup string**: HTML source when {@link ConvertHtmlToMarkdownOptions.inputMode} is `"html"`, or in `"auto"` when the string is not an existing file and {@link looksLikeHtmlString} is true
448
+ * - **Binary**: `Buffer` / `Uint8Array` / `ArrayBuffer` (UTF-8 or document bytes)
449
+ */
450
+ type HtmlToMarkdownInput = string | Buffer | Uint8Array | ArrayBuffer;
451
+ type HtmlStringInputMode = "auto" | "path" | "html";
452
+ type ConvertHtmlToMarkdownOptions = CognipeerFileMarkdownOptions & {
453
+ /**
454
+ * How to interpret a `string` input.
455
+ * - `auto` (default): if the string is an existing file path on Node, use it as a path; else if it looks like HTML, treat as markup; otherwise pass through as a path for Cognipeer (may error).
456
+ * - `path`: always a filesystem path.
457
+ * - `html`: always HTML source (written to a temp `.html` file for Cognipeer).
458
+ */
459
+ readonly inputMode?: HtmlStringInputMode;
460
+ };
461
+ type ConvertHtmlToMarkdownResult = CognipeerFileMarkdownResult;
462
+ type HtmlToMarkdownSource = CognipeerFileMarkdownSource;
463
+ type HtmlToMarkdownFallbackReason = CognipeerFileFallbackReason;
464
+ /** Heuristic: treat string as HTML document or fragment (not a path). */
465
+ declare function looksLikeHtmlString(s: string): boolean;
466
+ /**
467
+ * HTML → Markdown via `@cognipeer/to-markdown` on Node (Turndown-style semantics: headings, lists, links, tables, fenced code).
468
+ * Optional structured fallback when {@link CognipeerFileMarkdownOptions.resolveStructured} is set or wired from {@link extractMarkdown}.
469
+ */
470
+ declare function convertHtmlToMarkdown(input: HtmlToMarkdownInput, options?: ConvertHtmlToMarkdownOptions): Promise<ConvertHtmlToMarkdownResult>;
471
+
472
+ type CsvToMarkdownInput = string | Buffer | Uint8Array | ArrayBuffer;
473
+ type CsvStringInputMode = "auto" | "path" | "content";
474
+ type ConvertCsvToMarkdownOptions = CognipeerFileMarkdownOptions & {
475
+ /**
476
+ * How to interpret a `string` input.
477
+ * - `auto` (default): existing file path on Node → read file; else if the string looks like CSV text (comma + newline) → treat as body; otherwise pass as path to Cognipeer.
478
+ * - `path`: filesystem path.
479
+ * - `content`: raw CSV text.
480
+ */
481
+ readonly inputMode?: CsvStringInputMode;
482
+ /**
483
+ * When `false`, prepends synthetic `Column 1,…` so the first row of your file becomes table data (Cognipeer always builds a header row).
484
+ * @default true
485
+ */
486
+ readonly includeHeader?: boolean;
487
+ /** Collapse extra blank lines and trim trailing spaces on each line. @default false */
488
+ readonly compactMode?: boolean;
489
+ /** Maximum number of **data** rows after the header row (or after the synthetic header when `includeHeader` is false). */
490
+ readonly maxRows?: number;
491
+ };
492
+ type ConvertCsvToMarkdownResult = CognipeerFileMarkdownResult;
493
+ type CsvToMarkdownSource = CognipeerFileMarkdownSource;
494
+ type CsvToMarkdownFallbackReason = CognipeerFileFallbackReason;
495
+ /**
496
+ * CSV → Markdown via `@cognipeer/to-markdown` (GFM-style pipe table). Optional row cap and header semantics; optional structured fallback.
497
+ */
498
+ declare function convertCsvToMarkdown(input: CsvToMarkdownInput, options?: ConvertCsvToMarkdownOptions): Promise<ConvertCsvToMarkdownResult>;
499
+
500
+ type ConvertSpreadsheetToMarkdownOptions = CognipeerFileMarkdownOptions & {
501
+ /**
502
+ * When `false`, removes `## sheetName` lines Cognipeer emits before each sheet table.
503
+ * @default true
504
+ */
505
+ readonly includeSheetNames?: boolean;
506
+ /** Collapse extra blank lines and trim trailing spaces on each line. @default false */
507
+ readonly compactMode?: boolean;
508
+ /** Max **data** rows per sheet table (after the header row), post-processed on Cognipeer output. */
509
+ readonly maxRowsPerSheet?: number;
510
+ };
511
+ type ConvertSpreadsheetToMarkdownResult = CognipeerFileMarkdownResult;
512
+ type SpreadsheetToMarkdownInput = CognipeerFileInput;
513
+ type SpreadsheetToMarkdownSource = CognipeerFileMarkdownSource;
514
+ type SpreadsheetToMarkdownFallbackReason = CognipeerFileFallbackReason;
515
+ /**
516
+ * Excel (`.xlsx` / `.xls`) path or bytes → Markdown via `@cognipeer/to-markdown` on Node.
517
+ * Cognipeer emits one `## sheetName` section per worksheet; use {@link ConvertSpreadsheetToMarkdownOptions} to tune presentation.
518
+ * For buffer inputs, set `forceExtension` / `fileName` when the default `.xlsx` temp name is wrong.
519
+ */
520
+ declare function convertSpreadsheetToMarkdown(input: SpreadsheetToMarkdownInput, options?: ConvertSpreadsheetToMarkdownOptions): Promise<ConvertSpreadsheetToMarkdownResult>;
521
+
426
522
  /**
427
523
  * Binary file payload for {@link extractMarkdown} when you have bytes (and optional name/MIME hints).
428
524
  */
@@ -445,7 +541,7 @@ type ExtractMarkdownInput = StructuredDocumentResult | ExtractMarkdownFileInput
445
541
  *
446
542
  * Top-level fields match {@link ConvertStructuredToMarkdownOptions} so passing the same object you would pass to
447
543
  * {@link convertStructuredToMarkdown} remains valid when `input` is a {@link StructuredDocumentResult}.
448
- * Additional fields configure DOCX/PDF branches and cross-strategy fallback.
544
+ * Additional fields configure DOCX, PDF, HTML, CSV, spreadsheet branches and cross-strategy fallback.
449
545
  */
450
546
  type ExtractMarkdownOptions = ConvertStructuredToMarkdownOptions & {
451
547
  /**
@@ -457,15 +553,34 @@ type ExtractMarkdownOptions = ConvertStructuredToMarkdownOptions & {
457
553
  readonly docx?: ConvertDocxToMarkdownOptions;
458
554
  /** Overrides merged into {@link convertPdfToMarkdown} when the input is identified as `.pdf`. */
459
555
  readonly pdf?: ConvertPdfToMarkdownOptions;
556
+ /**
557
+ * Overrides merged into {@link convertHtmlToMarkdown} when the input is identified as HTML
558
+ * (`inputMode`, `resolveStructured`, …). File/byte inputs from {@link extractMarkdown} do not need `inputMode`.
559
+ */
560
+ readonly html?: ConvertHtmlToMarkdownOptions;
561
+ /**
562
+ * Overrides merged into {@link convertCsvToMarkdown} when the input is identified as CSV
563
+ * (`maxRows`, `includeHeader`, `compactMode`, `inputMode`, …).
564
+ */
565
+ readonly csv?: ConvertCsvToMarkdownOptions;
566
+ /**
567
+ * Overrides merged into {@link convertSpreadsheetToMarkdown} for `.xlsx` / `.xls`
568
+ * (`maxRowsPerSheet`, `includeSheetNames`, `compactMode`, …).
569
+ */
570
+ readonly spreadsheet?: ConvertSpreadsheetToMarkdownOptions;
460
571
  };
461
572
  /** @see {@link detectBinaryFormat} */
462
- type DetectedBinaryFormat = "docx" | "pdf" | "unknown";
573
+ type DetectedBinaryFormat = "docx" | "pdf" | "html" | "csv" | "spreadsheet" | "unknown";
463
574
  /** Which branch produced {@link ExtractMarkdownResult.markdown}. */
464
575
  type ExtractMarkdownStrategy = "structured" | "docx-mammoth" | "docx-structured-fallback" | "pdf-cognipeer-specialized"
465
576
  /** `@cognipeer/to-markdown` not loadable (missing package, bundler, etc.). */
466
577
  | "pdf-cognipeer-unavailable"
467
578
  /** Engine failed or returned empty Markdown; no structured fallback or fallback also failed. */
468
- | "pdf-cognipeer-failed" | "pdf-structured-fallback" | "pdf-unsupported-runtime" | "docx-requires-node" | "path-requires-node" | "binary-unidentified" | "binary-unidentified-structured-fallback";
579
+ | "pdf-cognipeer-failed" | "pdf-structured-fallback" | "pdf-unsupported-runtime" | "html-cognipeer-specialized" | "html-cognipeer-unavailable" | "html-cognipeer-failed" | "html-structured-fallback" | "html-unsupported-runtime" | "csv-cognipeer-specialized" | "csv-cognipeer-unavailable" | "csv-cognipeer-failed" | "csv-structured-fallback" | "csv-unsupported-runtime" | "spreadsheet-cognipeer-specialized" | "spreadsheet-cognipeer-unavailable" | "spreadsheet-cognipeer-failed" | "spreadsheet-structured-fallback" | "spreadsheet-unsupported-runtime" | "docx-requires-node" | "path-requires-node" | "binary-unidentified" | "binary-unidentified-structured-fallback";
580
+ /**
581
+ * When bytes are not a known office/HTML/CSV type, MIME/filename can still hint image vs text vs media for messaging.
582
+ */
583
+ type ExtractMarkdownMediaHint = "image" | "text" | "audio" | "video";
469
584
  /**
470
585
  * Format detection and fallback bookkeeping for file/path inputs to {@link extractMarkdown}.
471
586
  * Omitted when `input` is a {@link StructuredDocumentResult} (already structured).
@@ -473,9 +588,16 @@ type ExtractMarkdownStrategy = "structured" | "docx-mammoth" | "docx-structured-
473
588
  interface ExtractMarkdownRoutingInfo {
474
589
  readonly detectedFormat: DetectedBinaryFormat;
475
590
  /** Specialized binary pipeline invoked first on Node (`none` if not applicable). */
476
- readonly specializedPipeline: "docx" | "pdf" | "none";
591
+ readonly specializedPipeline: "docx" | "pdf" | "html" | "csv" | "spreadsheet" | "none";
477
592
  /** Final Markdown came from {@link ExtractMarkdownOptions.structuredFallback} (or the same snapshot via `resolveStructured`). */
478
593
  readonly usedStructuredFallback: boolean;
594
+ /** Set when {@link ExtractMarkdownRoutingInfo.detectedFormat} is `unknown` and MIME/name suggest a category. */
595
+ readonly mediaHint?: ExtractMarkdownMediaHint;
596
+ /**
597
+ * One-line trace for operators: strategy, format, pipeline, fallback flag, optional media hint.
598
+ * Prefix: `[docmind-markdown:extractMarkdown:routing]`.
599
+ */
600
+ readonly routingSummary: string;
479
601
  }
480
602
  interface ExtractMarkdownResult {
481
603
  readonly markdown: string;
@@ -490,19 +612,24 @@ interface ExtractMarkdownResult {
490
612
  }
491
613
  /** Type guard: file-like `{ data: … }` input for {@link extractMarkdown}. */
492
614
  declare function isExtractMarkdownFileInput(value: unknown): value is ExtractMarkdownFileInput;
493
- /** Detect PDF / OOXML zip (DOCX) from magic bytes and optional filename / MIME. */
615
+ /**
616
+ * Detect specialized binary/text formats from magic bytes and optional filename / MIME.
617
+ * OOXML ZIP containers are disambiguated so `.xlsx` is not treated as WordprocessingML.
618
+ */
494
619
  declare function detectBinaryFormat(data: Buffer | Uint8Array | ArrayBuffer, filename?: string, mimeType?: string): DetectedBinaryFormat;
495
620
  /**
496
621
  * Produces Markdown from a {@link StructuredDocumentResult}, raw file bytes, or a filesystem `path` (Node),
497
- * picking DOCX / PDF specialized pipelines when possible and falling back to {@link convertStructuredToMarkdown}.
622
+ * routing to specialized converters when possible and falling back to {@link convertStructuredToMarkdown}.
498
623
  *
499
- * - **Structured input** — always uses the structured serializer (image/OCR/text/PDF/DOCX blocks already normalized).
500
- * - **DOCX bytes / path** — {@link convertDocxToMarkdown} on Node; otherwise warns and uses {@link ExtractMarkdownOptions.structuredFallback} if provided.
501
- * - **PDF bytes / path** — {@link convertPdfToMarkdown} (`@cognipeer/to-markdown` on Node); in non-Node runtimes returns empty Markdown with warnings unless {@link ExtractMarkdownOptions.structuredFallback} supplies content.
624
+ * - **Structured input** — {@link convertStructuredToMarkdown} only (`strategy` `structured`).
625
+ * - **DOCX** — {@link convertDocxToMarkdown} on Node; otherwise warns and uses {@link ExtractMarkdownOptions.structuredFallback} if set.
626
+ * - **PDF** — {@link convertPdfToMarkdown} on Node (Cognipeer); non-Node / failures use `structuredFallback` when provided.
627
+ * - **HTML / CSV / Excel** — {@link convertHtmlToMarkdown}, {@link convertCsvToMarkdown}, {@link convertSpreadsheetToMarkdown} on Node (Cognipeer family); same fallback rules as PDF.
628
+ * - **Unknown bytes** (e.g. raw images, arbitrary binary) — no specialized route; use a {@link StructuredDocumentResult} (OCR/image adapters) or `structuredFallback`. {@link ExtractMarkdownRoutingInfo.mediaHint} and warnings call this out when MIME/filename suggest `image/*` or `text/*`.
502
629
  *
503
- * Any use of `structuredFallback` (or the same snapshot through `resolveStructured`) adds a tagged line in {@link ExtractMarkdownResult.warnings}
504
- * and sets {@link ExtractMarkdownRoutingInfo.usedStructuredFallback} when {@link ExtractMarkdownResult.routing} is present.
630
+ * Fallbacks are never silent: tagged lines appear in {@link ExtractMarkdownResult.warnings}, and {@link ExtractMarkdownRoutingInfo.routingSummary}
631
+ * records the final `strategy`, detected format, pipeline, and whether structured fallback was used.
505
632
  */
506
633
  declare function extractMarkdown(input: ExtractMarkdownInput, options?: ExtractMarkdownOptions): Promise<ExtractMarkdownResult>;
507
634
 
508
- export { type ConvertDocxToMarkdownOptions, type ConvertDocxToMarkdownResult, type ConvertPdfToMarkdownOptions, type ConvertPdfToMarkdownResult, type ConvertStructuredToLlmTextOptions, type ConvertStructuredToMarkdownOptions, type DetectedBinaryFormat, type DocxMarkdownMessage, type DocxMarkdownResult, type DocxToMarkdownInput, type DocxToMarkdownSource, type ExtractMarkdownFileInput, type ExtractMarkdownInput, type ExtractMarkdownOptions, type ExtractMarkdownPathInput, type ExtractMarkdownResult, type ExtractMarkdownRoutingInfo, type ExtractMarkdownStrategy, type MammothConvertToHtmlOptions, type MarkdownSection, type OpenDataLoaderPdfConvertOptions, type PdfMarkdownResult, type PdfToMarkdownFallbackReason, type PdfToMarkdownInput, type PdfToMarkdownSource, type RenderLlmTextOptions, type RenderMarkdownOptions, type RenderMarkdownSectionsOptions, type SplitStructuredIntoChunksOptions, type StructuredChunk, type StructuredToLlmTextOptions, type StructuredToMarkdownOptions, type TurndownServiceOptions, convertDocxBufferToMarkdown, convertDocxToMarkdown, convertPdfBufferToMarkdown, convertPdfPathToMarkdown, convertPdfToMarkdown, convertStructuredToLlmText, convertStructuredToMarkdown, detectBinaryFormat, extractLlmContent, extractMarkdown, extractStructuredChunks, isExtractMarkdownFileInput, renderLlmText, renderMarkdown, renderMarkdownSections, splitStructuredIntoChunks, structuredDocumentToLlmText, structuredDocumentToMarkdown };
635
+ export { type ConvertCsvToMarkdownOptions, type ConvertCsvToMarkdownResult, type ConvertDocxToMarkdownOptions, type ConvertDocxToMarkdownResult, type ConvertHtmlToMarkdownOptions, type ConvertHtmlToMarkdownResult, type ConvertPdfToMarkdownOptions, type ConvertPdfToMarkdownResult, type ConvertSpreadsheetToMarkdownOptions, type ConvertSpreadsheetToMarkdownResult, type ConvertStructuredToLlmTextOptions, type ConvertStructuredToMarkdownOptions, type CsvStringInputMode, type CsvToMarkdownFallbackReason, type CsvToMarkdownInput, type CsvToMarkdownSource, type DetectedBinaryFormat, type DocxMarkdownMessage, type DocxMarkdownResult, type DocxToMarkdownInput, type DocxToMarkdownSource, type ExtractMarkdownFileInput, type ExtractMarkdownInput, type ExtractMarkdownMediaHint, type ExtractMarkdownOptions, type ExtractMarkdownPathInput, type ExtractMarkdownResult, type ExtractMarkdownRoutingInfo, type ExtractMarkdownStrategy, type HtmlStringInputMode, type HtmlToMarkdownFallbackReason, type HtmlToMarkdownInput, type HtmlToMarkdownSource, type MammothConvertToHtmlOptions, type MarkdownSection, type OpenDataLoaderPdfConvertOptions, type PdfMarkdownResult, type PdfToMarkdownFallbackReason, type PdfToMarkdownInput, type PdfToMarkdownSource, type RenderLlmTextOptions, type RenderMarkdownOptions, type RenderMarkdownSectionsOptions, type SplitStructuredIntoChunksOptions, type SpreadsheetToMarkdownFallbackReason, type SpreadsheetToMarkdownInput, type SpreadsheetToMarkdownSource, type StructuredChunk, type StructuredToLlmTextOptions, type StructuredToMarkdownOptions, type TurndownServiceOptions, convertCsvToMarkdown, convertDocxBufferToMarkdown, convertDocxToMarkdown, convertHtmlToMarkdown, convertPdfBufferToMarkdown, convertPdfPathToMarkdown, convertPdfToMarkdown, convertSpreadsheetToMarkdown, convertStructuredToLlmText, convertStructuredToMarkdown, detectBinaryFormat, extractLlmContent, extractMarkdown, extractStructuredChunks, isExtractMarkdownFileInput, looksLikeHtmlString, renderLlmText, renderMarkdown, renderMarkdownSections, splitStructuredIntoChunks, structuredDocumentToLlmText, structuredDocumentToMarkdown };