npm - @dragon708/docmind-markdown - Versions diffs - 1.2.6 → 1.2.8 - Mend

@dragon708/docmind-markdown 1.2.6 → 1.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/dist/index.d.ts +139 -12
package/dist/index.js +781 -48
package/node_modules/turndown-plugin-gfm/LICENSE +21 -0
package/node_modules/turndown-plugin-gfm/README.md +50 -0
package/node_modules/turndown-plugin-gfm/dist/turndown-plugin-gfm.js +165 -0
package/node_modules/turndown-plugin-gfm/lib/turndown-plugin-gfm.browser.cjs.js +162 -0
package/node_modules/turndown-plugin-gfm/lib/turndown-plugin-gfm.browser.es.js +154 -0
package/node_modules/turndown-plugin-gfm/lib/turndown-plugin-gfm.cjs.js +162 -0
package/node_modules/turndown-plugin-gfm/lib/turndown-plugin-gfm.es.js +154 -0
package/node_modules/turndown-plugin-gfm/package.json +43 -0
package/package.json +5 -1

package/dist/index.d.ts CHANGED Viewed

@@ -423,6 +423,102 @@ declare function convertPdfPathToMarkdown(inputPath: string, options?: ConvertPd
  */
 declare function convertPdfBufferToMarkdown(input: Buffer | Uint8Array | ArrayBuffer, options?: ConvertPdfToMarkdownOptions): Promise<PdfMarkdownResult>;
+interface CognipeerFileMarkdownOptions {
+    readonly resolveStructured?: () => Promise<StructuredDocumentResult>;
+    readonly structuredMarkdown?: ConvertStructuredToMarkdownOptions;
+    /** @default true */
+    readonly cleanMarkdown?: boolean;
+    readonly fileName?: string;
+    readonly forceExtension?: string;
+    readonly url?: string;
+}
+type CognipeerFileMarkdownSource = "cognipeer" | "structured-fallback" | "unsupported-runtime" | "cognipeer-unavailable" | "cognipeer-failed";
+type CognipeerFileFallbackReason = "unsupported-runtime" | "error" | "empty" | "module-not-found";
+interface CognipeerFileMarkdownResult {
+    readonly markdown: string;
+    readonly warnings: readonly string[];
+    readonly source: CognipeerFileMarkdownSource;
+    readonly fallbackReason?: CognipeerFileFallbackReason;
+}
+type CognipeerFileInput = string | Buffer | Uint8Array | ArrayBuffer;
+/**
+ * - **Path** (Node): filesystem path to `.html` / `.htm`
+ * - **Markup string**: HTML source when {@link ConvertHtmlToMarkdownOptions.inputMode} is `"html"`, or in `"auto"` when the string is not an existing file and {@link looksLikeHtmlString} is true
+ * - **Binary**: `Buffer` / `Uint8Array` / `ArrayBuffer` (UTF-8 or document bytes)
+ */
+type HtmlToMarkdownInput = string | Buffer | Uint8Array | ArrayBuffer;
+type HtmlStringInputMode = "auto" | "path" | "html";
+type ConvertHtmlToMarkdownOptions = CognipeerFileMarkdownOptions & {
+    /**
+     * How to interpret a `string` input.
+     * - `auto` (default): if the string is an existing file path on Node, use it as a path; else if it looks like HTML, treat as markup; otherwise pass through as a path for Cognipeer (may error).
+     * - `path`: always a filesystem path.
+     * - `html`: always HTML source (written to a temp `.html` file for Cognipeer).
+     */
+    readonly inputMode?: HtmlStringInputMode;
+};
+type ConvertHtmlToMarkdownResult = CognipeerFileMarkdownResult;
+type HtmlToMarkdownSource = CognipeerFileMarkdownSource;
+type HtmlToMarkdownFallbackReason = CognipeerFileFallbackReason;
+/** Heuristic: treat string as HTML document or fragment (not a path). */
+declare function looksLikeHtmlString(s: string): boolean;
+/**
+ * HTML → Markdown via `@cognipeer/to-markdown` on Node (Turndown-style semantics: headings, lists, links, tables, fenced code).
+ * Optional structured fallback when {@link CognipeerFileMarkdownOptions.resolveStructured} is set or wired from {@link extractMarkdown}.
+ */
+declare function convertHtmlToMarkdown(input: HtmlToMarkdownInput, options?: ConvertHtmlToMarkdownOptions): Promise<ConvertHtmlToMarkdownResult>;
+type CsvToMarkdownInput = string | Buffer | Uint8Array | ArrayBuffer;
+type CsvStringInputMode = "auto" | "path" | "content";
+type ConvertCsvToMarkdownOptions = CognipeerFileMarkdownOptions & {
+    /**
+     * How to interpret a `string` input.
+     * - `auto` (default): existing file path on Node → read file; else if the string looks like CSV text (comma + newline) → treat as body; otherwise pass as path to Cognipeer.
+     * - `path`: filesystem path.
+     * - `content`: raw CSV text.
+     */
+    readonly inputMode?: CsvStringInputMode;
+    /**
+     * When `false`, prepends synthetic `Column 1,…` so the first row of your file becomes table data (Cognipeer always builds a header row).
+     * @default true
+     */
+    readonly includeHeader?: boolean;
+    /** Collapse extra blank lines and trim trailing spaces on each line. @default false */
+    readonly compactMode?: boolean;
+    /** Maximum number of **data** rows after the header row (or after the synthetic header when `includeHeader` is false). */
+    readonly maxRows?: number;
+};
+type ConvertCsvToMarkdownResult = CognipeerFileMarkdownResult;
+type CsvToMarkdownSource = CognipeerFileMarkdownSource;
+type CsvToMarkdownFallbackReason = CognipeerFileFallbackReason;
+/**
+ * CSV → Markdown via `@cognipeer/to-markdown` (GFM-style pipe table). Optional row cap and header semantics; optional structured fallback.
+ */
+declare function convertCsvToMarkdown(input: CsvToMarkdownInput, options?: ConvertCsvToMarkdownOptions): Promise<ConvertCsvToMarkdownResult>;
+type ConvertSpreadsheetToMarkdownOptions = CognipeerFileMarkdownOptions & {
+    /**
+     * When `false`, removes `## sheetName` lines Cognipeer emits before each sheet table.
+     * @default true
+     */
+    readonly includeSheetNames?: boolean;
+    /** Collapse extra blank lines and trim trailing spaces on each line. @default false */
+    readonly compactMode?: boolean;
+    /** Max **data** rows per sheet table (after the header row), post-processed on Cognipeer output. */
+    readonly maxRowsPerSheet?: number;
+};
+type ConvertSpreadsheetToMarkdownResult = CognipeerFileMarkdownResult;
+type SpreadsheetToMarkdownInput = CognipeerFileInput;
+type SpreadsheetToMarkdownSource = CognipeerFileMarkdownSource;
+type SpreadsheetToMarkdownFallbackReason = CognipeerFileFallbackReason;
+/**
+ * Excel (`.xlsx` / `.xls`) path or bytes → Markdown via `@cognipeer/to-markdown` on Node.
+ * Cognipeer emits one `## sheetName` section per worksheet; use {@link ConvertSpreadsheetToMarkdownOptions} to tune presentation.
+ * For buffer inputs, set `forceExtension` / `fileName` when the default `.xlsx` temp name is wrong.
+ */
+declare function convertSpreadsheetToMarkdown(input: SpreadsheetToMarkdownInput, options?: ConvertSpreadsheetToMarkdownOptions): Promise<ConvertSpreadsheetToMarkdownResult>;
 /**
  * Binary file payload for {@link extractMarkdown} when you have bytes (and optional name/MIME hints).
  */
@@ -445,7 +541,7 @@ type ExtractMarkdownInput = StructuredDocumentResult | ExtractMarkdownFileInput
  *
  * Top-level fields match {@link ConvertStructuredToMarkdownOptions} so passing the same object you would pass to
  * {@link convertStructuredToMarkdown} remains valid when `input` is a {@link StructuredDocumentResult}.
- * Additional fields configure DOCX/PDF branches and cross-strategy fallback.
+ * Additional fields configure DOCX, PDF, HTML, CSV, spreadsheet branches and cross-strategy fallback.
  */
 type ExtractMarkdownOptions = ConvertStructuredToMarkdownOptions & {
     /**
@@ -457,15 +553,34 @@ type ExtractMarkdownOptions = ConvertStructuredToMarkdownOptions & {
     readonly docx?: ConvertDocxToMarkdownOptions;
     /** Overrides merged into {@link convertPdfToMarkdown} when the input is identified as `.pdf`. */
     readonly pdf?: ConvertPdfToMarkdownOptions;
+    /**
+     * Overrides merged into {@link convertHtmlToMarkdown} when the input is identified as HTML
+     * (`inputMode`, `resolveStructured`, …). File/byte inputs from {@link extractMarkdown} do not need `inputMode`.
+     */
+    readonly html?: ConvertHtmlToMarkdownOptions;
+    /**
+     * Overrides merged into {@link convertCsvToMarkdown} when the input is identified as CSV
+     * (`maxRows`, `includeHeader`, `compactMode`, `inputMode`, …).
+     */
+    readonly csv?: ConvertCsvToMarkdownOptions;
+    /**
+     * Overrides merged into {@link convertSpreadsheetToMarkdown} for `.xlsx` / `.xls`
+     * (`maxRowsPerSheet`, `includeSheetNames`, `compactMode`, …).
+     */
+    readonly spreadsheet?: ConvertSpreadsheetToMarkdownOptions;
 };
 /** @see {@link detectBinaryFormat} */
-type DetectedBinaryFormat = "docx" | "pdf" | "unknown";
+type DetectedBinaryFormat = "docx" | "pdf" | "html" | "csv" | "spreadsheet" | "unknown";
 /** Which branch produced {@link ExtractMarkdownResult.markdown}. */
 type ExtractMarkdownStrategy = "structured" | "docx-mammoth" | "docx-structured-fallback" | "pdf-cognipeer-specialized"
 /** `@cognipeer/to-markdown` not loadable (missing package, bundler, etc.). */
  | "pdf-cognipeer-unavailable"
 /** Engine failed or returned empty Markdown; no structured fallback or fallback also failed. */
- | "pdf-cognipeer-failed" | "pdf-structured-fallback" | "pdf-unsupported-runtime" | "docx-requires-node" | "path-requires-node" | "binary-unidentified" | "binary-unidentified-structured-fallback";
+ | "pdf-cognipeer-failed" | "pdf-structured-fallback" | "pdf-unsupported-runtime" | "html-cognipeer-specialized" | "html-cognipeer-unavailable" | "html-cognipeer-failed" | "html-structured-fallback" | "html-unsupported-runtime" | "csv-cognipeer-specialized" | "csv-cognipeer-unavailable" | "csv-cognipeer-failed" | "csv-structured-fallback" | "csv-unsupported-runtime" | "spreadsheet-cognipeer-specialized" | "spreadsheet-cognipeer-unavailable" | "spreadsheet-cognipeer-failed" | "spreadsheet-structured-fallback" | "spreadsheet-unsupported-runtime" | "docx-requires-node" | "path-requires-node" | "binary-unidentified" | "binary-unidentified-structured-fallback";
+/**
+ * When bytes are not a known office/HTML/CSV type, MIME/filename can still hint image vs text vs media for messaging.
+ */
+type ExtractMarkdownMediaHint = "image" | "text" | "audio" | "video";
 /**
  * Format detection and fallback bookkeeping for file/path inputs to {@link extractMarkdown}.
  * Omitted when `input` is a {@link StructuredDocumentResult} (already structured).
@@ -473,9 +588,16 @@ type ExtractMarkdownStrategy = "structured" | "docx-mammoth" | "docx-structured-
 interface ExtractMarkdownRoutingInfo {
     readonly detectedFormat: DetectedBinaryFormat;
     /** Specialized binary pipeline invoked first on Node (`none` if not applicable). */
-    readonly specializedPipeline: "docx" | "pdf" | "none";
+    readonly specializedPipeline: "docx" | "pdf" | "html" | "csv" | "spreadsheet" | "none";
     /** Final Markdown came from {@link ExtractMarkdownOptions.structuredFallback} (or the same snapshot via `resolveStructured`). */
     readonly usedStructuredFallback: boolean;
+    /** Set when {@link ExtractMarkdownRoutingInfo.detectedFormat} is `unknown` and MIME/name suggest a category. */
+    readonly mediaHint?: ExtractMarkdownMediaHint;
+    /**
+     * One-line trace for operators: strategy, format, pipeline, fallback flag, optional media hint.
+     * Prefix: `[docmind-markdown:extractMarkdown:routing]`.
+     */
+    readonly routingSummary: string;
 }
 interface ExtractMarkdownResult {
     readonly markdown: string;
@@ -490,19 +612,24 @@ interface ExtractMarkdownResult {
 }
 /** Type guard: file-like `{ data: … }` input for {@link extractMarkdown}. */
 declare function isExtractMarkdownFileInput(value: unknown): value is ExtractMarkdownFileInput;
-/** Detect PDF / OOXML zip (DOCX) from magic bytes and optional filename / MIME. */
+/**
+ * Detect specialized binary/text formats from magic bytes and optional filename / MIME.
+ * OOXML ZIP containers are disambiguated so `.xlsx` is not treated as WordprocessingML.
+ */
 declare function detectBinaryFormat(data: Buffer | Uint8Array | ArrayBuffer, filename?: string, mimeType?: string): DetectedBinaryFormat;
 /**
  * Produces Markdown from a {@link StructuredDocumentResult}, raw file bytes, or a filesystem `path` (Node),
- * picking DOCX / PDF specialized pipelines when possible and falling back to {@link convertStructuredToMarkdown}.
+ * routing to specialized converters when possible and falling back to {@link convertStructuredToMarkdown}.
  *
- * - **Structured input** — always uses the structured serializer (image/OCR/text/PDF/DOCX blocks already normalized).
- * - **DOCX bytes / path** — {@link convertDocxToMarkdown} on Node; otherwise warns and uses {@link ExtractMarkdownOptions.structuredFallback} if provided.
- * - **PDF bytes / path** — {@link convertPdfToMarkdown} (`@cognipeer/to-markdown` on Node); in non-Node runtimes returns empty Markdown with warnings unless {@link ExtractMarkdownOptions.structuredFallback} supplies content.
+ * - **Structured input** — {@link convertStructuredToMarkdown} only (`strategy` `structured`).
+ * - **DOCX** — {@link convertDocxToMarkdown} on Node; otherwise warns and uses {@link ExtractMarkdownOptions.structuredFallback} if set.
+ * - **PDF** — {@link convertPdfToMarkdown} on Node (Cognipeer); non-Node / failures use `structuredFallback` when provided.
+ * - **HTML / CSV / Excel** — {@link convertHtmlToMarkdown}, {@link convertCsvToMarkdown}, {@link convertSpreadsheetToMarkdown} on Node (Cognipeer family); same fallback rules as PDF.
+ * - **Unknown bytes** (e.g. raw images, arbitrary binary) — no specialized route; use a {@link StructuredDocumentResult} (OCR/image adapters) or `structuredFallback`. {@link ExtractMarkdownRoutingInfo.mediaHint} and warnings call this out when MIME/filename suggest `image/*` or `text/*`.
  *
- * Any use of `structuredFallback` (or the same snapshot through `resolveStructured`) adds a tagged line in {@link ExtractMarkdownResult.warnings}
- * and sets {@link ExtractMarkdownRoutingInfo.usedStructuredFallback} when {@link ExtractMarkdownResult.routing} is present.
+ * Fallbacks are never silent: tagged lines appear in {@link ExtractMarkdownResult.warnings}, and {@link ExtractMarkdownRoutingInfo.routingSummary}
+ * records the final `strategy`, detected format, pipeline, and whether structured fallback was used.
  */
 declare function extractMarkdown(input: ExtractMarkdownInput, options?: ExtractMarkdownOptions): Promise<ExtractMarkdownResult>;
-export { type ConvertDocxToMarkdownOptions, type ConvertDocxToMarkdownResult, type ConvertPdfToMarkdownOptions, type ConvertPdfToMarkdownResult, type ConvertStructuredToLlmTextOptions, type ConvertStructuredToMarkdownOptions, type DetectedBinaryFormat, type DocxMarkdownMessage, type DocxMarkdownResult, type DocxToMarkdownInput, type DocxToMarkdownSource, type ExtractMarkdownFileInput, type ExtractMarkdownInput, type ExtractMarkdownOptions, type ExtractMarkdownPathInput, type ExtractMarkdownResult, type ExtractMarkdownRoutingInfo, type ExtractMarkdownStrategy, type MammothConvertToHtmlOptions, type MarkdownSection, type OpenDataLoaderPdfConvertOptions, type PdfMarkdownResult, type PdfToMarkdownFallbackReason, type PdfToMarkdownInput, type PdfToMarkdownSource, type RenderLlmTextOptions, type RenderMarkdownOptions, type RenderMarkdownSectionsOptions, type SplitStructuredIntoChunksOptions, type StructuredChunk, type StructuredToLlmTextOptions, type StructuredToMarkdownOptions, type TurndownServiceOptions, convertDocxBufferToMarkdown, convertDocxToMarkdown, convertPdfBufferToMarkdown, convertPdfPathToMarkdown, convertPdfToMarkdown, convertStructuredToLlmText, convertStructuredToMarkdown, detectBinaryFormat, extractLlmContent, extractMarkdown, extractStructuredChunks, isExtractMarkdownFileInput, renderLlmText, renderMarkdown, renderMarkdownSections, splitStructuredIntoChunks, structuredDocumentToLlmText, structuredDocumentToMarkdown };
+export { type ConvertCsvToMarkdownOptions, type ConvertCsvToMarkdownResult, type ConvertDocxToMarkdownOptions, type ConvertDocxToMarkdownResult, type ConvertHtmlToMarkdownOptions, type ConvertHtmlToMarkdownResult, type ConvertPdfToMarkdownOptions, type ConvertPdfToMarkdownResult, type ConvertSpreadsheetToMarkdownOptions, type ConvertSpreadsheetToMarkdownResult, type ConvertStructuredToLlmTextOptions, type ConvertStructuredToMarkdownOptions, type CsvStringInputMode, type CsvToMarkdownFallbackReason, type CsvToMarkdownInput, type CsvToMarkdownSource, type DetectedBinaryFormat, type DocxMarkdownMessage, type DocxMarkdownResult, type DocxToMarkdownInput, type DocxToMarkdownSource, type ExtractMarkdownFileInput, type ExtractMarkdownInput, type ExtractMarkdownMediaHint, type ExtractMarkdownOptions, type ExtractMarkdownPathInput, type ExtractMarkdownResult, type ExtractMarkdownRoutingInfo, type ExtractMarkdownStrategy, type HtmlStringInputMode, type HtmlToMarkdownFallbackReason, type HtmlToMarkdownInput, type HtmlToMarkdownSource, type MammothConvertToHtmlOptions, type MarkdownSection, type OpenDataLoaderPdfConvertOptions, type PdfMarkdownResult, type PdfToMarkdownFallbackReason, type PdfToMarkdownInput, type PdfToMarkdownSource, type RenderLlmTextOptions, type RenderMarkdownOptions, type RenderMarkdownSectionsOptions, type SplitStructuredIntoChunksOptions, type SpreadsheetToMarkdownFallbackReason, type SpreadsheetToMarkdownInput, type SpreadsheetToMarkdownSource, type StructuredChunk, type StructuredToLlmTextOptions, type StructuredToMarkdownOptions, type TurndownServiceOptions, convertCsvToMarkdown, convertDocxBufferToMarkdown, convertDocxToMarkdown, convertHtmlToMarkdown, convertPdfBufferToMarkdown, convertPdfPathToMarkdown, convertPdfToMarkdown, convertSpreadsheetToMarkdown, convertStructuredToLlmText, convertStructuredToMarkdown, detectBinaryFormat, extractLlmContent, extractMarkdown, extractStructuredChunks, isExtractMarkdownFileInput, looksLikeHtmlString, renderLlmText, renderMarkdown, renderMarkdownSections, splitStructuredIntoChunks, structuredDocumentToLlmText, structuredDocumentToMarkdown };