npm - @memvid/sdk - Versions diffs - 2.0.155 → 2.0.157 - Mend

@memvid/sdk 2.0.155 → 2.0.157

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/dist/documents/index.d.ts CHANGED Viewed

@@ -17,7 +17,8 @@
 import type { ParseOptions, ParseResult } from "./types";
 export * from "./types";
 export { parsePdf } from "./pdf";
-export { parseXlsx } from "./xlsx";
+export { parseXlsx, parseXlsxStructured } from "./xlsx";
+export type { XlsxStructuredChunk, XlsxDetectedTable, XlsxStructuredOptions, XlsxStructuredResult, } from "./xlsx";
 export { parsePptx } from "./pptx";
 export { parseDocx } from "./docx";
 /**

package/dist/documents/index.js CHANGED Viewed

@@ -30,7 +30,7 @@ var __exportStar = (this && this.__exportStar) || function(m, exports) {
     for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
 };
 Object.defineProperty(exports, "__esModule", { value: true });
-exports.parseDocx = exports.parsePptx = exports.parseXlsx = exports.parsePdf = void 0;
+exports.parseDocx = exports.parsePptx = exports.parseXlsxStructured = exports.parseXlsx = exports.parsePdf = void 0;
 exports.parse = parse;
 exports.isSupportedFormat = isSupportedFormat;
 exports.getDocumentType = getDocumentType;
@@ -45,6 +45,7 @@ var pdf_2 = require("./pdf");
 Object.defineProperty(exports, "parsePdf", { enumerable: true, get: function () { return pdf_2.parsePdf; } });
 var xlsx_2 = require("./xlsx");
 Object.defineProperty(exports, "parseXlsx", { enumerable: true, get: function () { return xlsx_2.parseXlsx; } });
+Object.defineProperty(exports, "parseXlsxStructured", { enumerable: true, get: function () { return xlsx_2.parseXlsxStructured; } });
 var pptx_2 = require("./pptx");
 Object.defineProperty(exports, "parsePptx", { enumerable: true, get: function () { return pptx_2.parsePptx; } });
 var docx_2 = require("./docx");

package/dist/documents/xlsx.d.ts CHANGED Viewed

@@ -1,6 +1,82 @@
 /**
  * Excel Parser with error handling
- * Uses exceljs - no Rust fallback available for XLSX
+ *
+ * Two strategies:
+ * - `parseXlsx()` — JS-based via exceljs (legacy, per-sheet flat text)
+ * - `parseXlsxStructured()` — Rust-native via memvid-core (high accuracy,
+ *   header-value pairing, merged cell support, semantic chunking)
  */
 import type { ParseOptions, ParseResult } from "./types";
 export declare function parseXlsx(filePath: string, options?: ParseOptions): Promise<ParseResult>;
+/** A single structured chunk from the XLSX extraction pipeline. */
+export interface XlsxStructuredChunk {
+    /** Chunk text with [Sheet:] prefix and Header: Value formatting */
+    text: string;
+    /** Chunk type: "Table", "TableContinuation", "Text", etc. */
+    chunkType: string;
+    /** 0-based chunk index */
+    index: number;
+    /** Table/element identifier */
+    elementId?: string;
+    /** Header context for continuation chunks */
+    context?: string;
+}
+/** A detected table with headers and metadata. */
+export interface XlsxDetectedTable {
+    name: string;
+    sheetName: string;
+    headers: string[];
+    headerRow: number;
+    firstDataRow: number;
+    lastDataRow: number;
+    firstCol: number;
+    lastCol: number;
+    confidence: number;
+    columnTypes: string[];
+}
+/** Options for structured XLSX extraction. */
+export interface XlsxStructuredOptions {
+    /** Target chunk size in characters (default: 1200) */
+    maxChars?: number;
+    /** Maximum chunks to produce (default: 500) */
+    maxChunks?: number;
+}
+/** Result of structured XLSX extraction. */
+export interface XlsxStructuredResult {
+    /** Backward-compatible flat text */
+    text: string;
+    /** Semantic chunks with header-value pairing */
+    chunks: XlsxStructuredChunk[];
+    /** Detected tables with metadata */
+    tables: XlsxDetectedTable[];
+    /** Extraction diagnostics */
+    diagnostics: {
+        warnings: string[];
+        tablesProcessed: number;
+        tablesSplit: number;
+    };
+    /** Extraction time in milliseconds */
+    timingMs: number;
+}
+/**
+ * Parse an XLSX file using the Rust structured extraction pipeline.
+ *
+ * This provides much higher search accuracy than `parseXlsx()` by:
+ * - Detecting table boundaries and headers automatically
+ * - Formatting rows as `Header: Value | Header: Value` pairs
+ * - Propagating merged cells
+ * - Detecting number formats (dates, currency, percentages)
+ * - Never splitting rows across chunk boundaries
+ *
+ * @example
+ * ```typescript
+ * const result = await parseXlsxStructured("./proforma.xlsx");
+ * console.log(`${result.tables.length} tables, ${result.chunks.length} chunks`);
+ *
+ * // Ingest chunks into memvid for high-accuracy search
+ * for (const chunk of result.chunks) {
+ *   await mem.put({ text: chunk.text, title: `XLSX chunk ${chunk.index}` });
+ * }
+ * ```
+ */
+export declare function parseXlsxStructured(filePath: string, options?: XlsxStructuredOptions): Promise<XlsxStructuredResult>;

package/dist/documents/xlsx.js CHANGED Viewed

@@ -1,7 +1,11 @@
 "use strict";
 /**
  * Excel Parser with error handling
- * Uses exceljs - no Rust fallback available for XLSX
+ *
+ * Two strategies:
+ * - `parseXlsx()` — JS-based via exceljs (legacy, per-sheet flat text)
+ * - `parseXlsxStructured()` — Rust-native via memvid-core (high accuracy,
+ *   header-value pairing, merged cell support, semantic chunking)
  */
 var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
     if (k2 === undefined) k2 = k;
@@ -38,6 +42,7 @@ var __importStar = (this && this.__importStar) || (function () {
 })();
 Object.defineProperty(exports, "__esModule", { value: true });
 exports.parseXlsx = parseXlsx;
+exports.parseXlsxStructured = parseXlsxStructured;
 /** Extract the display value from an exceljs cell */
 function getCellValue(cellValue) {
     if (cellValue == null || cellValue === "")
@@ -115,3 +120,57 @@ async function parseXlsx(filePath, options) {
             `Ensure the file is a valid .xlsx/.xls file.`);
     }
 }
+/**
+ * Parse an XLSX file using the Rust structured extraction pipeline.
+ *
+ * This provides much higher search accuracy than `parseXlsx()` by:
+ * - Detecting table boundaries and headers automatically
+ * - Formatting rows as `Header: Value | Header: Value` pairs
+ * - Propagating merged cells
+ * - Detecting number formats (dates, currency, percentages)
+ * - Never splitting rows across chunk boundaries
+ *
+ * @example
+ * ```typescript
+ * const result = await parseXlsxStructured("./proforma.xlsx");
+ * console.log(`${result.tables.length} tables, ${result.chunks.length} chunks`);
+ *
+ * // Ingest chunks into memvid for high-accuracy search
+ * for (const chunk of result.chunks) {
+ *   await mem.put({ text: chunk.text, title: `XLSX chunk ${chunk.index}` });
+ * }
+ * ```
+ */
+async function parseXlsxStructured(filePath, options) {
+    try {
+        // Try native Rust extraction first
+        const native = require("../../index.js");
+        if (typeof native.parseXlsxStructured === "function") {
+            const nativeOpts = options
+                ? { maxChars: options.maxChars, maxChunks: options.maxChunks }
+                : undefined;
+            return await native.parseXlsxStructured(filePath, nativeOpts);
+        }
+    }
+    catch {
+        // Native not available — fall through to JS fallback
+    }
+    // Fallback: use parseXlsx and wrap in structured format
+    const legacy = await parseXlsx(filePath);
+    const chunks = legacy.items.map((item, i) => ({
+        text: `[Sheet: ${item.name || `Sheet${item.number}`}]\n${item.text}`,
+        chunkType: "Table",
+        index: i,
+    }));
+    return {
+        text: legacy.items.map((item) => item.text).join("\n\n"),
+        chunks,
+        tables: [],
+        diagnostics: {
+            warnings: ["Native structured extraction unavailable; using JS fallback"],
+            tablesProcessed: 0,
+            tablesSplit: 0,
+        },
+        timingMs: 0,
+    };
+}

package/dist/image-ingest.d.ts ADDED Viewed

@@ -0,0 +1,250 @@
+/**
+ * High-performance batch image ingestion for Memvid SDK (Node.js).
+ *
+ * Uses OCR to extract text from images, then ingests into a .mv2 memory file.
+ * docTR (via Python) provides highest accuracy (85.3%), Tesseract.js is available as optional dependency.
+ *
+ * @example
+ * ```typescript
+ * import { ImageIngestor } from '@memvid/sdk';
+ *
+ * // First install tesseract.js: npm install tesseract.js
+ * const ingestor = new ImageIngestor({
+ *   ocrProvider: 'tesseract',
+ *   workers: 4,
+ * });
+ *
+ * const result = await ingestor.ingestDirectory(
+ *   './construction_drawings/',
+ *   './project.mv2',
+ *   {
+ *     patterns: ['*.png', '*.jpg'],
+ *     onProgress: (done, total) => console.log(`${done}/${total}`),
+ *   }
+ * );
+ *
+ * console.log(`Processed ${result.totalImages} images`);
+ * await ingestor.terminate();
+ * ```
+ *
+ * For highest accuracy (85.3%), use docTR via Python:
+ * ```typescript
+ * // Requires: pip install python-doctr[torch]
+ * const ingestor = new ImageIngestor({ ocrProvider: 'doctr' });
+ * ```
+ */
+import { OCRProviderType } from './ocr';
+/**
+ * Options for image ingestion.
+ */
+export interface ImageIngestOptions {
+    /** Minimum OCR confidence threshold (0-1). Default: 0.3 */
+    minConfidence?: number;
+    /** Use fallback OCR on low confidence. Default: true */
+    fallbackOcr?: boolean;
+    /** Images to process per batch. Default: 10 */
+    batchSize?: number;
+    /** Metadata to attach to all ingested frames */
+    metadata?: Record<string, unknown>;
+    /** Label for ingested frames. Default: 'image-extract' */
+    label?: string;
+}
+/**
+ * Options for directory ingestion.
+ */
+export interface DirectoryIngestOptions extends ImageIngestOptions {
+    /** Glob patterns for files to include. Default: ['*.png', '*.jpg', '*.jpeg', '*.tiff'] */
+    patterns?: string[];
+    /** Search subdirectories. Default: true */
+    recursive?: boolean;
+    /** Progress callback */
+    onProgress?: (completed: number, total: number) => void;
+}
+/**
+ * Options for array-based ingestion.
+ */
+export interface ImagesIngestOptions extends ImageIngestOptions {
+    /** Progress callback */
+    onProgress?: (completed: number, total: number) => void;
+}
+/**
+ * Result from batch image ingestion.
+ */
+export interface ImageIngestResult {
+    /** Total images processed */
+    totalImages: number;
+    /** Successfully ingested images */
+    successful: number;
+    /** Failed images */
+    failed: number;
+    /** Total chunks/frames created */
+    totalChunks: number;
+    /** Processing time in seconds */
+    elapsedSeconds: number;
+    /** Output file size in bytes */
+    outputSizeBytes: number;
+    /** Errors encountered */
+    errors: Array<{
+        path: string;
+        error: string;
+    }>;
+    /** Images processed per second */
+    imagesPerSecond: number;
+    /** Output size in MB */
+    outputSizeMb: number;
+}
+/**
+ * Constructor options for ImageIngestor.
+ */
+export interface ImageIngestorOptions {
+    /** OCR provider: 'tesseract', 'doctr', or 'easyocr'. Default: 'tesseract' */
+    ocrProvider?: OCRProviderType;
+    /** Number of parallel workers. Default: CPU count */
+    workers?: number;
+    /** Python path for doctr/easyocr providers */
+    pythonPath?: string;
+}
+/**
+ * High-performance batch image ingestor for Memvid.
+ *
+ * Combines OCR text extraction with parallel processing for fast, accurate
+ * ingestion of large image collections.
+ *
+ * OCR Accuracy (tested on construction drawings):
+ *   - docTR (Python): 85.3% - BEST
+ *   - EasyOCR (Python): 79.4%
+ *   - Tesseract.js: ~50-60%
+ *
+ * @example
+ * ```typescript
+ * const ingestor = new ImageIngestor({
+ *   ocrProvider: 'doctr',
+ *   workers: 8,
+ * });
+ *
+ * const result = await ingestor.ingestDirectory('./drawings/', './output.mv2');
+ * console.log(`Processed ${result.totalImages} images in ${result.elapsedSeconds}s`);
+ *
+ * await ingestor.terminate();
+ * ```
+ */
+export declare class ImageIngestor {
+    private _ocr;
+    private _fallbackOcr;
+    private _workers;
+    private _ocrType;
+    constructor(options?: ImageIngestorOptions);
+    /** Primary OCR provider name */
+    get ocrName(): string;
+    /** Number of parallel workers */
+    get workers(): number;
+    /**
+     * Ingest multiple images into a .mv2 file.
+     *
+     * @param paths - Array of image file paths
+     * @param outputPath - Output .mv2 file path
+     * @param options - Ingestion options
+     * @returns Promise resolving to ingestion result
+     *
+     * @example
+     * ```typescript
+     * const result = await ingestor.ingestImages(
+     *   ['img1.png', 'img2.png'],
+     *   './output.mv2',
+     *   { onProgress: (d, t) => console.log(`${d}/${t}`) }
+     * );
+     * ```
+     */
+    ingestImages(paths: string[], outputPath: string, options?: ImagesIngestOptions): Promise<ImageIngestResult>;
+    /**
+     * Ingest all matching images from a directory.
+     *
+     * @param directory - Source directory path
+     * @param outputPath - Output .mv2 file path
+     * @param options - Directory ingestion options
+     * @returns Promise resolving to ingestion result
+     *
+     * @example
+     * ```typescript
+     * const result = await ingestor.ingestDirectory(
+     *   './construction_drawings/',
+     *   './project.mv2',
+     *   {
+     *     patterns: ['*.png', '*.jpg'],
+     *     recursive: true,
+     *     onProgress: (d, t) => console.log(`${d}/${t}`),
+     *   }
+     * );
+     * ```
+     */
+    ingestDirectory(directory: string, outputPath: string, options?: DirectoryIngestOptions): Promise<ImageIngestResult>;
+    /**
+     * Extract text from a single image with fallback support.
+     */
+    private _extractText;
+    /**
+     * Clean up OCR worker resources.
+     *
+     * Call this when done using the ingestor to free memory.
+     */
+    terminate(): Promise<void>;
+}
+/**
+ * Convenience function for quick image ingestion.
+ *
+ * Creates an ImageIngestor, processes images, and cleans up automatically.
+ *
+ * @param paths - Array of image file paths
+ * @param outputPath - Output .mv2 file path
+ * @param options - Ingestion options
+ * @returns Promise resolving to ingestion result
+ *
+ * @example
+ * ```typescript
+ * import { ingestImages } from 'memvid-sdk';
+ *
+ * const result = await ingestImages(
+ *   ['img1.png', 'img2.png'],
+ *   './output.mv2',
+ *   {
+ *     ocrProvider: 'doctr',
+ *     onProgress: (d, t) => console.log(`${d}/${t}`),
+ *   }
+ * );
+ * ```
+ */
+export declare function ingestImages(paths: string[], outputPath: string, options?: ImagesIngestOptions & {
+    ocrProvider?: OCRProviderType;
+    workers?: number;
+    pythonPath?: string;
+}): Promise<ImageIngestResult>;
+/**
+ * Convenience function for quick directory ingestion.
+ *
+ * Creates an ImageIngestor, processes directory, and cleans up automatically.
+ *
+ * @param directory - Source directory path
+ * @param outputPath - Output .mv2 file path
+ * @param options - Directory ingestion options
+ * @returns Promise resolving to ingestion result
+ *
+ * @example
+ * ```typescript
+ * import { ingestDirectory } from 'memvid-sdk';
+ *
+ * const result = await ingestDirectory(
+ *   './construction_drawings/',
+ *   './project.mv2',
+ *   {
+ *     ocrProvider: 'doctr',
+ *     patterns: ['*.png', '*.jpg'],
+ *     onProgress: (d, t) => console.log(`${d}/${t}`),
+ *   }
+ * );
+ * ```
+ */
+export declare function ingestDirectory(directory: string, outputPath: string, options?: DirectoryIngestOptions & {
+    ocrProvider?: OCRProviderType;
+    workers?: number;
+    pythonPath?: string;
+}): Promise<ImageIngestResult>;