npm - @memvid/sdk - Versions diffs - 2.0.156 → 2.0.157 - Mend

@memvid/sdk 2.0.156 → 2.0.157

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/documents/index.d.ts CHANGED Viewed

@@ -17,7 +17,8 @@
 import type { ParseOptions, ParseResult } from "./types";
 export * from "./types";
 export { parsePdf } from "./pdf";
-export { parseXlsx } from "./xlsx";
+export { parseXlsx, parseXlsxStructured } from "./xlsx";
+export type { XlsxStructuredChunk, XlsxDetectedTable, XlsxStructuredOptions, XlsxStructuredResult, } from "./xlsx";
 export { parsePptx } from "./pptx";
 export { parseDocx } from "./docx";
 /**

package/dist/documents/index.js CHANGED Viewed

@@ -30,7 +30,7 @@ var __exportStar = (this && this.__exportStar) || function(m, exports) {
     for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
 };
 Object.defineProperty(exports, "__esModule", { value: true });
-exports.parseDocx = exports.parsePptx = exports.parseXlsx = exports.parsePdf = void 0;
+exports.parseDocx = exports.parsePptx = exports.parseXlsxStructured = exports.parseXlsx = exports.parsePdf = void 0;
 exports.parse = parse;
 exports.isSupportedFormat = isSupportedFormat;
 exports.getDocumentType = getDocumentType;
@@ -45,6 +45,7 @@ var pdf_2 = require("./pdf");
 Object.defineProperty(exports, "parsePdf", { enumerable: true, get: function () { return pdf_2.parsePdf; } });
 var xlsx_2 = require("./xlsx");
 Object.defineProperty(exports, "parseXlsx", { enumerable: true, get: function () { return xlsx_2.parseXlsx; } });
+Object.defineProperty(exports, "parseXlsxStructured", { enumerable: true, get: function () { return xlsx_2.parseXlsxStructured; } });
 var pptx_2 = require("./pptx");
 Object.defineProperty(exports, "parsePptx", { enumerable: true, get: function () { return pptx_2.parsePptx; } });
 var docx_2 = require("./docx");

package/dist/documents/xlsx.d.ts CHANGED Viewed

@@ -1,6 +1,82 @@
 /**
  * Excel Parser with error handling
- * Uses exceljs - no Rust fallback available for XLSX
+ *
+ * Two strategies:
+ * - `parseXlsx()` — JS-based via exceljs (legacy, per-sheet flat text)
+ * - `parseXlsxStructured()` — Rust-native via memvid-core (high accuracy,
+ *   header-value pairing, merged cell support, semantic chunking)
  */
 import type { ParseOptions, ParseResult } from "./types";
 export declare function parseXlsx(filePath: string, options?: ParseOptions): Promise<ParseResult>;
+/** A single structured chunk from the XLSX extraction pipeline. */
+export interface XlsxStructuredChunk {
+    /** Chunk text with [Sheet:] prefix and Header: Value formatting */
+    text: string;
+    /** Chunk type: "Table", "TableContinuation", "Text", etc. */
+    chunkType: string;
+    /** 0-based chunk index */
+    index: number;
+    /** Table/element identifier */
+    elementId?: string;
+    /** Header context for continuation chunks */
+    context?: string;
+}
+/** A detected table with headers and metadata. */
+export interface XlsxDetectedTable {
+    name: string;
+    sheetName: string;
+    headers: string[];
+    headerRow: number;
+    firstDataRow: number;
+    lastDataRow: number;
+    firstCol: number;
+    lastCol: number;
+    confidence: number;
+    columnTypes: string[];
+}
+/** Options for structured XLSX extraction. */
+export interface XlsxStructuredOptions {
+    /** Target chunk size in characters (default: 1200) */
+    maxChars?: number;
+    /** Maximum chunks to produce (default: 500) */
+    maxChunks?: number;
+}
+/** Result of structured XLSX extraction. */
+export interface XlsxStructuredResult {
+    /** Backward-compatible flat text */
+    text: string;
+    /** Semantic chunks with header-value pairing */
+    chunks: XlsxStructuredChunk[];
+    /** Detected tables with metadata */
+    tables: XlsxDetectedTable[];
+    /** Extraction diagnostics */
+    diagnostics: {
+        warnings: string[];
+        tablesProcessed: number;
+        tablesSplit: number;
+    };
+    /** Extraction time in milliseconds */
+    timingMs: number;
+}
+/**
+ * Parse an XLSX file using the Rust structured extraction pipeline.
+ *
+ * This provides much higher search accuracy than `parseXlsx()` by:
+ * - Detecting table boundaries and headers automatically
+ * - Formatting rows as `Header: Value | Header: Value` pairs
+ * - Propagating merged cells
+ * - Detecting number formats (dates, currency, percentages)
+ * - Never splitting rows across chunk boundaries
+ *
+ * @example
+ * ```typescript
+ * const result = await parseXlsxStructured("./proforma.xlsx");
+ * console.log(`${result.tables.length} tables, ${result.chunks.length} chunks`);
+ *
+ * // Ingest chunks into memvid for high-accuracy search
+ * for (const chunk of result.chunks) {
+ *   await mem.put({ text: chunk.text, title: `XLSX chunk ${chunk.index}` });
+ * }
+ * ```
+ */
+export declare function parseXlsxStructured(filePath: string, options?: XlsxStructuredOptions): Promise<XlsxStructuredResult>;

package/dist/documents/xlsx.js CHANGED Viewed

@@ -1,7 +1,11 @@
 "use strict";
 /**
  * Excel Parser with error handling
- * Uses exceljs - no Rust fallback available for XLSX
+ *
+ * Two strategies:
+ * - `parseXlsx()` — JS-based via exceljs (legacy, per-sheet flat text)
+ * - `parseXlsxStructured()` — Rust-native via memvid-core (high accuracy,
+ *   header-value pairing, merged cell support, semantic chunking)
  */
 var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
     if (k2 === undefined) k2 = k;
@@ -38,6 +42,7 @@ var __importStar = (this && this.__importStar) || (function () {
 })();
 Object.defineProperty(exports, "__esModule", { value: true });
 exports.parseXlsx = parseXlsx;
+exports.parseXlsxStructured = parseXlsxStructured;
 /** Extract the display value from an exceljs cell */
 function getCellValue(cellValue) {
     if (cellValue == null || cellValue === "")
@@ -115,3 +120,57 @@ async function parseXlsx(filePath, options) {
             `Ensure the file is a valid .xlsx/.xls file.`);
     }
 }
+/**
+ * Parse an XLSX file using the Rust structured extraction pipeline.
+ *
+ * This provides much higher search accuracy than `parseXlsx()` by:
+ * - Detecting table boundaries and headers automatically
+ * - Formatting rows as `Header: Value | Header: Value` pairs
+ * - Propagating merged cells
+ * - Detecting number formats (dates, currency, percentages)
+ * - Never splitting rows across chunk boundaries
+ *
+ * @example
+ * ```typescript
+ * const result = await parseXlsxStructured("./proforma.xlsx");
+ * console.log(`${result.tables.length} tables, ${result.chunks.length} chunks`);
+ *
+ * // Ingest chunks into memvid for high-accuracy search
+ * for (const chunk of result.chunks) {
+ *   await mem.put({ text: chunk.text, title: `XLSX chunk ${chunk.index}` });
+ * }
+ * ```
+ */
+async function parseXlsxStructured(filePath, options) {
+    try {
+        // Try native Rust extraction first
+        const native = require("../../index.js");
+        if (typeof native.parseXlsxStructured === "function") {
+            const nativeOpts = options
+                ? { maxChars: options.maxChars, maxChunks: options.maxChunks }
+                : undefined;
+            return await native.parseXlsxStructured(filePath, nativeOpts);
+        }
+    }
+    catch {
+        // Native not available — fall through to JS fallback
+    }
+    // Fallback: use parseXlsx and wrap in structured format
+    const legacy = await parseXlsx(filePath);
+    const chunks = legacy.items.map((item, i) => ({
+        text: `[Sheet: ${item.name || `Sheet${item.number}`}]\n${item.text}`,
+        chunkType: "Table",
+        index: i,
+    }));
+    return {
+        text: legacy.items.map((item) => item.text).join("\n\n"),
+        chunks,
+        tables: [],
+        diagnostics: {
+            warnings: ["Native structured extraction unavailable; using JS fallback"],
+            tablesProcessed: 0,
+            tablesSplit: 0,
+        },
+        timingMs: 0,
+    };
+}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@memvid/sdk",
-  "version": "2.0.156",
+  "version": "2.0.157",
   "description": "Single-file AI memory system for Node.js. Store, search, and query documents with built-in RAG.",
   "main": "./dist/index.js",
   "types": "./dist/index.d.ts",
@@ -41,11 +41,11 @@
     "node": ">=18"
   },
   "optionalDependencies": {
-    "@memvid/sdk-darwin-arm64": "2.0.156",
-    "@memvid/sdk-darwin-x64": "2.0.156",
-    "@memvid/sdk-linux-x64-gnu": "2.0.156",
-    "@memvid/sdk-linux-arm64-gnu": "2.0.156",
-    "@memvid/sdk-win32-x64-msvc": "2.0.156"
+    "@memvid/sdk-darwin-arm64": "2.0.157",
+    "@memvid/sdk-darwin-x64": "2.0.157",
+    "@memvid/sdk-linux-x64-gnu": "2.0.157",
+    "@memvid/sdk-linux-arm64-gnu": "2.0.157",
+    "@memvid/sdk-win32-x64-msvc": "2.0.157"
   },
   "peerDependencies": {
     "@langchain/core": ">=0.3.0",
@@ -89,7 +89,6 @@
     "langchain": ">=0.3.0",
     "llamaindex": ">=0.12.0",
     "officeparser": "^6.0.2",
-    "unpdf": "^1.4.0",
-    "xlsx": "^0.18.5"
+    "unpdf": "^1.4.0"
   }
 }