@memvid/sdk 2.0.155 → 2.0.157

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,7 +17,8 @@
17
17
  import type { ParseOptions, ParseResult } from "./types";
18
18
  export * from "./types";
19
19
  export { parsePdf } from "./pdf";
20
- export { parseXlsx } from "./xlsx";
20
+ export { parseXlsx, parseXlsxStructured } from "./xlsx";
21
+ export type { XlsxStructuredChunk, XlsxDetectedTable, XlsxStructuredOptions, XlsxStructuredResult, } from "./xlsx";
21
22
  export { parsePptx } from "./pptx";
22
23
  export { parseDocx } from "./docx";
23
24
  /**
@@ -30,7 +30,7 @@ var __exportStar = (this && this.__exportStar) || function(m, exports) {
30
30
  for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
31
31
  };
32
32
  Object.defineProperty(exports, "__esModule", { value: true });
33
- exports.parseDocx = exports.parsePptx = exports.parseXlsx = exports.parsePdf = void 0;
33
+ exports.parseDocx = exports.parsePptx = exports.parseXlsxStructured = exports.parseXlsx = exports.parsePdf = void 0;
34
34
  exports.parse = parse;
35
35
  exports.isSupportedFormat = isSupportedFormat;
36
36
  exports.getDocumentType = getDocumentType;
@@ -45,6 +45,7 @@ var pdf_2 = require("./pdf");
45
45
  Object.defineProperty(exports, "parsePdf", { enumerable: true, get: function () { return pdf_2.parsePdf; } });
46
46
  var xlsx_2 = require("./xlsx");
47
47
  Object.defineProperty(exports, "parseXlsx", { enumerable: true, get: function () { return xlsx_2.parseXlsx; } });
48
+ Object.defineProperty(exports, "parseXlsxStructured", { enumerable: true, get: function () { return xlsx_2.parseXlsxStructured; } });
48
49
  var pptx_2 = require("./pptx");
49
50
  Object.defineProperty(exports, "parsePptx", { enumerable: true, get: function () { return pptx_2.parsePptx; } });
50
51
  var docx_2 = require("./docx");
@@ -1,6 +1,82 @@
1
1
  /**
2
2
  * Excel Parser with error handling
3
- * Uses exceljs - no Rust fallback available for XLSX
3
+ *
4
+ * Two strategies:
5
+ * - `parseXlsx()` — JS-based via exceljs (legacy, per-sheet flat text)
6
+ * - `parseXlsxStructured()` — Rust-native via memvid-core (high accuracy,
7
+ * header-value pairing, merged cell support, semantic chunking)
4
8
  */
5
9
  import type { ParseOptions, ParseResult } from "./types";
6
10
  export declare function parseXlsx(filePath: string, options?: ParseOptions): Promise<ParseResult>;
11
+ /** A single structured chunk from the XLSX extraction pipeline. */
12
+ export interface XlsxStructuredChunk {
13
+ /** Chunk text with [Sheet:] prefix and Header: Value formatting */
14
+ text: string;
15
+ /** Chunk type: "Table", "TableContinuation", "Text", etc. */
16
+ chunkType: string;
17
+ /** 0-based chunk index */
18
+ index: number;
19
+ /** Table/element identifier */
20
+ elementId?: string;
21
+ /** Header context for continuation chunks */
22
+ context?: string;
23
+ }
24
+ /** A detected table with headers and metadata. */
25
+ export interface XlsxDetectedTable {
26
+ name: string;
27
+ sheetName: string;
28
+ headers: string[];
29
+ headerRow: number;
30
+ firstDataRow: number;
31
+ lastDataRow: number;
32
+ firstCol: number;
33
+ lastCol: number;
34
+ confidence: number;
35
+ columnTypes: string[];
36
+ }
37
+ /** Options for structured XLSX extraction. */
38
+ export interface XlsxStructuredOptions {
39
+ /** Target chunk size in characters (default: 1200) */
40
+ maxChars?: number;
41
+ /** Maximum chunks to produce (default: 500) */
42
+ maxChunks?: number;
43
+ }
44
+ /** Result of structured XLSX extraction. */
45
+ export interface XlsxStructuredResult {
46
+ /** Backward-compatible flat text */
47
+ text: string;
48
+ /** Semantic chunks with header-value pairing */
49
+ chunks: XlsxStructuredChunk[];
50
+ /** Detected tables with metadata */
51
+ tables: XlsxDetectedTable[];
52
+ /** Extraction diagnostics */
53
+ diagnostics: {
54
+ warnings: string[];
55
+ tablesProcessed: number;
56
+ tablesSplit: number;
57
+ };
58
+ /** Extraction time in milliseconds */
59
+ timingMs: number;
60
+ }
61
+ /**
62
+ * Parse an XLSX file using the Rust structured extraction pipeline.
63
+ *
64
+ * This provides much higher search accuracy than `parseXlsx()` by:
65
+ * - Detecting table boundaries and headers automatically
66
+ * - Formatting rows as `Header: Value | Header: Value` pairs
67
+ * - Propagating merged cells
68
+ * - Detecting number formats (dates, currency, percentages)
69
+ * - Never splitting rows across chunk boundaries
70
+ *
71
+ * @example
72
+ * ```typescript
73
+ * const result = await parseXlsxStructured("./proforma.xlsx");
74
+ * console.log(`${result.tables.length} tables, ${result.chunks.length} chunks`);
75
+ *
76
+ * // Ingest chunks into memvid for high-accuracy search
77
+ * for (const chunk of result.chunks) {
78
+ * await mem.put({ text: chunk.text, title: `XLSX chunk ${chunk.index}` });
79
+ * }
80
+ * ```
81
+ */
82
+ export declare function parseXlsxStructured(filePath: string, options?: XlsxStructuredOptions): Promise<XlsxStructuredResult>;
@@ -1,7 +1,11 @@
1
1
  "use strict";
2
2
  /**
3
3
  * Excel Parser with error handling
4
- * Uses exceljs - no Rust fallback available for XLSX
4
+ *
5
+ * Two strategies:
6
+ * - `parseXlsx()` — JS-based via exceljs (legacy, per-sheet flat text)
7
+ * - `parseXlsxStructured()` — Rust-native via memvid-core (high accuracy,
8
+ * header-value pairing, merged cell support, semantic chunking)
5
9
  */
6
10
  var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
7
11
  if (k2 === undefined) k2 = k;
@@ -38,6 +42,7 @@ var __importStar = (this && this.__importStar) || (function () {
38
42
  })();
39
43
  Object.defineProperty(exports, "__esModule", { value: true });
40
44
  exports.parseXlsx = parseXlsx;
45
+ exports.parseXlsxStructured = parseXlsxStructured;
41
46
  /** Extract the display value from an exceljs cell */
42
47
  function getCellValue(cellValue) {
43
48
  if (cellValue == null || cellValue === "")
@@ -115,3 +120,57 @@ async function parseXlsx(filePath, options) {
115
120
  `Ensure the file is a valid .xlsx/.xls file.`);
116
121
  }
117
122
  }
123
+ /**
124
+ * Parse an XLSX file using the Rust structured extraction pipeline.
125
+ *
126
+ * This provides much higher search accuracy than `parseXlsx()` by:
127
+ * - Detecting table boundaries and headers automatically
128
+ * - Formatting rows as `Header: Value | Header: Value` pairs
129
+ * - Propagating merged cells
130
+ * - Detecting number formats (dates, currency, percentages)
131
+ * - Never splitting rows across chunk boundaries
132
+ *
133
+ * @example
134
+ * ```typescript
135
+ * const result = await parseXlsxStructured("./proforma.xlsx");
136
+ * console.log(`${result.tables.length} tables, ${result.chunks.length} chunks`);
137
+ *
138
+ * // Ingest chunks into memvid for high-accuracy search
139
+ * for (const chunk of result.chunks) {
140
+ * await mem.put({ text: chunk.text, title: `XLSX chunk ${chunk.index}` });
141
+ * }
142
+ * ```
143
+ */
144
+ async function parseXlsxStructured(filePath, options) {
145
+ try {
146
+ // Try native Rust extraction first
147
+ const native = require("../../index.js");
148
+ if (typeof native.parseXlsxStructured === "function") {
149
+ const nativeOpts = options
150
+ ? { maxChars: options.maxChars, maxChunks: options.maxChunks }
151
+ : undefined;
152
+ return await native.parseXlsxStructured(filePath, nativeOpts);
153
+ }
154
+ }
155
+ catch {
156
+ // Native not available — fall through to JS fallback
157
+ }
158
+ // Fallback: use parseXlsx and wrap in structured format
159
+ const legacy = await parseXlsx(filePath);
160
+ const chunks = legacy.items.map((item, i) => ({
161
+ text: `[Sheet: ${item.name || `Sheet${item.number}`}]\n${item.text}`,
162
+ chunkType: "Table",
163
+ index: i,
164
+ }));
165
+ return {
166
+ text: legacy.items.map((item) => item.text).join("\n\n"),
167
+ chunks,
168
+ tables: [],
169
+ diagnostics: {
170
+ warnings: ["Native structured extraction unavailable; using JS fallback"],
171
+ tablesProcessed: 0,
172
+ tablesSplit: 0,
173
+ },
174
+ timingMs: 0,
175
+ };
176
+ }
@@ -0,0 +1,250 @@
1
+ /**
2
+ * High-performance batch image ingestion for Memvid SDK (Node.js).
3
+ *
4
+ * Uses OCR to extract text from images, then ingests into a .mv2 memory file.
5
+ * docTR (via Python) provides highest accuracy (85.3%), Tesseract.js is available as optional dependency.
6
+ *
7
+ * @example
8
+ * ```typescript
9
+ * import { ImageIngestor } from '@memvid/sdk';
10
+ *
11
+ * // First install tesseract.js: npm install tesseract.js
12
+ * const ingestor = new ImageIngestor({
13
+ * ocrProvider: 'tesseract',
14
+ * workers: 4,
15
+ * });
16
+ *
17
+ * const result = await ingestor.ingestDirectory(
18
+ * './construction_drawings/',
19
+ * './project.mv2',
20
+ * {
21
+ * patterns: ['*.png', '*.jpg'],
22
+ * onProgress: (done, total) => console.log(`${done}/${total}`),
23
+ * }
24
+ * );
25
+ *
26
+ * console.log(`Processed ${result.totalImages} images`);
27
+ * await ingestor.terminate();
28
+ * ```
29
+ *
30
+ * For highest accuracy (85.3%), use docTR via Python:
31
+ * ```typescript
32
+ * // Requires: pip install python-doctr[torch]
33
+ * const ingestor = new ImageIngestor({ ocrProvider: 'doctr' });
34
+ * ```
35
+ */
36
+ import { OCRProviderType } from './ocr';
37
+ /**
38
+ * Options for image ingestion.
39
+ */
40
+ export interface ImageIngestOptions {
41
+ /** Minimum OCR confidence threshold (0-1). Default: 0.3 */
42
+ minConfidence?: number;
43
+ /** Use fallback OCR on low confidence. Default: true */
44
+ fallbackOcr?: boolean;
45
+ /** Images to process per batch. Default: 10 */
46
+ batchSize?: number;
47
+ /** Metadata to attach to all ingested frames */
48
+ metadata?: Record<string, unknown>;
49
+ /** Label for ingested frames. Default: 'image-extract' */
50
+ label?: string;
51
+ }
52
+ /**
53
+ * Options for directory ingestion.
54
+ */
55
+ export interface DirectoryIngestOptions extends ImageIngestOptions {
56
+ /** Glob patterns for files to include. Default: ['*.png', '*.jpg', '*.jpeg', '*.tiff'] */
57
+ patterns?: string[];
58
+ /** Search subdirectories. Default: true */
59
+ recursive?: boolean;
60
+ /** Progress callback */
61
+ onProgress?: (completed: number, total: number) => void;
62
+ }
63
+ /**
64
+ * Options for array-based ingestion.
65
+ */
66
+ export interface ImagesIngestOptions extends ImageIngestOptions {
67
+ /** Progress callback */
68
+ onProgress?: (completed: number, total: number) => void;
69
+ }
70
+ /**
71
+ * Result from batch image ingestion.
72
+ */
73
+ export interface ImageIngestResult {
74
+ /** Total images processed */
75
+ totalImages: number;
76
+ /** Successfully ingested images */
77
+ successful: number;
78
+ /** Failed images */
79
+ failed: number;
80
+ /** Total chunks/frames created */
81
+ totalChunks: number;
82
+ /** Processing time in seconds */
83
+ elapsedSeconds: number;
84
+ /** Output file size in bytes */
85
+ outputSizeBytes: number;
86
+ /** Errors encountered */
87
+ errors: Array<{
88
+ path: string;
89
+ error: string;
90
+ }>;
91
+ /** Images processed per second */
92
+ imagesPerSecond: number;
93
+ /** Output size in MB */
94
+ outputSizeMb: number;
95
+ }
96
+ /**
97
+ * Constructor options for ImageIngestor.
98
+ */
99
+ export interface ImageIngestorOptions {
100
+ /** OCR provider: 'tesseract', 'doctr', or 'easyocr'. Default: 'tesseract' */
101
+ ocrProvider?: OCRProviderType;
102
+ /** Number of parallel workers. Default: CPU count */
103
+ workers?: number;
104
+ /** Python path for doctr/easyocr providers */
105
+ pythonPath?: string;
106
+ }
107
+ /**
108
+ * High-performance batch image ingestor for Memvid.
109
+ *
110
+ * Combines OCR text extraction with parallel processing for fast, accurate
111
+ * ingestion of large image collections.
112
+ *
113
+ * OCR Accuracy (tested on construction drawings):
114
+ * - docTR (Python): 85.3% - BEST
115
+ * - EasyOCR (Python): 79.4%
116
+ * - Tesseract.js: ~50-60%
117
+ *
118
+ * @example
119
+ * ```typescript
120
+ * const ingestor = new ImageIngestor({
121
+ * ocrProvider: 'doctr',
122
+ * workers: 8,
123
+ * });
124
+ *
125
+ * const result = await ingestor.ingestDirectory('./drawings/', './output.mv2');
126
+ * console.log(`Processed ${result.totalImages} images in ${result.elapsedSeconds}s`);
127
+ *
128
+ * await ingestor.terminate();
129
+ * ```
130
+ */
131
+ export declare class ImageIngestor {
132
+ private _ocr;
133
+ private _fallbackOcr;
134
+ private _workers;
135
+ private _ocrType;
136
+ constructor(options?: ImageIngestorOptions);
137
+ /** Primary OCR provider name */
138
+ get ocrName(): string;
139
+ /** Number of parallel workers */
140
+ get workers(): number;
141
+ /**
142
+ * Ingest multiple images into a .mv2 file.
143
+ *
144
+ * @param paths - Array of image file paths
145
+ * @param outputPath - Output .mv2 file path
146
+ * @param options - Ingestion options
147
+ * @returns Promise resolving to ingestion result
148
+ *
149
+ * @example
150
+ * ```typescript
151
+ * const result = await ingestor.ingestImages(
152
+ * ['img1.png', 'img2.png'],
153
+ * './output.mv2',
154
+ * { onProgress: (d, t) => console.log(`${d}/${t}`) }
155
+ * );
156
+ * ```
157
+ */
158
+ ingestImages(paths: string[], outputPath: string, options?: ImagesIngestOptions): Promise<ImageIngestResult>;
159
+ /**
160
+ * Ingest all matching images from a directory.
161
+ *
162
+ * @param directory - Source directory path
163
+ * @param outputPath - Output .mv2 file path
164
+ * @param options - Directory ingestion options
165
+ * @returns Promise resolving to ingestion result
166
+ *
167
+ * @example
168
+ * ```typescript
169
+ * const result = await ingestor.ingestDirectory(
170
+ * './construction_drawings/',
171
+ * './project.mv2',
172
+ * {
173
+ * patterns: ['*.png', '*.jpg'],
174
+ * recursive: true,
175
+ * onProgress: (d, t) => console.log(`${d}/${t}`),
176
+ * }
177
+ * );
178
+ * ```
179
+ */
180
+ ingestDirectory(directory: string, outputPath: string, options?: DirectoryIngestOptions): Promise<ImageIngestResult>;
181
+ /**
182
+ * Extract text from a single image with fallback support.
183
+ */
184
+ private _extractText;
185
+ /**
186
+ * Clean up OCR worker resources.
187
+ *
188
+ * Call this when done using the ingestor to free memory.
189
+ */
190
+ terminate(): Promise<void>;
191
+ }
192
+ /**
193
+ * Convenience function for quick image ingestion.
194
+ *
195
+ * Creates an ImageIngestor, processes images, and cleans up automatically.
196
+ *
197
+ * @param paths - Array of image file paths
198
+ * @param outputPath - Output .mv2 file path
199
+ * @param options - Ingestion options
200
+ * @returns Promise resolving to ingestion result
201
+ *
202
+ * @example
203
+ * ```typescript
204
+ * import { ingestImages } from 'memvid-sdk';
205
+ *
206
+ * const result = await ingestImages(
207
+ * ['img1.png', 'img2.png'],
208
+ * './output.mv2',
209
+ * {
210
+ * ocrProvider: 'doctr',
211
+ * onProgress: (d, t) => console.log(`${d}/${t}`),
212
+ * }
213
+ * );
214
+ * ```
215
+ */
216
+ export declare function ingestImages(paths: string[], outputPath: string, options?: ImagesIngestOptions & {
217
+ ocrProvider?: OCRProviderType;
218
+ workers?: number;
219
+ pythonPath?: string;
220
+ }): Promise<ImageIngestResult>;
221
+ /**
222
+ * Convenience function for quick directory ingestion.
223
+ *
224
+ * Creates an ImageIngestor, processes directory, and cleans up automatically.
225
+ *
226
+ * @param directory - Source directory path
227
+ * @param outputPath - Output .mv2 file path
228
+ * @param options - Directory ingestion options
229
+ * @returns Promise resolving to ingestion result
230
+ *
231
+ * @example
232
+ * ```typescript
233
+ * import { ingestDirectory } from 'memvid-sdk';
234
+ *
235
+ * const result = await ingestDirectory(
236
+ * './construction_drawings/',
237
+ * './project.mv2',
238
+ * {
239
+ * ocrProvider: 'doctr',
240
+ * patterns: ['*.png', '*.jpg'],
241
+ * onProgress: (d, t) => console.log(`${d}/${t}`),
242
+ * }
243
+ * );
244
+ * ```
245
+ */
246
+ export declare function ingestDirectory(directory: string, outputPath: string, options?: DirectoryIngestOptions & {
247
+ ocrProvider?: OCRProviderType;
248
+ workers?: number;
249
+ pythonPath?: string;
250
+ }): Promise<ImageIngestResult>;