@memvid/sdk 2.0.147 → 2.0.149

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/analytics.js CHANGED
@@ -134,6 +134,8 @@ function trackEvent(event) {
134
134
  // Silently ignore errors
135
135
  });
136
136
  }, FLUSH_INTERVAL_MS);
137
+ // Don't prevent process exit while waiting for flush
138
+ flushTimer.unref();
137
139
  }
138
140
  // Force flush if queue is getting large
139
141
  if (eventQueue.length >= MAX_BATCH_SIZE) {
@@ -209,9 +211,5 @@ async function flush() {
209
211
  }
210
212
  await flushEvents();
211
213
  }
212
- // Flush on process exit
213
- if (typeof process !== "undefined") {
214
- process.on("beforeExit", () => {
215
- flush().catch(() => { });
216
- });
217
- }
214
+ // Note: No beforeExit handler needed - timer.unref() allows process to exit
215
+ // Events may be lost on quick CLI exits, but that's acceptable for UX
@@ -0,0 +1,6 @@
1
+ /**
2
+ * Word Document Parser with error handling
3
+ * Uses officeparser v6 - no Rust fallback available for DOCX
4
+ */
5
+ import type { ParseOptions, ParseResult } from "./types";
6
+ export declare function parseDocx(filePath: string, _options?: ParseOptions): Promise<ParseResult>;
@@ -0,0 +1,71 @@
1
+ "use strict";
2
+ /**
3
+ * Word Document Parser with error handling
4
+ * Uses officeparser v6 - no Rust fallback available for DOCX
5
+ */
6
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
7
+ if (k2 === undefined) k2 = k;
8
+ var desc = Object.getOwnPropertyDescriptor(m, k);
9
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
10
+ desc = { enumerable: true, get: function() { return m[k]; } };
11
+ }
12
+ Object.defineProperty(o, k2, desc);
13
+ }) : (function(o, m, k, k2) {
14
+ if (k2 === undefined) k2 = k;
15
+ o[k2] = m[k];
16
+ }));
17
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
18
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
19
+ }) : function(o, v) {
20
+ o["default"] = v;
21
+ });
22
+ var __importStar = (this && this.__importStar) || (function () {
23
+ var ownKeys = function(o) {
24
+ ownKeys = Object.getOwnPropertyNames || function (o) {
25
+ var ar = [];
26
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
27
+ return ar;
28
+ };
29
+ return ownKeys(o);
30
+ };
31
+ return function (mod) {
32
+ if (mod && mod.__esModule) return mod;
33
+ var result = {};
34
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
35
+ __setModuleDefault(result, mod);
36
+ return result;
37
+ };
38
+ })();
39
+ Object.defineProperty(exports, "__esModule", { value: true });
40
+ exports.parseDocx = parseDocx;
41
+ async function parseDocx(filePath, _options) {
42
+ const filename = filePath.split("/").pop() || filePath;
43
+ try {
44
+ const officeParser = await Promise.resolve().then(() => __importStar(require("officeparser")));
45
+ const ast = await officeParser.parseOffice(filePath);
46
+ // Get full text content
47
+ const content = ast.toText();
48
+ const items = [];
49
+ if (content && content.trim().length > 0) {
50
+ items.push({
51
+ number: 1,
52
+ text: content,
53
+ });
54
+ }
55
+ else {
56
+ console.warn(`[memvid] No text content found in ${filename}`);
57
+ }
58
+ return {
59
+ type: "docx",
60
+ filename,
61
+ totalItems: 1,
62
+ items,
63
+ };
64
+ }
65
+ catch (err) {
66
+ // For DOCX, we don't have a Rust fallback, so throw a descriptive error
67
+ const message = err instanceof Error ? err.message : String(err);
68
+ throw new Error(`Failed to parse Word file "${filename}": ${message}. ` +
69
+ `Ensure the file is a valid .docx/.doc file.`);
70
+ }
71
+ }
@@ -0,0 +1,44 @@
1
+ /**
2
+ * Document Parsing Module
3
+ *
4
+ * Auto-detects file type and parses PDF, XLSX, PPTX, DOCX documents.
5
+ * Includes fallback support for PDF using Rust core.
6
+ *
7
+ * @example
8
+ * ```typescript
9
+ * import { parse } from "@memvid/sdk/documents";
10
+ *
11
+ * const result = await parse("./report.pdf");
12
+ * if (result) {
13
+ * console.log(result.items.length); // number of pages
14
+ * }
15
+ * ```
16
+ */
17
+ import type { ParseOptions, ParseResult } from "./types";
18
+ export * from "./types";
19
+ export { parsePdf } from "./pdf";
20
+ export { parseXlsx } from "./xlsx";
21
+ export { parsePptx } from "./pptx";
22
+ export { parseDocx } from "./docx";
23
+ /**
24
+ * Parse a document file with automatic format detection.
25
+ *
26
+ * Supported formats:
27
+ * - PDF (.pdf) - per-page extraction (with Rust fallback)
28
+ * - Excel (.xlsx, .xls) - per-sheet extraction
29
+ * - PowerPoint (.pptx, .ppt) - per-slide extraction
30
+ * - Word (.docx, .doc) - full document extraction
31
+ *
32
+ * @param filePath - Path to the document file
33
+ * @param options - Parsing options
34
+ * @returns ParseResult with items, or null for PDF if JS parser failed (signals Rust fallback)
35
+ */
36
+ export declare function parse(filePath: string, options?: ParseOptions): Promise<ParseResult | null>;
37
+ /**
38
+ * Check if a file extension is supported for document parsing.
39
+ */
40
+ export declare function isSupportedFormat(filePath: string): boolean;
41
+ /**
42
+ * Get the document type from a file path.
43
+ */
44
+ export declare function getDocumentType(filePath: string): string | null;
@@ -0,0 +1,114 @@
1
+ "use strict";
2
+ /**
3
+ * Document Parsing Module
4
+ *
5
+ * Auto-detects file type and parses PDF, XLSX, PPTX, DOCX documents.
6
+ * Includes fallback support for PDF using Rust core.
7
+ *
8
+ * @example
9
+ * ```typescript
10
+ * import { parse } from "@memvid/sdk/documents";
11
+ *
12
+ * const result = await parse("./report.pdf");
13
+ * if (result) {
14
+ * console.log(result.items.length); // number of pages
15
+ * }
16
+ * ```
17
+ */
18
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
19
+ if (k2 === undefined) k2 = k;
20
+ var desc = Object.getOwnPropertyDescriptor(m, k);
21
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
22
+ desc = { enumerable: true, get: function() { return m[k]; } };
23
+ }
24
+ Object.defineProperty(o, k2, desc);
25
+ }) : (function(o, m, k, k2) {
26
+ if (k2 === undefined) k2 = k;
27
+ o[k2] = m[k];
28
+ }));
29
+ var __exportStar = (this && this.__exportStar) || function(m, exports) {
30
+ for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
31
+ };
32
+ Object.defineProperty(exports, "__esModule", { value: true });
33
+ exports.parseDocx = exports.parsePptx = exports.parseXlsx = exports.parsePdf = void 0;
34
+ exports.parse = parse;
35
+ exports.isSupportedFormat = isSupportedFormat;
36
+ exports.getDocumentType = getDocumentType;
37
+ const fs_1 = require("fs");
38
+ const path_1 = require("path");
39
+ const pdf_1 = require("./pdf");
40
+ const xlsx_1 = require("./xlsx");
41
+ const pptx_1 = require("./pptx");
42
+ const docx_1 = require("./docx");
43
+ __exportStar(require("./types"), exports);
44
+ var pdf_2 = require("./pdf");
45
+ Object.defineProperty(exports, "parsePdf", { enumerable: true, get: function () { return pdf_2.parsePdf; } });
46
+ var xlsx_2 = require("./xlsx");
47
+ Object.defineProperty(exports, "parseXlsx", { enumerable: true, get: function () { return xlsx_2.parseXlsx; } });
48
+ var pptx_2 = require("./pptx");
49
+ Object.defineProperty(exports, "parsePptx", { enumerable: true, get: function () { return pptx_2.parsePptx; } });
50
+ var docx_2 = require("./docx");
51
+ Object.defineProperty(exports, "parseDocx", { enumerable: true, get: function () { return docx_2.parseDocx; } });
52
+ /**
53
+ * Parse a document file with automatic format detection.
54
+ *
55
+ * Supported formats:
56
+ * - PDF (.pdf) - per-page extraction (with Rust fallback)
57
+ * - Excel (.xlsx, .xls) - per-sheet extraction
58
+ * - PowerPoint (.pptx, .ppt) - per-slide extraction
59
+ * - Word (.docx, .doc) - full document extraction
60
+ *
61
+ * @param filePath - Path to the document file
62
+ * @param options - Parsing options
63
+ * @returns ParseResult with items, or null for PDF if JS parser failed (signals Rust fallback)
64
+ */
65
+ async function parse(filePath, options) {
66
+ if (!(0, fs_1.existsSync)(filePath)) {
67
+ throw new Error(`File not found: ${filePath}`);
68
+ }
69
+ const ext = (0, path_1.extname)(filePath).toLowerCase();
70
+ switch (ext) {
71
+ case ".pdf":
72
+ // PDF parser returns null if unpdf fails, signaling to use Rust fallback
73
+ return (0, pdf_1.parsePdf)(filePath, options);
74
+ case ".xlsx":
75
+ case ".xls":
76
+ return (0, xlsx_1.parseXlsx)(filePath, options);
77
+ case ".pptx":
78
+ case ".ppt":
79
+ return (0, pptx_1.parsePptx)(filePath, options);
80
+ case ".docx":
81
+ case ".doc":
82
+ return (0, docx_1.parseDocx)(filePath, options);
83
+ default:
84
+ throw new Error(`Unsupported file format: ${ext}. Supported: .pdf, .xlsx, .xls, .pptx, .ppt, .docx, .doc`);
85
+ }
86
+ }
87
+ /**
88
+ * Check if a file extension is supported for document parsing.
89
+ */
90
+ function isSupportedFormat(filePath) {
91
+ const ext = (0, path_1.extname)(filePath).toLowerCase();
92
+ return [".pdf", ".xlsx", ".xls", ".pptx", ".ppt", ".docx", ".doc"].includes(ext);
93
+ }
94
+ /**
95
+ * Get the document type from a file path.
96
+ */
97
+ function getDocumentType(filePath) {
98
+ const ext = (0, path_1.extname)(filePath).toLowerCase();
99
+ switch (ext) {
100
+ case ".pdf":
101
+ return "pdf";
102
+ case ".xlsx":
103
+ case ".xls":
104
+ return "xlsx";
105
+ case ".pptx":
106
+ case ".ppt":
107
+ return "pptx";
108
+ case ".docx":
109
+ case ".doc":
110
+ return "docx";
111
+ default:
112
+ return null;
113
+ }
114
+ }
@@ -0,0 +1,7 @@
1
+ /**
2
+ * PDF Parser with fallback support
3
+ * Primary: unpdf (Mozilla pdf.js)
4
+ * Fallback: Returns null to signal using Rust core's pdf_extract
5
+ */
6
+ import type { ParseOptions, ParseResult } from "./types";
7
+ export declare function parsePdf(filePath: string, options?: ParseOptions): Promise<ParseResult | null>;
@@ -0,0 +1,75 @@
1
+ "use strict";
2
+ /**
3
+ * PDF Parser with fallback support
4
+ * Primary: unpdf (Mozilla pdf.js)
5
+ * Fallback: Returns null to signal using Rust core's pdf_extract
6
+ */
7
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
8
+ if (k2 === undefined) k2 = k;
9
+ var desc = Object.getOwnPropertyDescriptor(m, k);
10
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
11
+ desc = { enumerable: true, get: function() { return m[k]; } };
12
+ }
13
+ Object.defineProperty(o, k2, desc);
14
+ }) : (function(o, m, k, k2) {
15
+ if (k2 === undefined) k2 = k;
16
+ o[k2] = m[k];
17
+ }));
18
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
19
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
20
+ }) : function(o, v) {
21
+ o["default"] = v;
22
+ });
23
+ var __importStar = (this && this.__importStar) || (function () {
24
+ var ownKeys = function(o) {
25
+ ownKeys = Object.getOwnPropertyNames || function (o) {
26
+ var ar = [];
27
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
28
+ return ar;
29
+ };
30
+ return ownKeys(o);
31
+ };
32
+ return function (mod) {
33
+ if (mod && mod.__esModule) return mod;
34
+ var result = {};
35
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
36
+ __setModuleDefault(result, mod);
37
+ return result;
38
+ };
39
+ })();
40
+ Object.defineProperty(exports, "__esModule", { value: true });
41
+ exports.parsePdf = parsePdf;
42
+ const fs_1 = require("fs");
43
+ async function parsePdf(filePath, options) {
44
+ const filename = filePath.split("/").pop() || filePath;
45
+ try {
46
+ const { extractText } = await Promise.resolve().then(() => __importStar(require("unpdf")));
47
+ const buffer = (0, fs_1.readFileSync)(filePath);
48
+ const { text, totalPages } = await extractText(new Uint8Array(buffer), {
49
+ mergePages: false,
50
+ });
51
+ const textArray = text;
52
+ const maxItems = options?.maxItems || textArray.length;
53
+ const items = [];
54
+ for (let i = 0; i < Math.min(textArray.length, maxItems); i++) {
55
+ const pageText = textArray[i];
56
+ if (pageText && pageText.trim().length > 0) {
57
+ items.push({
58
+ number: i + 1,
59
+ text: pageText,
60
+ });
61
+ }
62
+ }
63
+ return {
64
+ type: "pdf",
65
+ filename,
66
+ totalItems: totalPages,
67
+ items,
68
+ };
69
+ }
70
+ catch (err) {
71
+ // Log warning and return null to signal fallback to Rust core
72
+ console.warn(`[memvid] unpdf failed for ${filename}, using Rust fallback: ${err instanceof Error ? err.message : String(err)}`);
73
+ return null;
74
+ }
75
+ }
@@ -0,0 +1,6 @@
1
+ /**
2
+ * PowerPoint Parser with error handling
3
+ * Uses officeparser v6 - no Rust fallback available for PPTX
4
+ */
5
+ import type { ParseOptions, ParseResult } from "./types";
6
+ export declare function parsePptx(filePath: string, options?: ParseOptions): Promise<ParseResult>;
@@ -0,0 +1,89 @@
1
+ "use strict";
2
+ /**
3
+ * PowerPoint Parser with error handling
4
+ * Uses officeparser v6 - no Rust fallback available for PPTX
5
+ */
6
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
7
+ if (k2 === undefined) k2 = k;
8
+ var desc = Object.getOwnPropertyDescriptor(m, k);
9
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
10
+ desc = { enumerable: true, get: function() { return m[k]; } };
11
+ }
12
+ Object.defineProperty(o, k2, desc);
13
+ }) : (function(o, m, k, k2) {
14
+ if (k2 === undefined) k2 = k;
15
+ o[k2] = m[k];
16
+ }));
17
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
18
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
19
+ }) : function(o, v) {
20
+ o["default"] = v;
21
+ });
22
+ var __importStar = (this && this.__importStar) || (function () {
23
+ var ownKeys = function(o) {
24
+ ownKeys = Object.getOwnPropertyNames || function (o) {
25
+ var ar = [];
26
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
27
+ return ar;
28
+ };
29
+ return ownKeys(o);
30
+ };
31
+ return function (mod) {
32
+ if (mod && mod.__esModule) return mod;
33
+ var result = {};
34
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
35
+ __setModuleDefault(result, mod);
36
+ return result;
37
+ };
38
+ })();
39
+ Object.defineProperty(exports, "__esModule", { value: true });
40
+ exports.parsePptx = parsePptx;
41
+ async function parsePptx(filePath, options) {
42
+ const filename = filePath.split("/").pop() || filePath;
43
+ try {
44
+ const officeParser = await Promise.resolve().then(() => __importStar(require("officeparser")));
45
+ const ast = await officeParser.parseOffice(filePath, {
46
+ ignoreNotes: false, // Include speaker notes
47
+ });
48
+ const items = [];
49
+ // Get full text content
50
+ const content = ast.toText();
51
+ if (!content || content.trim().length === 0) {
52
+ console.warn(`[memvid] No text content found in ${filename}`);
53
+ return {
54
+ type: "pptx",
55
+ filename,
56
+ totalItems: 0,
57
+ items: [],
58
+ };
59
+ }
60
+ // Split content into slides (officeparser separates with double newlines)
61
+ const slides = content.split(/\n\s*\n/).filter((s) => s.trim());
62
+ const maxItems = options?.maxItems || slides.length;
63
+ for (let i = 0; i < Math.min(slides.length, maxItems); i++) {
64
+ const slideText = slides[i].trim();
65
+ if (slideText.length > 0) {
66
+ // Try to extract title (first line)
67
+ const lines = slideText.split("\n");
68
+ const title = lines[0]?.trim();
69
+ items.push({
70
+ number: i + 1,
71
+ title: title || undefined,
72
+ text: slideText,
73
+ });
74
+ }
75
+ }
76
+ return {
77
+ type: "pptx",
78
+ filename,
79
+ totalItems: items.length,
80
+ items,
81
+ };
82
+ }
83
+ catch (err) {
84
+ // For PPTX, we don't have a Rust fallback, so throw a descriptive error
85
+ const message = err instanceof Error ? err.message : String(err);
86
+ throw new Error(`Failed to parse PowerPoint file "${filename}": ${message}. ` +
87
+ `Ensure the file is a valid .pptx/.ppt file.`);
88
+ }
89
+ }
@@ -0,0 +1,61 @@
1
+ /**
2
+ * Document Parsing Types
3
+ */
4
+ export interface ParseOptions {
5
+ /** Enable OCR for scanned pages (default: false) */
6
+ ocr?: boolean;
7
+ /** Extract tables as structured arrays (default: true) */
8
+ extractTables?: boolean;
9
+ /** Limit pages/sheets/slides to process */
10
+ maxItems?: number;
11
+ }
12
+ export interface DocumentItem {
13
+ /** 1-based item number (page/slide) or 0-based (sheet) */
14
+ number: number;
15
+ /** Sheet name for XLSX */
16
+ name?: string;
17
+ /** Extracted text content */
18
+ text: string;
19
+ /** Slide title for PPTX */
20
+ title?: string;
21
+ }
22
+ export interface ParseResult {
23
+ /** Detected document type */
24
+ type: "pdf" | "xlsx" | "pptx" | "docx";
25
+ /** Original filename */
26
+ filename: string;
27
+ /** Total items (pages/sheets/slides) */
28
+ totalItems: number;
29
+ /** Parsed items */
30
+ items: DocumentItem[];
31
+ }
32
+ export interface PutFileOptions {
33
+ /** Label for all frames (default: "document") */
34
+ label?: string;
35
+ /** Additional metadata to attach */
36
+ metadata?: Record<string, unknown>;
37
+ }
38
+ export interface PutFileResult {
39
+ /** Number of frames added */
40
+ framesAdded: number;
41
+ /** Document type detected */
42
+ type: string;
43
+ /** Original filename */
44
+ filename: string;
45
+ }
46
+ export interface PutFilesOptions extends PutFileOptions {
47
+ /** File extensions to process (default: all supported) */
48
+ extensions?: string[];
49
+ }
50
+ export interface PutFilesResult {
51
+ /** Number of files processed */
52
+ filesProcessed: number;
53
+ /** Total frames added */
54
+ framesAdded: number;
55
+ /** Results per file */
56
+ files: Array<{
57
+ filename: string;
58
+ framesAdded: number;
59
+ type: string;
60
+ }>;
61
+ }
@@ -0,0 +1,5 @@
1
+ "use strict";
2
+ /**
3
+ * Document Parsing Types
4
+ */
5
+ Object.defineProperty(exports, "__esModule", { value: true });
@@ -0,0 +1,6 @@
1
+ /**
2
+ * Excel Parser with error handling
3
+ * Uses exceljs - no Rust fallback available for XLSX
4
+ */
5
+ import type { ParseOptions, ParseResult } from "./types";
6
+ export declare function parseXlsx(filePath: string, options?: ParseOptions): Promise<ParseResult>;
@@ -0,0 +1,117 @@
1
+ "use strict";
2
+ /**
3
+ * Excel Parser with error handling
4
+ * Uses exceljs - no Rust fallback available for XLSX
5
+ */
6
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
7
+ if (k2 === undefined) k2 = k;
8
+ var desc = Object.getOwnPropertyDescriptor(m, k);
9
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
10
+ desc = { enumerable: true, get: function() { return m[k]; } };
11
+ }
12
+ Object.defineProperty(o, k2, desc);
13
+ }) : (function(o, m, k, k2) {
14
+ if (k2 === undefined) k2 = k;
15
+ o[k2] = m[k];
16
+ }));
17
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
18
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
19
+ }) : function(o, v) {
20
+ o["default"] = v;
21
+ });
22
+ var __importStar = (this && this.__importStar) || (function () {
23
+ var ownKeys = function(o) {
24
+ ownKeys = Object.getOwnPropertyNames || function (o) {
25
+ var ar = [];
26
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
27
+ return ar;
28
+ };
29
+ return ownKeys(o);
30
+ };
31
+ return function (mod) {
32
+ if (mod && mod.__esModule) return mod;
33
+ var result = {};
34
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
35
+ __setModuleDefault(result, mod);
36
+ return result;
37
+ };
38
+ })();
39
+ Object.defineProperty(exports, "__esModule", { value: true });
40
+ exports.parseXlsx = parseXlsx;
41
+ /** Extract the display value from an exceljs cell */
42
+ function getCellValue(cellValue) {
43
+ if (cellValue == null || cellValue === "")
44
+ return null;
45
+ // Handle formula cells - exceljs returns { formula, result }
46
+ if (typeof cellValue === "object" && cellValue !== null) {
47
+ const obj = cellValue;
48
+ // Formula cell with result
49
+ if ("result" in obj && obj.result != null) {
50
+ return String(obj.result);
51
+ }
52
+ // Rich text - array of text runs
53
+ if ("richText" in obj && Array.isArray(obj.richText)) {
54
+ return obj.richText.map((r) => r.text || "").join("");
55
+ }
56
+ // Error value
57
+ if ("error" in obj) {
58
+ return `#${obj.error}`;
59
+ }
60
+ // Date object
61
+ if (cellValue instanceof Date) {
62
+ return cellValue.toISOString().split("T")[0];
63
+ }
64
+ // Unknown object - skip to avoid [object Object]
65
+ return null;
66
+ }
67
+ return String(cellValue);
68
+ }
69
+ async function parseXlsx(filePath, options) {
70
+ const filename = filePath.split("/").pop() || filePath;
71
+ try {
72
+ const ExcelJS = await Promise.resolve().then(() => __importStar(require("exceljs")));
73
+ const workbook = new ExcelJS.default.Workbook();
74
+ await workbook.xlsx.readFile(filePath);
75
+ const maxItems = options?.maxItems || workbook.worksheets.length;
76
+ const items = [];
77
+ for (let i = 0; i < Math.min(workbook.worksheets.length, maxItems); i++) {
78
+ const worksheet = workbook.worksheets[i];
79
+ const textLines = [];
80
+ worksheet.eachRow((row) => {
81
+ const values = [];
82
+ row.eachCell((cell) => {
83
+ const value = getCellValue(cell.value);
84
+ if (value != null) {
85
+ values.push(value);
86
+ }
87
+ });
88
+ if (values.length > 0) {
89
+ textLines.push(values.join(" | "));
90
+ }
91
+ });
92
+ if (textLines.length > 0) {
93
+ items.push({
94
+ number: i,
95
+ name: worksheet.name,
96
+ text: textLines.join("\n"),
97
+ });
98
+ }
99
+ }
100
+ // If no sheets with content, return empty but valid result
101
+ if (items.length === 0) {
102
+ console.warn(`[memvid] No content found in ${filename}`);
103
+ }
104
+ return {
105
+ type: "xlsx",
106
+ filename,
107
+ totalItems: workbook.worksheets.length,
108
+ items,
109
+ };
110
+ }
111
+ catch (err) {
112
+ // For XLSX, we don't have a Rust fallback, so throw a descriptive error
113
+ const message = err instanceof Error ? err.message : String(err);
114
+ throw new Error(`Failed to parse Excel file "${filename}": ${message}. ` +
115
+ `Ensure the file is a valid .xlsx/.xls file.`);
116
+ }
117
+ }
package/dist/index.js CHANGED
@@ -900,6 +900,7 @@ function normaliseAskOptions(opts) {
900
900
  minRelevancy: opts.minRelevancy,
901
901
  maxK: opts.maxK,
902
902
  adaptiveStrategy: opts.adaptiveStrategy,
903
+ showChunks: opts.showChunks,
903
904
  };
904
905
  if (payload.k == null &&
905
906
  payload.mode == null &&
@@ -917,7 +918,8 @@ function normaliseAskOptions(opts) {
917
918
  payload.adaptive == null &&
918
919
  payload.minRelevancy == null &&
919
920
  payload.maxK == null &&
920
- payload.adaptiveStrategy == null) {
921
+ payload.adaptiveStrategy == null &&
922
+ payload.showChunks == null) {
921
923
  return undefined;
922
924
  }
923
925
  return payload;
@@ -1690,6 +1692,13 @@ class MemvidImpl {
1690
1692
  async seal() {
1691
1693
  return wrapAsync(() => this.core.seal());
1692
1694
  }
1695
+ /**
1696
+ * Rebuild the time index. Call this after using putMany() if you need
1697
+ * time-based queries (like ask() with temporal context).
1698
+ */
1699
+ async rebuildTimeIndex() {
1700
+ await this.doctor({ rebuildTimeIndex: true, quiet: true });
1701
+ }
1693
1702
  async enableLex() {
1694
1703
  return wrapAsync(() => this.core.enableLex());
1695
1704
  }
@@ -1839,6 +1848,182 @@ class MemvidImpl {
1839
1848
  async addMemoryCards(cards) {
1840
1849
  return wrapAsync(() => this.core.addMemoryCards(cards));
1841
1850
  }
1851
+ // ─────────────────────────────────────────────────────────────────────────
1852
+ // Document Parsing & Ingestion
1853
+ // ─────────────────────────────────────────────────────────────────────────
1854
+ /**
1855
+ * Ingest a document file with automatic parsing and per-page/sheet/slide storage.
1856
+ *
1857
+ * Supported formats:
1858
+ * - PDF (.pdf) - stores each page as a separate frame
1859
+ * - Excel (.xlsx, .xls) - stores each sheet as a separate frame
1860
+ * - PowerPoint (.pptx, .ppt) - stores each slide as a separate frame
1861
+ * - Word (.docx, .doc) - stores the document as a single frame
1862
+ *
1863
+ * @example
1864
+ * ```typescript
1865
+ * const mv = await create('memory.mv2');
1866
+ * await mv.putFile('./report.pdf');
1867
+ * await mv.putFile('./data.xlsx');
1868
+ * await mv.putFile('./slides.pptx');
1869
+ * ```
1870
+ *
1871
+ * @param filePath - Path to the document file
1872
+ * @param options - Optional label and metadata
1873
+ * @returns Result with framesAdded count
1874
+ */
1875
+ async putFile(filePath, options) {
1876
+ const { parse, getDocumentType } = await Promise.resolve().then(() => __importStar(require("./documents/index.js")));
1877
+ const { basename } = await Promise.resolve().then(() => __importStar(require("path")));
1878
+ const filename = basename(filePath);
1879
+ const docType = getDocumentType(filePath);
1880
+ const label = options?.label || "document";
1881
+ const baseMetadata = options?.metadata || {};
1882
+ const embedder = options?.embedder;
1883
+ // Try JS-based parsing first
1884
+ const result = await parse(filePath);
1885
+ // PDF fallback: if parse() returns null, use Rust core's built-in pdf_extract
1886
+ if (result === null && docType === "pdf") {
1887
+ console.log(`[memvid] Using Rust pdf_extract for ${filename}`);
1888
+ // For PDF fallback with external embedder, we can't easily extract text first
1889
+ // so we fall back to the internal embedding model
1890
+ await this.put({
1891
+ file: filePath,
1892
+ label,
1893
+ metadata: {
1894
+ ...baseMetadata,
1895
+ doc_name: filename,
1896
+ doc_type: "pdf",
1897
+ fallback: "rust_pdf_extract",
1898
+ },
1899
+ enableEmbedding: embedder ? true : options?.enableEmbedding,
1900
+ embeddingModel: embedder ? undefined : options?.embeddingModel,
1901
+ vectorCompression: options?.vectorCompression,
1902
+ autoTag: options?.autoTag,
1903
+ extractDates: options?.extractDates,
1904
+ });
1905
+ (0, analytics_1.trackCommand)(this.filename, "putFile", true);
1906
+ return { framesAdded: 1, type: "pdf", filename };
1907
+ }
1908
+ // If result is null and not PDF, something went wrong
1909
+ if (result === null) {
1910
+ throw new Error(`Failed to parse document: ${filename}`);
1911
+ }
1912
+ // Build items for batch processing with putMany (6x faster than individual put())
1913
+ const items = [];
1914
+ for (const item of result.items) {
1915
+ let title;
1916
+ let metadata;
1917
+ if (result.type === "pdf") {
1918
+ title = `${result.filename} [Page ${item.number}]`;
1919
+ metadata = {
1920
+ ...baseMetadata,
1921
+ doc_name: result.filename,
1922
+ doc_type: result.type,
1923
+ page_number: item.number,
1924
+ total_pages: result.totalItems,
1925
+ };
1926
+ }
1927
+ else if (result.type === "xlsx") {
1928
+ title = `${result.filename} [Sheet: ${item.name}]`;
1929
+ metadata = {
1930
+ ...baseMetadata,
1931
+ doc_name: result.filename,
1932
+ doc_type: result.type,
1933
+ sheet_name: item.name,
1934
+ sheet_index: item.number,
1935
+ total_sheets: result.totalItems,
1936
+ };
1937
+ }
1938
+ else if (result.type === "pptx") {
1939
+ title = `${result.filename} [Slide ${item.number}]`;
1940
+ metadata = {
1941
+ ...baseMetadata,
1942
+ doc_name: result.filename,
1943
+ doc_type: result.type,
1944
+ slide_number: item.number,
1945
+ slide_title: item.title,
1946
+ total_slides: result.totalItems,
1947
+ };
1948
+ }
1949
+ else {
1950
+ // docx
1951
+ title = result.filename;
1952
+ metadata = {
1953
+ ...baseMetadata,
1954
+ doc_name: result.filename,
1955
+ doc_type: result.type,
1956
+ };
1957
+ }
1958
+ items.push({
1959
+ title,
1960
+ labels: label ? [label] : undefined,
1961
+ text: item.text,
1962
+ metadata,
1963
+ });
1964
+ }
1965
+ // Use putMany for fast batch ingestion
1966
+ // Note: Call rebuildTimeIndex() after seal() if using ask() with temporal queries
1967
+ await this.putMany(items, {
1968
+ embedder,
1969
+ enableEmbedding: embedder ? undefined : options?.enableEmbedding,
1970
+ embeddingModel: embedder ? undefined : options?.embeddingModel,
1971
+ });
1972
+ (0, analytics_1.trackCommand)(this.filename, "putFile", true);
1973
+ return { framesAdded: items.length, type: result.type, filename: result.filename };
1974
+ }
1975
+ /**
1976
+ * Ingest multiple document files from a directory.
1977
+ *
1978
+ * @example
1979
+ * ```typescript
1980
+ * const mv = await create('memory.mv2');
1981
+ * const result = await mv.putFiles('./documents/');
1982
+ * console.log(`Processed ${result.filesProcessed} files, ${result.framesAdded} frames`);
1983
+ * ```
1984
+ *
1985
+ * @param dirPath - Path to directory containing documents
1986
+ * @param options - Optional label, extensions filter, and metadata
1987
+ * @returns Result with filesProcessed and framesAdded counts
1988
+ */
1989
+ async putFiles(dirPath, options) {
1990
+ const { readdirSync } = await Promise.resolve().then(() => __importStar(require("fs")));
1991
+ const { join } = await Promise.resolve().then(() => __importStar(require("path")));
1992
+ const extensions = options?.extensions || [
1993
+ ".pdf",
1994
+ ".xlsx",
1995
+ ".xls",
1996
+ ".pptx",
1997
+ ".ppt",
1998
+ ".docx",
1999
+ ".doc",
2000
+ ];
2001
+ const files = readdirSync(dirPath).filter((f) => extensions.some((ext) => f.toLowerCase().endsWith(ext)));
2002
+ let filesProcessed = 0;
2003
+ let framesAdded = 0;
2004
+ const results = [];
2005
+ for (const file of files) {
2006
+ const result = await this.putFile(join(dirPath, file), {
2007
+ label: options?.label,
2008
+ metadata: options?.metadata,
2009
+ enableEmbedding: options?.enableEmbedding,
2010
+ embeddingModel: options?.embeddingModel,
2011
+ embedder: options?.embedder,
2012
+ vectorCompression: options?.vectorCompression,
2013
+ autoTag: options?.autoTag,
2014
+ extractDates: options?.extractDates,
2015
+ });
2016
+ filesProcessed++;
2017
+ framesAdded += result.framesAdded;
2018
+ results.push({
2019
+ filename: result.filename,
2020
+ framesAdded: result.framesAdded,
2021
+ type: result.type,
2022
+ });
2023
+ }
2024
+ (0, analytics_1.trackCommand)(this.filename, "putFiles", true);
2025
+ return { filesProcessed, framesAdded, files: results };
2026
+ }
1842
2027
  }
1843
2028
  const useImpl = (async (kind, filename, apiKeyOrOptions, options) => {
1844
2029
  const { apiKey, options: resolvedOptions } = splitUseArgs(apiKeyOrOptions, options);
@@ -1887,6 +2072,7 @@ useImpl.doctor = async (path, options) => {
1887
2072
  rebuildVecIndex: options.rebuildVecIndex,
1888
2073
  vacuum: options.vacuum,
1889
2074
  dryRun: options.dryRun,
2075
+ quiet: options.quiet,
1890
2076
  };
1891
2077
  return wrapAsync(() => addon.doctorMemvid(path, nativeOptions));
1892
2078
  };
package/dist/types.d.ts CHANGED
@@ -103,6 +103,8 @@ export interface AskInput {
103
103
  minRelevancy?: number;
104
104
  maxK?: number;
105
105
  adaptiveStrategy?: "relative" | "absolute" | "cliff" | "elbow" | "combined";
106
+ /** Include full chunk content in response (default: false) */
107
+ showChunks?: boolean;
106
108
  }
107
109
  export interface TimelineInput {
108
110
  limit?: number;
@@ -245,6 +247,7 @@ export interface NativeAskOptions {
245
247
  minRelevancy?: number;
246
248
  maxK?: number;
247
249
  adaptiveStrategy?: string;
250
+ showChunks?: boolean;
248
251
  }
249
252
  export interface NativeTimelineOptions {
250
253
  limit?: number;
@@ -693,6 +696,7 @@ export interface UseDoctorOptions {
693
696
  rebuildVecIndex?: boolean;
694
697
  vacuum?: boolean;
695
698
  dryRun?: boolean;
699
+ quiet?: boolean;
696
700
  }
697
701
  export interface LockOptions {
698
702
  /** Output file path (default: input.mv2e) */
@@ -819,6 +823,7 @@ export type NativeDoctorOptions = {
819
823
  rebuildVecIndex?: boolean;
820
824
  vacuum?: boolean;
821
825
  dryRun?: boolean;
826
+ quiet?: boolean;
822
827
  };
823
828
  export type NativeCapsuleOptions = {
824
829
  output?: string;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@memvid/sdk",
3
- "version": "2.0.147",
3
+ "version": "2.0.149",
4
4
  "description": "Single-file AI memory system for Node.js. Store, search, and query documents with built-in RAG.",
5
5
  "main": "./dist/index.js",
6
6
  "types": "./dist/index.d.ts",
@@ -41,10 +41,10 @@
41
41
  "node": ">=18"
42
42
  },
43
43
  "optionalDependencies": {
44
- "@memvid/sdk-darwin-arm64": "2.0.147",
45
- "@memvid/sdk-darwin-x64": "2.0.147",
46
- "@memvid/sdk-linux-x64-gnu": "2.0.147",
47
- "@memvid/sdk-win32-x64-msvc": "2.0.147"
44
+ "@memvid/sdk-darwin-arm64": "2.0.149",
45
+ "@memvid/sdk-darwin-x64": "2.0.149",
46
+ "@memvid/sdk-linux-x64-gnu": "2.0.149",
47
+ "@memvid/sdk-win32-x64-msvc": "2.0.149"
48
48
  },
49
49
  "peerDependencies": {
50
50
  "@langchain/core": ">=0.3.0",
@@ -76,6 +76,9 @@
76
76
  "typescript": "^5.4.0"
77
77
  },
78
78
  "dependencies": {
79
+ "unpdf": "^1.4.0",
80
+ "exceljs": "^4.4.0",
81
+ "officeparser": "^6.0.2",
79
82
  "@ai-sdk/openai": "^1.0.0",
80
83
  "@google/generative-ai": "^0.24.0",
81
84
  "@langchain/langgraph": ">=0.2.0",