@memvid/sdk 2.0.156 → 2.0.157
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -17,7 +17,8 @@
|
|
|
17
17
|
import type { ParseOptions, ParseResult } from "./types";
|
|
18
18
|
export * from "./types";
|
|
19
19
|
export { parsePdf } from "./pdf";
|
|
20
|
-
export { parseXlsx } from "./xlsx";
|
|
20
|
+
export { parseXlsx, parseXlsxStructured } from "./xlsx";
|
|
21
|
+
export type { XlsxStructuredChunk, XlsxDetectedTable, XlsxStructuredOptions, XlsxStructuredResult, } from "./xlsx";
|
|
21
22
|
export { parsePptx } from "./pptx";
|
|
22
23
|
export { parseDocx } from "./docx";
|
|
23
24
|
/**
|
package/dist/documents/index.js
CHANGED
|
@@ -30,7 +30,7 @@ var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
|
30
30
|
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
31
31
|
};
|
|
32
32
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
33
|
-
exports.parseDocx = exports.parsePptx = exports.parseXlsx = exports.parsePdf = void 0;
|
|
33
|
+
exports.parseDocx = exports.parsePptx = exports.parseXlsxStructured = exports.parseXlsx = exports.parsePdf = void 0;
|
|
34
34
|
exports.parse = parse;
|
|
35
35
|
exports.isSupportedFormat = isSupportedFormat;
|
|
36
36
|
exports.getDocumentType = getDocumentType;
|
|
@@ -45,6 +45,7 @@ var pdf_2 = require("./pdf");
|
|
|
45
45
|
Object.defineProperty(exports, "parsePdf", { enumerable: true, get: function () { return pdf_2.parsePdf; } });
|
|
46
46
|
var xlsx_2 = require("./xlsx");
|
|
47
47
|
Object.defineProperty(exports, "parseXlsx", { enumerable: true, get: function () { return xlsx_2.parseXlsx; } });
|
|
48
|
+
Object.defineProperty(exports, "parseXlsxStructured", { enumerable: true, get: function () { return xlsx_2.parseXlsxStructured; } });
|
|
48
49
|
var pptx_2 = require("./pptx");
|
|
49
50
|
Object.defineProperty(exports, "parsePptx", { enumerable: true, get: function () { return pptx_2.parsePptx; } });
|
|
50
51
|
var docx_2 = require("./docx");
|
package/dist/documents/xlsx.d.ts
CHANGED
|
@@ -1,6 +1,82 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Excel Parser with error handling
|
|
3
|
-
*
|
|
3
|
+
*
|
|
4
|
+
* Two strategies:
|
|
5
|
+
* - `parseXlsx()` — JS-based via exceljs (legacy, per-sheet flat text)
|
|
6
|
+
* - `parseXlsxStructured()` — Rust-native via memvid-core (high accuracy,
|
|
7
|
+
* header-value pairing, merged cell support, semantic chunking)
|
|
4
8
|
*/
|
|
5
9
|
import type { ParseOptions, ParseResult } from "./types";
|
|
6
10
|
export declare function parseXlsx(filePath: string, options?: ParseOptions): Promise<ParseResult>;
|
|
11
|
+
/** A single structured chunk from the XLSX extraction pipeline. */
|
|
12
|
+
export interface XlsxStructuredChunk {
|
|
13
|
+
/** Chunk text with [Sheet:] prefix and Header: Value formatting */
|
|
14
|
+
text: string;
|
|
15
|
+
/** Chunk type: "Table", "TableContinuation", "Text", etc. */
|
|
16
|
+
chunkType: string;
|
|
17
|
+
/** 0-based chunk index */
|
|
18
|
+
index: number;
|
|
19
|
+
/** Table/element identifier */
|
|
20
|
+
elementId?: string;
|
|
21
|
+
/** Header context for continuation chunks */
|
|
22
|
+
context?: string;
|
|
23
|
+
}
|
|
24
|
+
/** A detected table with headers and metadata. */
|
|
25
|
+
export interface XlsxDetectedTable {
|
|
26
|
+
name: string;
|
|
27
|
+
sheetName: string;
|
|
28
|
+
headers: string[];
|
|
29
|
+
headerRow: number;
|
|
30
|
+
firstDataRow: number;
|
|
31
|
+
lastDataRow: number;
|
|
32
|
+
firstCol: number;
|
|
33
|
+
lastCol: number;
|
|
34
|
+
confidence: number;
|
|
35
|
+
columnTypes: string[];
|
|
36
|
+
}
|
|
37
|
+
/** Options for structured XLSX extraction. */
|
|
38
|
+
export interface XlsxStructuredOptions {
|
|
39
|
+
/** Target chunk size in characters (default: 1200) */
|
|
40
|
+
maxChars?: number;
|
|
41
|
+
/** Maximum chunks to produce (default: 500) */
|
|
42
|
+
maxChunks?: number;
|
|
43
|
+
}
|
|
44
|
+
/** Result of structured XLSX extraction. */
|
|
45
|
+
export interface XlsxStructuredResult {
|
|
46
|
+
/** Backward-compatible flat text */
|
|
47
|
+
text: string;
|
|
48
|
+
/** Semantic chunks with header-value pairing */
|
|
49
|
+
chunks: XlsxStructuredChunk[];
|
|
50
|
+
/** Detected tables with metadata */
|
|
51
|
+
tables: XlsxDetectedTable[];
|
|
52
|
+
/** Extraction diagnostics */
|
|
53
|
+
diagnostics: {
|
|
54
|
+
warnings: string[];
|
|
55
|
+
tablesProcessed: number;
|
|
56
|
+
tablesSplit: number;
|
|
57
|
+
};
|
|
58
|
+
/** Extraction time in milliseconds */
|
|
59
|
+
timingMs: number;
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Parse an XLSX file using the Rust structured extraction pipeline.
|
|
63
|
+
*
|
|
64
|
+
* This provides much higher search accuracy than `parseXlsx()` by:
|
|
65
|
+
* - Detecting table boundaries and headers automatically
|
|
66
|
+
* - Formatting rows as `Header: Value | Header: Value` pairs
|
|
67
|
+
* - Propagating merged cells
|
|
68
|
+
* - Detecting number formats (dates, currency, percentages)
|
|
69
|
+
* - Never splitting rows across chunk boundaries
|
|
70
|
+
*
|
|
71
|
+
* @example
|
|
72
|
+
* ```typescript
|
|
73
|
+
* const result = await parseXlsxStructured("./proforma.xlsx");
|
|
74
|
+
* console.log(`${result.tables.length} tables, ${result.chunks.length} chunks`);
|
|
75
|
+
*
|
|
76
|
+
* // Ingest chunks into memvid for high-accuracy search
|
|
77
|
+
* for (const chunk of result.chunks) {
|
|
78
|
+
* await mem.put({ text: chunk.text, title: `XLSX chunk ${chunk.index}` });
|
|
79
|
+
* }
|
|
80
|
+
* ```
|
|
81
|
+
*/
|
|
82
|
+
export declare function parseXlsxStructured(filePath: string, options?: XlsxStructuredOptions): Promise<XlsxStructuredResult>;
|
package/dist/documents/xlsx.js
CHANGED
|
@@ -1,7 +1,11 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
/**
|
|
3
3
|
* Excel Parser with error handling
|
|
4
|
-
*
|
|
4
|
+
*
|
|
5
|
+
* Two strategies:
|
|
6
|
+
* - `parseXlsx()` — JS-based via exceljs (legacy, per-sheet flat text)
|
|
7
|
+
* - `parseXlsxStructured()` — Rust-native via memvid-core (high accuracy,
|
|
8
|
+
* header-value pairing, merged cell support, semantic chunking)
|
|
5
9
|
*/
|
|
6
10
|
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
7
11
|
if (k2 === undefined) k2 = k;
|
|
@@ -38,6 +42,7 @@ var __importStar = (this && this.__importStar) || (function () {
|
|
|
38
42
|
})();
|
|
39
43
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
40
44
|
exports.parseXlsx = parseXlsx;
|
|
45
|
+
exports.parseXlsxStructured = parseXlsxStructured;
|
|
41
46
|
/** Extract the display value from an exceljs cell */
|
|
42
47
|
function getCellValue(cellValue) {
|
|
43
48
|
if (cellValue == null || cellValue === "")
|
|
@@ -115,3 +120,57 @@ async function parseXlsx(filePath, options) {
|
|
|
115
120
|
`Ensure the file is a valid .xlsx/.xls file.`);
|
|
116
121
|
}
|
|
117
122
|
}
|
|
123
|
+
/**
|
|
124
|
+
* Parse an XLSX file using the Rust structured extraction pipeline.
|
|
125
|
+
*
|
|
126
|
+
* This provides much higher search accuracy than `parseXlsx()` by:
|
|
127
|
+
* - Detecting table boundaries and headers automatically
|
|
128
|
+
* - Formatting rows as `Header: Value | Header: Value` pairs
|
|
129
|
+
* - Propagating merged cells
|
|
130
|
+
* - Detecting number formats (dates, currency, percentages)
|
|
131
|
+
* - Never splitting rows across chunk boundaries
|
|
132
|
+
*
|
|
133
|
+
* @example
|
|
134
|
+
* ```typescript
|
|
135
|
+
* const result = await parseXlsxStructured("./proforma.xlsx");
|
|
136
|
+
* console.log(`${result.tables.length} tables, ${result.chunks.length} chunks`);
|
|
137
|
+
*
|
|
138
|
+
* // Ingest chunks into memvid for high-accuracy search
|
|
139
|
+
* for (const chunk of result.chunks) {
|
|
140
|
+
* await mem.put({ text: chunk.text, title: `XLSX chunk ${chunk.index}` });
|
|
141
|
+
* }
|
|
142
|
+
* ```
|
|
143
|
+
*/
|
|
144
|
+
async function parseXlsxStructured(filePath, options) {
|
|
145
|
+
try {
|
|
146
|
+
// Try native Rust extraction first
|
|
147
|
+
const native = require("../../index.js");
|
|
148
|
+
if (typeof native.parseXlsxStructured === "function") {
|
|
149
|
+
const nativeOpts = options
|
|
150
|
+
? { maxChars: options.maxChars, maxChunks: options.maxChunks }
|
|
151
|
+
: undefined;
|
|
152
|
+
return await native.parseXlsxStructured(filePath, nativeOpts);
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
catch {
|
|
156
|
+
// Native not available — fall through to JS fallback
|
|
157
|
+
}
|
|
158
|
+
// Fallback: use parseXlsx and wrap in structured format
|
|
159
|
+
const legacy = await parseXlsx(filePath);
|
|
160
|
+
const chunks = legacy.items.map((item, i) => ({
|
|
161
|
+
text: `[Sheet: ${item.name || `Sheet${item.number}`}]\n${item.text}`,
|
|
162
|
+
chunkType: "Table",
|
|
163
|
+
index: i,
|
|
164
|
+
}));
|
|
165
|
+
return {
|
|
166
|
+
text: legacy.items.map((item) => item.text).join("\n\n"),
|
|
167
|
+
chunks,
|
|
168
|
+
tables: [],
|
|
169
|
+
diagnostics: {
|
|
170
|
+
warnings: ["Native structured extraction unavailable; using JS fallback"],
|
|
171
|
+
tablesProcessed: 0,
|
|
172
|
+
tablesSplit: 0,
|
|
173
|
+
},
|
|
174
|
+
timingMs: 0,
|
|
175
|
+
};
|
|
176
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@memvid/sdk",
|
|
3
|
-
"version": "2.0.
|
|
3
|
+
"version": "2.0.157",
|
|
4
4
|
"description": "Single-file AI memory system for Node.js. Store, search, and query documents with built-in RAG.",
|
|
5
5
|
"main": "./dist/index.js",
|
|
6
6
|
"types": "./dist/index.d.ts",
|
|
@@ -41,11 +41,11 @@
|
|
|
41
41
|
"node": ">=18"
|
|
42
42
|
},
|
|
43
43
|
"optionalDependencies": {
|
|
44
|
-
"@memvid/sdk-darwin-arm64": "2.0.
|
|
45
|
-
"@memvid/sdk-darwin-x64": "2.0.
|
|
46
|
-
"@memvid/sdk-linux-x64-gnu": "2.0.
|
|
47
|
-
"@memvid/sdk-linux-arm64-gnu": "2.0.
|
|
48
|
-
"@memvid/sdk-win32-x64-msvc": "2.0.
|
|
44
|
+
"@memvid/sdk-darwin-arm64": "2.0.157",
|
|
45
|
+
"@memvid/sdk-darwin-x64": "2.0.157",
|
|
46
|
+
"@memvid/sdk-linux-x64-gnu": "2.0.157",
|
|
47
|
+
"@memvid/sdk-linux-arm64-gnu": "2.0.157",
|
|
48
|
+
"@memvid/sdk-win32-x64-msvc": "2.0.157"
|
|
49
49
|
},
|
|
50
50
|
"peerDependencies": {
|
|
51
51
|
"@langchain/core": ">=0.3.0",
|
|
@@ -89,7 +89,6 @@
|
|
|
89
89
|
"langchain": ">=0.3.0",
|
|
90
90
|
"llamaindex": ">=0.12.0",
|
|
91
91
|
"officeparser": "^6.0.2",
|
|
92
|
-
"unpdf": "^1.4.0"
|
|
93
|
-
"xlsx": "^0.18.5"
|
|
92
|
+
"unpdf": "^1.4.0"
|
|
94
93
|
}
|
|
95
94
|
}
|