@memvid/sdk 2.0.154 → 2.0.156
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/embeddings.d.ts +14 -8
- package/dist/embeddings.js +25 -21
- package/dist/image-ingest.d.ts +250 -0
- package/dist/image-ingest.js +411 -0
- package/dist/index.d.ts +23 -2
- package/dist/index.js +197 -4
- package/dist/ocr.d.ts +302 -0
- package/dist/ocr.js +778 -0
- package/dist/types.d.ts +36 -0
- package/package.json +7 -6
package/dist/embeddings.d.ts
CHANGED
|
@@ -98,9 +98,13 @@ export interface OpenAIEmbeddingsConfig {
|
|
|
98
98
|
apiKey?: string;
|
|
99
99
|
/** Model to use. Default: 'text-embedding-3-small' */
|
|
100
100
|
model?: string;
|
|
101
|
-
/** Max number of texts to embed in a single API call. Default: 2048 */
|
|
101
|
+
/** Max number of texts to embed in a single API call. Default: 2048 (OpenAI hard limit) */
|
|
102
102
|
batchSize?: number;
|
|
103
|
-
/** Max tokens per
|
|
103
|
+
/** Max tokens per individual input text (OpenAI limit is 8191). Default: 8000 (with safety margin).
|
|
104
|
+
* Note: this is a per-INPUT limit, not a per-batch total. Each input in a batch
|
|
105
|
+
* must individually be under this limit, but the batch total can be much higher. */
|
|
106
|
+
maxTokensPerInput?: number;
|
|
107
|
+
/** @deprecated Use maxTokensPerInput instead */
|
|
104
108
|
maxTokensPerBatch?: number;
|
|
105
109
|
}
|
|
106
110
|
/**
|
|
@@ -120,7 +124,7 @@ export declare class OpenAIEmbeddings implements EmbeddingProvider {
|
|
|
120
124
|
private readonly _apiKey;
|
|
121
125
|
private readonly _model;
|
|
122
126
|
private readonly _batchSize;
|
|
123
|
-
private readonly
|
|
127
|
+
private readonly _maxTokensPerInput;
|
|
124
128
|
constructor(config?: OpenAIEmbeddingsConfig);
|
|
125
129
|
get dimension(): number;
|
|
126
130
|
get modelName(): string;
|
|
@@ -132,15 +136,17 @@ export declare class OpenAIEmbeddings implements EmbeddingProvider {
|
|
|
132
136
|
*/
|
|
133
137
|
private estimateTokens;
|
|
134
138
|
/**
|
|
135
|
-
* Truncate text to fit within token limit.
|
|
139
|
+
* Truncate a single input text to fit within the per-input token limit.
|
|
136
140
|
* Preserves beginning of text as it typically contains the most important context.
|
|
137
|
-
* Uses conservative
|
|
141
|
+
* Uses conservative 2.0 chars/token for truncation to handle data-heavy content
|
|
142
|
+
* (spreadsheets, numbers, cell refs) where tokenization is denser than prose.
|
|
138
143
|
*/
|
|
139
144
|
private truncateToTokenLimit;
|
|
140
145
|
/**
|
|
141
|
-
* Split texts into batches respecting
|
|
142
|
-
*
|
|
143
|
-
*
|
|
146
|
+
* Split texts into batches respecting:
|
|
147
|
+
* 1. Per-input token limit (8,192 for text-embedding-3-small) — truncate oversized inputs
|
|
148
|
+
* 2. Per-request token limit (300K for most tiers) — split into multiple requests
|
|
149
|
+
* 3. Per-request input count (2,048 max inputs per request)
|
|
144
150
|
*/
|
|
145
151
|
private createTokenAwareBatches;
|
|
146
152
|
embedDocuments(texts: string[]): Promise<number[][]>;
|
package/dist/embeddings.js
CHANGED
|
@@ -115,8 +115,9 @@ class OpenAIEmbeddings {
|
|
|
115
115
|
}
|
|
116
116
|
this._model = config.model || 'text-embedding-3-small';
|
|
117
117
|
this._batchSize = config.batchSize || 2048;
|
|
118
|
-
// OpenAI's limit is 8,192 tokens
|
|
119
|
-
|
|
118
|
+
// OpenAI's limit is 8,192 tokens PER INPUT (not per batch).
|
|
119
|
+
// You can send up to 2048 inputs per request regardless of total tokens.
|
|
120
|
+
this._maxTokensPerInput = config.maxTokensPerInput || config.maxTokensPerBatch || 8000;
|
|
120
121
|
}
|
|
121
122
|
get dimension() {
|
|
122
123
|
return exports.MODEL_DIMENSIONS[this._model] || 1536;
|
|
@@ -136,48 +137,51 @@ class OpenAIEmbeddings {
|
|
|
136
137
|
return Math.ceil(text.length / 3.5);
|
|
137
138
|
}
|
|
138
139
|
/**
|
|
139
|
-
* Truncate text to fit within token limit.
|
|
140
|
+
* Truncate a single input text to fit within the per-input token limit.
|
|
140
141
|
* Preserves beginning of text as it typically contains the most important context.
|
|
141
|
-
* Uses conservative
|
|
142
|
+
* Uses conservative 2.0 chars/token for truncation to handle data-heavy content
|
|
143
|
+
* (spreadsheets, numbers, cell refs) where tokenization is denser than prose.
|
|
142
144
|
*/
|
|
143
145
|
truncateToTokenLimit(text) {
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
//
|
|
147
|
-
const maxChars = Math.floor(
|
|
146
|
+
const maxTokens = Math.min(this._maxTokensPerInput, 7800);
|
|
147
|
+
// Use 2.0 chars/token for safe truncation — handles spreadsheet data,
|
|
148
|
+
// numbers, and special characters which tokenize at ~2.2 chars/token
|
|
149
|
+
const maxChars = Math.floor(maxTokens * 2.0);
|
|
148
150
|
if (text.length <= maxChars) {
|
|
149
151
|
return text;
|
|
150
152
|
}
|
|
151
153
|
return text.slice(0, maxChars);
|
|
152
154
|
}
|
|
153
155
|
/**
|
|
154
|
-
* Split texts into batches respecting
|
|
155
|
-
*
|
|
156
|
-
*
|
|
156
|
+
* Split texts into batches respecting:
|
|
157
|
+
* 1. Per-input token limit (8,192 for text-embedding-3-small) — truncate oversized inputs
|
|
158
|
+
* 2. Per-request token limit (300K for most tiers) — split into multiple requests
|
|
159
|
+
* 3. Per-request input count (2,048 max inputs per request)
|
|
157
160
|
*/
|
|
158
161
|
createTokenAwareBatches(texts) {
|
|
162
|
+
// OpenAI enforces a per-request total token limit (typically 300K).
|
|
163
|
+
// Use 250K as a safe default to account for token estimation inaccuracy.
|
|
164
|
+
const MAX_TOKENS_PER_REQUEST = 250000;
|
|
159
165
|
const batches = [];
|
|
160
166
|
let currentBatch = [];
|
|
161
|
-
let
|
|
167
|
+
let currentBatchTokens = 0;
|
|
162
168
|
for (let text of texts) {
|
|
169
|
+
// Truncate individual texts that exceed the per-input token limit
|
|
163
170
|
let textTokens = this.estimateTokens(text);
|
|
164
|
-
|
|
165
|
-
if (textTokens > this._maxTokensPerBatch) {
|
|
171
|
+
if (textTokens > this._maxTokensPerInput) {
|
|
166
172
|
text = this.truncateToTokenLimit(text);
|
|
167
173
|
textTokens = this.estimateTokens(text);
|
|
168
174
|
}
|
|
169
|
-
const
|
|
175
|
+
const wouldExceedRequestTokens = (currentBatchTokens + textTokens) > MAX_TOKENS_PER_REQUEST;
|
|
170
176
|
const wouldExceedCount = currentBatch.length >= this._batchSize;
|
|
171
|
-
if (
|
|
172
|
-
|
|
173
|
-
batches.push(currentBatch);
|
|
174
|
-
}
|
|
177
|
+
if ((wouldExceedRequestTokens || wouldExceedCount) && currentBatch.length > 0) {
|
|
178
|
+
batches.push(currentBatch);
|
|
175
179
|
currentBatch = [text];
|
|
176
|
-
|
|
180
|
+
currentBatchTokens = textTokens;
|
|
177
181
|
}
|
|
178
182
|
else {
|
|
179
183
|
currentBatch.push(text);
|
|
180
|
-
|
|
184
|
+
currentBatchTokens += textTokens;
|
|
181
185
|
}
|
|
182
186
|
}
|
|
183
187
|
if (currentBatch.length > 0) {
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* High-performance batch image ingestion for Memvid SDK (Node.js).
|
|
3
|
+
*
|
|
4
|
+
* Uses OCR to extract text from images, then ingests into a .mv2 memory file.
|
|
5
|
+
* docTR (via Python) provides highest accuracy (85.3%), Tesseract.js is available as optional dependency.
|
|
6
|
+
*
|
|
7
|
+
* @example
|
|
8
|
+
* ```typescript
|
|
9
|
+
* import { ImageIngestor } from '@memvid/sdk';
|
|
10
|
+
*
|
|
11
|
+
* // First install tesseract.js: npm install tesseract.js
|
|
12
|
+
* const ingestor = new ImageIngestor({
|
|
13
|
+
* ocrProvider: 'tesseract',
|
|
14
|
+
* workers: 4,
|
|
15
|
+
* });
|
|
16
|
+
*
|
|
17
|
+
* const result = await ingestor.ingestDirectory(
|
|
18
|
+
* './construction_drawings/',
|
|
19
|
+
* './project.mv2',
|
|
20
|
+
* {
|
|
21
|
+
* patterns: ['*.png', '*.jpg'],
|
|
22
|
+
* onProgress: (done, total) => console.log(`${done}/${total}`),
|
|
23
|
+
* }
|
|
24
|
+
* );
|
|
25
|
+
*
|
|
26
|
+
* console.log(`Processed ${result.totalImages} images`);
|
|
27
|
+
* await ingestor.terminate();
|
|
28
|
+
* ```
|
|
29
|
+
*
|
|
30
|
+
* For highest accuracy (85.3%), use docTR via Python:
|
|
31
|
+
* ```typescript
|
|
32
|
+
* // Requires: pip install python-doctr[torch]
|
|
33
|
+
* const ingestor = new ImageIngestor({ ocrProvider: 'doctr' });
|
|
34
|
+
* ```
|
|
35
|
+
*/
|
|
36
|
+
import { OCRProviderType } from './ocr';
|
|
37
|
+
/**
|
|
38
|
+
* Options for image ingestion.
|
|
39
|
+
*/
|
|
40
|
+
export interface ImageIngestOptions {
|
|
41
|
+
/** Minimum OCR confidence threshold (0-1). Default: 0.3 */
|
|
42
|
+
minConfidence?: number;
|
|
43
|
+
/** Use fallback OCR on low confidence. Default: true */
|
|
44
|
+
fallbackOcr?: boolean;
|
|
45
|
+
/** Images to process per batch. Default: 10 */
|
|
46
|
+
batchSize?: number;
|
|
47
|
+
/** Metadata to attach to all ingested frames */
|
|
48
|
+
metadata?: Record<string, unknown>;
|
|
49
|
+
/** Label for ingested frames. Default: 'image-extract' */
|
|
50
|
+
label?: string;
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Options for directory ingestion.
|
|
54
|
+
*/
|
|
55
|
+
export interface DirectoryIngestOptions extends ImageIngestOptions {
|
|
56
|
+
/** Glob patterns for files to include. Default: ['*.png', '*.jpg', '*.jpeg', '*.tiff'] */
|
|
57
|
+
patterns?: string[];
|
|
58
|
+
/** Search subdirectories. Default: true */
|
|
59
|
+
recursive?: boolean;
|
|
60
|
+
/** Progress callback */
|
|
61
|
+
onProgress?: (completed: number, total: number) => void;
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Options for array-based ingestion.
|
|
65
|
+
*/
|
|
66
|
+
export interface ImagesIngestOptions extends ImageIngestOptions {
|
|
67
|
+
/** Progress callback */
|
|
68
|
+
onProgress?: (completed: number, total: number) => void;
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* Result from batch image ingestion.
|
|
72
|
+
*/
|
|
73
|
+
export interface ImageIngestResult {
|
|
74
|
+
/** Total images processed */
|
|
75
|
+
totalImages: number;
|
|
76
|
+
/** Successfully ingested images */
|
|
77
|
+
successful: number;
|
|
78
|
+
/** Failed images */
|
|
79
|
+
failed: number;
|
|
80
|
+
/** Total chunks/frames created */
|
|
81
|
+
totalChunks: number;
|
|
82
|
+
/** Processing time in seconds */
|
|
83
|
+
elapsedSeconds: number;
|
|
84
|
+
/** Output file size in bytes */
|
|
85
|
+
outputSizeBytes: number;
|
|
86
|
+
/** Errors encountered */
|
|
87
|
+
errors: Array<{
|
|
88
|
+
path: string;
|
|
89
|
+
error: string;
|
|
90
|
+
}>;
|
|
91
|
+
/** Images processed per second */
|
|
92
|
+
imagesPerSecond: number;
|
|
93
|
+
/** Output size in MB */
|
|
94
|
+
outputSizeMb: number;
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* Constructor options for ImageIngestor.
|
|
98
|
+
*/
|
|
99
|
+
export interface ImageIngestorOptions {
|
|
100
|
+
/** OCR provider: 'tesseract', 'doctr', or 'easyocr'. Default: 'tesseract' */
|
|
101
|
+
ocrProvider?: OCRProviderType;
|
|
102
|
+
/** Number of parallel workers. Default: CPU count */
|
|
103
|
+
workers?: number;
|
|
104
|
+
/** Python path for doctr/easyocr providers */
|
|
105
|
+
pythonPath?: string;
|
|
106
|
+
}
|
|
107
|
+
/**
|
|
108
|
+
* High-performance batch image ingestor for Memvid.
|
|
109
|
+
*
|
|
110
|
+
* Combines OCR text extraction with parallel processing for fast, accurate
|
|
111
|
+
* ingestion of large image collections.
|
|
112
|
+
*
|
|
113
|
+
* OCR Accuracy (tested on construction drawings):
|
|
114
|
+
* - docTR (Python): 85.3% - BEST
|
|
115
|
+
* - EasyOCR (Python): 79.4%
|
|
116
|
+
* - Tesseract.js: ~50-60%
|
|
117
|
+
*
|
|
118
|
+
* @example
|
|
119
|
+
* ```typescript
|
|
120
|
+
* const ingestor = new ImageIngestor({
|
|
121
|
+
* ocrProvider: 'doctr',
|
|
122
|
+
* workers: 8,
|
|
123
|
+
* });
|
|
124
|
+
*
|
|
125
|
+
* const result = await ingestor.ingestDirectory('./drawings/', './output.mv2');
|
|
126
|
+
* console.log(`Processed ${result.totalImages} images in ${result.elapsedSeconds}s`);
|
|
127
|
+
*
|
|
128
|
+
* await ingestor.terminate();
|
|
129
|
+
* ```
|
|
130
|
+
*/
|
|
131
|
+
export declare class ImageIngestor {
|
|
132
|
+
private _ocr;
|
|
133
|
+
private _fallbackOcr;
|
|
134
|
+
private _workers;
|
|
135
|
+
private _ocrType;
|
|
136
|
+
constructor(options?: ImageIngestorOptions);
|
|
137
|
+
/** Primary OCR provider name */
|
|
138
|
+
get ocrName(): string;
|
|
139
|
+
/** Number of parallel workers */
|
|
140
|
+
get workers(): number;
|
|
141
|
+
/**
|
|
142
|
+
* Ingest multiple images into a .mv2 file.
|
|
143
|
+
*
|
|
144
|
+
* @param paths - Array of image file paths
|
|
145
|
+
* @param outputPath - Output .mv2 file path
|
|
146
|
+
* @param options - Ingestion options
|
|
147
|
+
* @returns Promise resolving to ingestion result
|
|
148
|
+
*
|
|
149
|
+
* @example
|
|
150
|
+
* ```typescript
|
|
151
|
+
* const result = await ingestor.ingestImages(
|
|
152
|
+
* ['img1.png', 'img2.png'],
|
|
153
|
+
* './output.mv2',
|
|
154
|
+
* { onProgress: (d, t) => console.log(`${d}/${t}`) }
|
|
155
|
+
* );
|
|
156
|
+
* ```
|
|
157
|
+
*/
|
|
158
|
+
ingestImages(paths: string[], outputPath: string, options?: ImagesIngestOptions): Promise<ImageIngestResult>;
|
|
159
|
+
/**
|
|
160
|
+
* Ingest all matching images from a directory.
|
|
161
|
+
*
|
|
162
|
+
* @param directory - Source directory path
|
|
163
|
+
* @param outputPath - Output .mv2 file path
|
|
164
|
+
* @param options - Directory ingestion options
|
|
165
|
+
* @returns Promise resolving to ingestion result
|
|
166
|
+
*
|
|
167
|
+
* @example
|
|
168
|
+
* ```typescript
|
|
169
|
+
* const result = await ingestor.ingestDirectory(
|
|
170
|
+
* './construction_drawings/',
|
|
171
|
+
* './project.mv2',
|
|
172
|
+
* {
|
|
173
|
+
* patterns: ['*.png', '*.jpg'],
|
|
174
|
+
* recursive: true,
|
|
175
|
+
* onProgress: (d, t) => console.log(`${d}/${t}`),
|
|
176
|
+
* }
|
|
177
|
+
* );
|
|
178
|
+
* ```
|
|
179
|
+
*/
|
|
180
|
+
ingestDirectory(directory: string, outputPath: string, options?: DirectoryIngestOptions): Promise<ImageIngestResult>;
|
|
181
|
+
/**
|
|
182
|
+
* Extract text from a single image with fallback support.
|
|
183
|
+
*/
|
|
184
|
+
private _extractText;
|
|
185
|
+
/**
|
|
186
|
+
* Clean up OCR worker resources.
|
|
187
|
+
*
|
|
188
|
+
* Call this when done using the ingestor to free memory.
|
|
189
|
+
*/
|
|
190
|
+
terminate(): Promise<void>;
|
|
191
|
+
}
|
|
192
|
+
/**
|
|
193
|
+
* Convenience function for quick image ingestion.
|
|
194
|
+
*
|
|
195
|
+
* Creates an ImageIngestor, processes images, and cleans up automatically.
|
|
196
|
+
*
|
|
197
|
+
* @param paths - Array of image file paths
|
|
198
|
+
* @param outputPath - Output .mv2 file path
|
|
199
|
+
* @param options - Ingestion options
|
|
200
|
+
* @returns Promise resolving to ingestion result
|
|
201
|
+
*
|
|
202
|
+
* @example
|
|
203
|
+
* ```typescript
|
|
204
|
+
* import { ingestImages } from 'memvid-sdk';
|
|
205
|
+
*
|
|
206
|
+
* const result = await ingestImages(
|
|
207
|
+
* ['img1.png', 'img2.png'],
|
|
208
|
+
* './output.mv2',
|
|
209
|
+
* {
|
|
210
|
+
* ocrProvider: 'doctr',
|
|
211
|
+
* onProgress: (d, t) => console.log(`${d}/${t}`),
|
|
212
|
+
* }
|
|
213
|
+
* );
|
|
214
|
+
* ```
|
|
215
|
+
*/
|
|
216
|
+
export declare function ingestImages(paths: string[], outputPath: string, options?: ImagesIngestOptions & {
|
|
217
|
+
ocrProvider?: OCRProviderType;
|
|
218
|
+
workers?: number;
|
|
219
|
+
pythonPath?: string;
|
|
220
|
+
}): Promise<ImageIngestResult>;
|
|
221
|
+
/**
|
|
222
|
+
* Convenience function for quick directory ingestion.
|
|
223
|
+
*
|
|
224
|
+
* Creates an ImageIngestor, processes directory, and cleans up automatically.
|
|
225
|
+
*
|
|
226
|
+
* @param directory - Source directory path
|
|
227
|
+
* @param outputPath - Output .mv2 file path
|
|
228
|
+
* @param options - Directory ingestion options
|
|
229
|
+
* @returns Promise resolving to ingestion result
|
|
230
|
+
*
|
|
231
|
+
* @example
|
|
232
|
+
* ```typescript
|
|
233
|
+
* import { ingestDirectory } from 'memvid-sdk';
|
|
234
|
+
*
|
|
235
|
+
* const result = await ingestDirectory(
|
|
236
|
+
* './construction_drawings/',
|
|
237
|
+
* './project.mv2',
|
|
238
|
+
* {
|
|
239
|
+
* ocrProvider: 'doctr',
|
|
240
|
+
* patterns: ['*.png', '*.jpg'],
|
|
241
|
+
* onProgress: (d, t) => console.log(`${d}/${t}`),
|
|
242
|
+
* }
|
|
243
|
+
* );
|
|
244
|
+
* ```
|
|
245
|
+
*/
|
|
246
|
+
export declare function ingestDirectory(directory: string, outputPath: string, options?: DirectoryIngestOptions & {
|
|
247
|
+
ocrProvider?: OCRProviderType;
|
|
248
|
+
workers?: number;
|
|
249
|
+
pythonPath?: string;
|
|
250
|
+
}): Promise<ImageIngestResult>;
|