@memvid/sdk 2.0.154 → 2.0.156

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -98,9 +98,13 @@ export interface OpenAIEmbeddingsConfig {
98
98
  apiKey?: string;
99
99
  /** Model to use. Default: 'text-embedding-3-small' */
100
100
  model?: string;
101
- /** Max number of texts to embed in a single API call. Default: 2048 */
101
+ /** Max number of texts to embed in a single API call. Default: 2048 (OpenAI hard limit) */
102
102
  batchSize?: number;
103
- /** Max tokens per batch (OpenAI limit is 8191). Default: 8000 (with safety margin) */
103
+ /** Max tokens per individual input text (OpenAI limit is 8191). Default: 8000 (with safety margin).
104
+ * Note: this is a per-INPUT limit, not a per-batch total. Each input in a batch
105
+ * must individually be under this limit, but the batch total can be much higher. */
106
+ maxTokensPerInput?: number;
107
+ /** @deprecated Use maxTokensPerInput instead */
104
108
  maxTokensPerBatch?: number;
105
109
  }
106
110
  /**
@@ -120,7 +124,7 @@ export declare class OpenAIEmbeddings implements EmbeddingProvider {
120
124
  private readonly _apiKey;
121
125
  private readonly _model;
122
126
  private readonly _batchSize;
123
- private readonly _maxTokensPerBatch;
127
+ private readonly _maxTokensPerInput;
124
128
  constructor(config?: OpenAIEmbeddingsConfig);
125
129
  get dimension(): number;
126
130
  get modelName(): string;
@@ -132,15 +136,17 @@ export declare class OpenAIEmbeddings implements EmbeddingProvider {
132
136
  */
133
137
  private estimateTokens;
134
138
  /**
135
- * Truncate text to fit within token limit.
139
+ * Truncate a single input text to fit within the per-input token limit.
136
140
  * Preserves beginning of text as it typically contains the most important context.
137
- * Uses conservative 3.0 chars/token for truncation to handle mixed content safely.
141
+ * Uses conservative 2.0 chars/token for truncation to handle data-heavy content
142
+ * (spreadsheets, numbers, cell refs) where tokenization is denser than prose.
138
143
  */
139
144
  private truncateToTokenLimit;
140
145
  /**
141
- * Split texts into batches respecting both document count and token limits.
142
- * This prevents OpenAI API errors when total tokens exceed 8,192.
143
- * Automatically truncates individual texts that exceed the token limit.
146
+ * Split texts into batches respecting:
147
+ * 1. Per-input token limit (8,192 for text-embedding-3-small) truncate oversized inputs
148
+ * 2. Per-request token limit (300K for most tiers) split into multiple requests
149
+ * 3. Per-request input count (2,048 max inputs per request)
144
150
  */
145
151
  private createTokenAwareBatches;
146
152
  embedDocuments(texts: string[]): Promise<number[][]>;
@@ -115,8 +115,9 @@ class OpenAIEmbeddings {
115
115
  }
116
116
  this._model = config.model || 'text-embedding-3-small';
117
117
  this._batchSize = config.batchSize || 2048;
118
- // OpenAI's limit is 8,192 tokens. Use 8,000 as default for max throughput.
119
- this._maxTokensPerBatch = config.maxTokensPerBatch || 8000;
118
+ // OpenAI's limit is 8,192 tokens PER INPUT (not per batch).
119
+ // You can send up to 2048 inputs per request regardless of total tokens.
120
+ this._maxTokensPerInput = config.maxTokensPerInput || config.maxTokensPerBatch || 8000;
120
121
  }
121
122
  get dimension() {
122
123
  return exports.MODEL_DIMENSIONS[this._model] || 1536;
@@ -136,48 +137,51 @@ class OpenAIEmbeddings {
136
137
  return Math.ceil(text.length / 3.5);
137
138
  }
138
139
  /**
139
- * Truncate text to fit within token limit.
140
+ * Truncate a single input text to fit within the per-input token limit.
140
141
  * Preserves beginning of text as it typically contains the most important context.
141
- * Uses conservative 3.0 chars/token for truncation to handle mixed content safely.
142
+ * Uses conservative 2.0 chars/token for truncation to handle data-heavy content
143
+ * (spreadsheets, numbers, cell refs) where tokenization is denser than prose.
142
144
  */
143
145
  truncateToTokenLimit(text) {
144
- // Use conservative limit for truncation: 7800 tokens max for single text
145
- const maxTokensForSingleText = Math.min(this._maxTokensPerBatch, 7800);
146
- // Use 3.0 chars/token for safe truncation
147
- const maxChars = Math.floor(maxTokensForSingleText * 3.0);
146
+ const maxTokens = Math.min(this._maxTokensPerInput, 7800);
147
+ // Use 2.0 chars/token for safe truncation — handles spreadsheet data,
148
+ // numbers, and special characters which tokenize at ~2.2 chars/token
149
+ const maxChars = Math.floor(maxTokens * 2.0);
148
150
  if (text.length <= maxChars) {
149
151
  return text;
150
152
  }
151
153
  return text.slice(0, maxChars);
152
154
  }
153
155
  /**
154
- * Split texts into batches respecting both document count and token limits.
155
- * This prevents OpenAI API errors when total tokens exceed 8,192.
156
- * Automatically truncates individual texts that exceed the token limit.
156
+ * Split texts into batches respecting:
157
+ * 1. Per-input token limit (8,192 for text-embedding-3-small) truncate oversized inputs
158
+ * 2. Per-request token limit (300K for most tiers) split into multiple requests
159
+ * 3. Per-request input count (2,048 max inputs per request)
157
160
  */
158
161
  createTokenAwareBatches(texts) {
162
+ // OpenAI enforces a per-request total token limit (typically 300K).
163
+ // Use 250K as a safe default to account for token estimation inaccuracy.
164
+ const MAX_TOKENS_PER_REQUEST = 250000;
159
165
  const batches = [];
160
166
  let currentBatch = [];
161
- let currentTokens = 0;
167
+ let currentBatchTokens = 0;
162
168
  for (let text of texts) {
169
+ // Truncate individual texts that exceed the per-input token limit
163
170
  let textTokens = this.estimateTokens(text);
164
- // Truncate if single text exceeds token limit
165
- if (textTokens > this._maxTokensPerBatch) {
171
+ if (textTokens > this._maxTokensPerInput) {
166
172
  text = this.truncateToTokenLimit(text);
167
173
  textTokens = this.estimateTokens(text);
168
174
  }
169
- const wouldExceedTokens = (currentTokens + textTokens) > this._maxTokensPerBatch;
175
+ const wouldExceedRequestTokens = (currentBatchTokens + textTokens) > MAX_TOKENS_PER_REQUEST;
170
176
  const wouldExceedCount = currentBatch.length >= this._batchSize;
171
- if (wouldExceedTokens || wouldExceedCount) {
172
- if (currentBatch.length > 0) {
173
- batches.push(currentBatch);
174
- }
177
+ if ((wouldExceedRequestTokens || wouldExceedCount) && currentBatch.length > 0) {
178
+ batches.push(currentBatch);
175
179
  currentBatch = [text];
176
- currentTokens = textTokens;
180
+ currentBatchTokens = textTokens;
177
181
  }
178
182
  else {
179
183
  currentBatch.push(text);
180
- currentTokens += textTokens;
184
+ currentBatchTokens += textTokens;
181
185
  }
182
186
  }
183
187
  if (currentBatch.length > 0) {
@@ -0,0 +1,250 @@
1
+ /**
2
+ * High-performance batch image ingestion for Memvid SDK (Node.js).
3
+ *
4
+ * Uses OCR to extract text from images, then ingests into a .mv2 memory file.
5
+ * docTR (via Python) provides highest accuracy (85.3%), Tesseract.js is available as optional dependency.
6
+ *
7
+ * @example
8
+ * ```typescript
9
+ * import { ImageIngestor } from '@memvid/sdk';
10
+ *
11
+ * // First install tesseract.js: npm install tesseract.js
12
+ * const ingestor = new ImageIngestor({
13
+ * ocrProvider: 'tesseract',
14
+ * workers: 4,
15
+ * });
16
+ *
17
+ * const result = await ingestor.ingestDirectory(
18
+ * './construction_drawings/',
19
+ * './project.mv2',
20
+ * {
21
+ * patterns: ['*.png', '*.jpg'],
22
+ * onProgress: (done, total) => console.log(`${done}/${total}`),
23
+ * }
24
+ * );
25
+ *
26
+ * console.log(`Processed ${result.totalImages} images`);
27
+ * await ingestor.terminate();
28
+ * ```
29
+ *
30
+ * For highest accuracy (85.3%), use docTR via Python:
31
+ * ```typescript
32
+ * // Requires: pip install python-doctr[torch]
33
+ * const ingestor = new ImageIngestor({ ocrProvider: 'doctr' });
34
+ * ```
35
+ */
36
+ import { OCRProviderType } from './ocr';
37
+ /**
38
+ * Options for image ingestion.
39
+ */
40
+ export interface ImageIngestOptions {
41
+ /** Minimum OCR confidence threshold (0-1). Default: 0.3 */
42
+ minConfidence?: number;
43
+ /** Use fallback OCR on low confidence. Default: true */
44
+ fallbackOcr?: boolean;
45
+ /** Images to process per batch. Default: 10 */
46
+ batchSize?: number;
47
+ /** Metadata to attach to all ingested frames */
48
+ metadata?: Record<string, unknown>;
49
+ /** Label for ingested frames. Default: 'image-extract' */
50
+ label?: string;
51
+ }
52
+ /**
53
+ * Options for directory ingestion.
54
+ */
55
+ export interface DirectoryIngestOptions extends ImageIngestOptions {
56
+ /** Glob patterns for files to include. Default: ['*.png', '*.jpg', '*.jpeg', '*.tiff'] */
57
+ patterns?: string[];
58
+ /** Search subdirectories. Default: true */
59
+ recursive?: boolean;
60
+ /** Progress callback */
61
+ onProgress?: (completed: number, total: number) => void;
62
+ }
63
+ /**
64
+ * Options for array-based ingestion.
65
+ */
66
+ export interface ImagesIngestOptions extends ImageIngestOptions {
67
+ /** Progress callback */
68
+ onProgress?: (completed: number, total: number) => void;
69
+ }
70
+ /**
71
+ * Result from batch image ingestion.
72
+ */
73
+ export interface ImageIngestResult {
74
+ /** Total images processed */
75
+ totalImages: number;
76
+ /** Successfully ingested images */
77
+ successful: number;
78
+ /** Failed images */
79
+ failed: number;
80
+ /** Total chunks/frames created */
81
+ totalChunks: number;
82
+ /** Processing time in seconds */
83
+ elapsedSeconds: number;
84
+ /** Output file size in bytes */
85
+ outputSizeBytes: number;
86
+ /** Errors encountered */
87
+ errors: Array<{
88
+ path: string;
89
+ error: string;
90
+ }>;
91
+ /** Images processed per second */
92
+ imagesPerSecond: number;
93
+ /** Output size in MB */
94
+ outputSizeMb: number;
95
+ }
96
+ /**
97
+ * Constructor options for ImageIngestor.
98
+ */
99
+ export interface ImageIngestorOptions {
100
+ /** OCR provider: 'tesseract', 'doctr', or 'easyocr'. Default: 'tesseract' */
101
+ ocrProvider?: OCRProviderType;
102
+ /** Number of parallel workers. Default: CPU count */
103
+ workers?: number;
104
+ /** Python path for doctr/easyocr providers */
105
+ pythonPath?: string;
106
+ }
107
+ /**
108
+ * High-performance batch image ingestor for Memvid.
109
+ *
110
+ * Combines OCR text extraction with parallel processing for fast, accurate
111
+ * ingestion of large image collections.
112
+ *
113
+ * OCR Accuracy (tested on construction drawings):
114
+ * - docTR (Python): 85.3% - BEST
115
+ * - EasyOCR (Python): 79.4%
116
+ * - Tesseract.js: ~50-60%
117
+ *
118
+ * @example
119
+ * ```typescript
120
+ * const ingestor = new ImageIngestor({
121
+ * ocrProvider: 'doctr',
122
+ * workers: 8,
123
+ * });
124
+ *
125
+ * const result = await ingestor.ingestDirectory('./drawings/', './output.mv2');
126
+ * console.log(`Processed ${result.totalImages} images in ${result.elapsedSeconds}s`);
127
+ *
128
+ * await ingestor.terminate();
129
+ * ```
130
+ */
131
+ export declare class ImageIngestor {
132
+ private _ocr;
133
+ private _fallbackOcr;
134
+ private _workers;
135
+ private _ocrType;
136
+ constructor(options?: ImageIngestorOptions);
137
+ /** Primary OCR provider name */
138
+ get ocrName(): string;
139
+ /** Number of parallel workers */
140
+ get workers(): number;
141
+ /**
142
+ * Ingest multiple images into a .mv2 file.
143
+ *
144
+ * @param paths - Array of image file paths
145
+ * @param outputPath - Output .mv2 file path
146
+ * @param options - Ingestion options
147
+ * @returns Promise resolving to ingestion result
148
+ *
149
+ * @example
150
+ * ```typescript
151
+ * const result = await ingestor.ingestImages(
152
+ * ['img1.png', 'img2.png'],
153
+ * './output.mv2',
154
+ * { onProgress: (d, t) => console.log(`${d}/${t}`) }
155
+ * );
156
+ * ```
157
+ */
158
+ ingestImages(paths: string[], outputPath: string, options?: ImagesIngestOptions): Promise<ImageIngestResult>;
159
+ /**
160
+ * Ingest all matching images from a directory.
161
+ *
162
+ * @param directory - Source directory path
163
+ * @param outputPath - Output .mv2 file path
164
+ * @param options - Directory ingestion options
165
+ * @returns Promise resolving to ingestion result
166
+ *
167
+ * @example
168
+ * ```typescript
169
+ * const result = await ingestor.ingestDirectory(
170
+ * './construction_drawings/',
171
+ * './project.mv2',
172
+ * {
173
+ * patterns: ['*.png', '*.jpg'],
174
+ * recursive: true,
175
+ * onProgress: (d, t) => console.log(`${d}/${t}`),
176
+ * }
177
+ * );
178
+ * ```
179
+ */
180
+ ingestDirectory(directory: string, outputPath: string, options?: DirectoryIngestOptions): Promise<ImageIngestResult>;
181
+ /**
182
+ * Extract text from a single image with fallback support.
183
+ */
184
+ private _extractText;
185
+ /**
186
+ * Clean up OCR worker resources.
187
+ *
188
+ * Call this when done using the ingestor to free memory.
189
+ */
190
+ terminate(): Promise<void>;
191
+ }
192
+ /**
193
+ * Convenience function for quick image ingestion.
194
+ *
195
+ * Creates an ImageIngestor, processes images, and cleans up automatically.
196
+ *
197
+ * @param paths - Array of image file paths
198
+ * @param outputPath - Output .mv2 file path
199
+ * @param options - Ingestion options
200
+ * @returns Promise resolving to ingestion result
201
+ *
202
+ * @example
203
+ * ```typescript
204
+ * import { ingestImages } from 'memvid-sdk';
205
+ *
206
+ * const result = await ingestImages(
207
+ * ['img1.png', 'img2.png'],
208
+ * './output.mv2',
209
+ * {
210
+ * ocrProvider: 'doctr',
211
+ * onProgress: (d, t) => console.log(`${d}/${t}`),
212
+ * }
213
+ * );
214
+ * ```
215
+ */
216
+ export declare function ingestImages(paths: string[], outputPath: string, options?: ImagesIngestOptions & {
217
+ ocrProvider?: OCRProviderType;
218
+ workers?: number;
219
+ pythonPath?: string;
220
+ }): Promise<ImageIngestResult>;
221
+ /**
222
+ * Convenience function for quick directory ingestion.
223
+ *
224
+ * Creates an ImageIngestor, processes directory, and cleans up automatically.
225
+ *
226
+ * @param directory - Source directory path
227
+ * @param outputPath - Output .mv2 file path
228
+ * @param options - Directory ingestion options
229
+ * @returns Promise resolving to ingestion result
230
+ *
231
+ * @example
232
+ * ```typescript
233
+ * import { ingestDirectory } from 'memvid-sdk';
234
+ *
235
+ * const result = await ingestDirectory(
236
+ * './construction_drawings/',
237
+ * './project.mv2',
238
+ * {
239
+ * ocrProvider: 'doctr',
240
+ * patterns: ['*.png', '*.jpg'],
241
+ * onProgress: (d, t) => console.log(`${d}/${t}`),
242
+ * }
243
+ * );
244
+ * ```
245
+ */
246
+ export declare function ingestDirectory(directory: string, outputPath: string, options?: DirectoryIngestOptions & {
247
+ ocrProvider?: OCRProviderType;
248
+ workers?: number;
249
+ pythonPath?: string;
250
+ }): Promise<ImageIngestResult>;