npm - @memvid/sdk - Versions diffs - 2.0.154 → 2.0.156 - Mend

@memvid/sdk 2.0.154 → 2.0.156

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/dist/embeddings.d.ts CHANGED Viewed

@@ -98,9 +98,13 @@ export interface OpenAIEmbeddingsConfig {
     apiKey?: string;
     /** Model to use. Default: 'text-embedding-3-small' */
     model?: string;
-    /** Max number of texts to embed in a single API call. Default: 2048 */
+    /** Max number of texts to embed in a single API call. Default: 2048 (OpenAI hard limit) */
     batchSize?: number;
-    /** Max tokens per batch (OpenAI limit is 8191). Default: 8000 (with safety margin) */
+    /** Max tokens per individual input text (OpenAI limit is 8191). Default: 8000 (with safety margin).
+     *  Note: this is a per-INPUT limit, not a per-batch total. Each input in a batch
+     *  must individually be under this limit, but the batch total can be much higher. */
+    maxTokensPerInput?: number;
+    /** @deprecated Use maxTokensPerInput instead */
     maxTokensPerBatch?: number;
 }
 /**
@@ -120,7 +124,7 @@ export declare class OpenAIEmbeddings implements EmbeddingProvider {
     private readonly _apiKey;
     private readonly _model;
     private readonly _batchSize;
-    private readonly _maxTokensPerBatch;
+    private readonly _maxTokensPerInput;
     constructor(config?: OpenAIEmbeddingsConfig);
     get dimension(): number;
     get modelName(): string;
@@ -132,15 +136,17 @@ export declare class OpenAIEmbeddings implements EmbeddingProvider {
      */
     private estimateTokens;
     /**
-     * Truncate text to fit within token limit.
+     * Truncate a single input text to fit within the per-input token limit.
      * Preserves beginning of text as it typically contains the most important context.
-     * Uses conservative 3.0 chars/token for truncation to handle mixed content safely.
+     * Uses conservative 2.0 chars/token for truncation to handle data-heavy content
+     * (spreadsheets, numbers, cell refs) where tokenization is denser than prose.
      */
     private truncateToTokenLimit;
     /**
-     * Split texts into batches respecting both document count and token limits.
-     * This prevents OpenAI API errors when total tokens exceed 8,192.
-     * Automatically truncates individual texts that exceed the token limit.
+     * Split texts into batches respecting:
+     *  1. Per-input token limit (8,192 for text-embedding-3-small) — truncate oversized inputs
+     *  2. Per-request token limit (300K for most tiers) — split into multiple requests
+     *  3. Per-request input count (2,048 max inputs per request)
      */
     private createTokenAwareBatches;
     embedDocuments(texts: string[]): Promise<number[][]>;

package/dist/embeddings.js CHANGED Viewed

@@ -115,8 +115,9 @@ class OpenAIEmbeddings {
         }
         this._model = config.model || 'text-embedding-3-small';
         this._batchSize = config.batchSize || 2048;
-        // OpenAI's limit is 8,192 tokens. Use 8,000 as default for max throughput.
-        this._maxTokensPerBatch = config.maxTokensPerBatch || 8000;
+        // OpenAI's limit is 8,192 tokens PER INPUT (not per batch).
+        // You can send up to 2048 inputs per request regardless of total tokens.
+        this._maxTokensPerInput = config.maxTokensPerInput || config.maxTokensPerBatch || 8000;
     }
     get dimension() {
         return exports.MODEL_DIMENSIONS[this._model] || 1536;
@@ -136,48 +137,51 @@ class OpenAIEmbeddings {
         return Math.ceil(text.length / 3.5);
     }
     /**
-     * Truncate text to fit within token limit.
+     * Truncate a single input text to fit within the per-input token limit.
      * Preserves beginning of text as it typically contains the most important context.
-     * Uses conservative 3.0 chars/token for truncation to handle mixed content safely.
+     * Uses conservative 2.0 chars/token for truncation to handle data-heavy content
+     * (spreadsheets, numbers, cell refs) where tokenization is denser than prose.
      */
     truncateToTokenLimit(text) {
-        // Use conservative limit for truncation: 7800 tokens max for single text
-        const maxTokensForSingleText = Math.min(this._maxTokensPerBatch, 7800);
-        // Use 3.0 chars/token for safe truncation
-        const maxChars = Math.floor(maxTokensForSingleText * 3.0);
+        const maxTokens = Math.min(this._maxTokensPerInput, 7800);
+        // Use 2.0 chars/token for safe truncation — handles spreadsheet data,
+        // numbers, and special characters which tokenize at ~2.2 chars/token
+        const maxChars = Math.floor(maxTokens * 2.0);
         if (text.length <= maxChars) {
             return text;
         }
         return text.slice(0, maxChars);
     }
     /**
-     * Split texts into batches respecting both document count and token limits.
-     * This prevents OpenAI API errors when total tokens exceed 8,192.
-     * Automatically truncates individual texts that exceed the token limit.
+     * Split texts into batches respecting:
+     *  1. Per-input token limit (8,192 for text-embedding-3-small) — truncate oversized inputs
+     *  2. Per-request token limit (300K for most tiers) — split into multiple requests
+     *  3. Per-request input count (2,048 max inputs per request)
      */
     createTokenAwareBatches(texts) {
+        // OpenAI enforces a per-request total token limit (typically 300K).
+        // Use 250K as a safe default to account for token estimation inaccuracy.
+        const MAX_TOKENS_PER_REQUEST = 250000;
         const batches = [];
         let currentBatch = [];
-        let currentTokens = 0;
+        let currentBatchTokens = 0;
         for (let text of texts) {
+            // Truncate individual texts that exceed the per-input token limit
             let textTokens = this.estimateTokens(text);
-            // Truncate if single text exceeds token limit
-            if (textTokens > this._maxTokensPerBatch) {
+            if (textTokens > this._maxTokensPerInput) {
                 text = this.truncateToTokenLimit(text);
                 textTokens = this.estimateTokens(text);
             }
-            const wouldExceedTokens = (currentTokens + textTokens) > this._maxTokensPerBatch;
+            const wouldExceedRequestTokens = (currentBatchTokens + textTokens) > MAX_TOKENS_PER_REQUEST;
             const wouldExceedCount = currentBatch.length >= this._batchSize;
-            if (wouldExceedTokens || wouldExceedCount) {
-                if (currentBatch.length > 0) {
-                    batches.push(currentBatch);
-                }
+            if ((wouldExceedRequestTokens || wouldExceedCount) && currentBatch.length > 0) {
+                batches.push(currentBatch);
                 currentBatch = [text];
-                currentTokens = textTokens;
+                currentBatchTokens = textTokens;
             }
             else {
                 currentBatch.push(text);
-                currentTokens += textTokens;
+                currentBatchTokens += textTokens;
             }
         }
         if (currentBatch.length > 0) {

package/dist/image-ingest.d.ts ADDED Viewed

@@ -0,0 +1,250 @@
+/**
+ * High-performance batch image ingestion for Memvid SDK (Node.js).
+ *
+ * Uses OCR to extract text from images, then ingests into a .mv2 memory file.
+ * docTR (via Python) provides highest accuracy (85.3%), Tesseract.js is available as optional dependency.
+ *
+ * @example
+ * ```typescript
+ * import { ImageIngestor } from '@memvid/sdk';
+ *
+ * // First install tesseract.js: npm install tesseract.js
+ * const ingestor = new ImageIngestor({
+ *   ocrProvider: 'tesseract',
+ *   workers: 4,
+ * });
+ *
+ * const result = await ingestor.ingestDirectory(
+ *   './construction_drawings/',
+ *   './project.mv2',
+ *   {
+ *     patterns: ['*.png', '*.jpg'],
+ *     onProgress: (done, total) => console.log(`${done}/${total}`),
+ *   }
+ * );
+ *
+ * console.log(`Processed ${result.totalImages} images`);
+ * await ingestor.terminate();
+ * ```
+ *
+ * For highest accuracy (85.3%), use docTR via Python:
+ * ```typescript
+ * // Requires: pip install python-doctr[torch]
+ * const ingestor = new ImageIngestor({ ocrProvider: 'doctr' });
+ * ```
+ */
+import { OCRProviderType } from './ocr';
+/**
+ * Options for image ingestion.
+ */
+export interface ImageIngestOptions {
+    /** Minimum OCR confidence threshold (0-1). Default: 0.3 */
+    minConfidence?: number;
+    /** Use fallback OCR on low confidence. Default: true */
+    fallbackOcr?: boolean;
+    /** Images to process per batch. Default: 10 */
+    batchSize?: number;
+    /** Metadata to attach to all ingested frames */
+    metadata?: Record<string, unknown>;
+    /** Label for ingested frames. Default: 'image-extract' */
+    label?: string;
+}
+/**
+ * Options for directory ingestion.
+ */
+export interface DirectoryIngestOptions extends ImageIngestOptions {
+    /** Glob patterns for files to include. Default: ['*.png', '*.jpg', '*.jpeg', '*.tiff'] */
+    patterns?: string[];
+    /** Search subdirectories. Default: true */
+    recursive?: boolean;
+    /** Progress callback */
+    onProgress?: (completed: number, total: number) => void;
+}
+/**
+ * Options for array-based ingestion.
+ */
+export interface ImagesIngestOptions extends ImageIngestOptions {
+    /** Progress callback */
+    onProgress?: (completed: number, total: number) => void;
+}
+/**
+ * Result from batch image ingestion.
+ */
+export interface ImageIngestResult {
+    /** Total images processed */
+    totalImages: number;
+    /** Successfully ingested images */
+    successful: number;
+    /** Failed images */
+    failed: number;
+    /** Total chunks/frames created */
+    totalChunks: number;
+    /** Processing time in seconds */
+    elapsedSeconds: number;
+    /** Output file size in bytes */
+    outputSizeBytes: number;
+    /** Errors encountered */
+    errors: Array<{
+        path: string;
+        error: string;
+    }>;
+    /** Images processed per second */
+    imagesPerSecond: number;
+    /** Output size in MB */
+    outputSizeMb: number;
+}
+/**
+ * Constructor options for ImageIngestor.
+ */
+export interface ImageIngestorOptions {
+    /** OCR provider: 'tesseract', 'doctr', or 'easyocr'. Default: 'tesseract' */
+    ocrProvider?: OCRProviderType;
+    /** Number of parallel workers. Default: CPU count */
+    workers?: number;
+    /** Python path for doctr/easyocr providers */
+    pythonPath?: string;
+}
+/**
+ * High-performance batch image ingestor for Memvid.
+ *
+ * Combines OCR text extraction with parallel processing for fast, accurate
+ * ingestion of large image collections.
+ *
+ * OCR Accuracy (tested on construction drawings):
+ *   - docTR (Python): 85.3% - BEST
+ *   - EasyOCR (Python): 79.4%
+ *   - Tesseract.js: ~50-60%
+ *
+ * @example
+ * ```typescript
+ * const ingestor = new ImageIngestor({
+ *   ocrProvider: 'doctr',
+ *   workers: 8,
+ * });
+ *
+ * const result = await ingestor.ingestDirectory('./drawings/', './output.mv2');
+ * console.log(`Processed ${result.totalImages} images in ${result.elapsedSeconds}s`);
+ *
+ * await ingestor.terminate();
+ * ```
+ */
+export declare class ImageIngestor {
+    private _ocr;
+    private _fallbackOcr;
+    private _workers;
+    private _ocrType;
+    constructor(options?: ImageIngestorOptions);
+    /** Primary OCR provider name */
+    get ocrName(): string;
+    /** Number of parallel workers */
+    get workers(): number;
+    /**
+     * Ingest multiple images into a .mv2 file.
+     *
+     * @param paths - Array of image file paths
+     * @param outputPath - Output .mv2 file path
+     * @param options - Ingestion options
+     * @returns Promise resolving to ingestion result
+     *
+     * @example
+     * ```typescript
+     * const result = await ingestor.ingestImages(
+     *   ['img1.png', 'img2.png'],
+     *   './output.mv2',
+     *   { onProgress: (d, t) => console.log(`${d}/${t}`) }
+     * );
+     * ```
+     */
+    ingestImages(paths: string[], outputPath: string, options?: ImagesIngestOptions): Promise<ImageIngestResult>;
+    /**
+     * Ingest all matching images from a directory.
+     *
+     * @param directory - Source directory path
+     * @param outputPath - Output .mv2 file path
+     * @param options - Directory ingestion options
+     * @returns Promise resolving to ingestion result
+     *
+     * @example
+     * ```typescript
+     * const result = await ingestor.ingestDirectory(
+     *   './construction_drawings/',
+     *   './project.mv2',
+     *   {
+     *     patterns: ['*.png', '*.jpg'],
+     *     recursive: true,
+     *     onProgress: (d, t) => console.log(`${d}/${t}`),
+     *   }
+     * );
+     * ```
+     */
+    ingestDirectory(directory: string, outputPath: string, options?: DirectoryIngestOptions): Promise<ImageIngestResult>;
+    /**
+     * Extract text from a single image with fallback support.
+     */
+    private _extractText;
+    /**
+     * Clean up OCR worker resources.
+     *
+     * Call this when done using the ingestor to free memory.
+     */
+    terminate(): Promise<void>;
+}
+/**
+ * Convenience function for quick image ingestion.
+ *
+ * Creates an ImageIngestor, processes images, and cleans up automatically.
+ *
+ * @param paths - Array of image file paths
+ * @param outputPath - Output .mv2 file path
+ * @param options - Ingestion options
+ * @returns Promise resolving to ingestion result
+ *
+ * @example
+ * ```typescript
+ * import { ingestImages } from 'memvid-sdk';
+ *
+ * const result = await ingestImages(
+ *   ['img1.png', 'img2.png'],
+ *   './output.mv2',
+ *   {
+ *     ocrProvider: 'doctr',
+ *     onProgress: (d, t) => console.log(`${d}/${t}`),
+ *   }
+ * );
+ * ```
+ */
+export declare function ingestImages(paths: string[], outputPath: string, options?: ImagesIngestOptions & {
+    ocrProvider?: OCRProviderType;
+    workers?: number;
+    pythonPath?: string;
+}): Promise<ImageIngestResult>;
+/**
+ * Convenience function for quick directory ingestion.
+ *
+ * Creates an ImageIngestor, processes directory, and cleans up automatically.
+ *
+ * @param directory - Source directory path
+ * @param outputPath - Output .mv2 file path
+ * @param options - Directory ingestion options
+ * @returns Promise resolving to ingestion result
+ *
+ * @example
+ * ```typescript
+ * import { ingestDirectory } from 'memvid-sdk';
+ *
+ * const result = await ingestDirectory(
+ *   './construction_drawings/',
+ *   './project.mv2',
+ *   {
+ *     ocrProvider: 'doctr',
+ *     patterns: ['*.png', '*.jpg'],
+ *     onProgress: (d, t) => console.log(`${d}/${t}`),
+ *   }
+ * );
+ * ```
+ */
+export declare function ingestDirectory(directory: string, outputPath: string, options?: DirectoryIngestOptions & {
+    ocrProvider?: OCRProviderType;
+    workers?: number;
+    pythonPath?: string;
+}): Promise<ImageIngestResult>;