npm - @coreviz/sdk - Versions diffs - 1.0.8 → 1.0.10 - Mend

@coreviz/sdk 1.0.8 → 1.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/coreviz.d.ts CHANGED Viewed

@@ -14,6 +14,7 @@ export interface TagOptions {
     prompt: string;
     options?: string[];
     multiple?: boolean;
+    mode?: 'api' | 'local';
 }
 export interface TagResponse {
     tags: string[];
@@ -35,7 +36,9 @@ export declare class CoreViz {
     describe(image: string, options?: DescribeOptions): Promise<string>;
     edit(image: string, options: EditOptions): Promise<string>;
     tag(image: string, options: TagOptions): Promise<TagResponse>;
+    private tagLocal;
     embed(input: string, options?: EmbedOptions): Promise<EmbedResponse>;
     private embedLocal;
     resize(input: string | File, maxWidth?: number, maxHeight?: number): Promise<string>;
+    similarity(vecA: number[], vecB: number[]): number;
 }

package/dist/coreviz.js CHANGED Viewed

@@ -104,6 +104,10 @@ class CoreViz {
         }
     }
     async tag(image, options) {
+        const mode = options?.mode || 'api';
+        if (mode === 'local') {
+            return this.tagLocal(image, options);
+        }
         try {
             const resizedImage = await (0, resize_1.resize)(image);
             const headers = this.getHeaders();
@@ -134,6 +138,108 @@ class CoreViz {
             throw err instanceof Error ? err : new Error("An unexpected error occurred.");
         }
     }
+    async tagLocal(imageInput, options) {
+        try {
+            // Dynamic import to avoid loading transformers if not used
+            const { AutoProcessor, AutoModelForImageTextToText, RawImage, env } = await Promise.resolve().then(() => __importStar(require('@huggingface/transformers')));
+            // Configure transformers.js for browser usage
+            env.allowRemoteModels = true;
+            const processor = await AutoProcessor.from_pretrained('onnx-community/FastVLM-0.5B-ONNX');
+            const model = await AutoModelForImageTextToText.from_pretrained('onnx-community/FastVLM-0.5B-ONNX', {
+                dtype: {
+                    embed_tokens: "fp16",
+                    vision_encoder: "q4",
+                    decoder_model_merged: "q4",
+                },
+            });
+            let rawImg;
+            if (imageInput.startsWith('http')) {
+                rawImg = await RawImage.fromURL(imageInput);
+            }
+            else if (imageInput.startsWith('data:image')) {
+                const base64Data = imageInput.split(',')[1];
+                const binary = atob(base64Data);
+                const array = new Uint8Array(binary.length);
+                for (let i = 0; i < binary.length; i++) {
+                    array[i] = binary.charCodeAt(i);
+                }
+                rawImg = await RawImage.fromBlob(new Blob([array]));
+            }
+            else {
+                rawImg = await RawImage.read(imageInput);
+            }
+            let systemPrompt = `You are a precise image tagging AI.
+Rules:
+1. Return ONLY a comma-separated list of tags.
+2. DO NOT provide any conversational text, introductions, or explanations.
+3. DO NOT use full sentences.
+4. If options are provided, select strictly from them.
+Example 1:
+What animals are in the image?
+Example Output:
+cat
+Example 2:
+What color cars are visible in the image?
+Output:
+red, blue, green
+Example 3:
+What is the jersey number of the player?
+Output:
+10
+`;
+            let userPrompt = `${options.prompt}`;
+            if (options.options && options.options.length > 0) {
+                userPrompt += `\nSelect from these options: ${options.options.join(', ')}.`;
+            }
+            if (!options.multiple) {
+                userPrompt += `\nReturn a single tag.`;
+            }
+            const messages = [
+                {
+                    role: 'system',
+                    content: systemPrompt,
+                },
+                { role: 'user', content: `<image>\n${userPrompt}` },
+            ];
+            let promptText = processor.apply_chat_template(messages, {
+                add_generation_prompt: true,
+            });
+            if (typeof promptText === 'string') {
+                promptText += options.multiple ? "Tags: " : "Tag: ";
+            }
+            const inputs = await processor(rawImg, promptText, {
+                add_special_tokens: false,
+            });
+            const outputs = await model.generate({
+                ...inputs,
+                max_new_tokens: 120,
+                do_sample: false,
+                repetition_penalty: 1.2,
+            });
+            const decoded = processor.batch_decode(outputs.slice(null, [inputs.input_ids.dims.at(-1), null]), { skip_special_tokens: true });
+            let resultText = decoded[0].trim();
+            // Cleanup potential repetition of priming token
+            resultText = resultText.replace(/^(Tags?:\s*)/i, '');
+            let tags = [];
+            if (options.multiple) {
+                tags = resultText.split(',').map(s => s.trim()).filter(s => s.length > 0);
+            }
+            else {
+                tags = [resultText];
+            }
+            return {
+                tags,
+                raw: { result: resultText }
+            };
+        }
+        catch (err) {
+            console.error(err);
+            throw err instanceof Error ? err : new Error("Local tagging failed: " + String(err));
+        }
+    }
     async embed(input, options) {
         const mode = options?.mode || 'api';
         if (mode === 'local') {
@@ -176,7 +282,6 @@ class CoreViz {
             // Dynamic import to avoid loading transformers if not used
             const { AutoTokenizer, AutoProcessor, CLIPTextModelWithProjection, CLIPVisionModelWithProjection, RawImage } = await Promise.resolve().then(() => __importStar(require('@huggingface/transformers')));
             const MODEL_ID = 'Xenova/clip-vit-large-patch14';
-            console.log(`Loading local model ${MODEL_ID}...`);
             const start = Date.now();
             // Load tokenizer and processor
             const tokenizer = await AutoTokenizer.from_pretrained(MODEL_ID);
@@ -188,7 +293,6 @@ class CoreViz {
             const vision_model = await CLIPVisionModelWithProjection.from_pretrained(MODEL_ID, {
                 dtype: 'q4',
             });
-            console.log(`Model loaded in ${Date.now() - start}ms`);
             // Check if input is likely an image
             const isImage = options?.type === 'image' ||
                 input.startsWith('data:image') ||
@@ -241,5 +345,20 @@ class CoreViz {
     async resize(input, maxWidth, maxHeight) {
         return (0, resize_1.resize)(input, maxWidth, maxHeight);
     }
+    similarity(vecA, vecB) {
+        if (vecA.length !== vecB.length)
+            return 0;
+        let dotProduct = 0;
+        let normA = 0;
+        let normB = 0;
+        for (let i = 0; i < vecA.length; i++) {
+            dotProduct += vecA[i] * vecB[i];
+            normA += vecA[i] * vecA[i];
+            normB += vecB[i] * vecB[i];
+        }
+        if (normA === 0 || normB === 0)
+            return 0;
+        return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
+    }
 }
 exports.CoreViz = CoreViz;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@coreviz/sdk",
-  "version": "1.0.8",
+  "version": "1.0.10",
   "description": "CoreViz SDK",
   "main": "dist/index.js",
   "types": "dist/index.d.ts",