@coreviz/sdk 1.0.8 → 1.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/coreviz.d.ts CHANGED
@@ -14,6 +14,7 @@ export interface TagOptions {
14
14
  prompt: string;
15
15
  options?: string[];
16
16
  multiple?: boolean;
17
+ mode?: 'api' | 'local';
17
18
  }
18
19
  export interface TagResponse {
19
20
  tags: string[];
@@ -35,7 +36,9 @@ export declare class CoreViz {
35
36
  describe(image: string, options?: DescribeOptions): Promise<string>;
36
37
  edit(image: string, options: EditOptions): Promise<string>;
37
38
  tag(image: string, options: TagOptions): Promise<TagResponse>;
39
+ private tagLocal;
38
40
  embed(input: string, options?: EmbedOptions): Promise<EmbedResponse>;
39
41
  private embedLocal;
40
42
  resize(input: string | File, maxWidth?: number, maxHeight?: number): Promise<string>;
43
+ similarity(vecA: number[], vecB: number[]): number;
41
44
  }
package/dist/coreviz.js CHANGED
@@ -104,6 +104,10 @@ class CoreViz {
104
104
  }
105
105
  }
106
106
  async tag(image, options) {
107
+ const mode = options?.mode || 'api';
108
+ if (mode === 'local') {
109
+ return this.tagLocal(image, options);
110
+ }
107
111
  try {
108
112
  const resizedImage = await (0, resize_1.resize)(image);
109
113
  const headers = this.getHeaders();
@@ -134,6 +138,108 @@ class CoreViz {
134
138
  throw err instanceof Error ? err : new Error("An unexpected error occurred.");
135
139
  }
136
140
  }
141
+ async tagLocal(imageInput, options) {
142
+ try {
143
+ // Dynamic import to avoid loading transformers if not used
144
+ const { AutoProcessor, AutoModelForImageTextToText, RawImage, env } = await Promise.resolve().then(() => __importStar(require('@huggingface/transformers')));
145
+ // Configure transformers.js for browser usage
146
+ env.allowRemoteModels = true;
147
+ const processor = await AutoProcessor.from_pretrained('onnx-community/FastVLM-0.5B-ONNX');
148
+ const model = await AutoModelForImageTextToText.from_pretrained('onnx-community/FastVLM-0.5B-ONNX', {
149
+ dtype: {
150
+ embed_tokens: "fp16",
151
+ vision_encoder: "q4",
152
+ decoder_model_merged: "q4",
153
+ },
154
+ });
155
+ let rawImg;
156
+ if (imageInput.startsWith('http')) {
157
+ rawImg = await RawImage.fromURL(imageInput);
158
+ }
159
+ else if (imageInput.startsWith('data:image')) {
160
+ const base64Data = imageInput.split(',')[1];
161
+ const binary = atob(base64Data);
162
+ const array = new Uint8Array(binary.length);
163
+ for (let i = 0; i < binary.length; i++) {
164
+ array[i] = binary.charCodeAt(i);
165
+ }
166
+ rawImg = await RawImage.fromBlob(new Blob([array]));
167
+ }
168
+ else {
169
+ rawImg = await RawImage.read(imageInput);
170
+ }
171
+ let systemPrompt = `You are a precise image tagging AI.
172
+ Rules:
173
+ 1. Return ONLY a comma-separated list of tags.
174
+ 2. DO NOT provide any conversational text, introductions, or explanations.
175
+ 3. DO NOT use full sentences.
176
+ 4. If options are provided, select strictly from them.
177
+
178
+ Example 1:
179
+ What animals are in the image?
180
+ Example Output:
181
+ cat
182
+
183
+ Example 2:
184
+ What color cars are visible in the image?
185
+ Output:
186
+ red, blue, green
187
+
188
+ Example 3:
189
+ What is the jersey number of the player?
190
+ Output:
191
+ 10
192
+ `;
193
+ let userPrompt = `${options.prompt}`;
194
+ if (options.options && options.options.length > 0) {
195
+ userPrompt += `\nSelect from these options: ${options.options.join(', ')}.`;
196
+ }
197
+ if (!options.multiple) {
198
+ userPrompt += `\nReturn a single tag.`;
199
+ }
200
+ const messages = [
201
+ {
202
+ role: 'system',
203
+ content: systemPrompt,
204
+ },
205
+ { role: 'user', content: `<image>\n${userPrompt}` },
206
+ ];
207
+ let promptText = processor.apply_chat_template(messages, {
208
+ add_generation_prompt: true,
209
+ });
210
+ if (typeof promptText === 'string') {
211
+ promptText += options.multiple ? "Tags: " : "Tag: ";
212
+ }
213
+ const inputs = await processor(rawImg, promptText, {
214
+ add_special_tokens: false,
215
+ });
216
+ const outputs = await model.generate({
217
+ ...inputs,
218
+ max_new_tokens: 120,
219
+ do_sample: false,
220
+ repetition_penalty: 1.2,
221
+ });
222
+ const decoded = processor.batch_decode(outputs.slice(null, [inputs.input_ids.dims.at(-1), null]), { skip_special_tokens: true });
223
+ let resultText = decoded[0].trim();
224
+ // Cleanup potential repetition of priming token
225
+ resultText = resultText.replace(/^(Tags?:\s*)/i, '');
226
+ let tags = [];
227
+ if (options.multiple) {
228
+ tags = resultText.split(',').map(s => s.trim()).filter(s => s.length > 0);
229
+ }
230
+ else {
231
+ tags = [resultText];
232
+ }
233
+ return {
234
+ tags,
235
+ raw: { result: resultText }
236
+ };
237
+ }
238
+ catch (err) {
239
+ console.error(err);
240
+ throw err instanceof Error ? err : new Error("Local tagging failed: " + String(err));
241
+ }
242
+ }
137
243
  async embed(input, options) {
138
244
  const mode = options?.mode || 'api';
139
245
  if (mode === 'local') {
@@ -176,7 +282,6 @@ class CoreViz {
176
282
  // Dynamic import to avoid loading transformers if not used
177
283
  const { AutoTokenizer, AutoProcessor, CLIPTextModelWithProjection, CLIPVisionModelWithProjection, RawImage } = await Promise.resolve().then(() => __importStar(require('@huggingface/transformers')));
178
284
  const MODEL_ID = 'Xenova/clip-vit-large-patch14';
179
- console.log(`Loading local model ${MODEL_ID}...`);
180
285
  const start = Date.now();
181
286
  // Load tokenizer and processor
182
287
  const tokenizer = await AutoTokenizer.from_pretrained(MODEL_ID);
@@ -188,7 +293,6 @@ class CoreViz {
188
293
  const vision_model = await CLIPVisionModelWithProjection.from_pretrained(MODEL_ID, {
189
294
  dtype: 'q4',
190
295
  });
191
- console.log(`Model loaded in ${Date.now() - start}ms`);
192
296
  // Check if input is likely an image
193
297
  const isImage = options?.type === 'image' ||
194
298
  input.startsWith('data:image') ||
@@ -241,5 +345,20 @@ class CoreViz {
241
345
  async resize(input, maxWidth, maxHeight) {
242
346
  return (0, resize_1.resize)(input, maxWidth, maxHeight);
243
347
  }
348
+ similarity(vecA, vecB) {
349
+ if (vecA.length !== vecB.length)
350
+ return 0;
351
+ let dotProduct = 0;
352
+ let normA = 0;
353
+ let normB = 0;
354
+ for (let i = 0; i < vecA.length; i++) {
355
+ dotProduct += vecA[i] * vecB[i];
356
+ normA += vecA[i] * vecA[i];
357
+ normB += vecB[i] * vecB[i];
358
+ }
359
+ if (normA === 0 || normB === 0)
360
+ return 0;
361
+ return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
362
+ }
244
363
  }
245
364
  exports.CoreViz = CoreViz;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@coreviz/sdk",
3
- "version": "1.0.8",
3
+ "version": "1.0.10",
4
4
  "description": "CoreViz SDK",
5
5
  "main": "dist/index.js",
6
6
  "types": "dist/index.d.ts",