@coreviz/sdk 1.0.8 → 1.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/coreviz.d.ts +3 -0
- package/dist/coreviz.js +121 -2
- package/package.json +1 -1
package/dist/coreviz.d.ts
CHANGED
|
@@ -14,6 +14,7 @@ export interface TagOptions {
|
|
|
14
14
|
prompt: string;
|
|
15
15
|
options?: string[];
|
|
16
16
|
multiple?: boolean;
|
|
17
|
+
mode?: 'api' | 'local';
|
|
17
18
|
}
|
|
18
19
|
export interface TagResponse {
|
|
19
20
|
tags: string[];
|
|
@@ -35,7 +36,9 @@ export declare class CoreViz {
|
|
|
35
36
|
describe(image: string, options?: DescribeOptions): Promise<string>;
|
|
36
37
|
edit(image: string, options: EditOptions): Promise<string>;
|
|
37
38
|
tag(image: string, options: TagOptions): Promise<TagResponse>;
|
|
39
|
+
private tagLocal;
|
|
38
40
|
embed(input: string, options?: EmbedOptions): Promise<EmbedResponse>;
|
|
39
41
|
private embedLocal;
|
|
40
42
|
resize(input: string | File, maxWidth?: number, maxHeight?: number): Promise<string>;
|
|
43
|
+
similarity(vecA: number[], vecB: number[]): number;
|
|
41
44
|
}
|
package/dist/coreviz.js
CHANGED
|
@@ -104,6 +104,10 @@ class CoreViz {
|
|
|
104
104
|
}
|
|
105
105
|
}
|
|
106
106
|
async tag(image, options) {
|
|
107
|
+
const mode = options?.mode || 'api';
|
|
108
|
+
if (mode === 'local') {
|
|
109
|
+
return this.tagLocal(image, options);
|
|
110
|
+
}
|
|
107
111
|
try {
|
|
108
112
|
const resizedImage = await (0, resize_1.resize)(image);
|
|
109
113
|
const headers = this.getHeaders();
|
|
@@ -134,6 +138,108 @@ class CoreViz {
|
|
|
134
138
|
throw err instanceof Error ? err : new Error("An unexpected error occurred.");
|
|
135
139
|
}
|
|
136
140
|
}
|
|
141
|
+
async tagLocal(imageInput, options) {
|
|
142
|
+
try {
|
|
143
|
+
// Dynamic import to avoid loading transformers if not used
|
|
144
|
+
const { AutoProcessor, AutoModelForImageTextToText, RawImage, env } = await Promise.resolve().then(() => __importStar(require('@huggingface/transformers')));
|
|
145
|
+
// Configure transformers.js for browser usage
|
|
146
|
+
env.allowRemoteModels = true;
|
|
147
|
+
const processor = await AutoProcessor.from_pretrained('onnx-community/FastVLM-0.5B-ONNX');
|
|
148
|
+
const model = await AutoModelForImageTextToText.from_pretrained('onnx-community/FastVLM-0.5B-ONNX', {
|
|
149
|
+
dtype: {
|
|
150
|
+
embed_tokens: "fp16",
|
|
151
|
+
vision_encoder: "q4",
|
|
152
|
+
decoder_model_merged: "q4",
|
|
153
|
+
},
|
|
154
|
+
});
|
|
155
|
+
let rawImg;
|
|
156
|
+
if (imageInput.startsWith('http')) {
|
|
157
|
+
rawImg = await RawImage.fromURL(imageInput);
|
|
158
|
+
}
|
|
159
|
+
else if (imageInput.startsWith('data:image')) {
|
|
160
|
+
const base64Data = imageInput.split(',')[1];
|
|
161
|
+
const binary = atob(base64Data);
|
|
162
|
+
const array = new Uint8Array(binary.length);
|
|
163
|
+
for (let i = 0; i < binary.length; i++) {
|
|
164
|
+
array[i] = binary.charCodeAt(i);
|
|
165
|
+
}
|
|
166
|
+
rawImg = await RawImage.fromBlob(new Blob([array]));
|
|
167
|
+
}
|
|
168
|
+
else {
|
|
169
|
+
rawImg = await RawImage.read(imageInput);
|
|
170
|
+
}
|
|
171
|
+
let systemPrompt = `You are a precise image tagging AI.
|
|
172
|
+
Rules:
|
|
173
|
+
1. Return ONLY a comma-separated list of tags.
|
|
174
|
+
2. DO NOT provide any conversational text, introductions, or explanations.
|
|
175
|
+
3. DO NOT use full sentences.
|
|
176
|
+
4. If options are provided, select strictly from them.
|
|
177
|
+
|
|
178
|
+
Example 1:
|
|
179
|
+
What animals are in the image?
|
|
180
|
+
Example Output:
|
|
181
|
+
cat
|
|
182
|
+
|
|
183
|
+
Example 2:
|
|
184
|
+
What color cars are visible in the image?
|
|
185
|
+
Output:
|
|
186
|
+
red, blue, green
|
|
187
|
+
|
|
188
|
+
Example 3:
|
|
189
|
+
What is the jersey number of the player?
|
|
190
|
+
Output:
|
|
191
|
+
10
|
|
192
|
+
`;
|
|
193
|
+
let userPrompt = `${options.prompt}`;
|
|
194
|
+
if (options.options && options.options.length > 0) {
|
|
195
|
+
userPrompt += `\nSelect from these options: ${options.options.join(', ')}.`;
|
|
196
|
+
}
|
|
197
|
+
if (!options.multiple) {
|
|
198
|
+
userPrompt += `\nReturn a single tag.`;
|
|
199
|
+
}
|
|
200
|
+
const messages = [
|
|
201
|
+
{
|
|
202
|
+
role: 'system',
|
|
203
|
+
content: systemPrompt,
|
|
204
|
+
},
|
|
205
|
+
{ role: 'user', content: `<image>\n${userPrompt}` },
|
|
206
|
+
];
|
|
207
|
+
let promptText = processor.apply_chat_template(messages, {
|
|
208
|
+
add_generation_prompt: true,
|
|
209
|
+
});
|
|
210
|
+
if (typeof promptText === 'string') {
|
|
211
|
+
promptText += options.multiple ? "Tags: " : "Tag: ";
|
|
212
|
+
}
|
|
213
|
+
const inputs = await processor(rawImg, promptText, {
|
|
214
|
+
add_special_tokens: false,
|
|
215
|
+
});
|
|
216
|
+
const outputs = await model.generate({
|
|
217
|
+
...inputs,
|
|
218
|
+
max_new_tokens: 120,
|
|
219
|
+
do_sample: false,
|
|
220
|
+
repetition_penalty: 1.2,
|
|
221
|
+
});
|
|
222
|
+
const decoded = processor.batch_decode(outputs.slice(null, [inputs.input_ids.dims.at(-1), null]), { skip_special_tokens: true });
|
|
223
|
+
let resultText = decoded[0].trim();
|
|
224
|
+
// Cleanup potential repetition of priming token
|
|
225
|
+
resultText = resultText.replace(/^(Tags?:\s*)/i, '');
|
|
226
|
+
let tags = [];
|
|
227
|
+
if (options.multiple) {
|
|
228
|
+
tags = resultText.split(',').map(s => s.trim()).filter(s => s.length > 0);
|
|
229
|
+
}
|
|
230
|
+
else {
|
|
231
|
+
tags = [resultText];
|
|
232
|
+
}
|
|
233
|
+
return {
|
|
234
|
+
tags,
|
|
235
|
+
raw: { result: resultText }
|
|
236
|
+
};
|
|
237
|
+
}
|
|
238
|
+
catch (err) {
|
|
239
|
+
console.error(err);
|
|
240
|
+
throw err instanceof Error ? err : new Error("Local tagging failed: " + String(err));
|
|
241
|
+
}
|
|
242
|
+
}
|
|
137
243
|
async embed(input, options) {
|
|
138
244
|
const mode = options?.mode || 'api';
|
|
139
245
|
if (mode === 'local') {
|
|
@@ -176,7 +282,6 @@ class CoreViz {
|
|
|
176
282
|
// Dynamic import to avoid loading transformers if not used
|
|
177
283
|
const { AutoTokenizer, AutoProcessor, CLIPTextModelWithProjection, CLIPVisionModelWithProjection, RawImage } = await Promise.resolve().then(() => __importStar(require('@huggingface/transformers')));
|
|
178
284
|
const MODEL_ID = 'Xenova/clip-vit-large-patch14';
|
|
179
|
-
console.log(`Loading local model ${MODEL_ID}...`);
|
|
180
285
|
const start = Date.now();
|
|
181
286
|
// Load tokenizer and processor
|
|
182
287
|
const tokenizer = await AutoTokenizer.from_pretrained(MODEL_ID);
|
|
@@ -188,7 +293,6 @@ class CoreViz {
|
|
|
188
293
|
const vision_model = await CLIPVisionModelWithProjection.from_pretrained(MODEL_ID, {
|
|
189
294
|
dtype: 'q4',
|
|
190
295
|
});
|
|
191
|
-
console.log(`Model loaded in ${Date.now() - start}ms`);
|
|
192
296
|
// Check if input is likely an image
|
|
193
297
|
const isImage = options?.type === 'image' ||
|
|
194
298
|
input.startsWith('data:image') ||
|
|
@@ -241,5 +345,20 @@ class CoreViz {
|
|
|
241
345
|
async resize(input, maxWidth, maxHeight) {
|
|
242
346
|
return (0, resize_1.resize)(input, maxWidth, maxHeight);
|
|
243
347
|
}
|
|
348
|
+
similarity(vecA, vecB) {
|
|
349
|
+
if (vecA.length !== vecB.length)
|
|
350
|
+
return 0;
|
|
351
|
+
let dotProduct = 0;
|
|
352
|
+
let normA = 0;
|
|
353
|
+
let normB = 0;
|
|
354
|
+
for (let i = 0; i < vecA.length; i++) {
|
|
355
|
+
dotProduct += vecA[i] * vecB[i];
|
|
356
|
+
normA += vecA[i] * vecA[i];
|
|
357
|
+
normB += vecB[i] * vecB[i];
|
|
358
|
+
}
|
|
359
|
+
if (normA === 0 || normB === 0)
|
|
360
|
+
return 0;
|
|
361
|
+
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
|
|
362
|
+
}
|
|
244
363
|
}
|
|
245
364
|
exports.CoreViz = CoreViz;
|