@coreviz/sdk 1.0.9 → 1.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/coreviz.d.ts +3 -0
- package/dist/coreviz.js +121 -0
- package/package.json +1 -1
package/dist/coreviz.d.ts
CHANGED
|
@@ -14,6 +14,7 @@ export interface TagOptions {
|
|
|
14
14
|
prompt: string;
|
|
15
15
|
options?: string[];
|
|
16
16
|
multiple?: boolean;
|
|
17
|
+
mode?: 'api' | 'local';
|
|
17
18
|
}
|
|
18
19
|
export interface TagResponse {
|
|
19
20
|
tags: string[];
|
|
@@ -35,7 +36,9 @@ export declare class CoreViz {
|
|
|
35
36
|
describe(image: string, options?: DescribeOptions): Promise<string>;
|
|
36
37
|
edit(image: string, options: EditOptions): Promise<string>;
|
|
37
38
|
tag(image: string, options: TagOptions): Promise<TagResponse>;
|
|
39
|
+
private tagLocal;
|
|
38
40
|
embed(input: string, options?: EmbedOptions): Promise<EmbedResponse>;
|
|
39
41
|
private embedLocal;
|
|
40
42
|
resize(input: string | File, maxWidth?: number, maxHeight?: number): Promise<string>;
|
|
43
|
+
similarity(vecA: number[], vecB: number[]): number;
|
|
41
44
|
}
|
package/dist/coreviz.js
CHANGED
|
@@ -104,6 +104,10 @@ class CoreViz {
|
|
|
104
104
|
}
|
|
105
105
|
}
|
|
106
106
|
async tag(image, options) {
|
|
107
|
+
const mode = options?.mode || 'api';
|
|
108
|
+
if (mode === 'local') {
|
|
109
|
+
return this.tagLocal(image, options);
|
|
110
|
+
}
|
|
107
111
|
try {
|
|
108
112
|
const resizedImage = await (0, resize_1.resize)(image);
|
|
109
113
|
const headers = this.getHeaders();
|
|
@@ -134,6 +138,108 @@ class CoreViz {
|
|
|
134
138
|
throw err instanceof Error ? err : new Error("An unexpected error occurred.");
|
|
135
139
|
}
|
|
136
140
|
}
|
|
141
|
+
async tagLocal(imageInput, options) {
|
|
142
|
+
try {
|
|
143
|
+
// Dynamic import to avoid loading transformers if not used
|
|
144
|
+
const { AutoProcessor, AutoModelForImageTextToText, RawImage, env } = await Promise.resolve().then(() => __importStar(require('@huggingface/transformers')));
|
|
145
|
+
// Configure transformers.js for browser usage
|
|
146
|
+
env.allowRemoteModels = true;
|
|
147
|
+
const processor = await AutoProcessor.from_pretrained('onnx-community/FastVLM-0.5B-ONNX');
|
|
148
|
+
const model = await AutoModelForImageTextToText.from_pretrained('onnx-community/FastVLM-0.5B-ONNX', {
|
|
149
|
+
dtype: {
|
|
150
|
+
embed_tokens: "fp16",
|
|
151
|
+
vision_encoder: "q4",
|
|
152
|
+
decoder_model_merged: "q4",
|
|
153
|
+
},
|
|
154
|
+
});
|
|
155
|
+
let rawImg;
|
|
156
|
+
if (imageInput.startsWith('http')) {
|
|
157
|
+
rawImg = await RawImage.fromURL(imageInput);
|
|
158
|
+
}
|
|
159
|
+
else if (imageInput.startsWith('data:image')) {
|
|
160
|
+
const base64Data = imageInput.split(',')[1];
|
|
161
|
+
const binary = atob(base64Data);
|
|
162
|
+
const array = new Uint8Array(binary.length);
|
|
163
|
+
for (let i = 0; i < binary.length; i++) {
|
|
164
|
+
array[i] = binary.charCodeAt(i);
|
|
165
|
+
}
|
|
166
|
+
rawImg = await RawImage.fromBlob(new Blob([array]));
|
|
167
|
+
}
|
|
168
|
+
else {
|
|
169
|
+
rawImg = await RawImage.read(imageInput);
|
|
170
|
+
}
|
|
171
|
+
let systemPrompt = `You are a precise image tagging AI.
|
|
172
|
+
Rules:
|
|
173
|
+
1. Return ONLY a comma-separated list of tags.
|
|
174
|
+
2. DO NOT provide any conversational text, introductions, or explanations.
|
|
175
|
+
3. DO NOT use full sentences.
|
|
176
|
+
4. If options are provided, select strictly from them.
|
|
177
|
+
|
|
178
|
+
Example 1:
|
|
179
|
+
What animals are in the image?
|
|
180
|
+
Example Output:
|
|
181
|
+
cat
|
|
182
|
+
|
|
183
|
+
Example 2:
|
|
184
|
+
What color cars are visible in the image?
|
|
185
|
+
Output:
|
|
186
|
+
red, blue, green
|
|
187
|
+
|
|
188
|
+
Example 3:
|
|
189
|
+
What is the jersey number of the player?
|
|
190
|
+
Output:
|
|
191
|
+
10
|
|
192
|
+
`;
|
|
193
|
+
let userPrompt = `${options.prompt}`;
|
|
194
|
+
if (options.options && options.options.length > 0) {
|
|
195
|
+
userPrompt += `\nSelect from these options: ${options.options.join(', ')}.`;
|
|
196
|
+
}
|
|
197
|
+
if (!options.multiple) {
|
|
198
|
+
userPrompt += `\nReturn a single tag.`;
|
|
199
|
+
}
|
|
200
|
+
const messages = [
|
|
201
|
+
{
|
|
202
|
+
role: 'system',
|
|
203
|
+
content: systemPrompt,
|
|
204
|
+
},
|
|
205
|
+
{ role: 'user', content: `<image>\n${userPrompt}` },
|
|
206
|
+
];
|
|
207
|
+
let promptText = processor.apply_chat_template(messages, {
|
|
208
|
+
add_generation_prompt: true,
|
|
209
|
+
});
|
|
210
|
+
if (typeof promptText === 'string') {
|
|
211
|
+
promptText += options.multiple ? "Tags: " : "Tag: ";
|
|
212
|
+
}
|
|
213
|
+
const inputs = await processor(rawImg, promptText, {
|
|
214
|
+
add_special_tokens: false,
|
|
215
|
+
});
|
|
216
|
+
const outputs = await model.generate({
|
|
217
|
+
...inputs,
|
|
218
|
+
max_new_tokens: 120,
|
|
219
|
+
do_sample: false,
|
|
220
|
+
repetition_penalty: 1.2,
|
|
221
|
+
});
|
|
222
|
+
const decoded = processor.batch_decode(outputs.slice(null, [inputs.input_ids.dims.at(-1), null]), { skip_special_tokens: true });
|
|
223
|
+
let resultText = decoded[0].trim();
|
|
224
|
+
// Cleanup potential repetition of priming token
|
|
225
|
+
resultText = resultText.replace(/^(Tags?:\s*)/i, '');
|
|
226
|
+
let tags = [];
|
|
227
|
+
if (options.multiple) {
|
|
228
|
+
tags = resultText.split(',').map(s => s.trim()).filter(s => s.length > 0);
|
|
229
|
+
}
|
|
230
|
+
else {
|
|
231
|
+
tags = [resultText];
|
|
232
|
+
}
|
|
233
|
+
return {
|
|
234
|
+
tags,
|
|
235
|
+
raw: { result: resultText }
|
|
236
|
+
};
|
|
237
|
+
}
|
|
238
|
+
catch (err) {
|
|
239
|
+
console.error(err);
|
|
240
|
+
throw err instanceof Error ? err : new Error("Local tagging failed: " + String(err));
|
|
241
|
+
}
|
|
242
|
+
}
|
|
137
243
|
async embed(input, options) {
|
|
138
244
|
const mode = options?.mode || 'api';
|
|
139
245
|
if (mode === 'local') {
|
|
@@ -239,5 +345,20 @@ class CoreViz {
|
|
|
239
345
|
async resize(input, maxWidth, maxHeight) {
|
|
240
346
|
return (0, resize_1.resize)(input, maxWidth, maxHeight);
|
|
241
347
|
}
|
|
348
|
+
similarity(vecA, vecB) {
|
|
349
|
+
if (vecA.length !== vecB.length)
|
|
350
|
+
return 0;
|
|
351
|
+
let dotProduct = 0;
|
|
352
|
+
let normA = 0;
|
|
353
|
+
let normB = 0;
|
|
354
|
+
for (let i = 0; i < vecA.length; i++) {
|
|
355
|
+
dotProduct += vecA[i] * vecB[i];
|
|
356
|
+
normA += vecA[i] * vecA[i];
|
|
357
|
+
normB += vecB[i] * vecB[i];
|
|
358
|
+
}
|
|
359
|
+
if (normA === 0 || normB === 0)
|
|
360
|
+
return 0;
|
|
361
|
+
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
|
|
362
|
+
}
|
|
242
363
|
}
|
|
243
364
|
exports.CoreViz = CoreViz;
|