@memvid/sdk 2.0.113
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +190 -0
- package/README.md +244 -0
- package/dist/__tests__/basic.test.d.ts +1 -0
- package/dist/__tests__/basic.test.js +41 -0
- package/dist/adapters/autogen.d.ts +23 -0
- package/dist/adapters/autogen.js +163 -0
- package/dist/adapters/basic.d.ts +1 -0
- package/dist/adapters/basic.js +11 -0
- package/dist/adapters/crewai.d.ts +23 -0
- package/dist/adapters/crewai.js +160 -0
- package/dist/adapters/google_adk.d.ts +25 -0
- package/dist/adapters/google_adk.js +158 -0
- package/dist/adapters/haystack.d.ts +1 -0
- package/dist/adapters/haystack.js +11 -0
- package/dist/adapters/langchain.d.ts +28 -0
- package/dist/adapters/langchain.js +156 -0
- package/dist/adapters/langgraph.d.ts +1 -0
- package/dist/adapters/langgraph.js +11 -0
- package/dist/adapters/llamaindex.d.ts +33 -0
- package/dist/adapters/llamaindex.js +195 -0
- package/dist/adapters/mcp.d.ts +1 -0
- package/dist/adapters/mcp.js +11 -0
- package/dist/adapters/openai.d.ts +26 -0
- package/dist/adapters/openai.js +169 -0
- package/dist/adapters/semantic_kernel.d.ts +1 -0
- package/dist/adapters/semantic_kernel.js +11 -0
- package/dist/adapters/vercel_ai.d.ts +27 -0
- package/dist/adapters/vercel_ai.js +148 -0
- package/dist/clip.d.ts +182 -0
- package/dist/clip.js +371 -0
- package/dist/embeddings.d.ts +156 -0
- package/dist/embeddings.js +289 -0
- package/dist/entities.d.ts +251 -0
- package/dist/entities.js +489 -0
- package/dist/error.d.ts +91 -0
- package/dist/error.js +203 -0
- package/dist/index.d.ts +53 -0
- package/dist/index.js +458 -0
- package/dist/noop.d.ts +2 -0
- package/dist/noop.js +55 -0
- package/dist/registry.d.ts +5 -0
- package/dist/registry.js +53 -0
- package/dist/types.d.ts +275 -0
- package/dist/types.js +2 -0
- package/index.node +0 -0
- package/package.json +81 -0
package/dist/clip.d.ts
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CLIP visual embedding provider support for Memvid SDK (Node.js).
|
|
3
|
+
*
|
|
4
|
+
* Providers:
|
|
5
|
+
* - LocalClip: MobileCLIP-S2 (ONNX, offline)
|
|
6
|
+
* - OpenAIClip: GPT-4V + text embeddings (cloud)
|
|
7
|
+
* - GeminiClip: Google Gemini multimodal (cloud)
|
|
8
|
+
*
|
|
9
|
+
* @example
|
|
10
|
+
* ```typescript
|
|
11
|
+
* import { create } from 'memvid-sdk';
|
|
12
|
+
* import { getClipProvider, LocalClip, OpenAIClip } from 'memvid-sdk/clip';
|
|
13
|
+
*
|
|
14
|
+
* // Local CLIP (default)
|
|
15
|
+
* const clip = getClipProvider('local');
|
|
16
|
+
*
|
|
17
|
+
* // Or with cloud provider
|
|
18
|
+
* const clip = getClipProvider('openai');
|
|
19
|
+
*
|
|
20
|
+
* // Create memory and store images
|
|
21
|
+
* const mem = await create('gallery.mv2', 'basic');
|
|
22
|
+
* const embedding = await clip.embedImage('photo.jpg');
|
|
23
|
+
* await mem.put({ title: 'Beach', file: 'photo.jpg', clipEmbedding: embedding });
|
|
24
|
+
*
|
|
25
|
+
* // Search by text
|
|
26
|
+
* const queryEmbedding = await clip.embedText('sunset over ocean');
|
|
27
|
+
* const results = await mem.find('sunset', { queryEmbedding, mode: 'clip' });
|
|
28
|
+
* ```
|
|
29
|
+
*/
|
|
30
|
+
/**
|
|
31
|
+
* Abstract interface for CLIP embedding providers.
|
|
32
|
+
*/
|
|
33
|
+
export interface ClipProvider {
|
|
34
|
+
/** Provider name (e.g., 'local:mobileclip-s2'). */
|
|
35
|
+
readonly name: string;
|
|
36
|
+
/** Embedding dimension for this model. */
|
|
37
|
+
readonly dimension: number;
|
|
38
|
+
/**
|
|
39
|
+
* Generate embedding for a single image.
|
|
40
|
+
* @param imagePath - Path to the image file
|
|
41
|
+
* @returns Promise resolving to embedding vector
|
|
42
|
+
*/
|
|
43
|
+
embedImage(imagePath: string): Promise<number[]>;
|
|
44
|
+
/**
|
|
45
|
+
* Generate embedding for text (for text-to-image search).
|
|
46
|
+
* @param text - Text description to embed
|
|
47
|
+
* @returns Promise resolving to embedding vector
|
|
48
|
+
*/
|
|
49
|
+
embedText(text: string): Promise<number[]>;
|
|
50
|
+
/**
|
|
51
|
+
* Generate embeddings for multiple images.
|
|
52
|
+
* @param imagePaths - Paths to image files
|
|
53
|
+
* @returns Promise resolving to list of embedding vectors
|
|
54
|
+
*/
|
|
55
|
+
embedImages(imagePaths: string[]): Promise<number[][]>;
|
|
56
|
+
}
|
|
57
|
+
/**
|
|
58
|
+
* Model dimension mappings.
|
|
59
|
+
*/
|
|
60
|
+
export declare const CLIP_MODEL_DIMENSIONS: Record<string, number>;
|
|
61
|
+
/**
|
|
62
|
+
* LocalClip configuration options.
|
|
63
|
+
*/
|
|
64
|
+
export interface LocalClipConfig {
|
|
65
|
+
/** Model to use. Default: 'mobileclip-s2' */
|
|
66
|
+
model?: string;
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Local MobileCLIP provider using ONNX runtime.
|
|
70
|
+
*
|
|
71
|
+
* Uses MobileCLIP-S2 for fast, offline visual embeddings.
|
|
72
|
+
* Model auto-downloads on first use.
|
|
73
|
+
*/
|
|
74
|
+
export declare class LocalClip implements ClipProvider {
|
|
75
|
+
private readonly _model;
|
|
76
|
+
private _nativeModel;
|
|
77
|
+
constructor(config?: LocalClipConfig);
|
|
78
|
+
get name(): string;
|
|
79
|
+
get dimension(): number;
|
|
80
|
+
private _getModel;
|
|
81
|
+
embedImage(imagePath: string): Promise<number[]>;
|
|
82
|
+
embedText(text: string): Promise<number[]>;
|
|
83
|
+
embedImages(imagePaths: string[]): Promise<number[][]>;
|
|
84
|
+
}
|
|
85
|
+
/**
|
|
86
|
+
* OpenAIClip configuration options.
|
|
87
|
+
*/
|
|
88
|
+
export interface OpenAIClipConfig {
|
|
89
|
+
/** OpenAI API key. If not provided, uses OPENAI_API_KEY env var. */
|
|
90
|
+
apiKey?: string;
|
|
91
|
+
/** Embedding model. Default: 'text-embedding-3-small' */
|
|
92
|
+
embeddingModel?: string;
|
|
93
|
+
/** Vision model for image description. Default: 'gpt-4o-mini' */
|
|
94
|
+
visionModel?: string;
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* OpenAI CLIP-style provider using GPT-4V and text embeddings.
|
|
98
|
+
*
|
|
99
|
+
* Workflow:
|
|
100
|
+
* 1. Image -> GPT-4V generates description
|
|
101
|
+
* 2. Description -> text-embedding-3 -> Embedding vector
|
|
102
|
+
*/
|
|
103
|
+
export declare class OpenAIClip implements ClipProvider {
|
|
104
|
+
private readonly _apiKey;
|
|
105
|
+
private readonly _embeddingModel;
|
|
106
|
+
private readonly _visionModel;
|
|
107
|
+
constructor(config?: OpenAIClipConfig);
|
|
108
|
+
get name(): string;
|
|
109
|
+
get dimension(): number;
|
|
110
|
+
private _describeImage;
|
|
111
|
+
embedImage(imagePath: string): Promise<number[]>;
|
|
112
|
+
embedText(text: string): Promise<number[]>;
|
|
113
|
+
embedImages(imagePaths: string[]): Promise<number[][]>;
|
|
114
|
+
}
|
|
115
|
+
/**
|
|
116
|
+
* GeminiClip configuration options.
|
|
117
|
+
*/
|
|
118
|
+
export interface GeminiClipConfig {
|
|
119
|
+
/** Google AI API key. If not provided, uses GEMINI_API_KEY env var. */
|
|
120
|
+
apiKey?: string;
|
|
121
|
+
/** Model to use. Default: 'gemini-2.0-flash' */
|
|
122
|
+
model?: string;
|
|
123
|
+
}
|
|
124
|
+
/**
|
|
125
|
+
* Google Gemini multimodal provider.
|
|
126
|
+
*/
|
|
127
|
+
export declare class GeminiClip implements ClipProvider {
|
|
128
|
+
private readonly _apiKey;
|
|
129
|
+
private readonly _model;
|
|
130
|
+
constructor(config?: GeminiClipConfig);
|
|
131
|
+
get name(): string;
|
|
132
|
+
get dimension(): number;
|
|
133
|
+
private _describeImage;
|
|
134
|
+
embedImage(imagePath: string): Promise<number[]>;
|
|
135
|
+
embedText(text: string): Promise<number[]>;
|
|
136
|
+
embedImages(imagePaths: string[]): Promise<number[][]>;
|
|
137
|
+
}
|
|
138
|
+
/**
|
|
139
|
+
* Configuration options for getClipProvider factory.
|
|
140
|
+
*/
|
|
141
|
+
export interface ClipProviderConfig {
|
|
142
|
+
/** Model name (provider-specific). */
|
|
143
|
+
model?: string;
|
|
144
|
+
/** API key for cloud providers. */
|
|
145
|
+
apiKey?: string;
|
|
146
|
+
/** Embedding model (OpenAI). */
|
|
147
|
+
embeddingModel?: string;
|
|
148
|
+
/** Vision model (OpenAI). */
|
|
149
|
+
visionModel?: string;
|
|
150
|
+
}
|
|
151
|
+
/**
|
|
152
|
+
* Factory function to create a CLIP provider.
|
|
153
|
+
*
|
|
154
|
+
* @param provider - Provider specification. Can be:
|
|
155
|
+
* - Simple: 'local', 'openai', 'gemini'
|
|
156
|
+
* - With model: 'openai:gpt-4o-mini', 'gemini:gemini-2.0-flash'
|
|
157
|
+
* @param config - Provider-specific configuration
|
|
158
|
+
* @returns ClipProvider instance
|
|
159
|
+
*
|
|
160
|
+
* @example
|
|
161
|
+
* ```typescript
|
|
162
|
+
* // Simple provider
|
|
163
|
+
* const clip = getClipProvider('local');
|
|
164
|
+
* const clip = getClipProvider('openai');
|
|
165
|
+
*
|
|
166
|
+
* // Provider with model specification
|
|
167
|
+
* const clip = getClipProvider('openai:gpt-4o-mini');
|
|
168
|
+
* const clip = getClipProvider('gemini:gemini-2.0-flash');
|
|
169
|
+
*
|
|
170
|
+
* // With config override
|
|
171
|
+
* const clip = getClipProvider('openai', { embeddingModel: 'text-embedding-3-large' });
|
|
172
|
+
* ```
|
|
173
|
+
*/
|
|
174
|
+
export declare function getClipProvider(provider?: string, config?: ClipProviderConfig): ClipProvider;
|
|
175
|
+
declare const _default: {
|
|
176
|
+
LocalClip: typeof LocalClip;
|
|
177
|
+
OpenAIClip: typeof OpenAIClip;
|
|
178
|
+
GeminiClip: typeof GeminiClip;
|
|
179
|
+
getClipProvider: typeof getClipProvider;
|
|
180
|
+
CLIP_MODEL_DIMENSIONS: Record<string, number>;
|
|
181
|
+
};
|
|
182
|
+
export default _default;
|
package/dist/clip.js
ADDED
|
@@ -0,0 +1,371 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* CLIP visual embedding provider support for Memvid SDK (Node.js).
|
|
4
|
+
*
|
|
5
|
+
* Providers:
|
|
6
|
+
* - LocalClip: MobileCLIP-S2 (ONNX, offline)
|
|
7
|
+
* - OpenAIClip: GPT-4V + text embeddings (cloud)
|
|
8
|
+
* - GeminiClip: Google Gemini multimodal (cloud)
|
|
9
|
+
*
|
|
10
|
+
* @example
|
|
11
|
+
* ```typescript
|
|
12
|
+
* import { create } from 'memvid-sdk';
|
|
13
|
+
* import { getClipProvider, LocalClip, OpenAIClip } from 'memvid-sdk/clip';
|
|
14
|
+
*
|
|
15
|
+
* // Local CLIP (default)
|
|
16
|
+
* const clip = getClipProvider('local');
|
|
17
|
+
*
|
|
18
|
+
* // Or with cloud provider
|
|
19
|
+
* const clip = getClipProvider('openai');
|
|
20
|
+
*
|
|
21
|
+
* // Create memory and store images
|
|
22
|
+
* const mem = await create('gallery.mv2', 'basic');
|
|
23
|
+
* const embedding = await clip.embedImage('photo.jpg');
|
|
24
|
+
* await mem.put({ title: 'Beach', file: 'photo.jpg', clipEmbedding: embedding });
|
|
25
|
+
*
|
|
26
|
+
* // Search by text
|
|
27
|
+
* const queryEmbedding = await clip.embedText('sunset over ocean');
|
|
28
|
+
* const results = await mem.find('sunset', { queryEmbedding, mode: 'clip' });
|
|
29
|
+
* ```
|
|
30
|
+
*/
|
|
31
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
32
|
+
if (k2 === undefined) k2 = k;
|
|
33
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
34
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
35
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
36
|
+
}
|
|
37
|
+
Object.defineProperty(o, k2, desc);
|
|
38
|
+
}) : (function(o, m, k, k2) {
|
|
39
|
+
if (k2 === undefined) k2 = k;
|
|
40
|
+
o[k2] = m[k];
|
|
41
|
+
}));
|
|
42
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
43
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
44
|
+
}) : function(o, v) {
|
|
45
|
+
o["default"] = v;
|
|
46
|
+
});
|
|
47
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
48
|
+
var ownKeys = function(o) {
|
|
49
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
50
|
+
var ar = [];
|
|
51
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
52
|
+
return ar;
|
|
53
|
+
};
|
|
54
|
+
return ownKeys(o);
|
|
55
|
+
};
|
|
56
|
+
return function (mod) {
|
|
57
|
+
if (mod && mod.__esModule) return mod;
|
|
58
|
+
var result = {};
|
|
59
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
60
|
+
__setModuleDefault(result, mod);
|
|
61
|
+
return result;
|
|
62
|
+
};
|
|
63
|
+
})();
|
|
64
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
65
|
+
exports.GeminiClip = exports.OpenAIClip = exports.LocalClip = exports.CLIP_MODEL_DIMENSIONS = void 0;
|
|
66
|
+
exports.getClipProvider = getClipProvider;
|
|
67
|
+
const fs = __importStar(require("fs"));
|
|
68
|
+
const path = __importStar(require("path"));
|
|
69
|
+
/**
|
|
70
|
+
* Model dimension mappings.
|
|
71
|
+
*/
|
|
72
|
+
exports.CLIP_MODEL_DIMENSIONS = {
|
|
73
|
+
'mobileclip-s2': 512,
|
|
74
|
+
'mobileclip-s2-fp16': 512,
|
|
75
|
+
'siglip-base': 768,
|
|
76
|
+
'text-embedding-3-small': 1536,
|
|
77
|
+
'text-embedding-3-large': 3072,
|
|
78
|
+
};
|
|
79
|
+
/**
|
|
80
|
+
* Local MobileCLIP provider using ONNX runtime.
|
|
81
|
+
*
|
|
82
|
+
* Uses MobileCLIP-S2 for fast, offline visual embeddings.
|
|
83
|
+
* Model auto-downloads on first use.
|
|
84
|
+
*/
|
|
85
|
+
class LocalClip {
|
|
86
|
+
constructor(config = {}) {
|
|
87
|
+
this._nativeModel = null;
|
|
88
|
+
this._model = config.model || 'mobileclip-s2';
|
|
89
|
+
}
|
|
90
|
+
get name() {
|
|
91
|
+
return `local:${this._model}`;
|
|
92
|
+
}
|
|
93
|
+
get dimension() {
|
|
94
|
+
return exports.CLIP_MODEL_DIMENSIONS[this._model] || 512;
|
|
95
|
+
}
|
|
96
|
+
async _getModel() {
|
|
97
|
+
if (this._nativeModel === null) {
|
|
98
|
+
try {
|
|
99
|
+
// Import native bindings - use relative path for development
|
|
100
|
+
const native = require('../index.node');
|
|
101
|
+
if (!native.ClipModel) {
|
|
102
|
+
throw new Error('ClipModel not exported from native module');
|
|
103
|
+
}
|
|
104
|
+
this._nativeModel = new native.ClipModel();
|
|
105
|
+
}
|
|
106
|
+
catch (e) {
|
|
107
|
+
throw new Error(`Local CLIP support requires memvid-sdk with CLIP feature. ` +
|
|
108
|
+
`The model will auto-download on first use (~200 MB). Error: ${e}`);
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
return this._nativeModel;
|
|
112
|
+
}
|
|
113
|
+
async embedImage(imagePath) {
|
|
114
|
+
const model = await this._getModel();
|
|
115
|
+
return Array.from(await model.embedImage(imagePath));
|
|
116
|
+
}
|
|
117
|
+
async embedText(text) {
|
|
118
|
+
const model = await this._getModel();
|
|
119
|
+
return Array.from(await model.embedText(text));
|
|
120
|
+
}
|
|
121
|
+
async embedImages(imagePaths) {
|
|
122
|
+
const model = await this._getModel();
|
|
123
|
+
const embeddings = await model.embedImages(imagePaths);
|
|
124
|
+
return embeddings.map((e) => Array.from(e));
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
exports.LocalClip = LocalClip;
|
|
128
|
+
/**
|
|
129
|
+
* OpenAI CLIP-style provider using GPT-4V and text embeddings.
|
|
130
|
+
*
|
|
131
|
+
* Workflow:
|
|
132
|
+
* 1. Image -> GPT-4V generates description
|
|
133
|
+
* 2. Description -> text-embedding-3 -> Embedding vector
|
|
134
|
+
*/
|
|
135
|
+
class OpenAIClip {
|
|
136
|
+
constructor(config = {}) {
|
|
137
|
+
this._apiKey = config.apiKey || process.env.OPENAI_API_KEY || '';
|
|
138
|
+
if (!this._apiKey) {
|
|
139
|
+
throw new Error('OpenAI API key required. Pass apiKey or set OPENAI_API_KEY environment variable.');
|
|
140
|
+
}
|
|
141
|
+
this._embeddingModel = config.embeddingModel || 'text-embedding-3-small';
|
|
142
|
+
this._visionModel = config.visionModel || 'gpt-4o-mini';
|
|
143
|
+
}
|
|
144
|
+
get name() {
|
|
145
|
+
return `openai:${this._embeddingModel}`;
|
|
146
|
+
}
|
|
147
|
+
get dimension() {
|
|
148
|
+
return exports.CLIP_MODEL_DIMENSIONS[this._embeddingModel] || 1536;
|
|
149
|
+
}
|
|
150
|
+
async _describeImage(imagePath) {
|
|
151
|
+
const imageData = fs.readFileSync(imagePath);
|
|
152
|
+
const base64 = imageData.toString('base64');
|
|
153
|
+
const ext = path.extname(imagePath).toLowerCase();
|
|
154
|
+
const mimeTypes = {
|
|
155
|
+
'.jpg': 'image/jpeg',
|
|
156
|
+
'.jpeg': 'image/jpeg',
|
|
157
|
+
'.png': 'image/png',
|
|
158
|
+
'.gif': 'image/gif',
|
|
159
|
+
'.webp': 'image/webp',
|
|
160
|
+
};
|
|
161
|
+
const mimeType = mimeTypes[ext] || 'image/jpeg';
|
|
162
|
+
const response = await fetch('https://api.openai.com/v1/chat/completions', {
|
|
163
|
+
method: 'POST',
|
|
164
|
+
headers: {
|
|
165
|
+
'Authorization': `Bearer ${this._apiKey}`,
|
|
166
|
+
'Content-Type': 'application/json',
|
|
167
|
+
},
|
|
168
|
+
body: JSON.stringify({
|
|
169
|
+
model: this._visionModel,
|
|
170
|
+
messages: [{
|
|
171
|
+
role: 'user',
|
|
172
|
+
content: [
|
|
173
|
+
{
|
|
174
|
+
type: 'text',
|
|
175
|
+
text: 'Describe this image in 2-3 detailed sentences for visual search indexing. Focus on objects, colors, composition, and scene.',
|
|
176
|
+
},
|
|
177
|
+
{
|
|
178
|
+
type: 'image_url',
|
|
179
|
+
image_url: { url: `data:${mimeType};base64,${base64}` },
|
|
180
|
+
},
|
|
181
|
+
],
|
|
182
|
+
}],
|
|
183
|
+
max_tokens: 150,
|
|
184
|
+
}),
|
|
185
|
+
});
|
|
186
|
+
if (!response.ok) {
|
|
187
|
+
const error = await response.text();
|
|
188
|
+
throw new Error(`OpenAI API error: ${response.status} ${error}`);
|
|
189
|
+
}
|
|
190
|
+
const data = await response.json();
|
|
191
|
+
return data.choices[0]?.message?.content || '';
|
|
192
|
+
}
|
|
193
|
+
async embedImage(imagePath) {
|
|
194
|
+
const description = await this._describeImage(imagePath);
|
|
195
|
+
return this.embedText(description);
|
|
196
|
+
}
|
|
197
|
+
async embedText(text) {
|
|
198
|
+
const response = await fetch('https://api.openai.com/v1/embeddings', {
|
|
199
|
+
method: 'POST',
|
|
200
|
+
headers: {
|
|
201
|
+
'Authorization': `Bearer ${this._apiKey}`,
|
|
202
|
+
'Content-Type': 'application/json',
|
|
203
|
+
},
|
|
204
|
+
body: JSON.stringify({
|
|
205
|
+
model: this._embeddingModel,
|
|
206
|
+
input: text,
|
|
207
|
+
}),
|
|
208
|
+
});
|
|
209
|
+
if (!response.ok) {
|
|
210
|
+
const error = await response.text();
|
|
211
|
+
throw new Error(`OpenAI API error: ${response.status} ${error}`);
|
|
212
|
+
}
|
|
213
|
+
const data = await response.json();
|
|
214
|
+
return data.data[0].embedding;
|
|
215
|
+
}
|
|
216
|
+
async embedImages(imagePaths) {
|
|
217
|
+
// Process sequentially to avoid rate limits
|
|
218
|
+
const embeddings = [];
|
|
219
|
+
for (const imagePath of imagePaths) {
|
|
220
|
+
embeddings.push(await this.embedImage(imagePath));
|
|
221
|
+
}
|
|
222
|
+
return embeddings;
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
exports.OpenAIClip = OpenAIClip;
|
|
226
|
+
/**
|
|
227
|
+
* Google Gemini multimodal provider.
|
|
228
|
+
*/
|
|
229
|
+
class GeminiClip {
|
|
230
|
+
constructor(config = {}) {
|
|
231
|
+
this._apiKey = config.apiKey || process.env.GEMINI_API_KEY || '';
|
|
232
|
+
if (!this._apiKey) {
|
|
233
|
+
throw new Error('Gemini API key required. Pass apiKey or set GEMINI_API_KEY environment variable.');
|
|
234
|
+
}
|
|
235
|
+
this._model = config.model || 'gemini-2.0-flash';
|
|
236
|
+
}
|
|
237
|
+
get name() {
|
|
238
|
+
return `gemini:${this._model}`;
|
|
239
|
+
}
|
|
240
|
+
get dimension() {
|
|
241
|
+
return 768; // Gemini embedding dimension
|
|
242
|
+
}
|
|
243
|
+
async _describeImage(imagePath) {
|
|
244
|
+
const imageData = fs.readFileSync(imagePath);
|
|
245
|
+
const base64 = imageData.toString('base64');
|
|
246
|
+
const ext = path.extname(imagePath).toLowerCase();
|
|
247
|
+
const mimeTypes = {
|
|
248
|
+
'.jpg': 'image/jpeg',
|
|
249
|
+
'.jpeg': 'image/jpeg',
|
|
250
|
+
'.png': 'image/png',
|
|
251
|
+
'.gif': 'image/gif',
|
|
252
|
+
'.webp': 'image/webp',
|
|
253
|
+
};
|
|
254
|
+
const mimeType = mimeTypes[ext] || 'image/jpeg';
|
|
255
|
+
const response = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/${this._model}:generateContent?key=${this._apiKey}`, {
|
|
256
|
+
method: 'POST',
|
|
257
|
+
headers: { 'Content-Type': 'application/json' },
|
|
258
|
+
body: JSON.stringify({
|
|
259
|
+
contents: [{
|
|
260
|
+
parts: [
|
|
261
|
+
{ text: 'Describe this image in 2-3 detailed sentences for visual search indexing. Focus on objects, colors, composition, and scene.' },
|
|
262
|
+
{ inline_data: { mime_type: mimeType, data: base64 } },
|
|
263
|
+
],
|
|
264
|
+
}],
|
|
265
|
+
}),
|
|
266
|
+
});
|
|
267
|
+
if (!response.ok) {
|
|
268
|
+
const error = await response.text();
|
|
269
|
+
throw new Error(`Gemini API error: ${response.status} ${error}`);
|
|
270
|
+
}
|
|
271
|
+
const data = await response.json();
|
|
272
|
+
return data.candidates?.[0]?.content?.parts?.[0]?.text || '';
|
|
273
|
+
}
|
|
274
|
+
async embedImage(imagePath) {
|
|
275
|
+
const description = await this._describeImage(imagePath);
|
|
276
|
+
return this.embedText(description);
|
|
277
|
+
}
|
|
278
|
+
async embedText(text) {
|
|
279
|
+
const response = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/text-embedding-004:embedContent?key=${this._apiKey}`, {
|
|
280
|
+
method: 'POST',
|
|
281
|
+
headers: { 'Content-Type': 'application/json' },
|
|
282
|
+
body: JSON.stringify({
|
|
283
|
+
model: 'models/text-embedding-004',
|
|
284
|
+
content: { parts: [{ text }] },
|
|
285
|
+
taskType: 'RETRIEVAL_DOCUMENT',
|
|
286
|
+
}),
|
|
287
|
+
});
|
|
288
|
+
if (!response.ok) {
|
|
289
|
+
const error = await response.text();
|
|
290
|
+
throw new Error(`Gemini API error: ${response.status} ${error}`);
|
|
291
|
+
}
|
|
292
|
+
const data = await response.json();
|
|
293
|
+
return data.embedding?.values || [];
|
|
294
|
+
}
|
|
295
|
+
async embedImages(imagePaths) {
|
|
296
|
+
const embeddings = [];
|
|
297
|
+
for (const imagePath of imagePaths) {
|
|
298
|
+
embeddings.push(await this.embedImage(imagePath));
|
|
299
|
+
}
|
|
300
|
+
return embeddings;
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
exports.GeminiClip = GeminiClip;
|
|
304
|
+
/**
|
|
305
|
+
* Parse provider:model specification.
|
|
306
|
+
* Examples:
|
|
307
|
+
* "openai" -> ["openai", undefined]
|
|
308
|
+
* "openai:gpt-4o-mini" -> ["openai", "gpt-4o-mini"]
|
|
309
|
+
* "gemini:gemini-2.0-flash" -> ["gemini", "gemini-2.0-flash"]
|
|
310
|
+
* "local:mobileclip-s2" -> ["local", "mobileclip-s2"]
|
|
311
|
+
*/
|
|
312
|
+
function parseProviderModel(spec) {
|
|
313
|
+
if (spec.includes(':')) {
|
|
314
|
+
const idx = spec.indexOf(':');
|
|
315
|
+
return [spec.substring(0, idx).toLowerCase(), spec.substring(idx + 1)];
|
|
316
|
+
}
|
|
317
|
+
return [spec.toLowerCase(), undefined];
|
|
318
|
+
}
|
|
319
|
+
/**
|
|
320
|
+
* Factory function to create a CLIP provider.
|
|
321
|
+
*
|
|
322
|
+
* @param provider - Provider specification. Can be:
|
|
323
|
+
* - Simple: 'local', 'openai', 'gemini'
|
|
324
|
+
* - With model: 'openai:gpt-4o-mini', 'gemini:gemini-2.0-flash'
|
|
325
|
+
* @param config - Provider-specific configuration
|
|
326
|
+
* @returns ClipProvider instance
|
|
327
|
+
*
|
|
328
|
+
* @example
|
|
329
|
+
* ```typescript
|
|
330
|
+
* // Simple provider
|
|
331
|
+
* const clip = getClipProvider('local');
|
|
332
|
+
* const clip = getClipProvider('openai');
|
|
333
|
+
*
|
|
334
|
+
* // Provider with model specification
|
|
335
|
+
* const clip = getClipProvider('openai:gpt-4o-mini');
|
|
336
|
+
* const clip = getClipProvider('gemini:gemini-2.0-flash');
|
|
337
|
+
*
|
|
338
|
+
* // With config override
|
|
339
|
+
* const clip = getClipProvider('openai', { embeddingModel: 'text-embedding-3-large' });
|
|
340
|
+
* ```
|
|
341
|
+
*/
|
|
342
|
+
function getClipProvider(provider = 'local', config = {}) {
|
|
343
|
+
// Parse provider:model format
|
|
344
|
+
const [parsedProvider, parsedModel] = parseProviderModel(provider);
|
|
345
|
+
// Use parsed model if config.model is not explicitly set
|
|
346
|
+
const effectiveModel = config.model ?? parsedModel;
|
|
347
|
+
switch (parsedProvider) {
|
|
348
|
+
case 'local':
|
|
349
|
+
return new LocalClip({ model: effectiveModel });
|
|
350
|
+
case 'openai':
|
|
351
|
+
return new OpenAIClip({
|
|
352
|
+
apiKey: config.apiKey,
|
|
353
|
+
embeddingModel: config.embeddingModel || effectiveModel,
|
|
354
|
+
visionModel: config.visionModel || effectiveModel,
|
|
355
|
+
});
|
|
356
|
+
case 'gemini':
|
|
357
|
+
return new GeminiClip({
|
|
358
|
+
apiKey: config.apiKey,
|
|
359
|
+
model: effectiveModel,
|
|
360
|
+
});
|
|
361
|
+
default:
|
|
362
|
+
throw new Error(`Unknown provider: ${parsedProvider}. Supported: local, openai, gemini`);
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
exports.default = {
|
|
366
|
+
LocalClip,
|
|
367
|
+
OpenAIClip,
|
|
368
|
+
GeminiClip,
|
|
369
|
+
getClipProvider,
|
|
370
|
+
CLIP_MODEL_DIMENSIONS: exports.CLIP_MODEL_DIMENSIONS,
|
|
371
|
+
};
|