kailash 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/api/__init__.py +7 -0
- kailash/api/workflow_api.py +383 -0
- kailash/nodes/__init__.py +2 -1
- kailash/nodes/ai/__init__.py +26 -0
- kailash/nodes/ai/ai_providers.py +1272 -0
- kailash/nodes/ai/embedding_generator.py +853 -0
- kailash/nodes/ai/llm_agent.py +1166 -0
- kailash/nodes/api/auth.py +3 -3
- kailash/nodes/api/graphql.py +2 -2
- kailash/nodes/api/http.py +391 -48
- kailash/nodes/api/rate_limiting.py +2 -2
- kailash/nodes/api/rest.py +465 -57
- kailash/nodes/base.py +71 -12
- kailash/nodes/code/python.py +2 -1
- kailash/nodes/data/__init__.py +7 -0
- kailash/nodes/data/readers.py +28 -26
- kailash/nodes/data/retrieval.py +178 -0
- kailash/nodes/data/sharepoint_graph.py +7 -7
- kailash/nodes/data/sources.py +65 -0
- kailash/nodes/data/sql.py +7 -5
- kailash/nodes/data/vector_db.py +2 -2
- kailash/nodes/data/writers.py +6 -3
- kailash/nodes/logic/__init__.py +2 -1
- kailash/nodes/logic/operations.py +2 -1
- kailash/nodes/logic/workflow.py +439 -0
- kailash/nodes/mcp/__init__.py +11 -0
- kailash/nodes/mcp/client.py +558 -0
- kailash/nodes/mcp/resource.py +682 -0
- kailash/nodes/mcp/server.py +577 -0
- kailash/nodes/transform/__init__.py +16 -1
- kailash/nodes/transform/chunkers.py +78 -0
- kailash/nodes/transform/formatters.py +96 -0
- kailash/nodes/transform/processors.py +5 -3
- kailash/runtime/docker.py +8 -6
- kailash/sdk_exceptions.py +24 -10
- kailash/tracking/metrics_collector.py +2 -1
- kailash/tracking/models.py +0 -20
- kailash/tracking/storage/database.py +4 -4
- kailash/tracking/storage/filesystem.py +0 -1
- kailash/utils/templates.py +6 -6
- kailash/visualization/performance.py +7 -7
- kailash/visualization/reports.py +1 -1
- kailash/workflow/graph.py +4 -4
- kailash/workflow/mock_registry.py +1 -1
- {kailash-0.1.1.dist-info → kailash-0.1.3.dist-info}/METADATA +441 -47
- kailash-0.1.3.dist-info/RECORD +83 -0
- kailash-0.1.1.dist-info/RECORD +0 -69
- {kailash-0.1.1.dist-info → kailash-0.1.3.dist-info}/WHEEL +0 -0
- {kailash-0.1.1.dist-info → kailash-0.1.3.dist-info}/entry_points.txt +0 -0
- {kailash-0.1.1.dist-info → kailash-0.1.3.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.1.1.dist-info → kailash-0.1.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,853 @@
|
|
1
|
+
"""Embedding Generator node for vector embeddings with support for multiple providers."""
|
2
|
+
|
3
|
+
import time
|
4
|
+
from typing import Any, Dict, List, Optional
|
5
|
+
|
6
|
+
from kailash.nodes.base import Node, NodeParameter, register_node
|
7
|
+
|
8
|
+
|
9
|
+
@register_node()
|
10
|
+
class EmbeddingGenerator(Node):
|
11
|
+
"""
|
12
|
+
Vector embedding generator for RAG systems and semantic similarity operations.
|
13
|
+
|
14
|
+
Design Purpose and Philosophy:
|
15
|
+
The EmbeddingGenerator node provides enterprise-grade vector embedding capabilities
|
16
|
+
with support for multiple providers, batch processing, and efficient caching.
|
17
|
+
It's essential for building RAG systems, semantic search, and similarity-based workflows.
|
18
|
+
|
19
|
+
Upstream Dependencies:
|
20
|
+
- Text content or documents to embed
|
21
|
+
- Provider credentials (OpenAI, HuggingFace, Azure, etc.)
|
22
|
+
- Embedding model configurations and parameters
|
23
|
+
- Batch processing settings for efficiency
|
24
|
+
- Cache configuration for performance optimization
|
25
|
+
|
26
|
+
Downstream Consumers:
|
27
|
+
- Vector databases storing embeddings for retrieval
|
28
|
+
- Similarity calculation nodes for semantic matching
|
29
|
+
- RAG systems requiring document embeddings
|
30
|
+
- Clustering and classification algorithms
|
31
|
+
- Search and recommendation engines
|
32
|
+
|
33
|
+
Usage Patterns:
|
34
|
+
1. Single text embedding for ad-hoc similarity queries
|
35
|
+
2. Batch document embedding for knowledge base creation
|
36
|
+
3. Real-time embedding for streaming data processing
|
37
|
+
4. Incremental embedding with caching for large datasets
|
38
|
+
5. Multi-modal embedding for text, images, and code
|
39
|
+
|
40
|
+
Implementation Details:
|
41
|
+
- Supports OpenAI, HuggingFace, Azure, Cohere, and local models
|
42
|
+
- Implements efficient batch processing with optimal chunk sizes
|
43
|
+
- Provides intelligent caching with TTL and invalidation
|
44
|
+
- Handles rate limiting and retry logic for API providers
|
45
|
+
- Supports multiple embedding dimensions and models
|
46
|
+
- Includes similarity calculation utilities
|
47
|
+
|
48
|
+
Error Handling:
|
49
|
+
- APIError: When embedding provider API calls fail
|
50
|
+
- RateLimitError: When API rate limits are exceeded
|
51
|
+
- TokenLimitError: When input text exceeds model limits
|
52
|
+
- ValidationError: When input format is invalid
|
53
|
+
- CacheError: When embedding cache operations fail
|
54
|
+
- ModelNotFoundError: When specified model is unavailable
|
55
|
+
|
56
|
+
Side Effects:
|
57
|
+
- Makes API calls to external embedding providers
|
58
|
+
- Caches embedding vectors in local or distributed cache
|
59
|
+
- May chunk large texts for processing within model limits
|
60
|
+
- Logs embedding operations and performance metrics
|
61
|
+
- Updates usage statistics and cost tracking
|
62
|
+
|
63
|
+
Examples:
|
64
|
+
|
65
|
+
Single text embedding::
|
66
|
+
|
67
|
+
embedder = EmbeddingGenerator()
|
68
|
+
result = embedder.run(
|
69
|
+
provider="openai",
|
70
|
+
model="text-embedding-3-large",
|
71
|
+
input_text="This is a sample document to embed",
|
72
|
+
operation="embed_text"
|
73
|
+
)
|
74
|
+
|
75
|
+
Batch document embedding:
|
76
|
+
|
77
|
+
batch_embedder = EmbeddingGenerator()
|
78
|
+
result = batch_embedder.run(
|
79
|
+
provider="huggingface",
|
80
|
+
model="sentence-transformers/all-MiniLM-L6-v2",
|
81
|
+
input_texts=[
|
82
|
+
"First document content...",
|
83
|
+
"Second document content...",
|
84
|
+
"Third document content..."
|
85
|
+
],
|
86
|
+
operation="embed_batch",
|
87
|
+
batch_size=32,
|
88
|
+
cache_enabled=True
|
89
|
+
)
|
90
|
+
|
91
|
+
Similarity calculation:
|
92
|
+
|
93
|
+
similarity = EmbeddingGenerator()
|
94
|
+
result = similarity.run(
|
95
|
+
operation="calculate_similarity",
|
96
|
+
embedding_1=[0.1, 0.2, 0.3, ...],
|
97
|
+
embedding_2=[0.15, 0.25, 0.35, ...],
|
98
|
+
similarity_metric="cosine"
|
99
|
+
)
|
100
|
+
|
101
|
+
Cached embedding with MCP integration:
|
102
|
+
|
103
|
+
mcp_embedder = EmbeddingGenerator()
|
104
|
+
result = mcp_embedder.run(
|
105
|
+
provider="azure",
|
106
|
+
model="text-embedding-3-small",
|
107
|
+
mcp_resource_uri="data://documents/knowledge_base.json",
|
108
|
+
operation="embed_mcp_resource",
|
109
|
+
cache_ttl=3600,
|
110
|
+
chunk_size=512
|
111
|
+
)
|
112
|
+
"""
|
113
|
+
|
114
|
+
def get_parameters(self) -> Dict[str, NodeParameter]:
|
115
|
+
return {
|
116
|
+
"operation": NodeParameter(
|
117
|
+
name="operation",
|
118
|
+
type=str,
|
119
|
+
required=False,
|
120
|
+
default="embed_text",
|
121
|
+
description="Operation: embed_text, embed_batch, calculate_similarity, embed_mcp_resource",
|
122
|
+
),
|
123
|
+
"provider": NodeParameter(
|
124
|
+
name="provider",
|
125
|
+
type=str,
|
126
|
+
required=False,
|
127
|
+
description="Embedding provider: openai, ollama, cohere, huggingface, mock",
|
128
|
+
),
|
129
|
+
"model": NodeParameter(
|
130
|
+
name="model",
|
131
|
+
type=str,
|
132
|
+
required=False,
|
133
|
+
description="Embedding model name (e.g., text-embedding-3-large, all-MiniLM-L6-v2)",
|
134
|
+
),
|
135
|
+
"input_text": NodeParameter(
|
136
|
+
name="input_text",
|
137
|
+
type=str,
|
138
|
+
required=False,
|
139
|
+
description="Single text to embed (for embed_text operation)",
|
140
|
+
),
|
141
|
+
"input_texts": NodeParameter(
|
142
|
+
name="input_texts",
|
143
|
+
type=list,
|
144
|
+
required=False,
|
145
|
+
description="List of texts to embed (for embed_batch operation)",
|
146
|
+
),
|
147
|
+
"mcp_resource_uri": NodeParameter(
|
148
|
+
name="mcp_resource_uri",
|
149
|
+
type=str,
|
150
|
+
required=False,
|
151
|
+
description="MCP resource URI to embed (for embed_mcp_resource operation)",
|
152
|
+
),
|
153
|
+
"embedding_1": NodeParameter(
|
154
|
+
name="embedding_1",
|
155
|
+
type=list,
|
156
|
+
required=False,
|
157
|
+
description="First embedding vector (for calculate_similarity operation)",
|
158
|
+
),
|
159
|
+
"embedding_2": NodeParameter(
|
160
|
+
name="embedding_2",
|
161
|
+
type=list,
|
162
|
+
required=False,
|
163
|
+
description="Second embedding vector (for calculate_similarity operation)",
|
164
|
+
),
|
165
|
+
"similarity_metric": NodeParameter(
|
166
|
+
name="similarity_metric",
|
167
|
+
type=str,
|
168
|
+
required=False,
|
169
|
+
default="cosine",
|
170
|
+
description="Similarity metric: cosine, euclidean, dot_product",
|
171
|
+
),
|
172
|
+
"batch_size": NodeParameter(
|
173
|
+
name="batch_size",
|
174
|
+
type=int,
|
175
|
+
required=False,
|
176
|
+
default=32,
|
177
|
+
description="Batch size for processing multiple texts",
|
178
|
+
),
|
179
|
+
"chunk_size": NodeParameter(
|
180
|
+
name="chunk_size",
|
181
|
+
type=int,
|
182
|
+
required=False,
|
183
|
+
default=512,
|
184
|
+
description="Maximum tokens per text chunk",
|
185
|
+
),
|
186
|
+
"cache_enabled": NodeParameter(
|
187
|
+
name="cache_enabled",
|
188
|
+
type=bool,
|
189
|
+
required=False,
|
190
|
+
default=True,
|
191
|
+
description="Enable embedding caching for performance",
|
192
|
+
),
|
193
|
+
"cache_ttl": NodeParameter(
|
194
|
+
name="cache_ttl",
|
195
|
+
type=int,
|
196
|
+
required=False,
|
197
|
+
default=3600,
|
198
|
+
description="Cache time-to-live in seconds",
|
199
|
+
),
|
200
|
+
"dimensions": NodeParameter(
|
201
|
+
name="dimensions",
|
202
|
+
type=int,
|
203
|
+
required=False,
|
204
|
+
description="Number of embedding dimensions (provider-specific)",
|
205
|
+
),
|
206
|
+
"normalize": NodeParameter(
|
207
|
+
name="normalize",
|
208
|
+
type=bool,
|
209
|
+
required=False,
|
210
|
+
default=True,
|
211
|
+
description="Normalize embedding vectors to unit length",
|
212
|
+
),
|
213
|
+
"timeout": NodeParameter(
|
214
|
+
name="timeout",
|
215
|
+
type=int,
|
216
|
+
required=False,
|
217
|
+
default=60,
|
218
|
+
description="Request timeout in seconds",
|
219
|
+
),
|
220
|
+
"max_retries": NodeParameter(
|
221
|
+
name="max_retries",
|
222
|
+
type=int,
|
223
|
+
required=False,
|
224
|
+
default=3,
|
225
|
+
description="Maximum retry attempts for failed requests",
|
226
|
+
),
|
227
|
+
}
|
228
|
+
|
229
|
+
def run(self, **kwargs) -> Dict[str, Any]:
|
230
|
+
operation = kwargs["operation"]
|
231
|
+
provider = kwargs.get("provider", "mock")
|
232
|
+
model = kwargs.get("model", "default")
|
233
|
+
input_text = kwargs.get("input_text")
|
234
|
+
input_texts = kwargs.get("input_texts", [])
|
235
|
+
mcp_resource_uri = kwargs.get("mcp_resource_uri")
|
236
|
+
embedding_1 = kwargs.get("embedding_1")
|
237
|
+
embedding_2 = kwargs.get("embedding_2")
|
238
|
+
similarity_metric = kwargs.get("similarity_metric", "cosine")
|
239
|
+
batch_size = kwargs.get("batch_size", 32)
|
240
|
+
chunk_size = kwargs.get("chunk_size", 512)
|
241
|
+
cache_enabled = kwargs.get("cache_enabled", True)
|
242
|
+
cache_ttl = kwargs.get("cache_ttl", 3600)
|
243
|
+
dimensions = kwargs.get("dimensions")
|
244
|
+
normalize = kwargs.get("normalize", True)
|
245
|
+
timeout = kwargs.get("timeout", 60)
|
246
|
+
max_retries = kwargs.get("max_retries", 3)
|
247
|
+
|
248
|
+
try:
|
249
|
+
if operation == "embed_text":
|
250
|
+
return self._embed_single_text(
|
251
|
+
input_text,
|
252
|
+
provider,
|
253
|
+
model,
|
254
|
+
cache_enabled,
|
255
|
+
cache_ttl,
|
256
|
+
dimensions,
|
257
|
+
normalize,
|
258
|
+
timeout,
|
259
|
+
max_retries,
|
260
|
+
)
|
261
|
+
elif operation == "embed_batch":
|
262
|
+
return self._embed_batch_texts(
|
263
|
+
input_texts,
|
264
|
+
provider,
|
265
|
+
model,
|
266
|
+
batch_size,
|
267
|
+
chunk_size,
|
268
|
+
cache_enabled,
|
269
|
+
cache_ttl,
|
270
|
+
dimensions,
|
271
|
+
normalize,
|
272
|
+
timeout,
|
273
|
+
max_retries,
|
274
|
+
)
|
275
|
+
elif operation == "calculate_similarity":
|
276
|
+
# Handle both direct embeddings and text inputs
|
277
|
+
if embedding_1 and embedding_2:
|
278
|
+
return self._calculate_similarity(
|
279
|
+
embedding_1, embedding_2, similarity_metric
|
280
|
+
)
|
281
|
+
elif input_texts and len(input_texts) >= 2:
|
282
|
+
# Generate embeddings for texts first
|
283
|
+
embeddings = []
|
284
|
+
for text in input_texts[:2]: # Only use first 2 texts
|
285
|
+
if provider == "mock":
|
286
|
+
emb = self._generate_mock_embedding(
|
287
|
+
text, dimensions or 1536
|
288
|
+
)
|
289
|
+
else:
|
290
|
+
emb = self._generate_provider_embedding(
|
291
|
+
text, provider, model, dimensions, timeout, max_retries
|
292
|
+
)
|
293
|
+
if normalize:
|
294
|
+
emb = self._normalize_vector(emb)
|
295
|
+
embeddings.append(emb)
|
296
|
+
|
297
|
+
# Calculate similarity
|
298
|
+
result = self._calculate_similarity(
|
299
|
+
embeddings[0], embeddings[1], similarity_metric
|
300
|
+
)
|
301
|
+
|
302
|
+
# Add text information
|
303
|
+
if result["success"]:
|
304
|
+
result["texts"] = input_texts[:2]
|
305
|
+
result["embeddings"] = embeddings
|
306
|
+
|
307
|
+
return result
|
308
|
+
else:
|
309
|
+
return {
|
310
|
+
"success": False,
|
311
|
+
"error": "Either provide embedding_1 and embedding_2, or input_texts with at least 2 texts",
|
312
|
+
}
|
313
|
+
elif operation == "embed_mcp_resource":
|
314
|
+
return self._embed_mcp_resource(
|
315
|
+
mcp_resource_uri,
|
316
|
+
provider,
|
317
|
+
model,
|
318
|
+
chunk_size,
|
319
|
+
cache_enabled,
|
320
|
+
cache_ttl,
|
321
|
+
dimensions,
|
322
|
+
normalize,
|
323
|
+
timeout,
|
324
|
+
max_retries,
|
325
|
+
)
|
326
|
+
else:
|
327
|
+
return {
|
328
|
+
"success": False,
|
329
|
+
"error": f"Unsupported operation: {operation}",
|
330
|
+
"supported_operations": [
|
331
|
+
"embed_text",
|
332
|
+
"embed_batch",
|
333
|
+
"calculate_similarity",
|
334
|
+
"embed_mcp_resource",
|
335
|
+
],
|
336
|
+
}
|
337
|
+
|
338
|
+
except Exception as e:
|
339
|
+
return {
|
340
|
+
"success": False,
|
341
|
+
"error": str(e),
|
342
|
+
"error_type": type(e).__name__,
|
343
|
+
"operation": operation,
|
344
|
+
"provider": provider,
|
345
|
+
"model": model,
|
346
|
+
}
|
347
|
+
|
348
|
+
def _embed_single_text(
|
349
|
+
self,
|
350
|
+
text: Optional[str],
|
351
|
+
provider: str,
|
352
|
+
model: str,
|
353
|
+
cache_enabled: bool,
|
354
|
+
cache_ttl: int,
|
355
|
+
dimensions: Optional[int],
|
356
|
+
normalize: bool,
|
357
|
+
timeout: int,
|
358
|
+
max_retries: int,
|
359
|
+
) -> Dict[str, Any]:
|
360
|
+
"""Generate embedding for a single text."""
|
361
|
+
if not text:
|
362
|
+
return {
|
363
|
+
"success": False,
|
364
|
+
"error": "input_text is required for embed_text operation",
|
365
|
+
}
|
366
|
+
|
367
|
+
# Check cache first if enabled
|
368
|
+
if cache_enabled:
|
369
|
+
cache_key = self._generate_cache_key(text, provider, model)
|
370
|
+
cached_embedding = self._get_cached_embedding(cache_key)
|
371
|
+
if cached_embedding:
|
372
|
+
return {
|
373
|
+
"success": True,
|
374
|
+
"operation": "embed_text",
|
375
|
+
"embedding": cached_embedding["vector"],
|
376
|
+
"dimensions": len(cached_embedding["vector"]),
|
377
|
+
"text_length": len(text),
|
378
|
+
"provider": provider,
|
379
|
+
"model": model,
|
380
|
+
"cached": True,
|
381
|
+
"cache_key": cache_key,
|
382
|
+
"processing_time_ms": 5, # Very fast cache lookup
|
383
|
+
}
|
384
|
+
|
385
|
+
# Generate embedding using provider
|
386
|
+
start_time = time.time()
|
387
|
+
|
388
|
+
if provider == "mock":
|
389
|
+
embedding_vector = self._generate_mock_embedding(text, dimensions or 1536)
|
390
|
+
else:
|
391
|
+
embedding_vector = self._generate_provider_embedding(
|
392
|
+
text, provider, model, dimensions, timeout, max_retries
|
393
|
+
)
|
394
|
+
|
395
|
+
processing_time = (time.time() - start_time) * 1000
|
396
|
+
|
397
|
+
# Normalize if requested
|
398
|
+
if normalize:
|
399
|
+
embedding_vector = self._normalize_vector(embedding_vector)
|
400
|
+
|
401
|
+
# Cache the result
|
402
|
+
if cache_enabled:
|
403
|
+
self._cache_embedding(cache_key, embedding_vector, cache_ttl)
|
404
|
+
|
405
|
+
return {
|
406
|
+
"success": True,
|
407
|
+
"operation": "embed_text",
|
408
|
+
"embedding": embedding_vector,
|
409
|
+
"dimensions": len(embedding_vector),
|
410
|
+
"text_length": len(text),
|
411
|
+
"provider": provider,
|
412
|
+
"model": model,
|
413
|
+
"cached": False,
|
414
|
+
"processing_time_ms": round(processing_time, 2),
|
415
|
+
"usage": {
|
416
|
+
"tokens": len(text.split()),
|
417
|
+
"estimated_cost_usd": self._estimate_embedding_cost(
|
418
|
+
len(text.split()), provider, model
|
419
|
+
),
|
420
|
+
},
|
421
|
+
}
|
422
|
+
|
423
|
+
def _embed_batch_texts(
|
424
|
+
self,
|
425
|
+
texts: List[str],
|
426
|
+
provider: str,
|
427
|
+
model: str,
|
428
|
+
batch_size: int,
|
429
|
+
chunk_size: int,
|
430
|
+
cache_enabled: bool,
|
431
|
+
cache_ttl: int,
|
432
|
+
dimensions: Optional[int],
|
433
|
+
normalize: bool,
|
434
|
+
timeout: int,
|
435
|
+
max_retries: int,
|
436
|
+
) -> Dict[str, Any]:
|
437
|
+
"""Generate embeddings for a batch of texts."""
|
438
|
+
if not texts:
|
439
|
+
return {
|
440
|
+
"success": False,
|
441
|
+
"error": "input_texts is required and cannot be empty for embed_batch operation",
|
442
|
+
}
|
443
|
+
|
444
|
+
start_time = time.time()
|
445
|
+
embeddings = []
|
446
|
+
cache_hits = 0
|
447
|
+
total_tokens = 0
|
448
|
+
|
449
|
+
# Process texts in batches
|
450
|
+
for i in range(0, len(texts), batch_size):
|
451
|
+
batch = texts[i : i + batch_size]
|
452
|
+
|
453
|
+
for text in batch:
|
454
|
+
# Check cache first
|
455
|
+
cache_key = None
|
456
|
+
if cache_enabled:
|
457
|
+
cache_key = self._generate_cache_key(text, provider, model)
|
458
|
+
cached_embedding = self._get_cached_embedding(cache_key)
|
459
|
+
if cached_embedding:
|
460
|
+
embeddings.append(
|
461
|
+
{
|
462
|
+
"text": text[:100] + "..." if len(text) > 100 else text,
|
463
|
+
"embedding": cached_embedding["vector"],
|
464
|
+
"cached": True,
|
465
|
+
"dimensions": len(cached_embedding["vector"]),
|
466
|
+
}
|
467
|
+
)
|
468
|
+
cache_hits += 1
|
469
|
+
continue
|
470
|
+
|
471
|
+
# Chunk text if too long
|
472
|
+
chunks = self._chunk_text(text, chunk_size)
|
473
|
+
if len(chunks) > 1:
|
474
|
+
# For multiple chunks, embed each and average
|
475
|
+
chunk_embeddings = []
|
476
|
+
for chunk in chunks:
|
477
|
+
if provider == "mock":
|
478
|
+
chunk_emb = self._generate_mock_embedding(
|
479
|
+
chunk, dimensions or 1536
|
480
|
+
)
|
481
|
+
else:
|
482
|
+
chunk_emb = self._generate_provider_embedding(
|
483
|
+
chunk, provider, model, dimensions, timeout, max_retries
|
484
|
+
)
|
485
|
+
chunk_embeddings.append(chunk_emb)
|
486
|
+
|
487
|
+
# Average embeddings
|
488
|
+
embedding_vector = self._average_embeddings(chunk_embeddings)
|
489
|
+
else:
|
490
|
+
# Single chunk
|
491
|
+
if provider == "mock":
|
492
|
+
embedding_vector = self._generate_mock_embedding(
|
493
|
+
text, dimensions or 1536
|
494
|
+
)
|
495
|
+
else:
|
496
|
+
embedding_vector = self._generate_provider_embedding(
|
497
|
+
text, provider, model, dimensions, timeout, max_retries
|
498
|
+
)
|
499
|
+
|
500
|
+
# Normalize if requested
|
501
|
+
if normalize:
|
502
|
+
embedding_vector = self._normalize_vector(embedding_vector)
|
503
|
+
|
504
|
+
# Cache the result
|
505
|
+
if cache_enabled and cache_key:
|
506
|
+
self._cache_embedding(cache_key, embedding_vector, cache_ttl)
|
507
|
+
|
508
|
+
embeddings.append(
|
509
|
+
{
|
510
|
+
"text": text[:100] + "..." if len(text) > 100 else text,
|
511
|
+
"embedding": embedding_vector,
|
512
|
+
"cached": False,
|
513
|
+
"dimensions": len(embedding_vector),
|
514
|
+
"chunks": len(chunks),
|
515
|
+
}
|
516
|
+
)
|
517
|
+
|
518
|
+
total_tokens += len(text.split())
|
519
|
+
|
520
|
+
processing_time = (time.time() - start_time) * 1000
|
521
|
+
|
522
|
+
return {
|
523
|
+
"success": True,
|
524
|
+
"operation": "embed_batch",
|
525
|
+
"embeddings": embeddings,
|
526
|
+
"total_texts": len(texts),
|
527
|
+
"total_embeddings": len(embeddings),
|
528
|
+
"cache_hits": cache_hits,
|
529
|
+
"cache_hit_rate": cache_hits / len(texts) if texts else 0,
|
530
|
+
"provider": provider,
|
531
|
+
"model": model,
|
532
|
+
"batch_size": batch_size,
|
533
|
+
"processing_time_ms": round(processing_time, 2),
|
534
|
+
"usage": {
|
535
|
+
"total_tokens": total_tokens,
|
536
|
+
"estimated_cost_usd": self._estimate_embedding_cost(
|
537
|
+
total_tokens, provider, model
|
538
|
+
),
|
539
|
+
"average_tokens_per_text": total_tokens / len(texts) if texts else 0,
|
540
|
+
},
|
541
|
+
}
|
542
|
+
|
543
|
+
def _calculate_similarity(
|
544
|
+
self,
|
545
|
+
embedding_1: Optional[List[float]],
|
546
|
+
embedding_2: Optional[List[float]],
|
547
|
+
metric: str,
|
548
|
+
) -> Dict[str, Any]:
|
549
|
+
"""Calculate similarity between two embedding vectors."""
|
550
|
+
if not embedding_1 or not embedding_2:
|
551
|
+
return {
|
552
|
+
"success": False,
|
553
|
+
"error": "Both embedding_1 and embedding_2 are required for similarity calculation",
|
554
|
+
}
|
555
|
+
|
556
|
+
if len(embedding_1) != len(embedding_2):
|
557
|
+
return {
|
558
|
+
"success": False,
|
559
|
+
"error": f"Embedding dimensions must match: {len(embedding_1)} vs {len(embedding_2)}",
|
560
|
+
}
|
561
|
+
|
562
|
+
try:
|
563
|
+
if metric == "cosine":
|
564
|
+
similarity = self._cosine_similarity(embedding_1, embedding_2)
|
565
|
+
elif metric == "euclidean":
|
566
|
+
similarity = self._euclidean_distance(embedding_1, embedding_2)
|
567
|
+
elif metric == "dot_product":
|
568
|
+
similarity = self._dot_product(embedding_1, embedding_2)
|
569
|
+
else:
|
570
|
+
return {
|
571
|
+
"success": False,
|
572
|
+
"error": f"Unsupported similarity metric: {metric}",
|
573
|
+
"supported_metrics": ["cosine", "euclidean", "dot_product"],
|
574
|
+
}
|
575
|
+
|
576
|
+
return {
|
577
|
+
"success": True,
|
578
|
+
"operation": "calculate_similarity",
|
579
|
+
"similarity": similarity,
|
580
|
+
"metric": metric,
|
581
|
+
"dimensions": len(embedding_1),
|
582
|
+
"interpretation": self._interpret_similarity(similarity, metric),
|
583
|
+
}
|
584
|
+
|
585
|
+
except Exception as e:
|
586
|
+
return {
|
587
|
+
"success": False,
|
588
|
+
"error": f"Similarity calculation failed: {str(e)}",
|
589
|
+
"metric": metric,
|
590
|
+
}
|
591
|
+
|
592
|
+
def _embed_mcp_resource(
|
593
|
+
self,
|
594
|
+
resource_uri: Optional[str],
|
595
|
+
provider: str,
|
596
|
+
model: str,
|
597
|
+
chunk_size: int,
|
598
|
+
cache_enabled: bool,
|
599
|
+
cache_ttl: int,
|
600
|
+
dimensions: Optional[int],
|
601
|
+
normalize: bool,
|
602
|
+
timeout: int,
|
603
|
+
max_retries: int,
|
604
|
+
) -> Dict[str, Any]:
|
605
|
+
"""Embed content from an MCP resource."""
|
606
|
+
if not resource_uri:
|
607
|
+
return {
|
608
|
+
"success": False,
|
609
|
+
"error": "mcp_resource_uri is required for embed_mcp_resource operation",
|
610
|
+
}
|
611
|
+
|
612
|
+
# Mock MCP resource retrieval
|
613
|
+
mock_content = f"Mock content from MCP resource: {resource_uri}\n\nThis would contain the actual document or data content retrieved from the MCP server."
|
614
|
+
|
615
|
+
# Use the existing embed_text functionality
|
616
|
+
result = self._embed_single_text(
|
617
|
+
mock_content,
|
618
|
+
provider,
|
619
|
+
model,
|
620
|
+
cache_enabled,
|
621
|
+
cache_ttl,
|
622
|
+
dimensions,
|
623
|
+
normalize,
|
624
|
+
timeout,
|
625
|
+
max_retries,
|
626
|
+
)
|
627
|
+
|
628
|
+
if result["success"]:
|
629
|
+
result.update(
|
630
|
+
{
|
631
|
+
"operation": "embed_mcp_resource",
|
632
|
+
"mcp_resource_uri": resource_uri,
|
633
|
+
"content_preview": mock_content[:200] + "...",
|
634
|
+
}
|
635
|
+
)
|
636
|
+
|
637
|
+
return result
|
638
|
+
|
639
|
+
def _generate_mock_embedding(self, text: str, dimensions: int) -> List[float]:
|
640
|
+
"""Generate a mock embedding vector based on text content."""
|
641
|
+
import hashlib
|
642
|
+
import random
|
643
|
+
|
644
|
+
# Use text hash as seed for reproducible mock embeddings
|
645
|
+
seed = int(hashlib.md5(text.encode()).hexdigest()[:8], 16)
|
646
|
+
random.seed(seed)
|
647
|
+
|
648
|
+
# Generate normalized random vector
|
649
|
+
vector = [random.gauss(0, 1) for _ in range(dimensions)]
|
650
|
+
|
651
|
+
# Normalize to unit length
|
652
|
+
magnitude = sum(x * x for x in vector) ** 0.5
|
653
|
+
return [x / magnitude for x in vector]
|
654
|
+
|
655
|
+
def _generate_provider_embedding(
|
656
|
+
self,
|
657
|
+
text: str,
|
658
|
+
provider: str,
|
659
|
+
model: str,
|
660
|
+
dimensions: Optional[int],
|
661
|
+
timeout: int,
|
662
|
+
max_retries: int,
|
663
|
+
) -> List[float]:
|
664
|
+
"""Generate embedding using external provider."""
|
665
|
+
try:
|
666
|
+
from .ai_providers import get_provider
|
667
|
+
|
668
|
+
# Get the provider instance
|
669
|
+
provider_instance = get_provider(provider, "embeddings")
|
670
|
+
|
671
|
+
# Check if provider is available
|
672
|
+
if not provider_instance.is_available():
|
673
|
+
raise RuntimeError(
|
674
|
+
f"Provider {provider} is not available. Check dependencies and configuration."
|
675
|
+
)
|
676
|
+
|
677
|
+
# Prepare kwargs for the provider
|
678
|
+
kwargs = {"model": model, "timeout": timeout}
|
679
|
+
|
680
|
+
# Add dimensions if specified and provider supports it
|
681
|
+
if dimensions and provider in ["openai"]:
|
682
|
+
kwargs["dimensions"] = dimensions
|
683
|
+
|
684
|
+
# Provider-specific parameters
|
685
|
+
if provider == "cohere":
|
686
|
+
kwargs["input_type"] = "search_document"
|
687
|
+
elif provider == "huggingface":
|
688
|
+
kwargs["use_api"] = True # Default to API for consistency
|
689
|
+
|
690
|
+
# Generate embedding
|
691
|
+
embeddings = provider_instance.embed([text], **kwargs)
|
692
|
+
|
693
|
+
# Return the first (and only) embedding
|
694
|
+
return embeddings[0] if embeddings else []
|
695
|
+
|
696
|
+
except ImportError:
|
697
|
+
# Fallback to the original implementation if ai_providers not available
|
698
|
+
return self._fallback_provider_embedding(
|
699
|
+
text, provider, model, dimensions, timeout, max_retries
|
700
|
+
)
|
701
|
+
except Exception as e:
|
702
|
+
raise RuntimeError(f"Provider {provider} error: {str(e)}") from e
|
703
|
+
|
704
|
+
def _fallback_provider_embedding(
|
705
|
+
self,
|
706
|
+
text: str,
|
707
|
+
provider: str,
|
708
|
+
model: str,
|
709
|
+
dimensions: Optional[int],
|
710
|
+
timeout: int,
|
711
|
+
max_retries: int,
|
712
|
+
) -> List[float]:
|
713
|
+
"""Fallback implementation for backward compatibility."""
|
714
|
+
# Handle Ollama provider
|
715
|
+
if provider == "ollama":
|
716
|
+
try:
|
717
|
+
import ollama
|
718
|
+
|
719
|
+
response = ollama.embeddings(model=model, prompt=text)
|
720
|
+
return response.get("embedding", [])
|
721
|
+
except ImportError:
|
722
|
+
raise RuntimeError(
|
723
|
+
"Ollama library not installed. Install with: pip install ollama"
|
724
|
+
)
|
725
|
+
except Exception as e:
|
726
|
+
raise RuntimeError(f"Ollama embedding error: {str(e)}")
|
727
|
+
|
728
|
+
# Default dimensions for other providers
|
729
|
+
default_dimensions = {
|
730
|
+
"openai": {"text-embedding-3-large": 3072, "text-embedding-3-small": 1536},
|
731
|
+
"huggingface": {"all-MiniLM-L6-v2": 384, "all-mpnet-base-v2": 768},
|
732
|
+
"azure": {"text-embedding-3-large": 3072},
|
733
|
+
"cohere": {"embed-english-v3.0": 1024},
|
734
|
+
}
|
735
|
+
|
736
|
+
actual_dimensions = dimensions or default_dimensions.get(provider, {}).get(
|
737
|
+
model, 1536
|
738
|
+
)
|
739
|
+
|
740
|
+
# For now, other providers use mock embeddings
|
741
|
+
# In real implementation, this would call the actual provider APIs
|
742
|
+
return self._generate_mock_embedding(
|
743
|
+
f"{provider}:{model}:{text}", actual_dimensions
|
744
|
+
)
|
745
|
+
|
746
|
+
def _chunk_text(self, text: str, chunk_size: int) -> List[str]:
|
747
|
+
"""Split text into chunks based on token limit."""
|
748
|
+
# Simple word-based chunking (real implementation would use proper tokenization)
|
749
|
+
words = text.split()
|
750
|
+
chunks = []
|
751
|
+
|
752
|
+
for i in range(0, len(words), chunk_size):
|
753
|
+
chunk_words = words[i : i + chunk_size]
|
754
|
+
chunks.append(" ".join(chunk_words))
|
755
|
+
|
756
|
+
return chunks or [""]
|
757
|
+
|
758
|
+
def _average_embeddings(self, embeddings: List[List[float]]) -> List[float]:
|
759
|
+
"""Average multiple embedding vectors."""
|
760
|
+
if not embeddings:
|
761
|
+
return []
|
762
|
+
|
763
|
+
dimensions = len(embeddings[0])
|
764
|
+
averaged = []
|
765
|
+
|
766
|
+
for i in range(dimensions):
|
767
|
+
avg_value = sum(emb[i] for emb in embeddings) / len(embeddings)
|
768
|
+
averaged.append(avg_value)
|
769
|
+
|
770
|
+
return averaged
|
771
|
+
|
772
|
+
def _normalize_vector(self, vector: List[float]) -> List[float]:
|
773
|
+
"""Normalize vector to unit length."""
|
774
|
+
magnitude = sum(x * x for x in vector) ** 0.5
|
775
|
+
if magnitude == 0:
|
776
|
+
return vector
|
777
|
+
return [x / magnitude for x in vector]
|
778
|
+
|
779
|
+
def _cosine_similarity(self, vec1: List[float], vec2: List[float]) -> float:
|
780
|
+
"""Calculate cosine similarity between two vectors."""
|
781
|
+
dot_product = sum(a * b for a, b in zip(vec1, vec2))
|
782
|
+
magnitude1 = sum(a * a for a in vec1) ** 0.5
|
783
|
+
magnitude2 = sum(b * b for b in vec2) ** 0.5
|
784
|
+
|
785
|
+
if magnitude1 == 0 or magnitude2 == 0:
|
786
|
+
return 0.0
|
787
|
+
|
788
|
+
return dot_product / (magnitude1 * magnitude2)
|
789
|
+
|
790
|
+
def _euclidean_distance(self, vec1: List[float], vec2: List[float]) -> float:
|
791
|
+
"""Calculate Euclidean distance between two vectors."""
|
792
|
+
return sum((a - b) ** 2 for a, b in zip(vec1, vec2)) ** 0.5
|
793
|
+
|
794
|
+
def _dot_product(self, vec1: List[float], vec2: List[float]) -> float:
|
795
|
+
"""Calculate dot product of two vectors."""
|
796
|
+
return sum(a * b for a, b in zip(vec1, vec2))
|
797
|
+
|
798
|
+
def _interpret_similarity(self, score: float, metric: str) -> str:
|
799
|
+
"""Provide human-readable interpretation of similarity score."""
|
800
|
+
if metric == "cosine":
|
801
|
+
if score > 0.9:
|
802
|
+
return "Very similar"
|
803
|
+
elif score > 0.7:
|
804
|
+
return "Similar"
|
805
|
+
elif score > 0.5:
|
806
|
+
return "Somewhat similar"
|
807
|
+
elif score > 0.3:
|
808
|
+
return "Slightly similar"
|
809
|
+
else:
|
810
|
+
return "Not similar"
|
811
|
+
elif metric == "euclidean":
|
812
|
+
if score < 0.5:
|
813
|
+
return "Very similar"
|
814
|
+
elif score < 1.0:
|
815
|
+
return "Similar"
|
816
|
+
elif score < 2.0:
|
817
|
+
return "Somewhat similar"
|
818
|
+
else:
|
819
|
+
return "Not similar"
|
820
|
+
else: # dot_product
|
821
|
+
return f"Dot product: {score:.3f}"
|
822
|
+
|
823
|
+
def _generate_cache_key(self, text: str, provider: str, model: str) -> str:
|
824
|
+
"""Generate cache key for embedding."""
|
825
|
+
import hashlib
|
826
|
+
|
827
|
+
content = f"{provider}:{model}:{text}"
|
828
|
+
return f"emb_{hashlib.md5(content.encode()).hexdigest()[:16]}"
|
829
|
+
|
830
|
+
def _get_cached_embedding(self, cache_key: str) -> Optional[Dict[str, Any]]:
|
831
|
+
"""Retrieve embedding from cache (mock implementation)."""
|
832
|
+
# Mock cache lookup - in real implementation, use Redis or similar
|
833
|
+
return None
|
834
|
+
|
835
|
+
def _cache_embedding(self, cache_key: str, vector: List[float], ttl: int) -> None:
|
836
|
+
"""Store embedding in cache (mock implementation)."""
|
837
|
+
# Mock cache storage - in real implementation, use Redis or similar
|
838
|
+
pass
|
839
|
+
|
840
|
+
def _estimate_embedding_cost(self, tokens: int, provider: str, model: str) -> float:
|
841
|
+
"""Estimate embedding cost based on tokens and provider pricing."""
|
842
|
+
# Mock cost estimation (real implementation would use current pricing)
|
843
|
+
cost_per_1k_tokens = {
|
844
|
+
"openai": {
|
845
|
+
"text-embedding-3-large": 0.00013,
|
846
|
+
"text-embedding-3-small": 0.00002,
|
847
|
+
},
|
848
|
+
"azure": {"text-embedding-3-large": 0.00013},
|
849
|
+
"cohere": {"embed-english-v3.0": 0.0001},
|
850
|
+
}
|
851
|
+
|
852
|
+
rate = cost_per_1k_tokens.get(provider, {}).get(model, 0.0001)
|
853
|
+
return (tokens / 1000) * rate
|