kailash 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. kailash/__init__.py +1 -1
  2. kailash/nodes/__init__.py +2 -1
  3. kailash/nodes/ai/__init__.py +26 -0
  4. kailash/nodes/ai/ai_providers.py +1272 -0
  5. kailash/nodes/ai/embedding_generator.py +853 -0
  6. kailash/nodes/ai/llm_agent.py +1166 -0
  7. kailash/nodes/api/auth.py +3 -3
  8. kailash/nodes/api/graphql.py +2 -2
  9. kailash/nodes/api/http.py +391 -44
  10. kailash/nodes/api/rate_limiting.py +2 -2
  11. kailash/nodes/api/rest.py +464 -56
  12. kailash/nodes/base.py +71 -12
  13. kailash/nodes/code/python.py +2 -1
  14. kailash/nodes/data/__init__.py +7 -0
  15. kailash/nodes/data/readers.py +28 -26
  16. kailash/nodes/data/retrieval.py +178 -0
  17. kailash/nodes/data/sharepoint_graph.py +7 -7
  18. kailash/nodes/data/sources.py +65 -0
  19. kailash/nodes/data/sql.py +4 -2
  20. kailash/nodes/data/writers.py +6 -3
  21. kailash/nodes/logic/operations.py +2 -1
  22. kailash/nodes/mcp/__init__.py +11 -0
  23. kailash/nodes/mcp/client.py +558 -0
  24. kailash/nodes/mcp/resource.py +682 -0
  25. kailash/nodes/mcp/server.py +571 -0
  26. kailash/nodes/transform/__init__.py +16 -1
  27. kailash/nodes/transform/chunkers.py +78 -0
  28. kailash/nodes/transform/formatters.py +96 -0
  29. kailash/runtime/docker.py +6 -6
  30. kailash/sdk_exceptions.py +24 -10
  31. kailash/tracking/metrics_collector.py +2 -1
  32. kailash/utils/templates.py +6 -6
  33. {kailash-0.1.0.dist-info → kailash-0.1.2.dist-info}/METADATA +349 -49
  34. {kailash-0.1.0.dist-info → kailash-0.1.2.dist-info}/RECORD +38 -27
  35. {kailash-0.1.0.dist-info → kailash-0.1.2.dist-info}/WHEEL +0 -0
  36. {kailash-0.1.0.dist-info → kailash-0.1.2.dist-info}/entry_points.txt +0 -0
  37. {kailash-0.1.0.dist-info → kailash-0.1.2.dist-info}/licenses/LICENSE +0 -0
  38. {kailash-0.1.0.dist-info → kailash-0.1.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,853 @@
1
+ """Embedding Generator node for vector embeddings with support for multiple providers."""
2
+
3
+ import time
4
+ from typing import Any, Dict, List, Optional
5
+
6
+ from kailash.nodes.base import Node, NodeParameter, register_node
7
+
8
+
9
+ @register_node()
10
+ class EmbeddingGenerator(Node):
11
+ """
12
+ Vector embedding generator for RAG systems and semantic similarity operations.
13
+
14
+ Design Purpose and Philosophy:
15
+ The EmbeddingGenerator node provides enterprise-grade vector embedding capabilities
16
+ with support for multiple providers, batch processing, and efficient caching.
17
+ It's essential for building RAG systems, semantic search, and similarity-based workflows.
18
+
19
+ Upstream Dependencies:
20
+ - Text content or documents to embed
21
+ - Provider credentials (OpenAI, HuggingFace, Azure, etc.)
22
+ - Embedding model configurations and parameters
23
+ - Batch processing settings for efficiency
24
+ - Cache configuration for performance optimization
25
+
26
+ Downstream Consumers:
27
+ - Vector databases storing embeddings for retrieval
28
+ - Similarity calculation nodes for semantic matching
29
+ - RAG systems requiring document embeddings
30
+ - Clustering and classification algorithms
31
+ - Search and recommendation engines
32
+
33
+ Usage Patterns:
34
+ 1. Single text embedding for ad-hoc similarity queries
35
+ 2. Batch document embedding for knowledge base creation
36
+ 3. Real-time embedding for streaming data processing
37
+ 4. Incremental embedding with caching for large datasets
38
+ 5. Multi-modal embedding for text, images, and code
39
+
40
+ Implementation Details:
41
+ - Supports OpenAI, HuggingFace, Azure, Cohere, and local models
42
+ - Implements efficient batch processing with optimal chunk sizes
43
+ - Provides intelligent caching with TTL and invalidation
44
+ - Handles rate limiting and retry logic for API providers
45
+ - Supports multiple embedding dimensions and models
46
+ - Includes similarity calculation utilities
47
+
48
+ Error Handling:
49
+ - APIError: When embedding provider API calls fail
50
+ - RateLimitError: When API rate limits are exceeded
51
+ - TokenLimitError: When input text exceeds model limits
52
+ - ValidationError: When input format is invalid
53
+ - CacheError: When embedding cache operations fail
54
+ - ModelNotFoundError: When specified model is unavailable
55
+
56
+ Side Effects:
57
+ - Makes API calls to external embedding providers
58
+ - Caches embedding vectors in local or distributed cache
59
+ - May chunk large texts for processing within model limits
60
+ - Logs embedding operations and performance metrics
61
+ - Updates usage statistics and cost tracking
62
+
63
+ Examples:
64
+
65
+ Single text embedding::
66
+
67
+ embedder = EmbeddingGenerator()
68
+ result = embedder.run(
69
+ provider="openai",
70
+ model="text-embedding-3-large",
71
+ input_text="This is a sample document to embed",
72
+ operation="embed_text"
73
+ )
74
+
75
+ Batch document embedding:
76
+
77
+ batch_embedder = EmbeddingGenerator()
78
+ result = batch_embedder.run(
79
+ provider="huggingface",
80
+ model="sentence-transformers/all-MiniLM-L6-v2",
81
+ input_texts=[
82
+ "First document content...",
83
+ "Second document content...",
84
+ "Third document content..."
85
+ ],
86
+ operation="embed_batch",
87
+ batch_size=32,
88
+ cache_enabled=True
89
+ )
90
+
91
+ Similarity calculation:
92
+
93
+ similarity = EmbeddingGenerator()
94
+ result = similarity.run(
95
+ operation="calculate_similarity",
96
+ embedding_1=[0.1, 0.2, 0.3, ...],
97
+ embedding_2=[0.15, 0.25, 0.35, ...],
98
+ similarity_metric="cosine"
99
+ )
100
+
101
+ Cached embedding with MCP integration:
102
+
103
+ mcp_embedder = EmbeddingGenerator()
104
+ result = mcp_embedder.run(
105
+ provider="azure",
106
+ model="text-embedding-3-small",
107
+ mcp_resource_uri="data://documents/knowledge_base.json",
108
+ operation="embed_mcp_resource",
109
+ cache_ttl=3600,
110
+ chunk_size=512
111
+ )
112
+ """
113
+
114
+ def get_parameters(self) -> Dict[str, NodeParameter]:
115
+ return {
116
+ "operation": NodeParameter(
117
+ name="operation",
118
+ type=str,
119
+ required=False,
120
+ default="embed_text",
121
+ description="Operation: embed_text, embed_batch, calculate_similarity, embed_mcp_resource",
122
+ ),
123
+ "provider": NodeParameter(
124
+ name="provider",
125
+ type=str,
126
+ required=False,
127
+ description="Embedding provider: openai, ollama, cohere, huggingface, mock",
128
+ ),
129
+ "model": NodeParameter(
130
+ name="model",
131
+ type=str,
132
+ required=False,
133
+ description="Embedding model name (e.g., text-embedding-3-large, all-MiniLM-L6-v2)",
134
+ ),
135
+ "input_text": NodeParameter(
136
+ name="input_text",
137
+ type=str,
138
+ required=False,
139
+ description="Single text to embed (for embed_text operation)",
140
+ ),
141
+ "input_texts": NodeParameter(
142
+ name="input_texts",
143
+ type=list,
144
+ required=False,
145
+ description="List of texts to embed (for embed_batch operation)",
146
+ ),
147
+ "mcp_resource_uri": NodeParameter(
148
+ name="mcp_resource_uri",
149
+ type=str,
150
+ required=False,
151
+ description="MCP resource URI to embed (for embed_mcp_resource operation)",
152
+ ),
153
+ "embedding_1": NodeParameter(
154
+ name="embedding_1",
155
+ type=list,
156
+ required=False,
157
+ description="First embedding vector (for calculate_similarity operation)",
158
+ ),
159
+ "embedding_2": NodeParameter(
160
+ name="embedding_2",
161
+ type=list,
162
+ required=False,
163
+ description="Second embedding vector (for calculate_similarity operation)",
164
+ ),
165
+ "similarity_metric": NodeParameter(
166
+ name="similarity_metric",
167
+ type=str,
168
+ required=False,
169
+ default="cosine",
170
+ description="Similarity metric: cosine, euclidean, dot_product",
171
+ ),
172
+ "batch_size": NodeParameter(
173
+ name="batch_size",
174
+ type=int,
175
+ required=False,
176
+ default=32,
177
+ description="Batch size for processing multiple texts",
178
+ ),
179
+ "chunk_size": NodeParameter(
180
+ name="chunk_size",
181
+ type=int,
182
+ required=False,
183
+ default=512,
184
+ description="Maximum tokens per text chunk",
185
+ ),
186
+ "cache_enabled": NodeParameter(
187
+ name="cache_enabled",
188
+ type=bool,
189
+ required=False,
190
+ default=True,
191
+ description="Enable embedding caching for performance",
192
+ ),
193
+ "cache_ttl": NodeParameter(
194
+ name="cache_ttl",
195
+ type=int,
196
+ required=False,
197
+ default=3600,
198
+ description="Cache time-to-live in seconds",
199
+ ),
200
+ "dimensions": NodeParameter(
201
+ name="dimensions",
202
+ type=int,
203
+ required=False,
204
+ description="Number of embedding dimensions (provider-specific)",
205
+ ),
206
+ "normalize": NodeParameter(
207
+ name="normalize",
208
+ type=bool,
209
+ required=False,
210
+ default=True,
211
+ description="Normalize embedding vectors to unit length",
212
+ ),
213
+ "timeout": NodeParameter(
214
+ name="timeout",
215
+ type=int,
216
+ required=False,
217
+ default=60,
218
+ description="Request timeout in seconds",
219
+ ),
220
+ "max_retries": NodeParameter(
221
+ name="max_retries",
222
+ type=int,
223
+ required=False,
224
+ default=3,
225
+ description="Maximum retry attempts for failed requests",
226
+ ),
227
+ }
228
+
229
+ def run(self, **kwargs) -> Dict[str, Any]:
230
+ operation = kwargs["operation"]
231
+ provider = kwargs.get("provider", "mock")
232
+ model = kwargs.get("model", "default")
233
+ input_text = kwargs.get("input_text")
234
+ input_texts = kwargs.get("input_texts", [])
235
+ mcp_resource_uri = kwargs.get("mcp_resource_uri")
236
+ embedding_1 = kwargs.get("embedding_1")
237
+ embedding_2 = kwargs.get("embedding_2")
238
+ similarity_metric = kwargs.get("similarity_metric", "cosine")
239
+ batch_size = kwargs.get("batch_size", 32)
240
+ chunk_size = kwargs.get("chunk_size", 512)
241
+ cache_enabled = kwargs.get("cache_enabled", True)
242
+ cache_ttl = kwargs.get("cache_ttl", 3600)
243
+ dimensions = kwargs.get("dimensions")
244
+ normalize = kwargs.get("normalize", True)
245
+ timeout = kwargs.get("timeout", 60)
246
+ max_retries = kwargs.get("max_retries", 3)
247
+
248
+ try:
249
+ if operation == "embed_text":
250
+ return self._embed_single_text(
251
+ input_text,
252
+ provider,
253
+ model,
254
+ cache_enabled,
255
+ cache_ttl,
256
+ dimensions,
257
+ normalize,
258
+ timeout,
259
+ max_retries,
260
+ )
261
+ elif operation == "embed_batch":
262
+ return self._embed_batch_texts(
263
+ input_texts,
264
+ provider,
265
+ model,
266
+ batch_size,
267
+ chunk_size,
268
+ cache_enabled,
269
+ cache_ttl,
270
+ dimensions,
271
+ normalize,
272
+ timeout,
273
+ max_retries,
274
+ )
275
+ elif operation == "calculate_similarity":
276
+ # Handle both direct embeddings and text inputs
277
+ if embedding_1 and embedding_2:
278
+ return self._calculate_similarity(
279
+ embedding_1, embedding_2, similarity_metric
280
+ )
281
+ elif input_texts and len(input_texts) >= 2:
282
+ # Generate embeddings for texts first
283
+ embeddings = []
284
+ for text in input_texts[:2]: # Only use first 2 texts
285
+ if provider == "mock":
286
+ emb = self._generate_mock_embedding(
287
+ text, dimensions or 1536
288
+ )
289
+ else:
290
+ emb = self._generate_provider_embedding(
291
+ text, provider, model, dimensions, timeout, max_retries
292
+ )
293
+ if normalize:
294
+ emb = self._normalize_vector(emb)
295
+ embeddings.append(emb)
296
+
297
+ # Calculate similarity
298
+ result = self._calculate_similarity(
299
+ embeddings[0], embeddings[1], similarity_metric
300
+ )
301
+
302
+ # Add text information
303
+ if result["success"]:
304
+ result["texts"] = input_texts[:2]
305
+ result["embeddings"] = embeddings
306
+
307
+ return result
308
+ else:
309
+ return {
310
+ "success": False,
311
+ "error": "Either provide embedding_1 and embedding_2, or input_texts with at least 2 texts",
312
+ }
313
+ elif operation == "embed_mcp_resource":
314
+ return self._embed_mcp_resource(
315
+ mcp_resource_uri,
316
+ provider,
317
+ model,
318
+ chunk_size,
319
+ cache_enabled,
320
+ cache_ttl,
321
+ dimensions,
322
+ normalize,
323
+ timeout,
324
+ max_retries,
325
+ )
326
+ else:
327
+ return {
328
+ "success": False,
329
+ "error": f"Unsupported operation: {operation}",
330
+ "supported_operations": [
331
+ "embed_text",
332
+ "embed_batch",
333
+ "calculate_similarity",
334
+ "embed_mcp_resource",
335
+ ],
336
+ }
337
+
338
+ except Exception as e:
339
+ return {
340
+ "success": False,
341
+ "error": str(e),
342
+ "error_type": type(e).__name__,
343
+ "operation": operation,
344
+ "provider": provider,
345
+ "model": model,
346
+ }
347
+
348
+ def _embed_single_text(
349
+ self,
350
+ text: Optional[str],
351
+ provider: str,
352
+ model: str,
353
+ cache_enabled: bool,
354
+ cache_ttl: int,
355
+ dimensions: Optional[int],
356
+ normalize: bool,
357
+ timeout: int,
358
+ max_retries: int,
359
+ ) -> Dict[str, Any]:
360
+ """Generate embedding for a single text."""
361
+ if not text:
362
+ return {
363
+ "success": False,
364
+ "error": "input_text is required for embed_text operation",
365
+ }
366
+
367
+ # Check cache first if enabled
368
+ if cache_enabled:
369
+ cache_key = self._generate_cache_key(text, provider, model)
370
+ cached_embedding = self._get_cached_embedding(cache_key)
371
+ if cached_embedding:
372
+ return {
373
+ "success": True,
374
+ "operation": "embed_text",
375
+ "embedding": cached_embedding["vector"],
376
+ "dimensions": len(cached_embedding["vector"]),
377
+ "text_length": len(text),
378
+ "provider": provider,
379
+ "model": model,
380
+ "cached": True,
381
+ "cache_key": cache_key,
382
+ "processing_time_ms": 5, # Very fast cache lookup
383
+ }
384
+
385
+ # Generate embedding using provider
386
+ start_time = time.time()
387
+
388
+ if provider == "mock":
389
+ embedding_vector = self._generate_mock_embedding(text, dimensions or 1536)
390
+ else:
391
+ embedding_vector = self._generate_provider_embedding(
392
+ text, provider, model, dimensions, timeout, max_retries
393
+ )
394
+
395
+ processing_time = (time.time() - start_time) * 1000
396
+
397
+ # Normalize if requested
398
+ if normalize:
399
+ embedding_vector = self._normalize_vector(embedding_vector)
400
+
401
+ # Cache the result
402
+ if cache_enabled:
403
+ self._cache_embedding(cache_key, embedding_vector, cache_ttl)
404
+
405
+ return {
406
+ "success": True,
407
+ "operation": "embed_text",
408
+ "embedding": embedding_vector,
409
+ "dimensions": len(embedding_vector),
410
+ "text_length": len(text),
411
+ "provider": provider,
412
+ "model": model,
413
+ "cached": False,
414
+ "processing_time_ms": round(processing_time, 2),
415
+ "usage": {
416
+ "tokens": len(text.split()),
417
+ "estimated_cost_usd": self._estimate_embedding_cost(
418
+ len(text.split()), provider, model
419
+ ),
420
+ },
421
+ }
422
+
423
+ def _embed_batch_texts(
424
+ self,
425
+ texts: List[str],
426
+ provider: str,
427
+ model: str,
428
+ batch_size: int,
429
+ chunk_size: int,
430
+ cache_enabled: bool,
431
+ cache_ttl: int,
432
+ dimensions: Optional[int],
433
+ normalize: bool,
434
+ timeout: int,
435
+ max_retries: int,
436
+ ) -> Dict[str, Any]:
437
+ """Generate embeddings for a batch of texts."""
438
+ if not texts:
439
+ return {
440
+ "success": False,
441
+ "error": "input_texts is required and cannot be empty for embed_batch operation",
442
+ }
443
+
444
+ start_time = time.time()
445
+ embeddings = []
446
+ cache_hits = 0
447
+ total_tokens = 0
448
+
449
+ # Process texts in batches
450
+ for i in range(0, len(texts), batch_size):
451
+ batch = texts[i : i + batch_size]
452
+
453
+ for text in batch:
454
+ # Check cache first
455
+ cache_key = None
456
+ if cache_enabled:
457
+ cache_key = self._generate_cache_key(text, provider, model)
458
+ cached_embedding = self._get_cached_embedding(cache_key)
459
+ if cached_embedding:
460
+ embeddings.append(
461
+ {
462
+ "text": text[:100] + "..." if len(text) > 100 else text,
463
+ "embedding": cached_embedding["vector"],
464
+ "cached": True,
465
+ "dimensions": len(cached_embedding["vector"]),
466
+ }
467
+ )
468
+ cache_hits += 1
469
+ continue
470
+
471
+ # Chunk text if too long
472
+ chunks = self._chunk_text(text, chunk_size)
473
+ if len(chunks) > 1:
474
+ # For multiple chunks, embed each and average
475
+ chunk_embeddings = []
476
+ for chunk in chunks:
477
+ if provider == "mock":
478
+ chunk_emb = self._generate_mock_embedding(
479
+ chunk, dimensions or 1536
480
+ )
481
+ else:
482
+ chunk_emb = self._generate_provider_embedding(
483
+ chunk, provider, model, dimensions, timeout, max_retries
484
+ )
485
+ chunk_embeddings.append(chunk_emb)
486
+
487
+ # Average embeddings
488
+ embedding_vector = self._average_embeddings(chunk_embeddings)
489
+ else:
490
+ # Single chunk
491
+ if provider == "mock":
492
+ embedding_vector = self._generate_mock_embedding(
493
+ text, dimensions or 1536
494
+ )
495
+ else:
496
+ embedding_vector = self._generate_provider_embedding(
497
+ text, provider, model, dimensions, timeout, max_retries
498
+ )
499
+
500
+ # Normalize if requested
501
+ if normalize:
502
+ embedding_vector = self._normalize_vector(embedding_vector)
503
+
504
+ # Cache the result
505
+ if cache_enabled and cache_key:
506
+ self._cache_embedding(cache_key, embedding_vector, cache_ttl)
507
+
508
+ embeddings.append(
509
+ {
510
+ "text": text[:100] + "..." if len(text) > 100 else text,
511
+ "embedding": embedding_vector,
512
+ "cached": False,
513
+ "dimensions": len(embedding_vector),
514
+ "chunks": len(chunks),
515
+ }
516
+ )
517
+
518
+ total_tokens += len(text.split())
519
+
520
+ processing_time = (time.time() - start_time) * 1000
521
+
522
+ return {
523
+ "success": True,
524
+ "operation": "embed_batch",
525
+ "embeddings": embeddings,
526
+ "total_texts": len(texts),
527
+ "total_embeddings": len(embeddings),
528
+ "cache_hits": cache_hits,
529
+ "cache_hit_rate": cache_hits / len(texts) if texts else 0,
530
+ "provider": provider,
531
+ "model": model,
532
+ "batch_size": batch_size,
533
+ "processing_time_ms": round(processing_time, 2),
534
+ "usage": {
535
+ "total_tokens": total_tokens,
536
+ "estimated_cost_usd": self._estimate_embedding_cost(
537
+ total_tokens, provider, model
538
+ ),
539
+ "average_tokens_per_text": total_tokens / len(texts) if texts else 0,
540
+ },
541
+ }
542
+
543
+ def _calculate_similarity(
544
+ self,
545
+ embedding_1: Optional[List[float]],
546
+ embedding_2: Optional[List[float]],
547
+ metric: str,
548
+ ) -> Dict[str, Any]:
549
+ """Calculate similarity between two embedding vectors."""
550
+ if not embedding_1 or not embedding_2:
551
+ return {
552
+ "success": False,
553
+ "error": "Both embedding_1 and embedding_2 are required for similarity calculation",
554
+ }
555
+
556
+ if len(embedding_1) != len(embedding_2):
557
+ return {
558
+ "success": False,
559
+ "error": f"Embedding dimensions must match: {len(embedding_1)} vs {len(embedding_2)}",
560
+ }
561
+
562
+ try:
563
+ if metric == "cosine":
564
+ similarity = self._cosine_similarity(embedding_1, embedding_2)
565
+ elif metric == "euclidean":
566
+ similarity = self._euclidean_distance(embedding_1, embedding_2)
567
+ elif metric == "dot_product":
568
+ similarity = self._dot_product(embedding_1, embedding_2)
569
+ else:
570
+ return {
571
+ "success": False,
572
+ "error": f"Unsupported similarity metric: {metric}",
573
+ "supported_metrics": ["cosine", "euclidean", "dot_product"],
574
+ }
575
+
576
+ return {
577
+ "success": True,
578
+ "operation": "calculate_similarity",
579
+ "similarity": similarity,
580
+ "metric": metric,
581
+ "dimensions": len(embedding_1),
582
+ "interpretation": self._interpret_similarity(similarity, metric),
583
+ }
584
+
585
+ except Exception as e:
586
+ return {
587
+ "success": False,
588
+ "error": f"Similarity calculation failed: {str(e)}",
589
+ "metric": metric,
590
+ }
591
+
592
+ def _embed_mcp_resource(
593
+ self,
594
+ resource_uri: Optional[str],
595
+ provider: str,
596
+ model: str,
597
+ chunk_size: int,
598
+ cache_enabled: bool,
599
+ cache_ttl: int,
600
+ dimensions: Optional[int],
601
+ normalize: bool,
602
+ timeout: int,
603
+ max_retries: int,
604
+ ) -> Dict[str, Any]:
605
+ """Embed content from an MCP resource."""
606
+ if not resource_uri:
607
+ return {
608
+ "success": False,
609
+ "error": "mcp_resource_uri is required for embed_mcp_resource operation",
610
+ }
611
+
612
+ # Mock MCP resource retrieval
613
+ mock_content = f"Mock content from MCP resource: {resource_uri}\n\nThis would contain the actual document or data content retrieved from the MCP server."
614
+
615
+ # Use the existing embed_text functionality
616
+ result = self._embed_single_text(
617
+ mock_content,
618
+ provider,
619
+ model,
620
+ cache_enabled,
621
+ cache_ttl,
622
+ dimensions,
623
+ normalize,
624
+ timeout,
625
+ max_retries,
626
+ )
627
+
628
+ if result["success"]:
629
+ result.update(
630
+ {
631
+ "operation": "embed_mcp_resource",
632
+ "mcp_resource_uri": resource_uri,
633
+ "content_preview": mock_content[:200] + "...",
634
+ }
635
+ )
636
+
637
+ return result
638
+
639
+ def _generate_mock_embedding(self, text: str, dimensions: int) -> List[float]:
640
+ """Generate a mock embedding vector based on text content."""
641
+ import hashlib
642
+ import random
643
+
644
+ # Use text hash as seed for reproducible mock embeddings
645
+ seed = int(hashlib.md5(text.encode()).hexdigest()[:8], 16)
646
+ random.seed(seed)
647
+
648
+ # Generate normalized random vector
649
+ vector = [random.gauss(0, 1) for _ in range(dimensions)]
650
+
651
+ # Normalize to unit length
652
+ magnitude = sum(x * x for x in vector) ** 0.5
653
+ return [x / magnitude for x in vector]
654
+
655
+ def _generate_provider_embedding(
656
+ self,
657
+ text: str,
658
+ provider: str,
659
+ model: str,
660
+ dimensions: Optional[int],
661
+ timeout: int,
662
+ max_retries: int,
663
+ ) -> List[float]:
664
+ """Generate embedding using external provider."""
665
+ try:
666
+ from .ai_providers import get_provider
667
+
668
+ # Get the provider instance
669
+ provider_instance = get_provider(provider, "embeddings")
670
+
671
+ # Check if provider is available
672
+ if not provider_instance.is_available():
673
+ raise RuntimeError(
674
+ f"Provider {provider} is not available. Check dependencies and configuration."
675
+ )
676
+
677
+ # Prepare kwargs for the provider
678
+ kwargs = {"model": model, "timeout": timeout}
679
+
680
+ # Add dimensions if specified and provider supports it
681
+ if dimensions and provider in ["openai"]:
682
+ kwargs["dimensions"] = dimensions
683
+
684
+ # Provider-specific parameters
685
+ if provider == "cohere":
686
+ kwargs["input_type"] = "search_document"
687
+ elif provider == "huggingface":
688
+ kwargs["use_api"] = True # Default to API for consistency
689
+
690
+ # Generate embedding
691
+ embeddings = provider_instance.embed([text], **kwargs)
692
+
693
+ # Return the first (and only) embedding
694
+ return embeddings[0] if embeddings else []
695
+
696
+ except ImportError:
697
+ # Fallback to the original implementation if ai_providers not available
698
+ return self._fallback_provider_embedding(
699
+ text, provider, model, dimensions, timeout, max_retries
700
+ )
701
+ except Exception as e:
702
+ raise RuntimeError(f"Provider {provider} error: {str(e)}") from e
703
+
704
+ def _fallback_provider_embedding(
705
+ self,
706
+ text: str,
707
+ provider: str,
708
+ model: str,
709
+ dimensions: Optional[int],
710
+ timeout: int,
711
+ max_retries: int,
712
+ ) -> List[float]:
713
+ """Fallback implementation for backward compatibility."""
714
+ # Handle Ollama provider
715
+ if provider == "ollama":
716
+ try:
717
+ import ollama
718
+
719
+ response = ollama.embeddings(model=model, prompt=text)
720
+ return response.get("embedding", [])
721
+ except ImportError:
722
+ raise RuntimeError(
723
+ "Ollama library not installed. Install with: pip install ollama"
724
+ )
725
+ except Exception as e:
726
+ raise RuntimeError(f"Ollama embedding error: {str(e)}")
727
+
728
+ # Default dimensions for other providers
729
+ default_dimensions = {
730
+ "openai": {"text-embedding-3-large": 3072, "text-embedding-3-small": 1536},
731
+ "huggingface": {"all-MiniLM-L6-v2": 384, "all-mpnet-base-v2": 768},
732
+ "azure": {"text-embedding-3-large": 3072},
733
+ "cohere": {"embed-english-v3.0": 1024},
734
+ }
735
+
736
+ actual_dimensions = dimensions or default_dimensions.get(provider, {}).get(
737
+ model, 1536
738
+ )
739
+
740
+ # For now, other providers use mock embeddings
741
+ # In real implementation, this would call the actual provider APIs
742
+ return self._generate_mock_embedding(
743
+ f"{provider}:{model}:{text}", actual_dimensions
744
+ )
745
+
746
+ def _chunk_text(self, text: str, chunk_size: int) -> List[str]:
747
+ """Split text into chunks based on token limit."""
748
+ # Simple word-based chunking (real implementation would use proper tokenization)
749
+ words = text.split()
750
+ chunks = []
751
+
752
+ for i in range(0, len(words), chunk_size):
753
+ chunk_words = words[i : i + chunk_size]
754
+ chunks.append(" ".join(chunk_words))
755
+
756
+ return chunks or [""]
757
+
758
+ def _average_embeddings(self, embeddings: List[List[float]]) -> List[float]:
759
+ """Average multiple embedding vectors."""
760
+ if not embeddings:
761
+ return []
762
+
763
+ dimensions = len(embeddings[0])
764
+ averaged = []
765
+
766
+ for i in range(dimensions):
767
+ avg_value = sum(emb[i] for emb in embeddings) / len(embeddings)
768
+ averaged.append(avg_value)
769
+
770
+ return averaged
771
+
772
+ def _normalize_vector(self, vector: List[float]) -> List[float]:
773
+ """Normalize vector to unit length."""
774
+ magnitude = sum(x * x for x in vector) ** 0.5
775
+ if magnitude == 0:
776
+ return vector
777
+ return [x / magnitude for x in vector]
778
+
779
+ def _cosine_similarity(self, vec1: List[float], vec2: List[float]) -> float:
780
+ """Calculate cosine similarity between two vectors."""
781
+ dot_product = sum(a * b for a, b in zip(vec1, vec2))
782
+ magnitude1 = sum(a * a for a in vec1) ** 0.5
783
+ magnitude2 = sum(b * b for b in vec2) ** 0.5
784
+
785
+ if magnitude1 == 0 or magnitude2 == 0:
786
+ return 0.0
787
+
788
+ return dot_product / (magnitude1 * magnitude2)
789
+
790
+ def _euclidean_distance(self, vec1: List[float], vec2: List[float]) -> float:
791
+ """Calculate Euclidean distance between two vectors."""
792
+ return sum((a - b) ** 2 for a, b in zip(vec1, vec2)) ** 0.5
793
+
794
+ def _dot_product(self, vec1: List[float], vec2: List[float]) -> float:
795
+ """Calculate dot product of two vectors."""
796
+ return sum(a * b for a, b in zip(vec1, vec2))
797
+
798
+ def _interpret_similarity(self, score: float, metric: str) -> str:
799
+ """Provide human-readable interpretation of similarity score."""
800
+ if metric == "cosine":
801
+ if score > 0.9:
802
+ return "Very similar"
803
+ elif score > 0.7:
804
+ return "Similar"
805
+ elif score > 0.5:
806
+ return "Somewhat similar"
807
+ elif score > 0.3:
808
+ return "Slightly similar"
809
+ else:
810
+ return "Not similar"
811
+ elif metric == "euclidean":
812
+ if score < 0.5:
813
+ return "Very similar"
814
+ elif score < 1.0:
815
+ return "Similar"
816
+ elif score < 2.0:
817
+ return "Somewhat similar"
818
+ else:
819
+ return "Not similar"
820
+ else: # dot_product
821
+ return f"Dot product: {score:.3f}"
822
+
823
+ def _generate_cache_key(self, text: str, provider: str, model: str) -> str:
824
+ """Generate cache key for embedding."""
825
+ import hashlib
826
+
827
+ content = f"{provider}:{model}:{text}"
828
+ return f"emb_{hashlib.md5(content.encode()).hexdigest()[:16]}"
829
+
830
+ def _get_cached_embedding(self, cache_key: str) -> Optional[Dict[str, Any]]:
831
+ """Retrieve embedding from cache (mock implementation)."""
832
+ # Mock cache lookup - in real implementation, use Redis or similar
833
+ return None
834
+
835
+ def _cache_embedding(self, cache_key: str, vector: List[float], ttl: int) -> None:
836
+ """Store embedding in cache (mock implementation)."""
837
+ # Mock cache storage - in real implementation, use Redis or similar
838
+ pass
839
+
840
+ def _estimate_embedding_cost(self, tokens: int, provider: str, model: str) -> float:
841
+ """Estimate embedding cost based on tokens and provider pricing."""
842
+ # Mock cost estimation (real implementation would use current pricing)
843
+ cost_per_1k_tokens = {
844
+ "openai": {
845
+ "text-embedding-3-large": 0.00013,
846
+ "text-embedding-3-small": 0.00002,
847
+ },
848
+ "azure": {"text-embedding-3-large": 0.00013},
849
+ "cohere": {"embed-english-v3.0": 0.0001},
850
+ }
851
+
852
+ rate = cost_per_1k_tokens.get(provider, {}).get(model, 0.0001)
853
+ return (tokens / 1000) * rate