mcli-framework 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcli-framework might be problematic. Click here for more details.

Files changed (186) hide show
  1. mcli/app/chat_cmd.py +42 -0
  2. mcli/app/commands_cmd.py +226 -0
  3. mcli/app/completion_cmd.py +216 -0
  4. mcli/app/completion_helpers.py +288 -0
  5. mcli/app/cron_test_cmd.py +697 -0
  6. mcli/app/logs_cmd.py +419 -0
  7. mcli/app/main.py +492 -0
  8. mcli/app/model/model.py +1060 -0
  9. mcli/app/model_cmd.py +227 -0
  10. mcli/app/redis_cmd.py +269 -0
  11. mcli/app/video/video.py +1114 -0
  12. mcli/app/visual_cmd.py +303 -0
  13. mcli/chat/chat.py +2409 -0
  14. mcli/chat/command_rag.py +514 -0
  15. mcli/chat/enhanced_chat.py +652 -0
  16. mcli/chat/system_controller.py +1010 -0
  17. mcli/chat/system_integration.py +1016 -0
  18. mcli/cli.py +25 -0
  19. mcli/config.toml +20 -0
  20. mcli/lib/api/api.py +586 -0
  21. mcli/lib/api/daemon_client.py +203 -0
  22. mcli/lib/api/daemon_client_local.py +44 -0
  23. mcli/lib/api/daemon_decorator.py +217 -0
  24. mcli/lib/api/mcli_decorators.py +1032 -0
  25. mcli/lib/auth/auth.py +85 -0
  26. mcli/lib/auth/aws_manager.py +85 -0
  27. mcli/lib/auth/azure_manager.py +91 -0
  28. mcli/lib/auth/credential_manager.py +192 -0
  29. mcli/lib/auth/gcp_manager.py +93 -0
  30. mcli/lib/auth/key_manager.py +117 -0
  31. mcli/lib/auth/mcli_manager.py +93 -0
  32. mcli/lib/auth/token_manager.py +75 -0
  33. mcli/lib/auth/token_util.py +1011 -0
  34. mcli/lib/config/config.py +47 -0
  35. mcli/lib/discovery/__init__.py +1 -0
  36. mcli/lib/discovery/command_discovery.py +274 -0
  37. mcli/lib/erd/erd.py +1345 -0
  38. mcli/lib/erd/generate_graph.py +453 -0
  39. mcli/lib/files/files.py +76 -0
  40. mcli/lib/fs/fs.py +109 -0
  41. mcli/lib/lib.py +29 -0
  42. mcli/lib/logger/logger.py +611 -0
  43. mcli/lib/performance/optimizer.py +409 -0
  44. mcli/lib/performance/rust_bridge.py +502 -0
  45. mcli/lib/performance/uvloop_config.py +154 -0
  46. mcli/lib/pickles/pickles.py +50 -0
  47. mcli/lib/search/cached_vectorizer.py +479 -0
  48. mcli/lib/services/data_pipeline.py +460 -0
  49. mcli/lib/services/lsh_client.py +441 -0
  50. mcli/lib/services/redis_service.py +387 -0
  51. mcli/lib/shell/shell.py +137 -0
  52. mcli/lib/toml/toml.py +33 -0
  53. mcli/lib/ui/styling.py +47 -0
  54. mcli/lib/ui/visual_effects.py +634 -0
  55. mcli/lib/watcher/watcher.py +185 -0
  56. mcli/ml/api/app.py +215 -0
  57. mcli/ml/api/middleware.py +224 -0
  58. mcli/ml/api/routers/admin_router.py +12 -0
  59. mcli/ml/api/routers/auth_router.py +244 -0
  60. mcli/ml/api/routers/backtest_router.py +12 -0
  61. mcli/ml/api/routers/data_router.py +12 -0
  62. mcli/ml/api/routers/model_router.py +302 -0
  63. mcli/ml/api/routers/monitoring_router.py +12 -0
  64. mcli/ml/api/routers/portfolio_router.py +12 -0
  65. mcli/ml/api/routers/prediction_router.py +267 -0
  66. mcli/ml/api/routers/trade_router.py +12 -0
  67. mcli/ml/api/routers/websocket_router.py +76 -0
  68. mcli/ml/api/schemas.py +64 -0
  69. mcli/ml/auth/auth_manager.py +425 -0
  70. mcli/ml/auth/models.py +154 -0
  71. mcli/ml/auth/permissions.py +302 -0
  72. mcli/ml/backtesting/backtest_engine.py +502 -0
  73. mcli/ml/backtesting/performance_metrics.py +393 -0
  74. mcli/ml/cache.py +400 -0
  75. mcli/ml/cli/main.py +398 -0
  76. mcli/ml/config/settings.py +394 -0
  77. mcli/ml/configs/dvc_config.py +230 -0
  78. mcli/ml/configs/mlflow_config.py +131 -0
  79. mcli/ml/configs/mlops_manager.py +293 -0
  80. mcli/ml/dashboard/app.py +532 -0
  81. mcli/ml/dashboard/app_integrated.py +738 -0
  82. mcli/ml/dashboard/app_supabase.py +560 -0
  83. mcli/ml/dashboard/app_training.py +615 -0
  84. mcli/ml/dashboard/cli.py +51 -0
  85. mcli/ml/data_ingestion/api_connectors.py +501 -0
  86. mcli/ml/data_ingestion/data_pipeline.py +567 -0
  87. mcli/ml/data_ingestion/stream_processor.py +512 -0
  88. mcli/ml/database/migrations/env.py +94 -0
  89. mcli/ml/database/models.py +667 -0
  90. mcli/ml/database/session.py +200 -0
  91. mcli/ml/experimentation/ab_testing.py +845 -0
  92. mcli/ml/features/ensemble_features.py +607 -0
  93. mcli/ml/features/political_features.py +676 -0
  94. mcli/ml/features/recommendation_engine.py +809 -0
  95. mcli/ml/features/stock_features.py +573 -0
  96. mcli/ml/features/test_feature_engineering.py +346 -0
  97. mcli/ml/logging.py +85 -0
  98. mcli/ml/mlops/data_versioning.py +518 -0
  99. mcli/ml/mlops/experiment_tracker.py +377 -0
  100. mcli/ml/mlops/model_serving.py +481 -0
  101. mcli/ml/mlops/pipeline_orchestrator.py +614 -0
  102. mcli/ml/models/base_models.py +324 -0
  103. mcli/ml/models/ensemble_models.py +675 -0
  104. mcli/ml/models/recommendation_models.py +474 -0
  105. mcli/ml/models/test_models.py +487 -0
  106. mcli/ml/monitoring/drift_detection.py +676 -0
  107. mcli/ml/monitoring/metrics.py +45 -0
  108. mcli/ml/optimization/portfolio_optimizer.py +834 -0
  109. mcli/ml/preprocessing/data_cleaners.py +451 -0
  110. mcli/ml/preprocessing/feature_extractors.py +491 -0
  111. mcli/ml/preprocessing/ml_pipeline.py +382 -0
  112. mcli/ml/preprocessing/politician_trading_preprocessor.py +569 -0
  113. mcli/ml/preprocessing/test_preprocessing.py +294 -0
  114. mcli/ml/scripts/populate_sample_data.py +200 -0
  115. mcli/ml/tasks.py +400 -0
  116. mcli/ml/tests/test_integration.py +429 -0
  117. mcli/ml/tests/test_training_dashboard.py +387 -0
  118. mcli/public/oi/oi.py +15 -0
  119. mcli/public/public.py +4 -0
  120. mcli/self/self_cmd.py +1246 -0
  121. mcli/workflow/daemon/api_daemon.py +800 -0
  122. mcli/workflow/daemon/async_command_database.py +681 -0
  123. mcli/workflow/daemon/async_process_manager.py +591 -0
  124. mcli/workflow/daemon/client.py +530 -0
  125. mcli/workflow/daemon/commands.py +1196 -0
  126. mcli/workflow/daemon/daemon.py +905 -0
  127. mcli/workflow/daemon/daemon_api.py +59 -0
  128. mcli/workflow/daemon/enhanced_daemon.py +571 -0
  129. mcli/workflow/daemon/process_cli.py +244 -0
  130. mcli/workflow/daemon/process_manager.py +439 -0
  131. mcli/workflow/daemon/test_daemon.py +275 -0
  132. mcli/workflow/dashboard/dashboard_cmd.py +113 -0
  133. mcli/workflow/docker/docker.py +0 -0
  134. mcli/workflow/file/file.py +100 -0
  135. mcli/workflow/gcloud/config.toml +21 -0
  136. mcli/workflow/gcloud/gcloud.py +58 -0
  137. mcli/workflow/git_commit/ai_service.py +328 -0
  138. mcli/workflow/git_commit/commands.py +430 -0
  139. mcli/workflow/lsh_integration.py +355 -0
  140. mcli/workflow/model_service/client.py +594 -0
  141. mcli/workflow/model_service/download_and_run_efficient_models.py +288 -0
  142. mcli/workflow/model_service/lightweight_embedder.py +397 -0
  143. mcli/workflow/model_service/lightweight_model_server.py +714 -0
  144. mcli/workflow/model_service/lightweight_test.py +241 -0
  145. mcli/workflow/model_service/model_service.py +1955 -0
  146. mcli/workflow/model_service/ollama_efficient_runner.py +425 -0
  147. mcli/workflow/model_service/pdf_processor.py +386 -0
  148. mcli/workflow/model_service/test_efficient_runner.py +234 -0
  149. mcli/workflow/model_service/test_example.py +315 -0
  150. mcli/workflow/model_service/test_integration.py +131 -0
  151. mcli/workflow/model_service/test_new_features.py +149 -0
  152. mcli/workflow/openai/openai.py +99 -0
  153. mcli/workflow/politician_trading/commands.py +1790 -0
  154. mcli/workflow/politician_trading/config.py +134 -0
  155. mcli/workflow/politician_trading/connectivity.py +490 -0
  156. mcli/workflow/politician_trading/data_sources.py +395 -0
  157. mcli/workflow/politician_trading/database.py +410 -0
  158. mcli/workflow/politician_trading/demo.py +248 -0
  159. mcli/workflow/politician_trading/models.py +165 -0
  160. mcli/workflow/politician_trading/monitoring.py +413 -0
  161. mcli/workflow/politician_trading/scrapers.py +966 -0
  162. mcli/workflow/politician_trading/scrapers_california.py +412 -0
  163. mcli/workflow/politician_trading/scrapers_eu.py +377 -0
  164. mcli/workflow/politician_trading/scrapers_uk.py +350 -0
  165. mcli/workflow/politician_trading/scrapers_us_states.py +438 -0
  166. mcli/workflow/politician_trading/supabase_functions.py +354 -0
  167. mcli/workflow/politician_trading/workflow.py +852 -0
  168. mcli/workflow/registry/registry.py +180 -0
  169. mcli/workflow/repo/repo.py +223 -0
  170. mcli/workflow/scheduler/commands.py +493 -0
  171. mcli/workflow/scheduler/cron_parser.py +238 -0
  172. mcli/workflow/scheduler/job.py +182 -0
  173. mcli/workflow/scheduler/monitor.py +139 -0
  174. mcli/workflow/scheduler/persistence.py +324 -0
  175. mcli/workflow/scheduler/scheduler.py +679 -0
  176. mcli/workflow/sync/sync_cmd.py +437 -0
  177. mcli/workflow/sync/test_cmd.py +314 -0
  178. mcli/workflow/videos/videos.py +242 -0
  179. mcli/workflow/wakatime/wakatime.py +11 -0
  180. mcli/workflow/workflow.py +37 -0
  181. mcli_framework-7.0.0.dist-info/METADATA +479 -0
  182. mcli_framework-7.0.0.dist-info/RECORD +186 -0
  183. mcli_framework-7.0.0.dist-info/WHEEL +5 -0
  184. mcli_framework-7.0.0.dist-info/entry_points.txt +7 -0
  185. mcli_framework-7.0.0.dist-info/licenses/LICENSE +21 -0
  186. mcli_framework-7.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,479 @@
1
+ """
2
+ Cached TF-IDF Vectorizer with Redis support for high-performance text similarity
3
+ """
4
+
5
+ import asyncio
6
+ import hashlib
7
+ import json
8
+ import pickle
9
+ from datetime import datetime, timedelta
10
+ from typing import Any, Dict, List, Optional, Tuple
11
+
12
+ import numpy as np
13
+
14
+ # Optional redis import - gracefully handle if not installed
15
+ try:
16
+ import redis.asyncio as redis
17
+ REDIS_AVAILABLE = True
18
+ except ImportError:
19
+ REDIS_AVAILABLE = False
20
+ redis = None # type: ignore
21
+
22
+ from mcli.lib.logger.logger import get_logger
23
+
24
+ logger = get_logger(__name__)
25
+
26
+
27
+ class CachedTfIdfVectorizer:
28
+ """
29
+ TF-IDF Vectorizer with Redis caching for improved performance.
30
+ Falls back to Rust implementation when available, otherwise uses sklearn.
31
+ """
32
+
33
+ def __init__(
34
+ self,
35
+ redis_url: str = "redis://localhost:6379",
36
+ cache_ttl: int = 3600,
37
+ cache_prefix: str = "tfidf",
38
+ use_rust: bool = True,
39
+ ):
40
+ self.redis_url = redis_url
41
+ self.cache_ttl = cache_ttl
42
+ self.cache_prefix = cache_prefix
43
+ self.use_rust = use_rust
44
+
45
+ self.redis_client: Optional[Any] = None # redis.Redis when available
46
+ self.vectorizer = None
47
+ self.is_fitted = False
48
+
49
+ # Cache stats
50
+ self.cache_hits = 0
51
+ self.cache_misses = 0
52
+
53
+ async def initialize(self):
54
+ """Initialize Redis connection and vectorizer"""
55
+ await self._init_redis()
56
+ await self._init_vectorizer()
57
+
58
+ async def _init_redis(self):
59
+ """Initialize Redis connection"""
60
+ if not REDIS_AVAILABLE:
61
+ logger.warning("Redis is not installed. Caching disabled.")
62
+ self.redis_client = None
63
+ return
64
+
65
+ try:
66
+ # Try to ensure Redis is running through the service manager
67
+ try:
68
+ from mcli.lib.services.redis_service import ensure_redis_running
69
+
70
+ await ensure_redis_running()
71
+ except ImportError:
72
+ logger.debug("Redis service manager not available")
73
+
74
+ self.redis_client = redis.from_url(self.redis_url) # type: ignore
75
+ await self.redis_client.ping()
76
+ logger.info("Connected to Redis for TF-IDF caching")
77
+ except Exception as e:
78
+ logger.warning(f"Failed to connect to Redis: {e}. Caching disabled.")
79
+ self.redis_client = None
80
+
81
+ async def _init_vectorizer(self):
82
+ """Initialize the appropriate vectorizer implementation"""
83
+ if self.use_rust:
84
+ try:
85
+ # Try to use Rust implementation
86
+ import mcli_rust
87
+
88
+ self.vectorizer = mcli_rust.TfIdfVectorizer(
89
+ max_features=1000, min_df=1, max_df=0.95, ngram_range=(1, 2)
90
+ )
91
+ logger.info("Using Rust TF-IDF vectorizer for enhanced performance")
92
+ return
93
+ except ImportError:
94
+ logger.warning("Rust vectorizer not available, falling back to sklearn")
95
+
96
+ # Fallback to sklearn
97
+ try:
98
+ from sklearn.feature_extraction.text import TfidfVectorizer
99
+
100
+ self.vectorizer = TfidfVectorizer(
101
+ max_features=1000, stop_words="english", ngram_range=(1, 2), min_df=1, max_df=0.95
102
+ )
103
+ logger.info("Using sklearn TF-IDF vectorizer")
104
+ except ImportError:
105
+ raise RuntimeError("Neither Rust nor sklearn TF-IDF implementation available")
106
+
107
+ async def fit_transform(self, documents: List[str]) -> np.ndarray:
108
+ """Fit the vectorizer and transform documents with caching"""
109
+ # Generate cache key for the document set
110
+ cache_key = self._generate_cache_key(documents, "fit_transform")
111
+
112
+ # Try to get from cache
113
+ cached_result = await self._get_from_cache(cache_key)
114
+ if cached_result is not None:
115
+ self.cache_hits += 1
116
+ vectors, feature_names = cached_result
117
+ self.is_fitted = True
118
+ return vectors
119
+
120
+ self.cache_misses += 1
121
+
122
+ # Compute TF-IDF vectors
123
+ if hasattr(self.vectorizer, "fit_transform") and hasattr(
124
+ self.vectorizer, "get_feature_names_out"
125
+ ):
126
+ # sklearn implementation
127
+ vectors = self.vectorizer.fit_transform(documents).toarray()
128
+ feature_names = self.vectorizer.get_feature_names_out().tolist()
129
+ else:
130
+ # Rust implementation
131
+ result = self.vectorizer.fit_transform(documents)
132
+ if isinstance(result, list):
133
+ vectors = np.array(result)
134
+ else:
135
+ vectors = result.toarray() if hasattr(result, "toarray") else np.array(result)
136
+
137
+ if hasattr(self.vectorizer, "get_feature_names"):
138
+ feature_names = self.vectorizer.get_feature_names()
139
+ else:
140
+ feature_names = [
141
+ f"feature_{i}"
142
+ for i in range(vectors.shape[1] if len(vectors.shape) > 1 else len(vectors))
143
+ ]
144
+
145
+ self.is_fitted = True
146
+
147
+ # Cache the result
148
+ await self._cache_result(cache_key, (vectors, feature_names))
149
+
150
+ return vectors
151
+
152
+ async def transform(self, documents: List[str]) -> np.ndarray:
153
+ """Transform documents using fitted vectorizer with caching"""
154
+ if not self.is_fitted:
155
+ raise ValueError("Vectorizer must be fitted before transform")
156
+
157
+ # Generate cache key for transformation
158
+ cache_key = self._generate_cache_key(documents, "transform")
159
+
160
+ # Try to get from cache
161
+ cached_result = await self._get_from_cache(cache_key)
162
+ if cached_result is not None:
163
+ self.cache_hits += 1
164
+ return cached_result
165
+
166
+ self.cache_misses += 1
167
+
168
+ # Compute vectors
169
+ if hasattr(self.vectorizer, "transform") and hasattr(
170
+ self.vectorizer, "get_feature_names_out"
171
+ ):
172
+ # sklearn implementation
173
+ vectors = self.vectorizer.transform(documents).toarray()
174
+ else:
175
+ # Rust implementation
176
+ result = self.vectorizer.transform(documents)
177
+ if isinstance(result, list):
178
+ vectors = np.array(result)
179
+ else:
180
+ vectors = result.toarray() if hasattr(result, "toarray") else np.array(result)
181
+
182
+ # Cache the result
183
+ await self._cache_result(cache_key, vectors)
184
+
185
+ return vectors
186
+
187
+ async def similarity_search(
188
+ self, query: str, documents: List[str], top_k: int = 10
189
+ ) -> List[Tuple[int, float]]:
190
+ """Perform similarity search with caching"""
191
+ # Generate cache key for similarity search
192
+ search_data = {"query": query, "documents": documents, "top_k": top_k}
193
+ cache_key = self._generate_cache_key_from_dict(search_data, "similarity")
194
+
195
+ # Try to get from cache
196
+ cached_result = await self._get_from_cache(cache_key)
197
+ if cached_result is not None:
198
+ self.cache_hits += 1
199
+ return cached_result
200
+
201
+ self.cache_misses += 1
202
+
203
+ # Compute similarity
204
+ if hasattr(self.vectorizer, "similarity"):
205
+ # Rust implementation with built-in similarity
206
+ similarities = self.vectorizer.similarity(query, documents)
207
+ else:
208
+ # sklearn implementation - need to compute manually
209
+ if not self.is_fitted:
210
+ await self.fit_transform(documents)
211
+
212
+ query_vector = await self.transform([query])
213
+ doc_vectors = await self.transform(documents)
214
+
215
+ # Compute cosine similarity
216
+ similarities = []
217
+ for doc_vector in doc_vectors:
218
+ similarity = self._cosine_similarity(query_vector[0], doc_vector)
219
+ similarities.append(similarity)
220
+
221
+ # Get top-k results
222
+ indexed_similarities = [(i, sim) for i, sim in enumerate(similarities)]
223
+ indexed_similarities.sort(key=lambda x: x[1], reverse=True)
224
+ results = indexed_similarities[:top_k]
225
+
226
+ # Cache the result
227
+ await self._cache_result(cache_key, results)
228
+
229
+ return results
230
+
231
+ async def batch_similarity_search(
232
+ self, queries: List[str], documents: List[str], top_k: int = 10
233
+ ) -> List[List[Tuple[int, float]]]:
234
+ """Perform batch similarity search for multiple queries"""
235
+ # Try to use cached individual results first
236
+ results = []
237
+ uncached_queries = []
238
+ uncached_indices = []
239
+
240
+ for i, query in enumerate(queries):
241
+ cache_key = self._generate_cache_key_from_dict(
242
+ {"query": query, "documents": documents, "top_k": top_k}, "similarity"
243
+ )
244
+
245
+ cached_result = await self._get_from_cache(cache_key)
246
+ if cached_result is not None:
247
+ self.cache_hits += 1
248
+ results.append(cached_result)
249
+ else:
250
+ self.cache_misses += 1
251
+ results.append(None)
252
+ uncached_queries.append(query)
253
+ uncached_indices.append(i)
254
+
255
+ # Process uncached queries in batch
256
+ if uncached_queries:
257
+ if hasattr(self.vectorizer, "similarity"):
258
+ # Rust implementation might support batch processing
259
+ for j, query in enumerate(uncached_queries):
260
+ similarities = self.vectorizer.similarity(query, documents)
261
+ indexed_similarities = [(i, sim) for i, sim in enumerate(similarities)]
262
+ indexed_similarities.sort(key=lambda x: x[1], reverse=True)
263
+ query_results = indexed_similarities[:top_k]
264
+
265
+ # Update results and cache
266
+ results[uncached_indices[j]] = query_results
267
+ cache_key = self._generate_cache_key_from_dict(
268
+ {"query": query, "documents": documents, "top_k": top_k}, "similarity"
269
+ )
270
+ await self._cache_result(cache_key, query_results)
271
+ else:
272
+ # sklearn implementation
273
+ if not self.is_fitted:
274
+ await self.fit_transform(documents)
275
+
276
+ query_vectors = await self.transform(uncached_queries)
277
+ doc_vectors = await self.transform(documents)
278
+
279
+ for j, query_vector in enumerate(query_vectors):
280
+ similarities = []
281
+ for doc_vector in doc_vectors:
282
+ similarity = self._cosine_similarity(query_vector, doc_vector)
283
+ similarities.append(similarity)
284
+
285
+ indexed_similarities = [(i, sim) for i, sim in enumerate(similarities)]
286
+ indexed_similarities.sort(key=lambda x: x[1], reverse=True)
287
+ query_results = indexed_similarities[:top_k]
288
+
289
+ # Update results and cache
290
+ results[uncached_indices[j]] = query_results
291
+ cache_key = self._generate_cache_key_from_dict(
292
+ {"query": uncached_queries[j], "documents": documents, "top_k": top_k},
293
+ "similarity",
294
+ )
295
+ await self._cache_result(cache_key, query_results)
296
+
297
+ return results
298
+
299
+ def _generate_cache_key(self, documents: List[str], operation: str) -> str:
300
+ """Generate a cache key for a list of documents and operation"""
301
+ content = f"{operation}:{':'.join(documents)}"
302
+ hash_obj = hashlib.sha256(content.encode("utf-8"))
303
+ return f"{self.cache_prefix}:{hash_obj.hexdigest()[:16]}"
304
+
305
+ def _generate_cache_key_from_dict(self, data: Dict[str, Any], operation: str) -> str:
306
+ """Generate a cache key from a dictionary"""
307
+ content = f"{operation}:{json.dumps(data, sort_keys=True)}"
308
+ hash_obj = hashlib.sha256(content.encode("utf-8"))
309
+ return f"{self.cache_prefix}:{hash_obj.hexdigest()[:16]}"
310
+
311
+ async def _get_from_cache(self, cache_key: str) -> Optional[Any]:
312
+ """Get result from Redis cache"""
313
+ if not self.redis_client:
314
+ return None
315
+
316
+ try:
317
+ cached_data = await self.redis_client.get(cache_key)
318
+ if cached_data:
319
+ return pickle.loads(cached_data)
320
+ except Exception as e:
321
+ logger.warning(f"Failed to get from cache: {e}")
322
+
323
+ return None
324
+
325
+ async def _cache_result(self, cache_key: str, result: Any):
326
+ """Cache result in Redis"""
327
+ if not self.redis_client:
328
+ return
329
+
330
+ try:
331
+ serialized_result = pickle.dumps(result)
332
+ await self.redis_client.setex(cache_key, self.cache_ttl, serialized_result)
333
+ except Exception as e:
334
+ logger.warning(f"Failed to cache result: {e}")
335
+
336
+ def _cosine_similarity(self, vec1: np.ndarray, vec2: np.ndarray) -> float:
337
+ """Compute cosine similarity between two vectors"""
338
+ dot_product = np.dot(vec1, vec2)
339
+ norm1 = np.linalg.norm(vec1)
340
+ norm2 = np.linalg.norm(vec2)
341
+
342
+ if norm1 == 0 or norm2 == 0:
343
+ return 0.0
344
+
345
+ return dot_product / (norm1 * norm2)
346
+
347
+ async def clear_cache(self, pattern: Optional[str] = None):
348
+ """Clear cache entries"""
349
+ if not self.redis_client:
350
+ return
351
+
352
+ try:
353
+ if pattern:
354
+ keys = await self.redis_client.keys(f"{self.cache_prefix}:{pattern}")
355
+ else:
356
+ keys = await self.redis_client.keys(f"{self.cache_prefix}:*")
357
+
358
+ if keys:
359
+ await self.redis_client.delete(*keys)
360
+ logger.info(f"Cleared {len(keys)} cache entries")
361
+ except Exception as e:
362
+ logger.warning(f"Failed to clear cache: {e}")
363
+
364
+ async def get_cache_stats(self) -> Dict[str, Any]:
365
+ """Get cache statistics"""
366
+ stats = {
367
+ "cache_hits": self.cache_hits,
368
+ "cache_misses": self.cache_misses,
369
+ "hit_rate": (
370
+ self.cache_hits / (self.cache_hits + self.cache_misses)
371
+ if (self.cache_hits + self.cache_misses) > 0
372
+ else 0.0
373
+ ),
374
+ "redis_connected": self.redis_client is not None,
375
+ "vectorizer_type": (
376
+ "rust" if self.use_rust and "mcli_rust" in str(type(self.vectorizer)) else "sklearn"
377
+ ),
378
+ }
379
+
380
+ if self.redis_client:
381
+ try:
382
+ # Get Redis memory usage
383
+ info = await self.redis_client.info("memory")
384
+ stats["redis_memory_used"] = info.get("used_memory_human", "unknown")
385
+
386
+ # Count cache entries
387
+ keys = await self.redis_client.keys(f"{self.cache_prefix}:*")
388
+ stats["cached_entries"] = len(keys)
389
+ except Exception as e:
390
+ logger.warning(f"Failed to get Redis stats: {e}")
391
+
392
+ return stats
393
+
394
+ async def warm_cache(self, documents: List[str], common_queries: List[str]):
395
+ """Pre-populate cache with common queries"""
396
+ logger.info(
397
+ f"Warming cache with {len(common_queries)} queries and {len(documents)} documents"
398
+ )
399
+
400
+ # Fit the vectorizer if not already fitted
401
+ if not self.is_fitted:
402
+ await self.fit_transform(documents)
403
+
404
+ # Pre-compute similarities for common queries
405
+ for i, query in enumerate(common_queries):
406
+ await self.similarity_search(query, documents)
407
+ if i % 10 == 0:
408
+ logger.info(f"Warmed {i + 1}/{len(common_queries)} queries")
409
+
410
+ logger.info("Cache warming completed")
411
+
412
+ async def close(self):
413
+ """Clean up resources"""
414
+ if self.redis_client:
415
+ await self.redis_client.close()
416
+
417
+ # Print final stats
418
+ stats = await self.get_cache_stats()
419
+ logger.info(f"TF-IDF Cache Stats: {stats}")
420
+
421
+
422
+ class SmartVectorizerManager:
423
+ """
424
+ Manager for multiple cached vectorizers with automatic model selection
425
+ """
426
+
427
+ def __init__(self, redis_url: str = "redis://localhost:6379"):
428
+ self.redis_url = redis_url
429
+ self.vectorizers: Dict[str, CachedTfIdfVectorizer] = {}
430
+ self.default_vectorizer = None
431
+
432
+ async def get_vectorizer(
433
+ self,
434
+ domain: str = "default",
435
+ max_features: int = 1000,
436
+ ngram_range: Tuple[int, int] = (1, 2),
437
+ ) -> CachedTfIdfVectorizer:
438
+ """Get or create a vectorizer for a specific domain"""
439
+ vectorizer_key = f"{domain}_{max_features}_{ngram_range[0]}_{ngram_range[1]}"
440
+
441
+ if vectorizer_key not in self.vectorizers:
442
+ vectorizer = CachedTfIdfVectorizer(
443
+ redis_url=self.redis_url, cache_prefix=f"tfidf_{domain}", use_rust=True
444
+ )
445
+ await vectorizer.initialize()
446
+ self.vectorizers[vectorizer_key] = vectorizer
447
+
448
+ if self.default_vectorizer is None:
449
+ self.default_vectorizer = vectorizer
450
+
451
+ return self.vectorizers[vectorizer_key]
452
+
453
+ async def search_commands(
454
+ self, query: str, commands: List[Dict[str, Any]], top_k: int = 10
455
+ ) -> List[Tuple[Dict[str, Any], float]]:
456
+ """Search commands using optimized vectorization"""
457
+ vectorizer = await self.get_vectorizer("commands")
458
+
459
+ # Extract searchable text from commands
460
+ documents = []
461
+ for cmd in commands:
462
+ text_parts = [
463
+ cmd.get("name", ""),
464
+ cmd.get("description", ""),
465
+ " ".join(cmd.get("tags", [])),
466
+ ]
467
+ documents.append(" ".join(filter(None, text_parts)))
468
+
469
+ # Perform similarity search
470
+ results = await vectorizer.similarity_search(query, documents, top_k)
471
+
472
+ # Return commands with their similarity scores
473
+ return [(commands[idx], score) for idx, score in results]
474
+
475
+ async def close_all(self):
476
+ """Close all vectorizers"""
477
+ for vectorizer in self.vectorizers.values():
478
+ await vectorizer.close()
479
+ self.vectorizers.clear()