mcli-framework 7.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcli-framework might be problematic. Click here for more details.
- mcli/app/chat_cmd.py +42 -0
- mcli/app/commands_cmd.py +226 -0
- mcli/app/completion_cmd.py +216 -0
- mcli/app/completion_helpers.py +288 -0
- mcli/app/cron_test_cmd.py +697 -0
- mcli/app/logs_cmd.py +419 -0
- mcli/app/main.py +492 -0
- mcli/app/model/model.py +1060 -0
- mcli/app/model_cmd.py +227 -0
- mcli/app/redis_cmd.py +269 -0
- mcli/app/video/video.py +1114 -0
- mcli/app/visual_cmd.py +303 -0
- mcli/chat/chat.py +2409 -0
- mcli/chat/command_rag.py +514 -0
- mcli/chat/enhanced_chat.py +652 -0
- mcli/chat/system_controller.py +1010 -0
- mcli/chat/system_integration.py +1016 -0
- mcli/cli.py +25 -0
- mcli/config.toml +20 -0
- mcli/lib/api/api.py +586 -0
- mcli/lib/api/daemon_client.py +203 -0
- mcli/lib/api/daemon_client_local.py +44 -0
- mcli/lib/api/daemon_decorator.py +217 -0
- mcli/lib/api/mcli_decorators.py +1032 -0
- mcli/lib/auth/auth.py +85 -0
- mcli/lib/auth/aws_manager.py +85 -0
- mcli/lib/auth/azure_manager.py +91 -0
- mcli/lib/auth/credential_manager.py +192 -0
- mcli/lib/auth/gcp_manager.py +93 -0
- mcli/lib/auth/key_manager.py +117 -0
- mcli/lib/auth/mcli_manager.py +93 -0
- mcli/lib/auth/token_manager.py +75 -0
- mcli/lib/auth/token_util.py +1011 -0
- mcli/lib/config/config.py +47 -0
- mcli/lib/discovery/__init__.py +1 -0
- mcli/lib/discovery/command_discovery.py +274 -0
- mcli/lib/erd/erd.py +1345 -0
- mcli/lib/erd/generate_graph.py +453 -0
- mcli/lib/files/files.py +76 -0
- mcli/lib/fs/fs.py +109 -0
- mcli/lib/lib.py +29 -0
- mcli/lib/logger/logger.py +611 -0
- mcli/lib/performance/optimizer.py +409 -0
- mcli/lib/performance/rust_bridge.py +502 -0
- mcli/lib/performance/uvloop_config.py +154 -0
- mcli/lib/pickles/pickles.py +50 -0
- mcli/lib/search/cached_vectorizer.py +479 -0
- mcli/lib/services/data_pipeline.py +460 -0
- mcli/lib/services/lsh_client.py +441 -0
- mcli/lib/services/redis_service.py +387 -0
- mcli/lib/shell/shell.py +137 -0
- mcli/lib/toml/toml.py +33 -0
- mcli/lib/ui/styling.py +47 -0
- mcli/lib/ui/visual_effects.py +634 -0
- mcli/lib/watcher/watcher.py +185 -0
- mcli/ml/api/app.py +215 -0
- mcli/ml/api/middleware.py +224 -0
- mcli/ml/api/routers/admin_router.py +12 -0
- mcli/ml/api/routers/auth_router.py +244 -0
- mcli/ml/api/routers/backtest_router.py +12 -0
- mcli/ml/api/routers/data_router.py +12 -0
- mcli/ml/api/routers/model_router.py +302 -0
- mcli/ml/api/routers/monitoring_router.py +12 -0
- mcli/ml/api/routers/portfolio_router.py +12 -0
- mcli/ml/api/routers/prediction_router.py +267 -0
- mcli/ml/api/routers/trade_router.py +12 -0
- mcli/ml/api/routers/websocket_router.py +76 -0
- mcli/ml/api/schemas.py +64 -0
- mcli/ml/auth/auth_manager.py +425 -0
- mcli/ml/auth/models.py +154 -0
- mcli/ml/auth/permissions.py +302 -0
- mcli/ml/backtesting/backtest_engine.py +502 -0
- mcli/ml/backtesting/performance_metrics.py +393 -0
- mcli/ml/cache.py +400 -0
- mcli/ml/cli/main.py +398 -0
- mcli/ml/config/settings.py +394 -0
- mcli/ml/configs/dvc_config.py +230 -0
- mcli/ml/configs/mlflow_config.py +131 -0
- mcli/ml/configs/mlops_manager.py +293 -0
- mcli/ml/dashboard/app.py +532 -0
- mcli/ml/dashboard/app_integrated.py +738 -0
- mcli/ml/dashboard/app_supabase.py +560 -0
- mcli/ml/dashboard/app_training.py +615 -0
- mcli/ml/dashboard/cli.py +51 -0
- mcli/ml/data_ingestion/api_connectors.py +501 -0
- mcli/ml/data_ingestion/data_pipeline.py +567 -0
- mcli/ml/data_ingestion/stream_processor.py +512 -0
- mcli/ml/database/migrations/env.py +94 -0
- mcli/ml/database/models.py +667 -0
- mcli/ml/database/session.py +200 -0
- mcli/ml/experimentation/ab_testing.py +845 -0
- mcli/ml/features/ensemble_features.py +607 -0
- mcli/ml/features/political_features.py +676 -0
- mcli/ml/features/recommendation_engine.py +809 -0
- mcli/ml/features/stock_features.py +573 -0
- mcli/ml/features/test_feature_engineering.py +346 -0
- mcli/ml/logging.py +85 -0
- mcli/ml/mlops/data_versioning.py +518 -0
- mcli/ml/mlops/experiment_tracker.py +377 -0
- mcli/ml/mlops/model_serving.py +481 -0
- mcli/ml/mlops/pipeline_orchestrator.py +614 -0
- mcli/ml/models/base_models.py +324 -0
- mcli/ml/models/ensemble_models.py +675 -0
- mcli/ml/models/recommendation_models.py +474 -0
- mcli/ml/models/test_models.py +487 -0
- mcli/ml/monitoring/drift_detection.py +676 -0
- mcli/ml/monitoring/metrics.py +45 -0
- mcli/ml/optimization/portfolio_optimizer.py +834 -0
- mcli/ml/preprocessing/data_cleaners.py +451 -0
- mcli/ml/preprocessing/feature_extractors.py +491 -0
- mcli/ml/preprocessing/ml_pipeline.py +382 -0
- mcli/ml/preprocessing/politician_trading_preprocessor.py +569 -0
- mcli/ml/preprocessing/test_preprocessing.py +294 -0
- mcli/ml/scripts/populate_sample_data.py +200 -0
- mcli/ml/tasks.py +400 -0
- mcli/ml/tests/test_integration.py +429 -0
- mcli/ml/tests/test_training_dashboard.py +387 -0
- mcli/public/oi/oi.py +15 -0
- mcli/public/public.py +4 -0
- mcli/self/self_cmd.py +1246 -0
- mcli/workflow/daemon/api_daemon.py +800 -0
- mcli/workflow/daemon/async_command_database.py +681 -0
- mcli/workflow/daemon/async_process_manager.py +591 -0
- mcli/workflow/daemon/client.py +530 -0
- mcli/workflow/daemon/commands.py +1196 -0
- mcli/workflow/daemon/daemon.py +905 -0
- mcli/workflow/daemon/daemon_api.py +59 -0
- mcli/workflow/daemon/enhanced_daemon.py +571 -0
- mcli/workflow/daemon/process_cli.py +244 -0
- mcli/workflow/daemon/process_manager.py +439 -0
- mcli/workflow/daemon/test_daemon.py +275 -0
- mcli/workflow/dashboard/dashboard_cmd.py +113 -0
- mcli/workflow/docker/docker.py +0 -0
- mcli/workflow/file/file.py +100 -0
- mcli/workflow/gcloud/config.toml +21 -0
- mcli/workflow/gcloud/gcloud.py +58 -0
- mcli/workflow/git_commit/ai_service.py +328 -0
- mcli/workflow/git_commit/commands.py +430 -0
- mcli/workflow/lsh_integration.py +355 -0
- mcli/workflow/model_service/client.py +594 -0
- mcli/workflow/model_service/download_and_run_efficient_models.py +288 -0
- mcli/workflow/model_service/lightweight_embedder.py +397 -0
- mcli/workflow/model_service/lightweight_model_server.py +714 -0
- mcli/workflow/model_service/lightweight_test.py +241 -0
- mcli/workflow/model_service/model_service.py +1955 -0
- mcli/workflow/model_service/ollama_efficient_runner.py +425 -0
- mcli/workflow/model_service/pdf_processor.py +386 -0
- mcli/workflow/model_service/test_efficient_runner.py +234 -0
- mcli/workflow/model_service/test_example.py +315 -0
- mcli/workflow/model_service/test_integration.py +131 -0
- mcli/workflow/model_service/test_new_features.py +149 -0
- mcli/workflow/openai/openai.py +99 -0
- mcli/workflow/politician_trading/commands.py +1790 -0
- mcli/workflow/politician_trading/config.py +134 -0
- mcli/workflow/politician_trading/connectivity.py +490 -0
- mcli/workflow/politician_trading/data_sources.py +395 -0
- mcli/workflow/politician_trading/database.py +410 -0
- mcli/workflow/politician_trading/demo.py +248 -0
- mcli/workflow/politician_trading/models.py +165 -0
- mcli/workflow/politician_trading/monitoring.py +413 -0
- mcli/workflow/politician_trading/scrapers.py +966 -0
- mcli/workflow/politician_trading/scrapers_california.py +412 -0
- mcli/workflow/politician_trading/scrapers_eu.py +377 -0
- mcli/workflow/politician_trading/scrapers_uk.py +350 -0
- mcli/workflow/politician_trading/scrapers_us_states.py +438 -0
- mcli/workflow/politician_trading/supabase_functions.py +354 -0
- mcli/workflow/politician_trading/workflow.py +852 -0
- mcli/workflow/registry/registry.py +180 -0
- mcli/workflow/repo/repo.py +223 -0
- mcli/workflow/scheduler/commands.py +493 -0
- mcli/workflow/scheduler/cron_parser.py +238 -0
- mcli/workflow/scheduler/job.py +182 -0
- mcli/workflow/scheduler/monitor.py +139 -0
- mcli/workflow/scheduler/persistence.py +324 -0
- mcli/workflow/scheduler/scheduler.py +679 -0
- mcli/workflow/sync/sync_cmd.py +437 -0
- mcli/workflow/sync/test_cmd.py +314 -0
- mcli/workflow/videos/videos.py +242 -0
- mcli/workflow/wakatime/wakatime.py +11 -0
- mcli/workflow/workflow.py +37 -0
- mcli_framework-7.0.0.dist-info/METADATA +479 -0
- mcli_framework-7.0.0.dist-info/RECORD +186 -0
- mcli_framework-7.0.0.dist-info/WHEEL +5 -0
- mcli_framework-7.0.0.dist-info/entry_points.txt +7 -0
- mcli_framework-7.0.0.dist-info/licenses/LICENSE +21 -0
- mcli_framework-7.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,479 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Cached TF-IDF Vectorizer with Redis support for high-performance text similarity
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import hashlib
|
|
7
|
+
import json
|
|
8
|
+
import pickle
|
|
9
|
+
from datetime import datetime, timedelta
|
|
10
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
|
|
14
|
+
# Optional redis import - gracefully handle if not installed
|
|
15
|
+
try:
|
|
16
|
+
import redis.asyncio as redis
|
|
17
|
+
REDIS_AVAILABLE = True
|
|
18
|
+
except ImportError:
|
|
19
|
+
REDIS_AVAILABLE = False
|
|
20
|
+
redis = None # type: ignore
|
|
21
|
+
|
|
22
|
+
from mcli.lib.logger.logger import get_logger
|
|
23
|
+
|
|
24
|
+
logger = get_logger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class CachedTfIdfVectorizer:
|
|
28
|
+
"""
|
|
29
|
+
TF-IDF Vectorizer with Redis caching for improved performance.
|
|
30
|
+
Falls back to Rust implementation when available, otherwise uses sklearn.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
redis_url: str = "redis://localhost:6379",
|
|
36
|
+
cache_ttl: int = 3600,
|
|
37
|
+
cache_prefix: str = "tfidf",
|
|
38
|
+
use_rust: bool = True,
|
|
39
|
+
):
|
|
40
|
+
self.redis_url = redis_url
|
|
41
|
+
self.cache_ttl = cache_ttl
|
|
42
|
+
self.cache_prefix = cache_prefix
|
|
43
|
+
self.use_rust = use_rust
|
|
44
|
+
|
|
45
|
+
self.redis_client: Optional[Any] = None # redis.Redis when available
|
|
46
|
+
self.vectorizer = None
|
|
47
|
+
self.is_fitted = False
|
|
48
|
+
|
|
49
|
+
# Cache stats
|
|
50
|
+
self.cache_hits = 0
|
|
51
|
+
self.cache_misses = 0
|
|
52
|
+
|
|
53
|
+
async def initialize(self):
|
|
54
|
+
"""Initialize Redis connection and vectorizer"""
|
|
55
|
+
await self._init_redis()
|
|
56
|
+
await self._init_vectorizer()
|
|
57
|
+
|
|
58
|
+
async def _init_redis(self):
|
|
59
|
+
"""Initialize Redis connection"""
|
|
60
|
+
if not REDIS_AVAILABLE:
|
|
61
|
+
logger.warning("Redis is not installed. Caching disabled.")
|
|
62
|
+
self.redis_client = None
|
|
63
|
+
return
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
# Try to ensure Redis is running through the service manager
|
|
67
|
+
try:
|
|
68
|
+
from mcli.lib.services.redis_service import ensure_redis_running
|
|
69
|
+
|
|
70
|
+
await ensure_redis_running()
|
|
71
|
+
except ImportError:
|
|
72
|
+
logger.debug("Redis service manager not available")
|
|
73
|
+
|
|
74
|
+
self.redis_client = redis.from_url(self.redis_url) # type: ignore
|
|
75
|
+
await self.redis_client.ping()
|
|
76
|
+
logger.info("Connected to Redis for TF-IDF caching")
|
|
77
|
+
except Exception as e:
|
|
78
|
+
logger.warning(f"Failed to connect to Redis: {e}. Caching disabled.")
|
|
79
|
+
self.redis_client = None
|
|
80
|
+
|
|
81
|
+
async def _init_vectorizer(self):
|
|
82
|
+
"""Initialize the appropriate vectorizer implementation"""
|
|
83
|
+
if self.use_rust:
|
|
84
|
+
try:
|
|
85
|
+
# Try to use Rust implementation
|
|
86
|
+
import mcli_rust
|
|
87
|
+
|
|
88
|
+
self.vectorizer = mcli_rust.TfIdfVectorizer(
|
|
89
|
+
max_features=1000, min_df=1, max_df=0.95, ngram_range=(1, 2)
|
|
90
|
+
)
|
|
91
|
+
logger.info("Using Rust TF-IDF vectorizer for enhanced performance")
|
|
92
|
+
return
|
|
93
|
+
except ImportError:
|
|
94
|
+
logger.warning("Rust vectorizer not available, falling back to sklearn")
|
|
95
|
+
|
|
96
|
+
# Fallback to sklearn
|
|
97
|
+
try:
|
|
98
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
99
|
+
|
|
100
|
+
self.vectorizer = TfidfVectorizer(
|
|
101
|
+
max_features=1000, stop_words="english", ngram_range=(1, 2), min_df=1, max_df=0.95
|
|
102
|
+
)
|
|
103
|
+
logger.info("Using sklearn TF-IDF vectorizer")
|
|
104
|
+
except ImportError:
|
|
105
|
+
raise RuntimeError("Neither Rust nor sklearn TF-IDF implementation available")
|
|
106
|
+
|
|
107
|
+
async def fit_transform(self, documents: List[str]) -> np.ndarray:
|
|
108
|
+
"""Fit the vectorizer and transform documents with caching"""
|
|
109
|
+
# Generate cache key for the document set
|
|
110
|
+
cache_key = self._generate_cache_key(documents, "fit_transform")
|
|
111
|
+
|
|
112
|
+
# Try to get from cache
|
|
113
|
+
cached_result = await self._get_from_cache(cache_key)
|
|
114
|
+
if cached_result is not None:
|
|
115
|
+
self.cache_hits += 1
|
|
116
|
+
vectors, feature_names = cached_result
|
|
117
|
+
self.is_fitted = True
|
|
118
|
+
return vectors
|
|
119
|
+
|
|
120
|
+
self.cache_misses += 1
|
|
121
|
+
|
|
122
|
+
# Compute TF-IDF vectors
|
|
123
|
+
if hasattr(self.vectorizer, "fit_transform") and hasattr(
|
|
124
|
+
self.vectorizer, "get_feature_names_out"
|
|
125
|
+
):
|
|
126
|
+
# sklearn implementation
|
|
127
|
+
vectors = self.vectorizer.fit_transform(documents).toarray()
|
|
128
|
+
feature_names = self.vectorizer.get_feature_names_out().tolist()
|
|
129
|
+
else:
|
|
130
|
+
# Rust implementation
|
|
131
|
+
result = self.vectorizer.fit_transform(documents)
|
|
132
|
+
if isinstance(result, list):
|
|
133
|
+
vectors = np.array(result)
|
|
134
|
+
else:
|
|
135
|
+
vectors = result.toarray() if hasattr(result, "toarray") else np.array(result)
|
|
136
|
+
|
|
137
|
+
if hasattr(self.vectorizer, "get_feature_names"):
|
|
138
|
+
feature_names = self.vectorizer.get_feature_names()
|
|
139
|
+
else:
|
|
140
|
+
feature_names = [
|
|
141
|
+
f"feature_{i}"
|
|
142
|
+
for i in range(vectors.shape[1] if len(vectors.shape) > 1 else len(vectors))
|
|
143
|
+
]
|
|
144
|
+
|
|
145
|
+
self.is_fitted = True
|
|
146
|
+
|
|
147
|
+
# Cache the result
|
|
148
|
+
await self._cache_result(cache_key, (vectors, feature_names))
|
|
149
|
+
|
|
150
|
+
return vectors
|
|
151
|
+
|
|
152
|
+
async def transform(self, documents: List[str]) -> np.ndarray:
|
|
153
|
+
"""Transform documents using fitted vectorizer with caching"""
|
|
154
|
+
if not self.is_fitted:
|
|
155
|
+
raise ValueError("Vectorizer must be fitted before transform")
|
|
156
|
+
|
|
157
|
+
# Generate cache key for transformation
|
|
158
|
+
cache_key = self._generate_cache_key(documents, "transform")
|
|
159
|
+
|
|
160
|
+
# Try to get from cache
|
|
161
|
+
cached_result = await self._get_from_cache(cache_key)
|
|
162
|
+
if cached_result is not None:
|
|
163
|
+
self.cache_hits += 1
|
|
164
|
+
return cached_result
|
|
165
|
+
|
|
166
|
+
self.cache_misses += 1
|
|
167
|
+
|
|
168
|
+
# Compute vectors
|
|
169
|
+
if hasattr(self.vectorizer, "transform") and hasattr(
|
|
170
|
+
self.vectorizer, "get_feature_names_out"
|
|
171
|
+
):
|
|
172
|
+
# sklearn implementation
|
|
173
|
+
vectors = self.vectorizer.transform(documents).toarray()
|
|
174
|
+
else:
|
|
175
|
+
# Rust implementation
|
|
176
|
+
result = self.vectorizer.transform(documents)
|
|
177
|
+
if isinstance(result, list):
|
|
178
|
+
vectors = np.array(result)
|
|
179
|
+
else:
|
|
180
|
+
vectors = result.toarray() if hasattr(result, "toarray") else np.array(result)
|
|
181
|
+
|
|
182
|
+
# Cache the result
|
|
183
|
+
await self._cache_result(cache_key, vectors)
|
|
184
|
+
|
|
185
|
+
return vectors
|
|
186
|
+
|
|
187
|
+
async def similarity_search(
|
|
188
|
+
self, query: str, documents: List[str], top_k: int = 10
|
|
189
|
+
) -> List[Tuple[int, float]]:
|
|
190
|
+
"""Perform similarity search with caching"""
|
|
191
|
+
# Generate cache key for similarity search
|
|
192
|
+
search_data = {"query": query, "documents": documents, "top_k": top_k}
|
|
193
|
+
cache_key = self._generate_cache_key_from_dict(search_data, "similarity")
|
|
194
|
+
|
|
195
|
+
# Try to get from cache
|
|
196
|
+
cached_result = await self._get_from_cache(cache_key)
|
|
197
|
+
if cached_result is not None:
|
|
198
|
+
self.cache_hits += 1
|
|
199
|
+
return cached_result
|
|
200
|
+
|
|
201
|
+
self.cache_misses += 1
|
|
202
|
+
|
|
203
|
+
# Compute similarity
|
|
204
|
+
if hasattr(self.vectorizer, "similarity"):
|
|
205
|
+
# Rust implementation with built-in similarity
|
|
206
|
+
similarities = self.vectorizer.similarity(query, documents)
|
|
207
|
+
else:
|
|
208
|
+
# sklearn implementation - need to compute manually
|
|
209
|
+
if not self.is_fitted:
|
|
210
|
+
await self.fit_transform(documents)
|
|
211
|
+
|
|
212
|
+
query_vector = await self.transform([query])
|
|
213
|
+
doc_vectors = await self.transform(documents)
|
|
214
|
+
|
|
215
|
+
# Compute cosine similarity
|
|
216
|
+
similarities = []
|
|
217
|
+
for doc_vector in doc_vectors:
|
|
218
|
+
similarity = self._cosine_similarity(query_vector[0], doc_vector)
|
|
219
|
+
similarities.append(similarity)
|
|
220
|
+
|
|
221
|
+
# Get top-k results
|
|
222
|
+
indexed_similarities = [(i, sim) for i, sim in enumerate(similarities)]
|
|
223
|
+
indexed_similarities.sort(key=lambda x: x[1], reverse=True)
|
|
224
|
+
results = indexed_similarities[:top_k]
|
|
225
|
+
|
|
226
|
+
# Cache the result
|
|
227
|
+
await self._cache_result(cache_key, results)
|
|
228
|
+
|
|
229
|
+
return results
|
|
230
|
+
|
|
231
|
+
async def batch_similarity_search(
|
|
232
|
+
self, queries: List[str], documents: List[str], top_k: int = 10
|
|
233
|
+
) -> List[List[Tuple[int, float]]]:
|
|
234
|
+
"""Perform batch similarity search for multiple queries"""
|
|
235
|
+
# Try to use cached individual results first
|
|
236
|
+
results = []
|
|
237
|
+
uncached_queries = []
|
|
238
|
+
uncached_indices = []
|
|
239
|
+
|
|
240
|
+
for i, query in enumerate(queries):
|
|
241
|
+
cache_key = self._generate_cache_key_from_dict(
|
|
242
|
+
{"query": query, "documents": documents, "top_k": top_k}, "similarity"
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
cached_result = await self._get_from_cache(cache_key)
|
|
246
|
+
if cached_result is not None:
|
|
247
|
+
self.cache_hits += 1
|
|
248
|
+
results.append(cached_result)
|
|
249
|
+
else:
|
|
250
|
+
self.cache_misses += 1
|
|
251
|
+
results.append(None)
|
|
252
|
+
uncached_queries.append(query)
|
|
253
|
+
uncached_indices.append(i)
|
|
254
|
+
|
|
255
|
+
# Process uncached queries in batch
|
|
256
|
+
if uncached_queries:
|
|
257
|
+
if hasattr(self.vectorizer, "similarity"):
|
|
258
|
+
# Rust implementation might support batch processing
|
|
259
|
+
for j, query in enumerate(uncached_queries):
|
|
260
|
+
similarities = self.vectorizer.similarity(query, documents)
|
|
261
|
+
indexed_similarities = [(i, sim) for i, sim in enumerate(similarities)]
|
|
262
|
+
indexed_similarities.sort(key=lambda x: x[1], reverse=True)
|
|
263
|
+
query_results = indexed_similarities[:top_k]
|
|
264
|
+
|
|
265
|
+
# Update results and cache
|
|
266
|
+
results[uncached_indices[j]] = query_results
|
|
267
|
+
cache_key = self._generate_cache_key_from_dict(
|
|
268
|
+
{"query": query, "documents": documents, "top_k": top_k}, "similarity"
|
|
269
|
+
)
|
|
270
|
+
await self._cache_result(cache_key, query_results)
|
|
271
|
+
else:
|
|
272
|
+
# sklearn implementation
|
|
273
|
+
if not self.is_fitted:
|
|
274
|
+
await self.fit_transform(documents)
|
|
275
|
+
|
|
276
|
+
query_vectors = await self.transform(uncached_queries)
|
|
277
|
+
doc_vectors = await self.transform(documents)
|
|
278
|
+
|
|
279
|
+
for j, query_vector in enumerate(query_vectors):
|
|
280
|
+
similarities = []
|
|
281
|
+
for doc_vector in doc_vectors:
|
|
282
|
+
similarity = self._cosine_similarity(query_vector, doc_vector)
|
|
283
|
+
similarities.append(similarity)
|
|
284
|
+
|
|
285
|
+
indexed_similarities = [(i, sim) for i, sim in enumerate(similarities)]
|
|
286
|
+
indexed_similarities.sort(key=lambda x: x[1], reverse=True)
|
|
287
|
+
query_results = indexed_similarities[:top_k]
|
|
288
|
+
|
|
289
|
+
# Update results and cache
|
|
290
|
+
results[uncached_indices[j]] = query_results
|
|
291
|
+
cache_key = self._generate_cache_key_from_dict(
|
|
292
|
+
{"query": uncached_queries[j], "documents": documents, "top_k": top_k},
|
|
293
|
+
"similarity",
|
|
294
|
+
)
|
|
295
|
+
await self._cache_result(cache_key, query_results)
|
|
296
|
+
|
|
297
|
+
return results
|
|
298
|
+
|
|
299
|
+
def _generate_cache_key(self, documents: List[str], operation: str) -> str:
|
|
300
|
+
"""Generate a cache key for a list of documents and operation"""
|
|
301
|
+
content = f"{operation}:{':'.join(documents)}"
|
|
302
|
+
hash_obj = hashlib.sha256(content.encode("utf-8"))
|
|
303
|
+
return f"{self.cache_prefix}:{hash_obj.hexdigest()[:16]}"
|
|
304
|
+
|
|
305
|
+
def _generate_cache_key_from_dict(self, data: Dict[str, Any], operation: str) -> str:
|
|
306
|
+
"""Generate a cache key from a dictionary"""
|
|
307
|
+
content = f"{operation}:{json.dumps(data, sort_keys=True)}"
|
|
308
|
+
hash_obj = hashlib.sha256(content.encode("utf-8"))
|
|
309
|
+
return f"{self.cache_prefix}:{hash_obj.hexdigest()[:16]}"
|
|
310
|
+
|
|
311
|
+
async def _get_from_cache(self, cache_key: str) -> Optional[Any]:
|
|
312
|
+
"""Get result from Redis cache"""
|
|
313
|
+
if not self.redis_client:
|
|
314
|
+
return None
|
|
315
|
+
|
|
316
|
+
try:
|
|
317
|
+
cached_data = await self.redis_client.get(cache_key)
|
|
318
|
+
if cached_data:
|
|
319
|
+
return pickle.loads(cached_data)
|
|
320
|
+
except Exception as e:
|
|
321
|
+
logger.warning(f"Failed to get from cache: {e}")
|
|
322
|
+
|
|
323
|
+
return None
|
|
324
|
+
|
|
325
|
+
async def _cache_result(self, cache_key: str, result: Any):
|
|
326
|
+
"""Cache result in Redis"""
|
|
327
|
+
if not self.redis_client:
|
|
328
|
+
return
|
|
329
|
+
|
|
330
|
+
try:
|
|
331
|
+
serialized_result = pickle.dumps(result)
|
|
332
|
+
await self.redis_client.setex(cache_key, self.cache_ttl, serialized_result)
|
|
333
|
+
except Exception as e:
|
|
334
|
+
logger.warning(f"Failed to cache result: {e}")
|
|
335
|
+
|
|
336
|
+
def _cosine_similarity(self, vec1: np.ndarray, vec2: np.ndarray) -> float:
|
|
337
|
+
"""Compute cosine similarity between two vectors"""
|
|
338
|
+
dot_product = np.dot(vec1, vec2)
|
|
339
|
+
norm1 = np.linalg.norm(vec1)
|
|
340
|
+
norm2 = np.linalg.norm(vec2)
|
|
341
|
+
|
|
342
|
+
if norm1 == 0 or norm2 == 0:
|
|
343
|
+
return 0.0
|
|
344
|
+
|
|
345
|
+
return dot_product / (norm1 * norm2)
|
|
346
|
+
|
|
347
|
+
async def clear_cache(self, pattern: Optional[str] = None):
|
|
348
|
+
"""Clear cache entries"""
|
|
349
|
+
if not self.redis_client:
|
|
350
|
+
return
|
|
351
|
+
|
|
352
|
+
try:
|
|
353
|
+
if pattern:
|
|
354
|
+
keys = await self.redis_client.keys(f"{self.cache_prefix}:{pattern}")
|
|
355
|
+
else:
|
|
356
|
+
keys = await self.redis_client.keys(f"{self.cache_prefix}:*")
|
|
357
|
+
|
|
358
|
+
if keys:
|
|
359
|
+
await self.redis_client.delete(*keys)
|
|
360
|
+
logger.info(f"Cleared {len(keys)} cache entries")
|
|
361
|
+
except Exception as e:
|
|
362
|
+
logger.warning(f"Failed to clear cache: {e}")
|
|
363
|
+
|
|
364
|
+
async def get_cache_stats(self) -> Dict[str, Any]:
|
|
365
|
+
"""Get cache statistics"""
|
|
366
|
+
stats = {
|
|
367
|
+
"cache_hits": self.cache_hits,
|
|
368
|
+
"cache_misses": self.cache_misses,
|
|
369
|
+
"hit_rate": (
|
|
370
|
+
self.cache_hits / (self.cache_hits + self.cache_misses)
|
|
371
|
+
if (self.cache_hits + self.cache_misses) > 0
|
|
372
|
+
else 0.0
|
|
373
|
+
),
|
|
374
|
+
"redis_connected": self.redis_client is not None,
|
|
375
|
+
"vectorizer_type": (
|
|
376
|
+
"rust" if self.use_rust and "mcli_rust" in str(type(self.vectorizer)) else "sklearn"
|
|
377
|
+
),
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
if self.redis_client:
|
|
381
|
+
try:
|
|
382
|
+
# Get Redis memory usage
|
|
383
|
+
info = await self.redis_client.info("memory")
|
|
384
|
+
stats["redis_memory_used"] = info.get("used_memory_human", "unknown")
|
|
385
|
+
|
|
386
|
+
# Count cache entries
|
|
387
|
+
keys = await self.redis_client.keys(f"{self.cache_prefix}:*")
|
|
388
|
+
stats["cached_entries"] = len(keys)
|
|
389
|
+
except Exception as e:
|
|
390
|
+
logger.warning(f"Failed to get Redis stats: {e}")
|
|
391
|
+
|
|
392
|
+
return stats
|
|
393
|
+
|
|
394
|
+
async def warm_cache(self, documents: List[str], common_queries: List[str]):
|
|
395
|
+
"""Pre-populate cache with common queries"""
|
|
396
|
+
logger.info(
|
|
397
|
+
f"Warming cache with {len(common_queries)} queries and {len(documents)} documents"
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
# Fit the vectorizer if not already fitted
|
|
401
|
+
if not self.is_fitted:
|
|
402
|
+
await self.fit_transform(documents)
|
|
403
|
+
|
|
404
|
+
# Pre-compute similarities for common queries
|
|
405
|
+
for i, query in enumerate(common_queries):
|
|
406
|
+
await self.similarity_search(query, documents)
|
|
407
|
+
if i % 10 == 0:
|
|
408
|
+
logger.info(f"Warmed {i + 1}/{len(common_queries)} queries")
|
|
409
|
+
|
|
410
|
+
logger.info("Cache warming completed")
|
|
411
|
+
|
|
412
|
+
async def close(self):
|
|
413
|
+
"""Clean up resources"""
|
|
414
|
+
if self.redis_client:
|
|
415
|
+
await self.redis_client.close()
|
|
416
|
+
|
|
417
|
+
# Print final stats
|
|
418
|
+
stats = await self.get_cache_stats()
|
|
419
|
+
logger.info(f"TF-IDF Cache Stats: {stats}")
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
class SmartVectorizerManager:
|
|
423
|
+
"""
|
|
424
|
+
Manager for multiple cached vectorizers with automatic model selection
|
|
425
|
+
"""
|
|
426
|
+
|
|
427
|
+
def __init__(self, redis_url: str = "redis://localhost:6379"):
|
|
428
|
+
self.redis_url = redis_url
|
|
429
|
+
self.vectorizers: Dict[str, CachedTfIdfVectorizer] = {}
|
|
430
|
+
self.default_vectorizer = None
|
|
431
|
+
|
|
432
|
+
async def get_vectorizer(
|
|
433
|
+
self,
|
|
434
|
+
domain: str = "default",
|
|
435
|
+
max_features: int = 1000,
|
|
436
|
+
ngram_range: Tuple[int, int] = (1, 2),
|
|
437
|
+
) -> CachedTfIdfVectorizer:
|
|
438
|
+
"""Get or create a vectorizer for a specific domain"""
|
|
439
|
+
vectorizer_key = f"{domain}_{max_features}_{ngram_range[0]}_{ngram_range[1]}"
|
|
440
|
+
|
|
441
|
+
if vectorizer_key not in self.vectorizers:
|
|
442
|
+
vectorizer = CachedTfIdfVectorizer(
|
|
443
|
+
redis_url=self.redis_url, cache_prefix=f"tfidf_{domain}", use_rust=True
|
|
444
|
+
)
|
|
445
|
+
await vectorizer.initialize()
|
|
446
|
+
self.vectorizers[vectorizer_key] = vectorizer
|
|
447
|
+
|
|
448
|
+
if self.default_vectorizer is None:
|
|
449
|
+
self.default_vectorizer = vectorizer
|
|
450
|
+
|
|
451
|
+
return self.vectorizers[vectorizer_key]
|
|
452
|
+
|
|
453
|
+
async def search_commands(
|
|
454
|
+
self, query: str, commands: List[Dict[str, Any]], top_k: int = 10
|
|
455
|
+
) -> List[Tuple[Dict[str, Any], float]]:
|
|
456
|
+
"""Search commands using optimized vectorization"""
|
|
457
|
+
vectorizer = await self.get_vectorizer("commands")
|
|
458
|
+
|
|
459
|
+
# Extract searchable text from commands
|
|
460
|
+
documents = []
|
|
461
|
+
for cmd in commands:
|
|
462
|
+
text_parts = [
|
|
463
|
+
cmd.get("name", ""),
|
|
464
|
+
cmd.get("description", ""),
|
|
465
|
+
" ".join(cmd.get("tags", [])),
|
|
466
|
+
]
|
|
467
|
+
documents.append(" ".join(filter(None, text_parts)))
|
|
468
|
+
|
|
469
|
+
# Perform similarity search
|
|
470
|
+
results = await vectorizer.similarity_search(query, documents, top_k)
|
|
471
|
+
|
|
472
|
+
# Return commands with their similarity scores
|
|
473
|
+
return [(commands[idx], score) for idx, score in results]
|
|
474
|
+
|
|
475
|
+
async def close_all(self):
|
|
476
|
+
"""Close all vectorizers"""
|
|
477
|
+
for vectorizer in self.vectorizers.values():
|
|
478
|
+
await vectorizer.close()
|
|
479
|
+
self.vectorizers.clear()
|