mcli-framework 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcli-framework might be problematic. Click here for more details.

Files changed (186) hide show
  1. mcli/app/chat_cmd.py +42 -0
  2. mcli/app/commands_cmd.py +226 -0
  3. mcli/app/completion_cmd.py +216 -0
  4. mcli/app/completion_helpers.py +288 -0
  5. mcli/app/cron_test_cmd.py +697 -0
  6. mcli/app/logs_cmd.py +419 -0
  7. mcli/app/main.py +492 -0
  8. mcli/app/model/model.py +1060 -0
  9. mcli/app/model_cmd.py +227 -0
  10. mcli/app/redis_cmd.py +269 -0
  11. mcli/app/video/video.py +1114 -0
  12. mcli/app/visual_cmd.py +303 -0
  13. mcli/chat/chat.py +2409 -0
  14. mcli/chat/command_rag.py +514 -0
  15. mcli/chat/enhanced_chat.py +652 -0
  16. mcli/chat/system_controller.py +1010 -0
  17. mcli/chat/system_integration.py +1016 -0
  18. mcli/cli.py +25 -0
  19. mcli/config.toml +20 -0
  20. mcli/lib/api/api.py +586 -0
  21. mcli/lib/api/daemon_client.py +203 -0
  22. mcli/lib/api/daemon_client_local.py +44 -0
  23. mcli/lib/api/daemon_decorator.py +217 -0
  24. mcli/lib/api/mcli_decorators.py +1032 -0
  25. mcli/lib/auth/auth.py +85 -0
  26. mcli/lib/auth/aws_manager.py +85 -0
  27. mcli/lib/auth/azure_manager.py +91 -0
  28. mcli/lib/auth/credential_manager.py +192 -0
  29. mcli/lib/auth/gcp_manager.py +93 -0
  30. mcli/lib/auth/key_manager.py +117 -0
  31. mcli/lib/auth/mcli_manager.py +93 -0
  32. mcli/lib/auth/token_manager.py +75 -0
  33. mcli/lib/auth/token_util.py +1011 -0
  34. mcli/lib/config/config.py +47 -0
  35. mcli/lib/discovery/__init__.py +1 -0
  36. mcli/lib/discovery/command_discovery.py +274 -0
  37. mcli/lib/erd/erd.py +1345 -0
  38. mcli/lib/erd/generate_graph.py +453 -0
  39. mcli/lib/files/files.py +76 -0
  40. mcli/lib/fs/fs.py +109 -0
  41. mcli/lib/lib.py +29 -0
  42. mcli/lib/logger/logger.py +611 -0
  43. mcli/lib/performance/optimizer.py +409 -0
  44. mcli/lib/performance/rust_bridge.py +502 -0
  45. mcli/lib/performance/uvloop_config.py +154 -0
  46. mcli/lib/pickles/pickles.py +50 -0
  47. mcli/lib/search/cached_vectorizer.py +479 -0
  48. mcli/lib/services/data_pipeline.py +460 -0
  49. mcli/lib/services/lsh_client.py +441 -0
  50. mcli/lib/services/redis_service.py +387 -0
  51. mcli/lib/shell/shell.py +137 -0
  52. mcli/lib/toml/toml.py +33 -0
  53. mcli/lib/ui/styling.py +47 -0
  54. mcli/lib/ui/visual_effects.py +634 -0
  55. mcli/lib/watcher/watcher.py +185 -0
  56. mcli/ml/api/app.py +215 -0
  57. mcli/ml/api/middleware.py +224 -0
  58. mcli/ml/api/routers/admin_router.py +12 -0
  59. mcli/ml/api/routers/auth_router.py +244 -0
  60. mcli/ml/api/routers/backtest_router.py +12 -0
  61. mcli/ml/api/routers/data_router.py +12 -0
  62. mcli/ml/api/routers/model_router.py +302 -0
  63. mcli/ml/api/routers/monitoring_router.py +12 -0
  64. mcli/ml/api/routers/portfolio_router.py +12 -0
  65. mcli/ml/api/routers/prediction_router.py +267 -0
  66. mcli/ml/api/routers/trade_router.py +12 -0
  67. mcli/ml/api/routers/websocket_router.py +76 -0
  68. mcli/ml/api/schemas.py +64 -0
  69. mcli/ml/auth/auth_manager.py +425 -0
  70. mcli/ml/auth/models.py +154 -0
  71. mcli/ml/auth/permissions.py +302 -0
  72. mcli/ml/backtesting/backtest_engine.py +502 -0
  73. mcli/ml/backtesting/performance_metrics.py +393 -0
  74. mcli/ml/cache.py +400 -0
  75. mcli/ml/cli/main.py +398 -0
  76. mcli/ml/config/settings.py +394 -0
  77. mcli/ml/configs/dvc_config.py +230 -0
  78. mcli/ml/configs/mlflow_config.py +131 -0
  79. mcli/ml/configs/mlops_manager.py +293 -0
  80. mcli/ml/dashboard/app.py +532 -0
  81. mcli/ml/dashboard/app_integrated.py +738 -0
  82. mcli/ml/dashboard/app_supabase.py +560 -0
  83. mcli/ml/dashboard/app_training.py +615 -0
  84. mcli/ml/dashboard/cli.py +51 -0
  85. mcli/ml/data_ingestion/api_connectors.py +501 -0
  86. mcli/ml/data_ingestion/data_pipeline.py +567 -0
  87. mcli/ml/data_ingestion/stream_processor.py +512 -0
  88. mcli/ml/database/migrations/env.py +94 -0
  89. mcli/ml/database/models.py +667 -0
  90. mcli/ml/database/session.py +200 -0
  91. mcli/ml/experimentation/ab_testing.py +845 -0
  92. mcli/ml/features/ensemble_features.py +607 -0
  93. mcli/ml/features/political_features.py +676 -0
  94. mcli/ml/features/recommendation_engine.py +809 -0
  95. mcli/ml/features/stock_features.py +573 -0
  96. mcli/ml/features/test_feature_engineering.py +346 -0
  97. mcli/ml/logging.py +85 -0
  98. mcli/ml/mlops/data_versioning.py +518 -0
  99. mcli/ml/mlops/experiment_tracker.py +377 -0
  100. mcli/ml/mlops/model_serving.py +481 -0
  101. mcli/ml/mlops/pipeline_orchestrator.py +614 -0
  102. mcli/ml/models/base_models.py +324 -0
  103. mcli/ml/models/ensemble_models.py +675 -0
  104. mcli/ml/models/recommendation_models.py +474 -0
  105. mcli/ml/models/test_models.py +487 -0
  106. mcli/ml/monitoring/drift_detection.py +676 -0
  107. mcli/ml/monitoring/metrics.py +45 -0
  108. mcli/ml/optimization/portfolio_optimizer.py +834 -0
  109. mcli/ml/preprocessing/data_cleaners.py +451 -0
  110. mcli/ml/preprocessing/feature_extractors.py +491 -0
  111. mcli/ml/preprocessing/ml_pipeline.py +382 -0
  112. mcli/ml/preprocessing/politician_trading_preprocessor.py +569 -0
  113. mcli/ml/preprocessing/test_preprocessing.py +294 -0
  114. mcli/ml/scripts/populate_sample_data.py +200 -0
  115. mcli/ml/tasks.py +400 -0
  116. mcli/ml/tests/test_integration.py +429 -0
  117. mcli/ml/tests/test_training_dashboard.py +387 -0
  118. mcli/public/oi/oi.py +15 -0
  119. mcli/public/public.py +4 -0
  120. mcli/self/self_cmd.py +1246 -0
  121. mcli/workflow/daemon/api_daemon.py +800 -0
  122. mcli/workflow/daemon/async_command_database.py +681 -0
  123. mcli/workflow/daemon/async_process_manager.py +591 -0
  124. mcli/workflow/daemon/client.py +530 -0
  125. mcli/workflow/daemon/commands.py +1196 -0
  126. mcli/workflow/daemon/daemon.py +905 -0
  127. mcli/workflow/daemon/daemon_api.py +59 -0
  128. mcli/workflow/daemon/enhanced_daemon.py +571 -0
  129. mcli/workflow/daemon/process_cli.py +244 -0
  130. mcli/workflow/daemon/process_manager.py +439 -0
  131. mcli/workflow/daemon/test_daemon.py +275 -0
  132. mcli/workflow/dashboard/dashboard_cmd.py +113 -0
  133. mcli/workflow/docker/docker.py +0 -0
  134. mcli/workflow/file/file.py +100 -0
  135. mcli/workflow/gcloud/config.toml +21 -0
  136. mcli/workflow/gcloud/gcloud.py +58 -0
  137. mcli/workflow/git_commit/ai_service.py +328 -0
  138. mcli/workflow/git_commit/commands.py +430 -0
  139. mcli/workflow/lsh_integration.py +355 -0
  140. mcli/workflow/model_service/client.py +594 -0
  141. mcli/workflow/model_service/download_and_run_efficient_models.py +288 -0
  142. mcli/workflow/model_service/lightweight_embedder.py +397 -0
  143. mcli/workflow/model_service/lightweight_model_server.py +714 -0
  144. mcli/workflow/model_service/lightweight_test.py +241 -0
  145. mcli/workflow/model_service/model_service.py +1955 -0
  146. mcli/workflow/model_service/ollama_efficient_runner.py +425 -0
  147. mcli/workflow/model_service/pdf_processor.py +386 -0
  148. mcli/workflow/model_service/test_efficient_runner.py +234 -0
  149. mcli/workflow/model_service/test_example.py +315 -0
  150. mcli/workflow/model_service/test_integration.py +131 -0
  151. mcli/workflow/model_service/test_new_features.py +149 -0
  152. mcli/workflow/openai/openai.py +99 -0
  153. mcli/workflow/politician_trading/commands.py +1790 -0
  154. mcli/workflow/politician_trading/config.py +134 -0
  155. mcli/workflow/politician_trading/connectivity.py +490 -0
  156. mcli/workflow/politician_trading/data_sources.py +395 -0
  157. mcli/workflow/politician_trading/database.py +410 -0
  158. mcli/workflow/politician_trading/demo.py +248 -0
  159. mcli/workflow/politician_trading/models.py +165 -0
  160. mcli/workflow/politician_trading/monitoring.py +413 -0
  161. mcli/workflow/politician_trading/scrapers.py +966 -0
  162. mcli/workflow/politician_trading/scrapers_california.py +412 -0
  163. mcli/workflow/politician_trading/scrapers_eu.py +377 -0
  164. mcli/workflow/politician_trading/scrapers_uk.py +350 -0
  165. mcli/workflow/politician_trading/scrapers_us_states.py +438 -0
  166. mcli/workflow/politician_trading/supabase_functions.py +354 -0
  167. mcli/workflow/politician_trading/workflow.py +852 -0
  168. mcli/workflow/registry/registry.py +180 -0
  169. mcli/workflow/repo/repo.py +223 -0
  170. mcli/workflow/scheduler/commands.py +493 -0
  171. mcli/workflow/scheduler/cron_parser.py +238 -0
  172. mcli/workflow/scheduler/job.py +182 -0
  173. mcli/workflow/scheduler/monitor.py +139 -0
  174. mcli/workflow/scheduler/persistence.py +324 -0
  175. mcli/workflow/scheduler/scheduler.py +679 -0
  176. mcli/workflow/sync/sync_cmd.py +437 -0
  177. mcli/workflow/sync/test_cmd.py +314 -0
  178. mcli/workflow/videos/videos.py +242 -0
  179. mcli/workflow/wakatime/wakatime.py +11 -0
  180. mcli/workflow/workflow.py +37 -0
  181. mcli_framework-7.0.0.dist-info/METADATA +479 -0
  182. mcli_framework-7.0.0.dist-info/RECORD +186 -0
  183. mcli_framework-7.0.0.dist-info/WHEEL +5 -0
  184. mcli_framework-7.0.0.dist-info/entry_points.txt +7 -0
  185. mcli_framework-7.0.0.dist-info/licenses/LICENSE +21 -0
  186. mcli_framework-7.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,676 @@
1
+ """Model monitoring and drift detection for ML systems"""
2
+
3
+ import asyncio
4
+ import json
5
+ import logging
6
+ import numpy as np
7
+ import pandas as pd
8
+ from datetime import datetime, timedelta
9
+ from typing import Dict, Any, List, Optional, Union, Callable, Tuple
10
+ from dataclasses import dataclass, field, asdict
11
+ from pathlib import Path
12
+ from enum import Enum
13
+ import pickle
14
+ from scipy import stats
15
+ from sklearn.model_selection import train_test_split
16
+ from sklearn.ensemble import IsolationForest
17
+ from sklearn.metrics import ks_2samp
18
+ import warnings
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class DriftType(Enum):
24
+ DATA_DRIFT = "data_drift"
25
+ CONCEPT_DRIFT = "concept_drift"
26
+ PREDICTION_DRIFT = "prediction_drift"
27
+ MODEL_DEGRADATION = "model_degradation"
28
+
29
+
30
+ class AlertSeverity(Enum):
31
+ LOW = "low"
32
+ MEDIUM = "medium"
33
+ HIGH = "high"
34
+ CRITICAL = "critical"
35
+
36
+
37
+ @dataclass
38
+ class DriftAlert:
39
+ """Drift detection alert"""
40
+ timestamp: datetime
41
+ drift_type: DriftType
42
+ severity: AlertSeverity
43
+ metric_name: str
44
+ value: float
45
+ threshold: float
46
+ description: str
47
+ metadata: Dict[str, Any] = field(default_factory=dict)
48
+
49
+
50
+ @dataclass
51
+ class ModelMetrics:
52
+ """Model performance metrics"""
53
+ timestamp: datetime
54
+ accuracy: float
55
+ precision: float
56
+ recall: float
57
+ f1_score: float
58
+ auc_roc: Optional[float] = None
59
+ log_loss: Optional[float] = None
60
+ mse: Optional[float] = None
61
+ mae: Optional[float] = None
62
+ custom_metrics: Dict[str, float] = field(default_factory=dict)
63
+
64
+
65
+ @dataclass
66
+ class DataProfile:
67
+ """Statistical profile of data"""
68
+ feature_means: Dict[str, float]
69
+ feature_stds: Dict[str, float]
70
+ feature_mins: Dict[str, float]
71
+ feature_maxs: Dict[str, float]
72
+ feature_nulls: Dict[str, float]
73
+ correlation_matrix: np.ndarray
74
+ timestamp: datetime
75
+
76
+
77
+ class StatisticalDriftDetector:
78
+ """Detect statistical drift in data distributions"""
79
+
80
+ def __init__(self, reference_data: pd.DataFrame,
81
+ significance_level: float = 0.05,
82
+ min_samples: int = 100):
83
+ self.reference_data = reference_data
84
+ self.reference_profile = self._create_data_profile(reference_data)
85
+ self.significance_level = significance_level
86
+ self.min_samples = min_samples
87
+
88
+ def detect_drift(self, current_data: pd.DataFrame) -> Dict[str, Any]:
89
+ """Detect drift between reference and current data"""
90
+ if len(current_data) < self.min_samples:
91
+ return {"drift_detected": False, "message": "Insufficient samples"}
92
+
93
+ drift_results = {}
94
+ current_profile = self._create_data_profile(current_data)
95
+
96
+ # Kolmogorov-Smirnov test for each feature
97
+ ks_results = {}
98
+ for feature in self.reference_data.columns:
99
+ if feature in current_data.columns and pd.api.types.is_numeric_dtype(current_data[feature]):
100
+ ref_values = self.reference_data[feature].dropna()
101
+ curr_values = current_data[feature].dropna()
102
+
103
+ if len(ref_values) > 0 and len(curr_values) > 0:
104
+ ks_stat, p_value = ks_2samp(ref_values, curr_values)
105
+ ks_results[feature] = {
106
+ "ks_statistic": ks_stat,
107
+ "p_value": p_value,
108
+ "drift_detected": p_value < self.significance_level
109
+ }
110
+
111
+ # Population Stability Index (PSI)
112
+ psi_results = self._calculate_psi(self.reference_data, current_data)
113
+
114
+ # Feature distribution comparisons
115
+ feature_comparisons = self._compare_feature_distributions(
116
+ self.reference_profile, current_profile
117
+ )
118
+
119
+ drift_results = {
120
+ "timestamp": datetime.now(),
121
+ "ks_tests": ks_results,
122
+ "psi_scores": psi_results,
123
+ "feature_comparisons": feature_comparisons,
124
+ "overall_drift_detected": any(
125
+ result.get("drift_detected", False) for result in ks_results.values()
126
+ ) or any(score > 0.25 for score in psi_results.values()),
127
+ "reference_profile": asdict(self.reference_profile),
128
+ "current_profile": asdict(current_profile)
129
+ }
130
+
131
+ return drift_results
132
+
133
+ def _create_data_profile(self, data: pd.DataFrame) -> DataProfile:
134
+ """Create statistical profile of data"""
135
+ numeric_data = data.select_dtypes(include=[np.number])
136
+
137
+ return DataProfile(
138
+ feature_means=numeric_data.mean().to_dict(),
139
+ feature_stds=numeric_data.std().to_dict(),
140
+ feature_mins=numeric_data.min().to_dict(),
141
+ feature_maxs=numeric_data.max().to_dict(),
142
+ feature_nulls=data.isnull().sum().to_dict(),
143
+ correlation_matrix=numeric_data.corr().values if len(numeric_data.columns) > 1 else np.array([]),
144
+ timestamp=datetime.now()
145
+ )
146
+
147
+ def _calculate_psi(self, reference_data: pd.DataFrame,
148
+ current_data: pd.DataFrame) -> Dict[str, float]:
149
+ """Calculate Population Stability Index for each feature"""
150
+ psi_scores = {}
151
+
152
+ for feature in reference_data.columns:
153
+ if feature in current_data.columns and pd.api.types.is_numeric_dtype(reference_data[feature]):
154
+ ref_values = reference_data[feature].dropna()
155
+ curr_values = current_data[feature].dropna()
156
+
157
+ if len(ref_values) > 0 and len(curr_values) > 0:
158
+ psi_score = self._psi_score(ref_values, curr_values)
159
+ psi_scores[feature] = psi_score
160
+
161
+ return psi_scores
162
+
163
+ def _psi_score(self, reference: pd.Series, current: pd.Series,
164
+ bins: int = 10) -> float:
165
+ """Calculate PSI score between two distributions"""
166
+ try:
167
+ # Create bins based on reference data
168
+ ref_min, ref_max = reference.min(), reference.max()
169
+ bin_edges = np.linspace(ref_min, ref_max, bins + 1)
170
+
171
+ # Calculate frequencies
172
+ ref_freq, _ = np.histogram(reference, bins=bin_edges)
173
+ curr_freq, _ = np.histogram(current, bins=bin_edges)
174
+
175
+ # Convert to proportions
176
+ ref_prop = ref_freq / len(reference)
177
+ curr_prop = curr_freq / len(current)
178
+
179
+ # Add small epsilon to avoid log(0)
180
+ epsilon = 1e-10
181
+ ref_prop = np.maximum(ref_prop, epsilon)
182
+ curr_prop = np.maximum(curr_prop, epsilon)
183
+
184
+ # Calculate PSI
185
+ psi = np.sum((curr_prop - ref_prop) * np.log(curr_prop / ref_prop))
186
+ return psi
187
+
188
+ except Exception as e:
189
+ logger.warning(f"Failed to calculate PSI: {e}")
190
+ return 0.0
191
+
192
+ def _compare_feature_distributions(self, ref_profile: DataProfile,
193
+ curr_profile: DataProfile) -> Dict[str, Dict[str, float]]:
194
+ """Compare feature distributions between profiles"""
195
+ comparisons = {}
196
+
197
+ for feature in ref_profile.feature_means.keys():
198
+ if feature in curr_profile.feature_means:
199
+ ref_mean = ref_profile.feature_means[feature]
200
+ curr_mean = curr_profile.feature_means[feature]
201
+ ref_std = ref_profile.feature_stds[feature]
202
+
203
+ # Calculate z-score for mean shift
204
+ z_score = abs(curr_mean - ref_mean) / ref_std if ref_std > 0 else 0
205
+
206
+ # Calculate coefficient of variation change
207
+ ref_cv = ref_std / ref_mean if ref_mean != 0 else 0
208
+ curr_cv = curr_profile.feature_stds[feature] / curr_mean if curr_mean != 0 else 0
209
+ cv_change = abs(curr_cv - ref_cv) / ref_cv if ref_cv > 0 else 0
210
+
211
+ comparisons[feature] = {
212
+ "mean_z_score": z_score,
213
+ "cv_change": cv_change,
214
+ "mean_shift_detected": z_score > 2.0,
215
+ "variance_change_detected": cv_change > 0.5
216
+ }
217
+
218
+ return comparisons
219
+
220
+
221
+ class ConceptDriftDetector:
222
+ """Detect concept drift in model predictions"""
223
+
224
+ def __init__(self, window_size: int = 1000,
225
+ detection_threshold: float = 0.05):
226
+ self.window_size = window_size
227
+ self.detection_threshold = detection_threshold
228
+ self.historical_metrics = []
229
+
230
+ def add_batch_metrics(self, metrics: ModelMetrics):
231
+ """Add batch metrics for drift detection"""
232
+ self.historical_metrics.append(metrics)
233
+
234
+ # Keep only recent metrics
235
+ if len(self.historical_metrics) > self.window_size * 2:
236
+ self.historical_metrics = self.historical_metrics[-self.window_size:]
237
+
238
+ def detect_concept_drift(self) -> Dict[str, Any]:
239
+ """Detect concept drift using model performance degradation"""
240
+ if len(self.historical_metrics) < self.window_size:
241
+ return {"drift_detected": False, "message": "Insufficient historical data"}
242
+
243
+ # Split metrics into two windows
244
+ mid_point = len(self.historical_metrics) // 2
245
+ early_metrics = self.historical_metrics[:mid_point]
246
+ recent_metrics = self.historical_metrics[mid_point:]
247
+
248
+ # Calculate average performance for each window
249
+ early_performance = self._calculate_average_performance(early_metrics)
250
+ recent_performance = self._calculate_average_performance(recent_metrics)
251
+
252
+ # Detect significant performance degradation
253
+ drift_detected = False
254
+ degraded_metrics = []
255
+
256
+ for metric_name in ["accuracy", "precision", "recall", "f1_score"]:
257
+ if metric_name in early_performance and metric_name in recent_performance:
258
+ early_value = early_performance[metric_name]
259
+ recent_value = recent_performance[metric_name]
260
+
261
+ # Check for significant decrease
262
+ if early_value > 0:
263
+ relative_change = (recent_value - early_value) / early_value
264
+ if relative_change < -self.detection_threshold:
265
+ drift_detected = True
266
+ degraded_metrics.append({
267
+ "metric": metric_name,
268
+ "early_value": early_value,
269
+ "recent_value": recent_value,
270
+ "relative_change": relative_change
271
+ })
272
+
273
+ return {
274
+ "drift_detected": drift_detected,
275
+ "degraded_metrics": degraded_metrics,
276
+ "early_performance": early_performance,
277
+ "recent_performance": recent_performance,
278
+ "timestamp": datetime.now()
279
+ }
280
+
281
+ def _calculate_average_performance(self, metrics_list: List[ModelMetrics]) -> Dict[str, float]:
282
+ """Calculate average performance metrics"""
283
+ if not metrics_list:
284
+ return {}
285
+
286
+ performance = {
287
+ "accuracy": np.mean([m.accuracy for m in metrics_list]),
288
+ "precision": np.mean([m.precision for m in metrics_list]),
289
+ "recall": np.mean([m.recall for m in metrics_list]),
290
+ "f1_score": np.mean([m.f1_score for m in metrics_list])
291
+ }
292
+
293
+ # Add optional metrics if available
294
+ auc_scores = [m.auc_roc for m in metrics_list if m.auc_roc is not None]
295
+ if auc_scores:
296
+ performance["auc_roc"] = np.mean(auc_scores)
297
+
298
+ return performance
299
+
300
+
301
+ class OutlierDetector:
302
+ """Detect outliers in incoming data"""
303
+
304
+ def __init__(self, contamination: float = 0.1):
305
+ self.contamination = contamination
306
+ self.detector = None
307
+ self.is_fitted = False
308
+
309
+ def fit(self, reference_data: pd.DataFrame):
310
+ """Fit outlier detector on reference data"""
311
+ numeric_data = reference_data.select_dtypes(include=[np.number])
312
+
313
+ if numeric_data.empty:
314
+ logger.warning("No numeric features found for outlier detection")
315
+ return
316
+
317
+ self.detector = IsolationForest(
318
+ contamination=self.contamination,
319
+ random_state=42
320
+ )
321
+ self.detector.fit(numeric_data.fillna(0))
322
+ self.is_fitted = True
323
+
324
+ def detect_outliers(self, data: pd.DataFrame) -> Dict[str, Any]:
325
+ """Detect outliers in new data"""
326
+ if not self.is_fitted:
327
+ return {"outliers_detected": False, "message": "Detector not fitted"}
328
+
329
+ numeric_data = data.select_dtypes(include=[np.number])
330
+
331
+ if numeric_data.empty:
332
+ return {"outliers_detected": False, "message": "No numeric features"}
333
+
334
+ # Predict outliers
335
+ outlier_scores = self.detector.decision_function(numeric_data.fillna(0))
336
+ outlier_labels = self.detector.predict(numeric_data.fillna(0))
337
+
338
+ outliers_mask = outlier_labels == -1
339
+ outlier_ratio = np.mean(outliers_mask)
340
+
341
+ return {
342
+ "outliers_detected": outlier_ratio > self.contamination * 2, # Alert if 2x expected
343
+ "outlier_ratio": outlier_ratio,
344
+ "outlier_scores": outlier_scores.tolist(),
345
+ "outlier_indices": np.where(outliers_mask)[0].tolist(),
346
+ "timestamp": datetime.now()
347
+ }
348
+
349
+
350
+ class ModelMonitor:
351
+ """Comprehensive model monitoring system"""
352
+
353
+ def __init__(self, model_name: str, storage_path: Path = Path("monitoring")):
354
+ self.model_name = model_name
355
+ self.storage_path = storage_path / model_name
356
+ self.storage_path.mkdir(parents=True, exist_ok=True)
357
+
358
+ # Initialize detectors
359
+ self.statistical_detector = None
360
+ self.concept_detector = ConceptDriftDetector()
361
+ self.outlier_detector = OutlierDetector()
362
+
363
+ # Monitoring configuration
364
+ self.thresholds = {
365
+ "data_drift_psi": 0.25,
366
+ "concept_drift_threshold": 0.05,
367
+ "outlier_ratio_threshold": 0.2,
368
+ "performance_degradation": 0.1
369
+ }
370
+
371
+ # Alert handlers
372
+ self.alert_handlers = []
373
+
374
+ # Monitoring history
375
+ self.monitoring_history = []
376
+
377
+ def setup_reference_data(self, reference_data: pd.DataFrame):
378
+ """Set up reference data for drift detection"""
379
+ self.statistical_detector = StatisticalDriftDetector(reference_data)
380
+ self.outlier_detector.fit(reference_data)
381
+
382
+ # Save reference data profile
383
+ self._save_reference_profile(reference_data)
384
+
385
+ def monitor_batch(self, current_data: pd.DataFrame,
386
+ predictions: np.ndarray,
387
+ true_labels: Optional[np.ndarray] = None) -> Dict[str, Any]:
388
+ """Monitor a batch of data and predictions"""
389
+ monitoring_result = {
390
+ "timestamp": datetime.now(),
391
+ "batch_size": len(current_data),
392
+ "alerts": [],
393
+ "metrics": {}
394
+ }
395
+
396
+ # Data drift detection
397
+ if self.statistical_detector:
398
+ drift_result = self.statistical_detector.detect_drift(current_data)
399
+ monitoring_result["data_drift"] = drift_result
400
+
401
+ if drift_result.get("overall_drift_detected", False):
402
+ alert = DriftAlert(
403
+ timestamp=datetime.now(),
404
+ drift_type=DriftType.DATA_DRIFT,
405
+ severity=AlertSeverity.MEDIUM,
406
+ metric_name="overall_data_drift",
407
+ value=1.0,
408
+ threshold=0.5,
409
+ description="Statistical drift detected in input features",
410
+ metadata=drift_result
411
+ )
412
+ monitoring_result["alerts"].append(alert)
413
+
414
+ # Outlier detection
415
+ outlier_result = self.outlier_detector.detect_outliers(current_data)
416
+ monitoring_result["outliers"] = outlier_result
417
+
418
+ if outlier_result.get("outliers_detected", False):
419
+ alert = DriftAlert(
420
+ timestamp=datetime.now(),
421
+ drift_type=DriftType.DATA_DRIFT,
422
+ severity=AlertSeverity.LOW,
423
+ metric_name="outlier_ratio",
424
+ value=outlier_result["outlier_ratio"],
425
+ threshold=self.thresholds["outlier_ratio_threshold"],
426
+ description=f"High outlier ratio detected: {outlier_result['outlier_ratio']:.3f}",
427
+ metadata=outlier_result
428
+ )
429
+ monitoring_result["alerts"].append(alert)
430
+
431
+ # Prediction drift analysis
432
+ prediction_stats = self._analyze_predictions(predictions)
433
+ monitoring_result["prediction_stats"] = prediction_stats
434
+
435
+ # Model performance monitoring (if true labels available)
436
+ if true_labels is not None:
437
+ performance_metrics = self._calculate_performance_metrics(predictions, true_labels)
438
+ monitoring_result["performance"] = performance_metrics
439
+
440
+ # Add to concept drift detector
441
+ self.concept_detector.add_batch_metrics(performance_metrics)
442
+
443
+ # Check for concept drift
444
+ concept_drift_result = self.concept_detector.detect_concept_drift()
445
+ monitoring_result["concept_drift"] = concept_drift_result
446
+
447
+ if concept_drift_result.get("drift_detected", False):
448
+ alert = DriftAlert(
449
+ timestamp=datetime.now(),
450
+ drift_type=DriftType.CONCEPT_DRIFT,
451
+ severity=AlertSeverity.HIGH,
452
+ metric_name="model_performance",
453
+ value=performance_metrics.accuracy,
454
+ threshold=self.thresholds["performance_degradation"],
455
+ description="Model performance degradation detected",
456
+ metadata=concept_drift_result
457
+ )
458
+ monitoring_result["alerts"].append(alert)
459
+
460
+ # Process alerts
461
+ for alert in monitoring_result["alerts"]:
462
+ self._handle_alert(alert)
463
+
464
+ # Save monitoring result
465
+ self._save_monitoring_result(monitoring_result)
466
+
467
+ return monitoring_result
468
+
469
+ def add_alert_handler(self, handler: Callable[[DriftAlert], None]):
470
+ """Add alert handler function"""
471
+ self.alert_handlers.append(handler)
472
+
473
+ def get_monitoring_summary(self, days: int = 7) -> Dict[str, Any]:
474
+ """Get monitoring summary for the last N days"""
475
+ cutoff_date = datetime.now() - timedelta(days=days)
476
+ recent_results = [
477
+ result for result in self.monitoring_history
478
+ if result["timestamp"] >= cutoff_date
479
+ ]
480
+
481
+ if not recent_results:
482
+ return {"message": "No monitoring data available"}
483
+
484
+ # Count alerts by type and severity
485
+ alert_counts = {}
486
+ for result in recent_results:
487
+ for alert in result.get("alerts", []):
488
+ key = f"{alert.drift_type.value}_{alert.severity.value}"
489
+ alert_counts[key] = alert_counts.get(key, 0) + 1
490
+
491
+ # Calculate average metrics
492
+ avg_metrics = {}
493
+ if recent_results and "performance" in recent_results[0]:
494
+ performance_data = [r["performance"] for r in recent_results if "performance" in r]
495
+ if performance_data:
496
+ avg_metrics = {
497
+ "avg_accuracy": np.mean([p.accuracy for p in performance_data]),
498
+ "avg_precision": np.mean([p.precision for p in performance_data]),
499
+ "avg_recall": np.mean([p.recall for p in performance_data]),
500
+ "avg_f1_score": np.mean([p.f1_score for p in performance_data])
501
+ }
502
+
503
+ return {
504
+ "period_days": days,
505
+ "total_batches": len(recent_results),
506
+ "alert_counts": alert_counts,
507
+ "average_metrics": avg_metrics,
508
+ "latest_timestamp": recent_results[-1]["timestamp"] if recent_results else None
509
+ }
510
+
511
+ def _analyze_predictions(self, predictions: np.ndarray) -> Dict[str, Any]:
512
+ """Analyze prediction distribution"""
513
+ return {
514
+ "mean": float(np.mean(predictions)),
515
+ "std": float(np.std(predictions)),
516
+ "min": float(np.min(predictions)),
517
+ "max": float(np.max(predictions)),
518
+ "unique_values": len(np.unique(predictions))
519
+ }
520
+
521
+ def _calculate_performance_metrics(self, predictions: np.ndarray,
522
+ true_labels: np.ndarray) -> ModelMetrics:
523
+ """Calculate model performance metrics"""
524
+ # Convert to binary if needed
525
+ if len(np.unique(true_labels)) == 2:
526
+ # Binary classification
527
+ pred_binary = (predictions > 0.5).astype(int)
528
+ true_binary = true_labels.astype(int)
529
+
530
+ tp = np.sum((pred_binary == 1) & (true_binary == 1))
531
+ fp = np.sum((pred_binary == 1) & (true_binary == 0))
532
+ tn = np.sum((pred_binary == 0) & (true_binary == 0))
533
+ fn = np.sum((pred_binary == 0) & (true_binary == 1))
534
+
535
+ accuracy = (tp + tn) / len(true_labels) if len(true_labels) > 0 else 0
536
+ precision = tp / (tp + fp) if (tp + fp) > 0 else 0
537
+ recall = tp / (tp + fn) if (tp + fn) > 0 else 0
538
+ f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
539
+
540
+ return ModelMetrics(
541
+ timestamp=datetime.now(),
542
+ accuracy=accuracy,
543
+ precision=precision,
544
+ recall=recall,
545
+ f1_score=f1_score
546
+ )
547
+ else:
548
+ # Regression metrics
549
+ mse = np.mean((predictions - true_labels) ** 2)
550
+ mae = np.mean(np.abs(predictions - true_labels))
551
+
552
+ return ModelMetrics(
553
+ timestamp=datetime.now(),
554
+ accuracy=0.0, # Not applicable for regression
555
+ precision=0.0,
556
+ recall=0.0,
557
+ f1_score=0.0,
558
+ mse=mse,
559
+ mae=mae
560
+ )
561
+
562
+ def _handle_alert(self, alert: DriftAlert):
563
+ """Handle drift alert"""
564
+ logger.warning(f"DRIFT ALERT: {alert.description} "
565
+ f"(Type: {alert.drift_type.value}, Severity: {alert.severity.value})")
566
+
567
+ # Call registered alert handlers
568
+ for handler in self.alert_handlers:
569
+ try:
570
+ handler(alert)
571
+ except Exception as e:
572
+ logger.error(f"Alert handler failed: {e}")
573
+
574
+ def _save_monitoring_result(self, result: Dict[str, Any]):
575
+ """Save monitoring result to storage"""
576
+ timestamp_str = result["timestamp"].strftime("%Y%m%d_%H%M%S")
577
+ filename = self.storage_path / f"monitoring_{timestamp_str}.json"
578
+
579
+ # Convert non-serializable objects
580
+ serializable_result = self._make_serializable(result)
581
+
582
+ with open(filename, 'w') as f:
583
+ json.dump(serializable_result, f, indent=2, default=str)
584
+
585
+ self.monitoring_history.append(result)
586
+
587
+ # Keep only recent history in memory
588
+ if len(self.monitoring_history) > 1000:
589
+ self.monitoring_history = self.monitoring_history[-500:]
590
+
591
+ def _save_reference_profile(self, reference_data: pd.DataFrame):
592
+ """Save reference data profile"""
593
+ profile_file = self.storage_path / "reference_profile.pkl"
594
+
595
+ with open(profile_file, 'wb') as f:
596
+ pickle.dump(reference_data, f)
597
+
598
+ def _make_serializable(self, obj: Any) -> Any:
599
+ """Convert object to JSON-serializable format"""
600
+ if isinstance(obj, np.ndarray):
601
+ return obj.tolist()
602
+ elif isinstance(obj, np.integer):
603
+ return int(obj)
604
+ elif isinstance(obj, np.floating):
605
+ return float(obj)
606
+ elif isinstance(obj, datetime):
607
+ return obj.isoformat()
608
+ elif isinstance(obj, DriftAlert):
609
+ return asdict(obj)
610
+ elif isinstance(obj, ModelMetrics):
611
+ return asdict(obj)
612
+ elif isinstance(obj, (DriftType, AlertSeverity)):
613
+ return obj.value
614
+ elif isinstance(obj, dict):
615
+ return {k: self._make_serializable(v) for k, v in obj.items()}
616
+ elif isinstance(obj, list):
617
+ return [self._make_serializable(item) for item in obj]
618
+ else:
619
+ return obj
620
+
621
+
622
+ # Example alert handlers
623
+ def email_alert_handler(alert: DriftAlert):
624
+ """Example email alert handler"""
625
+ logger.info(f"EMAIL ALERT: {alert.description}")
626
+ # In production, would send actual email
627
+
628
+
629
+ def slack_alert_handler(alert: DriftAlert):
630
+ """Example Slack alert handler"""
631
+ logger.info(f"SLACK ALERT: {alert.description}")
632
+ # In production, would send to Slack
633
+
634
+
635
+ # Example usage
636
+ if __name__ == "__main__":
637
+ # Generate sample data
638
+ np.random.seed(42)
639
+ reference_data = pd.DataFrame({
640
+ 'feature1': np.random.normal(0, 1, 1000),
641
+ 'feature2': np.random.normal(5, 2, 1000),
642
+ 'feature3': np.random.uniform(0, 10, 1000)
643
+ })
644
+
645
+ # Initialize monitor
646
+ monitor = ModelMonitor("stock_recommendation_model")
647
+ monitor.setup_reference_data(reference_data)
648
+
649
+ # Add alert handlers
650
+ monitor.add_alert_handler(email_alert_handler)
651
+ monitor.add_alert_handler(slack_alert_handler)
652
+
653
+ # Simulate monitoring batches
654
+ for i in range(10):
655
+ # Generate current data (with some drift)
656
+ drift_factor = i * 0.1
657
+ current_data = pd.DataFrame({
658
+ 'feature1': np.random.normal(drift_factor, 1, 100),
659
+ 'feature2': np.random.normal(5 + drift_factor, 2, 100),
660
+ 'feature3': np.random.uniform(0, 10 + drift_factor, 100)
661
+ })
662
+
663
+ # Generate predictions and labels
664
+ predictions = np.random.uniform(0, 1, 100)
665
+ true_labels = (predictions + np.random.normal(0, 0.1, 100) > 0.5).astype(int)
666
+
667
+ # Monitor batch
668
+ result = monitor.monitor_batch(current_data, predictions, true_labels)
669
+
670
+ print(f"Batch {i}: {len(result['alerts'])} alerts generated")
671
+
672
+ # Get monitoring summary
673
+ summary = monitor.get_monitoring_summary(days=1)
674
+ print(f"Monitoring Summary: {json.dumps(summary, indent=2, default=str)}")
675
+
676
+ logger.info("Model monitoring demo completed")