mcli-framework 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcli-framework might be problematic. Click here for more details.

Files changed (186) hide show
  1. mcli/app/chat_cmd.py +42 -0
  2. mcli/app/commands_cmd.py +226 -0
  3. mcli/app/completion_cmd.py +216 -0
  4. mcli/app/completion_helpers.py +288 -0
  5. mcli/app/cron_test_cmd.py +697 -0
  6. mcli/app/logs_cmd.py +419 -0
  7. mcli/app/main.py +492 -0
  8. mcli/app/model/model.py +1060 -0
  9. mcli/app/model_cmd.py +227 -0
  10. mcli/app/redis_cmd.py +269 -0
  11. mcli/app/video/video.py +1114 -0
  12. mcli/app/visual_cmd.py +303 -0
  13. mcli/chat/chat.py +2409 -0
  14. mcli/chat/command_rag.py +514 -0
  15. mcli/chat/enhanced_chat.py +652 -0
  16. mcli/chat/system_controller.py +1010 -0
  17. mcli/chat/system_integration.py +1016 -0
  18. mcli/cli.py +25 -0
  19. mcli/config.toml +20 -0
  20. mcli/lib/api/api.py +586 -0
  21. mcli/lib/api/daemon_client.py +203 -0
  22. mcli/lib/api/daemon_client_local.py +44 -0
  23. mcli/lib/api/daemon_decorator.py +217 -0
  24. mcli/lib/api/mcli_decorators.py +1032 -0
  25. mcli/lib/auth/auth.py +85 -0
  26. mcli/lib/auth/aws_manager.py +85 -0
  27. mcli/lib/auth/azure_manager.py +91 -0
  28. mcli/lib/auth/credential_manager.py +192 -0
  29. mcli/lib/auth/gcp_manager.py +93 -0
  30. mcli/lib/auth/key_manager.py +117 -0
  31. mcli/lib/auth/mcli_manager.py +93 -0
  32. mcli/lib/auth/token_manager.py +75 -0
  33. mcli/lib/auth/token_util.py +1011 -0
  34. mcli/lib/config/config.py +47 -0
  35. mcli/lib/discovery/__init__.py +1 -0
  36. mcli/lib/discovery/command_discovery.py +274 -0
  37. mcli/lib/erd/erd.py +1345 -0
  38. mcli/lib/erd/generate_graph.py +453 -0
  39. mcli/lib/files/files.py +76 -0
  40. mcli/lib/fs/fs.py +109 -0
  41. mcli/lib/lib.py +29 -0
  42. mcli/lib/logger/logger.py +611 -0
  43. mcli/lib/performance/optimizer.py +409 -0
  44. mcli/lib/performance/rust_bridge.py +502 -0
  45. mcli/lib/performance/uvloop_config.py +154 -0
  46. mcli/lib/pickles/pickles.py +50 -0
  47. mcli/lib/search/cached_vectorizer.py +479 -0
  48. mcli/lib/services/data_pipeline.py +460 -0
  49. mcli/lib/services/lsh_client.py +441 -0
  50. mcli/lib/services/redis_service.py +387 -0
  51. mcli/lib/shell/shell.py +137 -0
  52. mcli/lib/toml/toml.py +33 -0
  53. mcli/lib/ui/styling.py +47 -0
  54. mcli/lib/ui/visual_effects.py +634 -0
  55. mcli/lib/watcher/watcher.py +185 -0
  56. mcli/ml/api/app.py +215 -0
  57. mcli/ml/api/middleware.py +224 -0
  58. mcli/ml/api/routers/admin_router.py +12 -0
  59. mcli/ml/api/routers/auth_router.py +244 -0
  60. mcli/ml/api/routers/backtest_router.py +12 -0
  61. mcli/ml/api/routers/data_router.py +12 -0
  62. mcli/ml/api/routers/model_router.py +302 -0
  63. mcli/ml/api/routers/monitoring_router.py +12 -0
  64. mcli/ml/api/routers/portfolio_router.py +12 -0
  65. mcli/ml/api/routers/prediction_router.py +267 -0
  66. mcli/ml/api/routers/trade_router.py +12 -0
  67. mcli/ml/api/routers/websocket_router.py +76 -0
  68. mcli/ml/api/schemas.py +64 -0
  69. mcli/ml/auth/auth_manager.py +425 -0
  70. mcli/ml/auth/models.py +154 -0
  71. mcli/ml/auth/permissions.py +302 -0
  72. mcli/ml/backtesting/backtest_engine.py +502 -0
  73. mcli/ml/backtesting/performance_metrics.py +393 -0
  74. mcli/ml/cache.py +400 -0
  75. mcli/ml/cli/main.py +398 -0
  76. mcli/ml/config/settings.py +394 -0
  77. mcli/ml/configs/dvc_config.py +230 -0
  78. mcli/ml/configs/mlflow_config.py +131 -0
  79. mcli/ml/configs/mlops_manager.py +293 -0
  80. mcli/ml/dashboard/app.py +532 -0
  81. mcli/ml/dashboard/app_integrated.py +738 -0
  82. mcli/ml/dashboard/app_supabase.py +560 -0
  83. mcli/ml/dashboard/app_training.py +615 -0
  84. mcli/ml/dashboard/cli.py +51 -0
  85. mcli/ml/data_ingestion/api_connectors.py +501 -0
  86. mcli/ml/data_ingestion/data_pipeline.py +567 -0
  87. mcli/ml/data_ingestion/stream_processor.py +512 -0
  88. mcli/ml/database/migrations/env.py +94 -0
  89. mcli/ml/database/models.py +667 -0
  90. mcli/ml/database/session.py +200 -0
  91. mcli/ml/experimentation/ab_testing.py +845 -0
  92. mcli/ml/features/ensemble_features.py +607 -0
  93. mcli/ml/features/political_features.py +676 -0
  94. mcli/ml/features/recommendation_engine.py +809 -0
  95. mcli/ml/features/stock_features.py +573 -0
  96. mcli/ml/features/test_feature_engineering.py +346 -0
  97. mcli/ml/logging.py +85 -0
  98. mcli/ml/mlops/data_versioning.py +518 -0
  99. mcli/ml/mlops/experiment_tracker.py +377 -0
  100. mcli/ml/mlops/model_serving.py +481 -0
  101. mcli/ml/mlops/pipeline_orchestrator.py +614 -0
  102. mcli/ml/models/base_models.py +324 -0
  103. mcli/ml/models/ensemble_models.py +675 -0
  104. mcli/ml/models/recommendation_models.py +474 -0
  105. mcli/ml/models/test_models.py +487 -0
  106. mcli/ml/monitoring/drift_detection.py +676 -0
  107. mcli/ml/monitoring/metrics.py +45 -0
  108. mcli/ml/optimization/portfolio_optimizer.py +834 -0
  109. mcli/ml/preprocessing/data_cleaners.py +451 -0
  110. mcli/ml/preprocessing/feature_extractors.py +491 -0
  111. mcli/ml/preprocessing/ml_pipeline.py +382 -0
  112. mcli/ml/preprocessing/politician_trading_preprocessor.py +569 -0
  113. mcli/ml/preprocessing/test_preprocessing.py +294 -0
  114. mcli/ml/scripts/populate_sample_data.py +200 -0
  115. mcli/ml/tasks.py +400 -0
  116. mcli/ml/tests/test_integration.py +429 -0
  117. mcli/ml/tests/test_training_dashboard.py +387 -0
  118. mcli/public/oi/oi.py +15 -0
  119. mcli/public/public.py +4 -0
  120. mcli/self/self_cmd.py +1246 -0
  121. mcli/workflow/daemon/api_daemon.py +800 -0
  122. mcli/workflow/daemon/async_command_database.py +681 -0
  123. mcli/workflow/daemon/async_process_manager.py +591 -0
  124. mcli/workflow/daemon/client.py +530 -0
  125. mcli/workflow/daemon/commands.py +1196 -0
  126. mcli/workflow/daemon/daemon.py +905 -0
  127. mcli/workflow/daemon/daemon_api.py +59 -0
  128. mcli/workflow/daemon/enhanced_daemon.py +571 -0
  129. mcli/workflow/daemon/process_cli.py +244 -0
  130. mcli/workflow/daemon/process_manager.py +439 -0
  131. mcli/workflow/daemon/test_daemon.py +275 -0
  132. mcli/workflow/dashboard/dashboard_cmd.py +113 -0
  133. mcli/workflow/docker/docker.py +0 -0
  134. mcli/workflow/file/file.py +100 -0
  135. mcli/workflow/gcloud/config.toml +21 -0
  136. mcli/workflow/gcloud/gcloud.py +58 -0
  137. mcli/workflow/git_commit/ai_service.py +328 -0
  138. mcli/workflow/git_commit/commands.py +430 -0
  139. mcli/workflow/lsh_integration.py +355 -0
  140. mcli/workflow/model_service/client.py +594 -0
  141. mcli/workflow/model_service/download_and_run_efficient_models.py +288 -0
  142. mcli/workflow/model_service/lightweight_embedder.py +397 -0
  143. mcli/workflow/model_service/lightweight_model_server.py +714 -0
  144. mcli/workflow/model_service/lightweight_test.py +241 -0
  145. mcli/workflow/model_service/model_service.py +1955 -0
  146. mcli/workflow/model_service/ollama_efficient_runner.py +425 -0
  147. mcli/workflow/model_service/pdf_processor.py +386 -0
  148. mcli/workflow/model_service/test_efficient_runner.py +234 -0
  149. mcli/workflow/model_service/test_example.py +315 -0
  150. mcli/workflow/model_service/test_integration.py +131 -0
  151. mcli/workflow/model_service/test_new_features.py +149 -0
  152. mcli/workflow/openai/openai.py +99 -0
  153. mcli/workflow/politician_trading/commands.py +1790 -0
  154. mcli/workflow/politician_trading/config.py +134 -0
  155. mcli/workflow/politician_trading/connectivity.py +490 -0
  156. mcli/workflow/politician_trading/data_sources.py +395 -0
  157. mcli/workflow/politician_trading/database.py +410 -0
  158. mcli/workflow/politician_trading/demo.py +248 -0
  159. mcli/workflow/politician_trading/models.py +165 -0
  160. mcli/workflow/politician_trading/monitoring.py +413 -0
  161. mcli/workflow/politician_trading/scrapers.py +966 -0
  162. mcli/workflow/politician_trading/scrapers_california.py +412 -0
  163. mcli/workflow/politician_trading/scrapers_eu.py +377 -0
  164. mcli/workflow/politician_trading/scrapers_uk.py +350 -0
  165. mcli/workflow/politician_trading/scrapers_us_states.py +438 -0
  166. mcli/workflow/politician_trading/supabase_functions.py +354 -0
  167. mcli/workflow/politician_trading/workflow.py +852 -0
  168. mcli/workflow/registry/registry.py +180 -0
  169. mcli/workflow/repo/repo.py +223 -0
  170. mcli/workflow/scheduler/commands.py +493 -0
  171. mcli/workflow/scheduler/cron_parser.py +238 -0
  172. mcli/workflow/scheduler/job.py +182 -0
  173. mcli/workflow/scheduler/monitor.py +139 -0
  174. mcli/workflow/scheduler/persistence.py +324 -0
  175. mcli/workflow/scheduler/scheduler.py +679 -0
  176. mcli/workflow/sync/sync_cmd.py +437 -0
  177. mcli/workflow/sync/test_cmd.py +314 -0
  178. mcli/workflow/videos/videos.py +242 -0
  179. mcli/workflow/wakatime/wakatime.py +11 -0
  180. mcli/workflow/workflow.py +37 -0
  181. mcli_framework-7.0.0.dist-info/METADATA +479 -0
  182. mcli_framework-7.0.0.dist-info/RECORD +186 -0
  183. mcli_framework-7.0.0.dist-info/WHEEL +5 -0
  184. mcli_framework-7.0.0.dist-info/entry_points.txt +7 -0
  185. mcli_framework-7.0.0.dist-info/licenses/LICENSE +21 -0
  186. mcli_framework-7.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,845 @@
1
+ """A/B Testing framework for ML model experiments"""
2
+
3
+ import asyncio
4
+ import hashlib
5
+ import json
6
+ import logging
7
+ import random
8
+ import uuid
9
+ from dataclasses import dataclass, field, asdict
10
+ from datetime import datetime, timedelta
11
+ from enum import Enum
12
+ from typing import Dict, Any, List, Optional, Union, Callable
13
+ import pandas as pd
14
+ import numpy as np
15
+ from scipy import stats
16
+ from pathlib import Path
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class ExperimentStatus(Enum):
22
+ DRAFT = "draft"
23
+ RUNNING = "running"
24
+ PAUSED = "paused"
25
+ COMPLETED = "completed"
26
+ CANCELLED = "cancelled"
27
+
28
+
29
+ class VariantType(Enum):
30
+ CONTROL = "control"
31
+ TREATMENT = "treatment"
32
+
33
+
34
+ @dataclass
35
+ class Variant:
36
+ """A/B test variant configuration"""
37
+ id: str
38
+ name: str
39
+ type: VariantType
40
+ traffic_percentage: float
41
+ model_config: Dict[str, Any] = field(default_factory=dict)
42
+ feature_flags: Dict[str, Any] = field(default_factory=dict)
43
+ description: str = ""
44
+
45
+
46
+ @dataclass
47
+ class Metric:
48
+ """A/B test metric definition"""
49
+ name: str
50
+ type: str # "binary", "continuous", "count"
51
+ aggregation: str # "mean", "sum", "count", "rate"
52
+ goal: str # "increase", "decrease", "maintain"
53
+ statistical_power: float = 0.8
54
+ min_detectable_effect: float = 0.05
55
+ primary: bool = False
56
+
57
+
58
+ @dataclass
59
+ class ExperimentConfig:
60
+ """A/B test experiment configuration"""
61
+ id: str
62
+ name: str
63
+ description: str
64
+ variants: List[Variant]
65
+ metrics: List[Metric]
66
+
67
+ # Traffic configuration
68
+ traffic_percentage: float = 100.0 # Percentage of traffic to include
69
+
70
+ # Duration configuration
71
+ start_date: Optional[datetime] = None
72
+ end_date: Optional[datetime] = None
73
+ min_duration_days: int = 7
74
+ max_duration_days: int = 30
75
+
76
+ # Statistical configuration
77
+ significance_level: float = 0.05
78
+ statistical_power: float = 0.8
79
+ min_sample_size: int = 1000
80
+
81
+ # Guardrail metrics
82
+ guardrail_metrics: List[str] = field(default_factory=list)
83
+
84
+ # Feature flags
85
+ feature_flags: Dict[str, Any] = field(default_factory=dict)
86
+
87
+ status: ExperimentStatus = ExperimentStatus.DRAFT
88
+
89
+
90
+ @dataclass
91
+ class UserAssignment:
92
+ """User assignment to experiment variant"""
93
+ user_id: str
94
+ experiment_id: str
95
+ variant_id: str
96
+ assigned_at: datetime
97
+ metadata: Dict[str, Any] = field(default_factory=dict)
98
+
99
+
100
+ @dataclass
101
+ class ExperimentResult:
102
+ """Results of an A/B test experiment"""
103
+ experiment_id: str
104
+ variant_results: Dict[str, Dict[str, Any]]
105
+ statistical_tests: Dict[str, Dict[str, Any]]
106
+ confidence_intervals: Dict[str, Dict[str, tuple]]
107
+ recommendations: List[str]
108
+ created_at: datetime
109
+
110
+ # Overall experiment stats
111
+ total_users: int = 0
112
+ duration_days: int = 0
113
+ statistical_significance: bool = False
114
+ winner_variant: Optional[str] = None
115
+
116
+
117
+ class TrafficSplitter:
118
+ """Handle traffic splitting for A/B tests"""
119
+
120
+ def __init__(self):
121
+ self.assignments = {}
122
+
123
+ def assign_variant(self, user_id: str, experiment: ExperimentConfig) -> str:
124
+ """Assign user to experiment variant"""
125
+ # Check if user already assigned
126
+ cache_key = f"{user_id}:{experiment.id}"
127
+ if cache_key in self.assignments:
128
+ return self.assignments[cache_key]
129
+
130
+ # Hash user ID for consistent assignment
131
+ hash_input = f"{user_id}:{experiment.id}".encode()
132
+ hash_value = int(hashlib.md5(hash_input).hexdigest(), 16)
133
+ hash_ratio = (hash_value % 10000) / 10000.0
134
+
135
+ # Check if user should be included in experiment
136
+ if hash_ratio * 100 > experiment.traffic_percentage:
137
+ return "control" # Not in experiment
138
+
139
+ # Assign to variant based on traffic split
140
+ cumulative_percentage = 0
141
+ for variant in experiment.variants:
142
+ cumulative_percentage += variant.traffic_percentage
143
+ if hash_ratio * 100 <= cumulative_percentage:
144
+ self.assignments[cache_key] = variant.id
145
+ return variant.id
146
+
147
+ # Default to control
148
+ control_variant = next((v for v in experiment.variants if v.type == VariantType.CONTROL), experiment.variants[0])
149
+ self.assignments[cache_key] = control_variant.id
150
+ return control_variant.id
151
+
152
+ def get_assignment(self, user_id: str, experiment_id: str) -> Optional[str]:
153
+ """Get existing assignment"""
154
+ cache_key = f"{user_id}:{experiment_id}"
155
+ return self.assignments.get(cache_key)
156
+
157
+
158
+ class MetricsCollector:
159
+ """Collect and store experiment metrics"""
160
+
161
+ def __init__(self, storage_path: Path = Path("experiments/metrics")):
162
+ self.storage_path = storage_path
163
+ self.storage_path.mkdir(parents=True, exist_ok=True)
164
+ self.metrics_buffer = []
165
+
166
+ def record_metric(self, user_id: str, experiment_id: str, variant_id: str,
167
+ metric_name: str, value: Union[float, int, bool],
168
+ timestamp: Optional[datetime] = None):
169
+ """Record a metric value for a user"""
170
+ if timestamp is None:
171
+ timestamp = datetime.now()
172
+
173
+ metric_record = {
174
+ "user_id": user_id,
175
+ "experiment_id": experiment_id,
176
+ "variant_id": variant_id,
177
+ "metric_name": metric_name,
178
+ "value": value,
179
+ "timestamp": timestamp.isoformat()
180
+ }
181
+
182
+ self.metrics_buffer.append(metric_record)
183
+
184
+ # Flush buffer if it gets too large
185
+ if len(self.metrics_buffer) >= 1000:
186
+ self.flush_metrics()
187
+
188
+ def flush_metrics(self):
189
+ """Flush metrics buffer to storage"""
190
+ if not self.metrics_buffer:
191
+ return
192
+
193
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
194
+ filename = self.storage_path / f"metrics_{timestamp}.json"
195
+
196
+ with open(filename, 'w') as f:
197
+ json.dump(self.metrics_buffer, f, indent=2)
198
+
199
+ logger.info(f"Flushed {len(self.metrics_buffer)} metrics to {filename}")
200
+ self.metrics_buffer.clear()
201
+
202
+ def get_experiment_metrics(self, experiment_id: str) -> pd.DataFrame:
203
+ """Get all metrics for an experiment"""
204
+ all_metrics = []
205
+
206
+ # Load from all metric files
207
+ for file_path in self.storage_path.glob("metrics_*.json"):
208
+ with open(file_path, 'r') as f:
209
+ metrics = json.load(f)
210
+ experiment_metrics = [m for m in metrics if m["experiment_id"] == experiment_id]
211
+ all_metrics.extend(experiment_metrics)
212
+
213
+ if not all_metrics:
214
+ return pd.DataFrame()
215
+
216
+ df = pd.DataFrame(all_metrics)
217
+ df['timestamp'] = pd.to_datetime(df['timestamp'])
218
+ return df
219
+
220
+
221
+ class StatisticalAnalyzer:
222
+ """Perform statistical analysis on A/B test results"""
223
+
224
+ def __init__(self, significance_level: float = 0.05):
225
+ self.significance_level = significance_level
226
+
227
+ def analyze_experiment(self, experiment: ExperimentConfig,
228
+ metrics_df: pd.DataFrame) -> ExperimentResult:
229
+ """Analyze experiment results"""
230
+ if metrics_df.empty:
231
+ return self._empty_result(experiment.id)
232
+
233
+ # Group metrics by variant
234
+ variant_data = {}
235
+ for variant in experiment.variants:
236
+ variant_metrics = metrics_df[metrics_df['variant_id'] == variant.id]
237
+ variant_data[variant.id] = self._analyze_variant_metrics(variant_metrics, experiment.metrics)
238
+
239
+ # Perform statistical tests
240
+ statistical_tests = {}
241
+ confidence_intervals = {}
242
+
243
+ control_variant = next((v for v in experiment.variants if v.type == VariantType.CONTROL), None)
244
+ if control_variant:
245
+ for variant in experiment.variants:
246
+ if variant.type == VariantType.TREATMENT:
247
+ tests, intervals = self._compare_variants(
248
+ metrics_df, control_variant.id, variant.id, experiment.metrics
249
+ )
250
+ statistical_tests[variant.id] = tests
251
+ confidence_intervals[variant.id] = intervals
252
+
253
+ # Generate recommendations
254
+ recommendations = self._generate_recommendations(
255
+ variant_data, statistical_tests, experiment.metrics
256
+ )
257
+
258
+ # Determine winner
259
+ winner = self._determine_winner(statistical_tests, experiment.metrics)
260
+
261
+ return ExperimentResult(
262
+ experiment_id=experiment.id,
263
+ variant_results=variant_data,
264
+ statistical_tests=statistical_tests,
265
+ confidence_intervals=confidence_intervals,
266
+ recommendations=recommendations,
267
+ created_at=datetime.now(),
268
+ total_users=len(metrics_df['user_id'].unique()),
269
+ duration_days=(datetime.now() - experiment.start_date).days if experiment.start_date else 0,
270
+ statistical_significance=any(test.get('significant', False) for test in statistical_tests.values()),
271
+ winner_variant=winner
272
+ )
273
+
274
+ def _analyze_variant_metrics(self, variant_df: pd.DataFrame,
275
+ metrics_config: List[Metric]) -> Dict[str, Any]:
276
+ """Analyze metrics for a single variant"""
277
+ if variant_df.empty:
278
+ return {}
279
+
280
+ results = {}
281
+ for metric in metrics_config:
282
+ metric_data = variant_df[variant_df['metric_name'] == metric.name]['value']
283
+
284
+ if metric_data.empty:
285
+ continue
286
+
287
+ if metric.type == "binary":
288
+ results[metric.name] = {
289
+ "count": len(metric_data),
290
+ "success_rate": metric_data.mean(),
291
+ "std": metric_data.std(),
292
+ "confidence_interval": self._binary_confidence_interval(metric_data)
293
+ }
294
+ elif metric.type == "continuous":
295
+ results[metric.name] = {
296
+ "count": len(metric_data),
297
+ "mean": metric_data.mean(),
298
+ "std": metric_data.std(),
299
+ "median": metric_data.median(),
300
+ "confidence_interval": self._continuous_confidence_interval(metric_data)
301
+ }
302
+ elif metric.type == "count":
303
+ results[metric.name] = {
304
+ "count": len(metric_data),
305
+ "sum": metric_data.sum(),
306
+ "mean": metric_data.mean(),
307
+ "rate_per_user": metric_data.sum() / len(variant_df['user_id'].unique())
308
+ }
309
+
310
+ return results
311
+
312
+ def _compare_variants(self, metrics_df: pd.DataFrame, control_id: str,
313
+ treatment_id: str, metrics_config: List[Metric]) -> tuple:
314
+ """Compare treatment variant against control"""
315
+ tests = {}
316
+ intervals = {}
317
+
318
+ for metric in metrics_config:
319
+ control_data = metrics_df[
320
+ (metrics_df['variant_id'] == control_id) &
321
+ (metrics_df['metric_name'] == metric.name)
322
+ ]['value']
323
+
324
+ treatment_data = metrics_df[
325
+ (metrics_df['variant_id'] == treatment_id) &
326
+ (metrics_df['metric_name'] == metric.name)
327
+ ]['value']
328
+
329
+ if control_data.empty or treatment_data.empty:
330
+ continue
331
+
332
+ if metric.type == "binary":
333
+ test_result = self._binary_test(control_data, treatment_data)
334
+ elif metric.type == "continuous":
335
+ test_result = self._continuous_test(control_data, treatment_data)
336
+ else:
337
+ test_result = self._count_test(control_data, treatment_data)
338
+
339
+ tests[metric.name] = test_result
340
+
341
+ # Calculate effect size confidence interval
342
+ if metric.type == "binary":
343
+ intervals[metric.name] = self._binary_effect_interval(control_data, treatment_data)
344
+ else:
345
+ intervals[metric.name] = self._continuous_effect_interval(control_data, treatment_data)
346
+
347
+ return tests, intervals
348
+
349
+ def _binary_test(self, control: pd.Series, treatment: pd.Series) -> Dict[str, Any]:
350
+ """Perform statistical test for binary metric"""
351
+ control_success = control.sum()
352
+ control_total = len(control)
353
+ treatment_success = treatment.sum()
354
+ treatment_total = len(treatment)
355
+
356
+ # Chi-square test
357
+ observed = [[control_success, control_total - control_success],
358
+ [treatment_success, treatment_total - treatment_success]]
359
+
360
+ chi2, p_value, _, _ = stats.chi2_contingency(observed)
361
+
362
+ # Effect size (difference in rates)
363
+ control_rate = control_success / control_total
364
+ treatment_rate = treatment_success / treatment_total
365
+ effect_size = treatment_rate - control_rate
366
+
367
+ return {
368
+ "test_type": "chi_square",
369
+ "statistic": chi2,
370
+ "p_value": p_value,
371
+ "significant": p_value < self.significance_level,
372
+ "effect_size": effect_size,
373
+ "control_rate": control_rate,
374
+ "treatment_rate": treatment_rate
375
+ }
376
+
377
+ def _continuous_test(self, control: pd.Series, treatment: pd.Series) -> Dict[str, Any]:
378
+ """Perform statistical test for continuous metric"""
379
+ # Two-sample t-test
380
+ statistic, p_value = stats.ttest_ind(treatment, control)
381
+
382
+ # Effect size (Cohen's d)
383
+ pooled_std = np.sqrt(((len(control) - 1) * control.std()**2 +
384
+ (len(treatment) - 1) * treatment.std()**2) /
385
+ (len(control) + len(treatment) - 2))
386
+
387
+ cohens_d = (treatment.mean() - control.mean()) / pooled_std if pooled_std > 0 else 0
388
+
389
+ return {
390
+ "test_type": "t_test",
391
+ "statistic": statistic,
392
+ "p_value": p_value,
393
+ "significant": p_value < self.significance_level,
394
+ "effect_size": cohens_d,
395
+ "control_mean": control.mean(),
396
+ "treatment_mean": treatment.mean(),
397
+ "relative_change": (treatment.mean() - control.mean()) / control.mean() if control.mean() != 0 else 0
398
+ }
399
+
400
+ def _count_test(self, control: pd.Series, treatment: pd.Series) -> Dict[str, Any]:
401
+ """Perform statistical test for count metric"""
402
+ # Poisson test (approximated with normal for large samples)
403
+ control_sum = control.sum()
404
+ treatment_sum = treatment.sum()
405
+
406
+ # Rate comparison
407
+ control_rate = control_sum / len(control)
408
+ treatment_rate = treatment_sum / len(treatment)
409
+
410
+ # Use two-sample Poisson test approximation
411
+ if control_rate > 0 and treatment_rate > 0:
412
+ statistic, p_value = stats.ttest_ind(treatment, control)
413
+ else:
414
+ statistic, p_value = 0, 1
415
+
416
+ return {
417
+ "test_type": "poisson_approximation",
418
+ "statistic": statistic,
419
+ "p_value": p_value,
420
+ "significant": p_value < self.significance_level,
421
+ "control_rate": control_rate,
422
+ "treatment_rate": treatment_rate,
423
+ "rate_ratio": treatment_rate / control_rate if control_rate > 0 else float('inf')
424
+ }
425
+
426
+ def _binary_confidence_interval(self, data: pd.Series, confidence: float = 0.95) -> tuple:
427
+ """Calculate confidence interval for binary metric"""
428
+ n = len(data)
429
+ p = data.mean()
430
+ z = stats.norm.ppf(1 - (1 - confidence) / 2)
431
+ margin = z * np.sqrt(p * (1 - p) / n) if n > 0 else 0
432
+ return (max(0, p - margin), min(1, p + margin))
433
+
434
+ def _continuous_confidence_interval(self, data: pd.Series, confidence: float = 0.95) -> tuple:
435
+ """Calculate confidence interval for continuous metric"""
436
+ n = len(data)
437
+ mean = data.mean()
438
+ sem = data.std() / np.sqrt(n) if n > 0 else 0
439
+ t_value = stats.t.ppf(1 - (1 - confidence) / 2, n - 1) if n > 1 else 0
440
+ margin = t_value * sem
441
+ return (mean - margin, mean + margin)
442
+
443
+ def _binary_effect_interval(self, control: pd.Series, treatment: pd.Series) -> tuple:
444
+ """Calculate confidence interval for binary effect size"""
445
+ p1 = control.mean()
446
+ p2 = treatment.mean()
447
+ n1 = len(control)
448
+ n2 = len(treatment)
449
+
450
+ diff = p2 - p1
451
+ se = np.sqrt(p1 * (1 - p1) / n1 + p2 * (1 - p2) / n2) if n1 > 0 and n2 > 0 else 0
452
+ z = stats.norm.ppf(0.975)
453
+ margin = z * se
454
+
455
+ return (diff - margin, diff + margin)
456
+
457
+ def _continuous_effect_interval(self, control: pd.Series, treatment: pd.Series) -> tuple:
458
+ """Calculate confidence interval for continuous effect size"""
459
+ diff = treatment.mean() - control.mean()
460
+ n1 = len(control)
461
+ n2 = len(treatment)
462
+
463
+ if n1 > 1 and n2 > 1:
464
+ pooled_var = ((n1 - 1) * control.var() + (n2 - 1) * treatment.var()) / (n1 + n2 - 2)
465
+ se = np.sqrt(pooled_var * (1/n1 + 1/n2))
466
+ t_value = stats.t.ppf(0.975, n1 + n2 - 2)
467
+ margin = t_value * se
468
+ else:
469
+ margin = 0
470
+
471
+ return (diff - margin, diff + margin)
472
+
473
+ def _generate_recommendations(self, variant_data: Dict, statistical_tests: Dict,
474
+ metrics_config: List[Metric]) -> List[str]:
475
+ """Generate recommendations based on results"""
476
+ recommendations = []
477
+
478
+ primary_metrics = [m for m in metrics_config if m.primary]
479
+
480
+ for variant_id, tests in statistical_tests.items():
481
+ significant_improvements = []
482
+ significant_degradations = []
483
+
484
+ for metric_name, test in tests.items():
485
+ if test.get('significant', False):
486
+ metric_config = next((m for m in metrics_config if m.name == metric_name), None)
487
+
488
+ if metric_config:
489
+ if metric_config.goal == "increase":
490
+ if test.get('effect_size', 0) > 0:
491
+ significant_improvements.append(metric_name)
492
+ else:
493
+ significant_degradations.append(metric_name)
494
+ elif metric_config.goal == "decrease":
495
+ if test.get('effect_size', 0) < 0:
496
+ significant_improvements.append(metric_name)
497
+ else:
498
+ significant_degradations.append(metric_name)
499
+
500
+ if significant_improvements:
501
+ recommendations.append(
502
+ f"Variant {variant_id} shows significant improvement in: {', '.join(significant_improvements)}"
503
+ )
504
+
505
+ if significant_degradations:
506
+ recommendations.append(
507
+ f"Variant {variant_id} shows significant degradation in: {', '.join(significant_degradations)}"
508
+ )
509
+
510
+ if not any(test.get('significant', False) for tests in statistical_tests.values() for test in tests.values()):
511
+ recommendations.append("No statistically significant differences detected. Consider running experiment longer.")
512
+
513
+ return recommendations
514
+
515
+ def _determine_winner(self, statistical_tests: Dict, metrics_config: List[Metric]) -> Optional[str]:
516
+ """Determine winning variant based on primary metrics"""
517
+ primary_metrics = [m for m in metrics_config if m.primary]
518
+
519
+ if not primary_metrics:
520
+ return None
521
+
522
+ variant_scores = {}
523
+
524
+ for variant_id, tests in statistical_tests.items():
525
+ score = 0
526
+
527
+ for metric in primary_metrics:
528
+ test = tests.get(metric.name)
529
+ if test and test.get('significant', False):
530
+ effect_size = test.get('effect_size', 0)
531
+
532
+ if metric.goal == "increase" and effect_size > 0:
533
+ score += 1
534
+ elif metric.goal == "decrease" and effect_size < 0:
535
+ score += 1
536
+ else:
537
+ score -= 1
538
+
539
+ variant_scores[variant_id] = score
540
+
541
+ if variant_scores:
542
+ winner = max(variant_scores.items(), key=lambda x: x[1])
543
+ return winner[0] if winner[1] > 0 else None
544
+
545
+ return None
546
+
547
+ def _empty_result(self, experiment_id: str) -> ExperimentResult:
548
+ """Return empty result for experiments with no data"""
549
+ return ExperimentResult(
550
+ experiment_id=experiment_id,
551
+ variant_results={},
552
+ statistical_tests={},
553
+ confidence_intervals={},
554
+ recommendations=["No data available for analysis"],
555
+ created_at=datetime.now()
556
+ )
557
+
558
+
559
+ class ABTestingFramework:
560
+ """Main A/B testing framework orchestrator"""
561
+
562
+ def __init__(self, storage_path: Path = Path("experiments")):
563
+ self.storage_path = storage_path
564
+ self.storage_path.mkdir(parents=True, exist_ok=True)
565
+
566
+ self.traffic_splitter = TrafficSplitter()
567
+ self.metrics_collector = MetricsCollector(storage_path / "metrics")
568
+ self.analyzer = StatisticalAnalyzer()
569
+
570
+ self.experiments = {}
571
+ self.load_experiments()
572
+
573
+ def create_experiment(self, config: ExperimentConfig) -> str:
574
+ """Create new A/B test experiment"""
575
+ # Validate configuration
576
+ self._validate_experiment_config(config)
577
+
578
+ # Generate ID if not provided
579
+ if not config.id:
580
+ config.id = str(uuid.uuid4())
581
+
582
+ # Set start date if not provided
583
+ if not config.start_date:
584
+ config.start_date = datetime.now()
585
+
586
+ # Store experiment
587
+ self.experiments[config.id] = config
588
+ self.save_experiment(config)
589
+
590
+ logger.info(f"Created experiment: {config.name} ({config.id})")
591
+ return config.id
592
+
593
+ def start_experiment(self, experiment_id: str):
594
+ """Start an experiment"""
595
+ if experiment_id not in self.experiments:
596
+ raise ValueError(f"Experiment {experiment_id} not found")
597
+
598
+ experiment = self.experiments[experiment_id]
599
+ experiment.status = ExperimentStatus.RUNNING
600
+ experiment.start_date = datetime.now()
601
+
602
+ self.save_experiment(experiment)
603
+ logger.info(f"Started experiment: {experiment.name}")
604
+
605
+ def stop_experiment(self, experiment_id: str):
606
+ """Stop an experiment"""
607
+ if experiment_id not in self.experiments:
608
+ raise ValueError(f"Experiment {experiment_id} not found")
609
+
610
+ experiment = self.experiments[experiment_id]
611
+ experiment.status = ExperimentStatus.COMPLETED
612
+ experiment.end_date = datetime.now()
613
+
614
+ self.save_experiment(experiment)
615
+ logger.info(f"Stopped experiment: {experiment.name}")
616
+
617
+ def assign_user(self, user_id: str, experiment_id: str) -> str:
618
+ """Assign user to experiment variant"""
619
+ if experiment_id not in self.experiments:
620
+ return "control"
621
+
622
+ experiment = self.experiments[experiment_id]
623
+
624
+ # Check experiment status
625
+ if experiment.status != ExperimentStatus.RUNNING:
626
+ return "control"
627
+
628
+ # Check date range
629
+ now = datetime.now()
630
+ if experiment.start_date and now < experiment.start_date:
631
+ return "control"
632
+ if experiment.end_date and now > experiment.end_date:
633
+ return "control"
634
+
635
+ return self.traffic_splitter.assign_variant(user_id, experiment)
636
+
637
+ def record_metric(self, user_id: str, experiment_id: str, metric_name: str,
638
+ value: Union[float, int, bool]):
639
+ """Record metric for user"""
640
+ # Get user's variant assignment
641
+ variant_id = self.traffic_splitter.get_assignment(user_id, experiment_id)
642
+ if not variant_id:
643
+ variant_id = self.assign_user(user_id, experiment_id)
644
+
645
+ # Record metric
646
+ self.metrics_collector.record_metric(
647
+ user_id, experiment_id, variant_id, metric_name, value
648
+ )
649
+
650
+ def analyze_experiment(self, experiment_id: str) -> ExperimentResult:
651
+ """Analyze experiment results"""
652
+ if experiment_id not in self.experiments:
653
+ raise ValueError(f"Experiment {experiment_id} not found")
654
+
655
+ experiment = self.experiments[experiment_id]
656
+ metrics_df = self.metrics_collector.get_experiment_metrics(experiment_id)
657
+
658
+ return self.analyzer.analyze_experiment(experiment, metrics_df)
659
+
660
+ def get_experiment_summary(self, experiment_id: str) -> Dict[str, Any]:
661
+ """Get experiment summary"""
662
+ if experiment_id not in self.experiments:
663
+ raise ValueError(f"Experiment {experiment_id} not found")
664
+
665
+ experiment = self.experiments[experiment_id]
666
+ metrics_df = self.metrics_collector.get_experiment_metrics(experiment_id)
667
+
668
+ summary = {
669
+ "experiment": asdict(experiment),
670
+ "total_users": len(metrics_df['user_id'].unique()) if not metrics_df.empty else 0,
671
+ "total_events": len(metrics_df) if not metrics_df.empty else 0,
672
+ "variant_distribution": metrics_df['variant_id'].value_counts().to_dict() if not metrics_df.empty else {}
673
+ }
674
+
675
+ return summary
676
+
677
+ def list_experiments(self) -> List[Dict[str, Any]]:
678
+ """List all experiments"""
679
+ return [
680
+ {
681
+ "id": exp.id,
682
+ "name": exp.name,
683
+ "status": exp.status.value,
684
+ "start_date": exp.start_date.isoformat() if exp.start_date else None,
685
+ "end_date": exp.end_date.isoformat() if exp.end_date else None,
686
+ "variants": len(exp.variants),
687
+ "metrics": len(exp.metrics)
688
+ }
689
+ for exp in self.experiments.values()
690
+ ]
691
+
692
+ def save_experiment(self, experiment: ExperimentConfig):
693
+ """Save experiment to storage"""
694
+ experiment_file = self.storage_path / f"experiment_{experiment.id}.json"
695
+
696
+ # Convert to dict and handle non-serializable types
697
+ experiment_dict = asdict(experiment)
698
+
699
+ # Convert datetime objects to ISO strings
700
+ if experiment_dict.get('start_date'):
701
+ experiment_dict['start_date'] = experiment.start_date.isoformat()
702
+ if experiment_dict.get('end_date'):
703
+ experiment_dict['end_date'] = experiment.end_date.isoformat()
704
+
705
+ # Convert enums to strings
706
+ experiment_dict['status'] = experiment.status.value
707
+ for variant in experiment_dict['variants']:
708
+ variant['type'] = variant['type'].value if hasattr(variant['type'], 'value') else variant['type']
709
+
710
+ with open(experiment_file, 'w') as f:
711
+ json.dump(experiment_dict, f, indent=2)
712
+
713
+ def load_experiments(self):
714
+ """Load experiments from storage"""
715
+ for experiment_file in self.storage_path.glob("experiment_*.json"):
716
+ try:
717
+ with open(experiment_file, 'r') as f:
718
+ experiment_dict = json.load(f)
719
+
720
+ # Convert back from dict to objects
721
+ experiment = self._dict_to_experiment(experiment_dict)
722
+ self.experiments[experiment.id] = experiment
723
+
724
+ except Exception as e:
725
+ logger.error(f"Failed to load experiment from {experiment_file}: {e}")
726
+
727
+ def _dict_to_experiment(self, experiment_dict: Dict) -> ExperimentConfig:
728
+ """Convert dictionary back to ExperimentConfig"""
729
+ # Convert datetime strings back to objects
730
+ if experiment_dict.get('start_date'):
731
+ experiment_dict['start_date'] = datetime.fromisoformat(experiment_dict['start_date'])
732
+ if experiment_dict.get('end_date'):
733
+ experiment_dict['end_date'] = datetime.fromisoformat(experiment_dict['end_date'])
734
+
735
+ # Convert status string back to enum
736
+ experiment_dict['status'] = ExperimentStatus(experiment_dict['status'])
737
+
738
+ # Convert variants
739
+ variants = []
740
+ for variant_dict in experiment_dict['variants']:
741
+ variant_dict['type'] = VariantType(variant_dict['type'])
742
+ variants.append(Variant(**variant_dict))
743
+ experiment_dict['variants'] = variants
744
+
745
+ # Convert metrics
746
+ metrics = []
747
+ for metric_dict in experiment_dict['metrics']:
748
+ metrics.append(Metric(**metric_dict))
749
+ experiment_dict['metrics'] = metrics
750
+
751
+ return ExperimentConfig(**experiment_dict)
752
+
753
+ def _validate_experiment_config(self, config: ExperimentConfig):
754
+ """Validate experiment configuration"""
755
+ # Check traffic percentages sum to 100%
756
+ total_traffic = sum(v.traffic_percentage for v in config.variants)
757
+ if abs(total_traffic - 100.0) > 0.01:
758
+ raise ValueError(f"Variant traffic percentages must sum to 100%, got {total_traffic}")
759
+
760
+ # Check at least one control variant
761
+ control_variants = [v for v in config.variants if v.type == VariantType.CONTROL]
762
+ if not control_variants:
763
+ raise ValueError("At least one control variant is required")
764
+
765
+ # Check at least one primary metric
766
+ primary_metrics = [m for m in config.metrics if m.primary]
767
+ if not primary_metrics:
768
+ logger.warning("No primary metrics defined")
769
+
770
+
771
+ # Example usage
772
+ if __name__ == "__main__":
773
+ # Initialize framework
774
+ framework = ABTestingFramework(Path("experiments"))
775
+
776
+ # Create experiment configuration
777
+ config = ExperimentConfig(
778
+ id="model_comparison_v1",
779
+ name="Stock Recommendation Model A/B Test",
780
+ description="Compare ensemble model vs single model performance",
781
+ variants=[
782
+ Variant(
783
+ id="control",
784
+ name="Single Model",
785
+ type=VariantType.CONTROL,
786
+ traffic_percentage=50.0,
787
+ model_config={"model_type": "single_mlp"}
788
+ ),
789
+ Variant(
790
+ id="treatment",
791
+ name="Ensemble Model",
792
+ type=VariantType.TREATMENT,
793
+ traffic_percentage=50.0,
794
+ model_config={"model_type": "ensemble"}
795
+ )
796
+ ],
797
+ metrics=[
798
+ Metric(
799
+ name="prediction_accuracy",
800
+ type="continuous",
801
+ aggregation="mean",
802
+ goal="increase",
803
+ primary=True
804
+ ),
805
+ Metric(
806
+ name="recommendation_click_rate",
807
+ type="binary",
808
+ aggregation="mean",
809
+ goal="increase",
810
+ primary=True
811
+ ),
812
+ Metric(
813
+ name="portfolio_return",
814
+ type="continuous",
815
+ aggregation="mean",
816
+ goal="increase"
817
+ )
818
+ ],
819
+ min_sample_size=1000
820
+ )
821
+
822
+ # Create and start experiment
823
+ experiment_id = framework.create_experiment(config)
824
+ framework.start_experiment(experiment_id)
825
+
826
+ # Simulate user assignments and metrics
827
+ for i in range(100):
828
+ user_id = f"user_{i}"
829
+ variant = framework.assign_user(user_id, experiment_id)
830
+
831
+ # Simulate metrics
832
+ framework.record_metric(user_id, experiment_id, "prediction_accuracy", random.uniform(0.6, 0.9))
833
+ framework.record_metric(user_id, experiment_id, "recommendation_click_rate", random.choice([0, 1]))
834
+ framework.record_metric(user_id, experiment_id, "portfolio_return", random.uniform(-0.1, 0.15))
835
+
836
+ # Analyze results
837
+ results = framework.analyze_experiment(experiment_id)
838
+
839
+ print(f"Experiment Results:")
840
+ print(f"Total Users: {results.total_users}")
841
+ print(f"Statistical Significance: {results.statistical_significance}")
842
+ print(f"Winner: {results.winner_variant}")
843
+ print(f"Recommendations: {results.recommendations}")
844
+
845
+ logger.info("A/B testing framework demo completed")