mcli-framework 7.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcli-framework might be problematic. Click here for more details.
- mcli/app/chat_cmd.py +42 -0
- mcli/app/commands_cmd.py +226 -0
- mcli/app/completion_cmd.py +216 -0
- mcli/app/completion_helpers.py +288 -0
- mcli/app/cron_test_cmd.py +697 -0
- mcli/app/logs_cmd.py +419 -0
- mcli/app/main.py +492 -0
- mcli/app/model/model.py +1060 -0
- mcli/app/model_cmd.py +227 -0
- mcli/app/redis_cmd.py +269 -0
- mcli/app/video/video.py +1114 -0
- mcli/app/visual_cmd.py +303 -0
- mcli/chat/chat.py +2409 -0
- mcli/chat/command_rag.py +514 -0
- mcli/chat/enhanced_chat.py +652 -0
- mcli/chat/system_controller.py +1010 -0
- mcli/chat/system_integration.py +1016 -0
- mcli/cli.py +25 -0
- mcli/config.toml +20 -0
- mcli/lib/api/api.py +586 -0
- mcli/lib/api/daemon_client.py +203 -0
- mcli/lib/api/daemon_client_local.py +44 -0
- mcli/lib/api/daemon_decorator.py +217 -0
- mcli/lib/api/mcli_decorators.py +1032 -0
- mcli/lib/auth/auth.py +85 -0
- mcli/lib/auth/aws_manager.py +85 -0
- mcli/lib/auth/azure_manager.py +91 -0
- mcli/lib/auth/credential_manager.py +192 -0
- mcli/lib/auth/gcp_manager.py +93 -0
- mcli/lib/auth/key_manager.py +117 -0
- mcli/lib/auth/mcli_manager.py +93 -0
- mcli/lib/auth/token_manager.py +75 -0
- mcli/lib/auth/token_util.py +1011 -0
- mcli/lib/config/config.py +47 -0
- mcli/lib/discovery/__init__.py +1 -0
- mcli/lib/discovery/command_discovery.py +274 -0
- mcli/lib/erd/erd.py +1345 -0
- mcli/lib/erd/generate_graph.py +453 -0
- mcli/lib/files/files.py +76 -0
- mcli/lib/fs/fs.py +109 -0
- mcli/lib/lib.py +29 -0
- mcli/lib/logger/logger.py +611 -0
- mcli/lib/performance/optimizer.py +409 -0
- mcli/lib/performance/rust_bridge.py +502 -0
- mcli/lib/performance/uvloop_config.py +154 -0
- mcli/lib/pickles/pickles.py +50 -0
- mcli/lib/search/cached_vectorizer.py +479 -0
- mcli/lib/services/data_pipeline.py +460 -0
- mcli/lib/services/lsh_client.py +441 -0
- mcli/lib/services/redis_service.py +387 -0
- mcli/lib/shell/shell.py +137 -0
- mcli/lib/toml/toml.py +33 -0
- mcli/lib/ui/styling.py +47 -0
- mcli/lib/ui/visual_effects.py +634 -0
- mcli/lib/watcher/watcher.py +185 -0
- mcli/ml/api/app.py +215 -0
- mcli/ml/api/middleware.py +224 -0
- mcli/ml/api/routers/admin_router.py +12 -0
- mcli/ml/api/routers/auth_router.py +244 -0
- mcli/ml/api/routers/backtest_router.py +12 -0
- mcli/ml/api/routers/data_router.py +12 -0
- mcli/ml/api/routers/model_router.py +302 -0
- mcli/ml/api/routers/monitoring_router.py +12 -0
- mcli/ml/api/routers/portfolio_router.py +12 -0
- mcli/ml/api/routers/prediction_router.py +267 -0
- mcli/ml/api/routers/trade_router.py +12 -0
- mcli/ml/api/routers/websocket_router.py +76 -0
- mcli/ml/api/schemas.py +64 -0
- mcli/ml/auth/auth_manager.py +425 -0
- mcli/ml/auth/models.py +154 -0
- mcli/ml/auth/permissions.py +302 -0
- mcli/ml/backtesting/backtest_engine.py +502 -0
- mcli/ml/backtesting/performance_metrics.py +393 -0
- mcli/ml/cache.py +400 -0
- mcli/ml/cli/main.py +398 -0
- mcli/ml/config/settings.py +394 -0
- mcli/ml/configs/dvc_config.py +230 -0
- mcli/ml/configs/mlflow_config.py +131 -0
- mcli/ml/configs/mlops_manager.py +293 -0
- mcli/ml/dashboard/app.py +532 -0
- mcli/ml/dashboard/app_integrated.py +738 -0
- mcli/ml/dashboard/app_supabase.py +560 -0
- mcli/ml/dashboard/app_training.py +615 -0
- mcli/ml/dashboard/cli.py +51 -0
- mcli/ml/data_ingestion/api_connectors.py +501 -0
- mcli/ml/data_ingestion/data_pipeline.py +567 -0
- mcli/ml/data_ingestion/stream_processor.py +512 -0
- mcli/ml/database/migrations/env.py +94 -0
- mcli/ml/database/models.py +667 -0
- mcli/ml/database/session.py +200 -0
- mcli/ml/experimentation/ab_testing.py +845 -0
- mcli/ml/features/ensemble_features.py +607 -0
- mcli/ml/features/political_features.py +676 -0
- mcli/ml/features/recommendation_engine.py +809 -0
- mcli/ml/features/stock_features.py +573 -0
- mcli/ml/features/test_feature_engineering.py +346 -0
- mcli/ml/logging.py +85 -0
- mcli/ml/mlops/data_versioning.py +518 -0
- mcli/ml/mlops/experiment_tracker.py +377 -0
- mcli/ml/mlops/model_serving.py +481 -0
- mcli/ml/mlops/pipeline_orchestrator.py +614 -0
- mcli/ml/models/base_models.py +324 -0
- mcli/ml/models/ensemble_models.py +675 -0
- mcli/ml/models/recommendation_models.py +474 -0
- mcli/ml/models/test_models.py +487 -0
- mcli/ml/monitoring/drift_detection.py +676 -0
- mcli/ml/monitoring/metrics.py +45 -0
- mcli/ml/optimization/portfolio_optimizer.py +834 -0
- mcli/ml/preprocessing/data_cleaners.py +451 -0
- mcli/ml/preprocessing/feature_extractors.py +491 -0
- mcli/ml/preprocessing/ml_pipeline.py +382 -0
- mcli/ml/preprocessing/politician_trading_preprocessor.py +569 -0
- mcli/ml/preprocessing/test_preprocessing.py +294 -0
- mcli/ml/scripts/populate_sample_data.py +200 -0
- mcli/ml/tasks.py +400 -0
- mcli/ml/tests/test_integration.py +429 -0
- mcli/ml/tests/test_training_dashboard.py +387 -0
- mcli/public/oi/oi.py +15 -0
- mcli/public/public.py +4 -0
- mcli/self/self_cmd.py +1246 -0
- mcli/workflow/daemon/api_daemon.py +800 -0
- mcli/workflow/daemon/async_command_database.py +681 -0
- mcli/workflow/daemon/async_process_manager.py +591 -0
- mcli/workflow/daemon/client.py +530 -0
- mcli/workflow/daemon/commands.py +1196 -0
- mcli/workflow/daemon/daemon.py +905 -0
- mcli/workflow/daemon/daemon_api.py +59 -0
- mcli/workflow/daemon/enhanced_daemon.py +571 -0
- mcli/workflow/daemon/process_cli.py +244 -0
- mcli/workflow/daemon/process_manager.py +439 -0
- mcli/workflow/daemon/test_daemon.py +275 -0
- mcli/workflow/dashboard/dashboard_cmd.py +113 -0
- mcli/workflow/docker/docker.py +0 -0
- mcli/workflow/file/file.py +100 -0
- mcli/workflow/gcloud/config.toml +21 -0
- mcli/workflow/gcloud/gcloud.py +58 -0
- mcli/workflow/git_commit/ai_service.py +328 -0
- mcli/workflow/git_commit/commands.py +430 -0
- mcli/workflow/lsh_integration.py +355 -0
- mcli/workflow/model_service/client.py +594 -0
- mcli/workflow/model_service/download_and_run_efficient_models.py +288 -0
- mcli/workflow/model_service/lightweight_embedder.py +397 -0
- mcli/workflow/model_service/lightweight_model_server.py +714 -0
- mcli/workflow/model_service/lightweight_test.py +241 -0
- mcli/workflow/model_service/model_service.py +1955 -0
- mcli/workflow/model_service/ollama_efficient_runner.py +425 -0
- mcli/workflow/model_service/pdf_processor.py +386 -0
- mcli/workflow/model_service/test_efficient_runner.py +234 -0
- mcli/workflow/model_service/test_example.py +315 -0
- mcli/workflow/model_service/test_integration.py +131 -0
- mcli/workflow/model_service/test_new_features.py +149 -0
- mcli/workflow/openai/openai.py +99 -0
- mcli/workflow/politician_trading/commands.py +1790 -0
- mcli/workflow/politician_trading/config.py +134 -0
- mcli/workflow/politician_trading/connectivity.py +490 -0
- mcli/workflow/politician_trading/data_sources.py +395 -0
- mcli/workflow/politician_trading/database.py +410 -0
- mcli/workflow/politician_trading/demo.py +248 -0
- mcli/workflow/politician_trading/models.py +165 -0
- mcli/workflow/politician_trading/monitoring.py +413 -0
- mcli/workflow/politician_trading/scrapers.py +966 -0
- mcli/workflow/politician_trading/scrapers_california.py +412 -0
- mcli/workflow/politician_trading/scrapers_eu.py +377 -0
- mcli/workflow/politician_trading/scrapers_uk.py +350 -0
- mcli/workflow/politician_trading/scrapers_us_states.py +438 -0
- mcli/workflow/politician_trading/supabase_functions.py +354 -0
- mcli/workflow/politician_trading/workflow.py +852 -0
- mcli/workflow/registry/registry.py +180 -0
- mcli/workflow/repo/repo.py +223 -0
- mcli/workflow/scheduler/commands.py +493 -0
- mcli/workflow/scheduler/cron_parser.py +238 -0
- mcli/workflow/scheduler/job.py +182 -0
- mcli/workflow/scheduler/monitor.py +139 -0
- mcli/workflow/scheduler/persistence.py +324 -0
- mcli/workflow/scheduler/scheduler.py +679 -0
- mcli/workflow/sync/sync_cmd.py +437 -0
- mcli/workflow/sync/test_cmd.py +314 -0
- mcli/workflow/videos/videos.py +242 -0
- mcli/workflow/wakatime/wakatime.py +11 -0
- mcli/workflow/workflow.py +37 -0
- mcli_framework-7.0.0.dist-info/METADATA +479 -0
- mcli_framework-7.0.0.dist-info/RECORD +186 -0
- mcli_framework-7.0.0.dist-info/WHEEL +5 -0
- mcli_framework-7.0.0.dist-info/entry_points.txt +7 -0
- mcli_framework-7.0.0.dist-info/licenses/LICENSE +21 -0
- mcli_framework-7.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,845 @@
|
|
|
1
|
+
"""A/B Testing framework for ML model experiments"""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import hashlib
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
import random
|
|
8
|
+
import uuid
|
|
9
|
+
from dataclasses import dataclass, field, asdict
|
|
10
|
+
from datetime import datetime, timedelta
|
|
11
|
+
from enum import Enum
|
|
12
|
+
from typing import Dict, Any, List, Optional, Union, Callable
|
|
13
|
+
import pandas as pd
|
|
14
|
+
import numpy as np
|
|
15
|
+
from scipy import stats
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ExperimentStatus(Enum):
|
|
22
|
+
DRAFT = "draft"
|
|
23
|
+
RUNNING = "running"
|
|
24
|
+
PAUSED = "paused"
|
|
25
|
+
COMPLETED = "completed"
|
|
26
|
+
CANCELLED = "cancelled"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class VariantType(Enum):
|
|
30
|
+
CONTROL = "control"
|
|
31
|
+
TREATMENT = "treatment"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class Variant:
|
|
36
|
+
"""A/B test variant configuration"""
|
|
37
|
+
id: str
|
|
38
|
+
name: str
|
|
39
|
+
type: VariantType
|
|
40
|
+
traffic_percentage: float
|
|
41
|
+
model_config: Dict[str, Any] = field(default_factory=dict)
|
|
42
|
+
feature_flags: Dict[str, Any] = field(default_factory=dict)
|
|
43
|
+
description: str = ""
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class Metric:
|
|
48
|
+
"""A/B test metric definition"""
|
|
49
|
+
name: str
|
|
50
|
+
type: str # "binary", "continuous", "count"
|
|
51
|
+
aggregation: str # "mean", "sum", "count", "rate"
|
|
52
|
+
goal: str # "increase", "decrease", "maintain"
|
|
53
|
+
statistical_power: float = 0.8
|
|
54
|
+
min_detectable_effect: float = 0.05
|
|
55
|
+
primary: bool = False
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass
|
|
59
|
+
class ExperimentConfig:
|
|
60
|
+
"""A/B test experiment configuration"""
|
|
61
|
+
id: str
|
|
62
|
+
name: str
|
|
63
|
+
description: str
|
|
64
|
+
variants: List[Variant]
|
|
65
|
+
metrics: List[Metric]
|
|
66
|
+
|
|
67
|
+
# Traffic configuration
|
|
68
|
+
traffic_percentage: float = 100.0 # Percentage of traffic to include
|
|
69
|
+
|
|
70
|
+
# Duration configuration
|
|
71
|
+
start_date: Optional[datetime] = None
|
|
72
|
+
end_date: Optional[datetime] = None
|
|
73
|
+
min_duration_days: int = 7
|
|
74
|
+
max_duration_days: int = 30
|
|
75
|
+
|
|
76
|
+
# Statistical configuration
|
|
77
|
+
significance_level: float = 0.05
|
|
78
|
+
statistical_power: float = 0.8
|
|
79
|
+
min_sample_size: int = 1000
|
|
80
|
+
|
|
81
|
+
# Guardrail metrics
|
|
82
|
+
guardrail_metrics: List[str] = field(default_factory=list)
|
|
83
|
+
|
|
84
|
+
# Feature flags
|
|
85
|
+
feature_flags: Dict[str, Any] = field(default_factory=dict)
|
|
86
|
+
|
|
87
|
+
status: ExperimentStatus = ExperimentStatus.DRAFT
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@dataclass
|
|
91
|
+
class UserAssignment:
|
|
92
|
+
"""User assignment to experiment variant"""
|
|
93
|
+
user_id: str
|
|
94
|
+
experiment_id: str
|
|
95
|
+
variant_id: str
|
|
96
|
+
assigned_at: datetime
|
|
97
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@dataclass
|
|
101
|
+
class ExperimentResult:
|
|
102
|
+
"""Results of an A/B test experiment"""
|
|
103
|
+
experiment_id: str
|
|
104
|
+
variant_results: Dict[str, Dict[str, Any]]
|
|
105
|
+
statistical_tests: Dict[str, Dict[str, Any]]
|
|
106
|
+
confidence_intervals: Dict[str, Dict[str, tuple]]
|
|
107
|
+
recommendations: List[str]
|
|
108
|
+
created_at: datetime
|
|
109
|
+
|
|
110
|
+
# Overall experiment stats
|
|
111
|
+
total_users: int = 0
|
|
112
|
+
duration_days: int = 0
|
|
113
|
+
statistical_significance: bool = False
|
|
114
|
+
winner_variant: Optional[str] = None
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class TrafficSplitter:
|
|
118
|
+
"""Handle traffic splitting for A/B tests"""
|
|
119
|
+
|
|
120
|
+
def __init__(self):
|
|
121
|
+
self.assignments = {}
|
|
122
|
+
|
|
123
|
+
def assign_variant(self, user_id: str, experiment: ExperimentConfig) -> str:
|
|
124
|
+
"""Assign user to experiment variant"""
|
|
125
|
+
# Check if user already assigned
|
|
126
|
+
cache_key = f"{user_id}:{experiment.id}"
|
|
127
|
+
if cache_key in self.assignments:
|
|
128
|
+
return self.assignments[cache_key]
|
|
129
|
+
|
|
130
|
+
# Hash user ID for consistent assignment
|
|
131
|
+
hash_input = f"{user_id}:{experiment.id}".encode()
|
|
132
|
+
hash_value = int(hashlib.md5(hash_input).hexdigest(), 16)
|
|
133
|
+
hash_ratio = (hash_value % 10000) / 10000.0
|
|
134
|
+
|
|
135
|
+
# Check if user should be included in experiment
|
|
136
|
+
if hash_ratio * 100 > experiment.traffic_percentage:
|
|
137
|
+
return "control" # Not in experiment
|
|
138
|
+
|
|
139
|
+
# Assign to variant based on traffic split
|
|
140
|
+
cumulative_percentage = 0
|
|
141
|
+
for variant in experiment.variants:
|
|
142
|
+
cumulative_percentage += variant.traffic_percentage
|
|
143
|
+
if hash_ratio * 100 <= cumulative_percentage:
|
|
144
|
+
self.assignments[cache_key] = variant.id
|
|
145
|
+
return variant.id
|
|
146
|
+
|
|
147
|
+
# Default to control
|
|
148
|
+
control_variant = next((v for v in experiment.variants if v.type == VariantType.CONTROL), experiment.variants[0])
|
|
149
|
+
self.assignments[cache_key] = control_variant.id
|
|
150
|
+
return control_variant.id
|
|
151
|
+
|
|
152
|
+
def get_assignment(self, user_id: str, experiment_id: str) -> Optional[str]:
|
|
153
|
+
"""Get existing assignment"""
|
|
154
|
+
cache_key = f"{user_id}:{experiment_id}"
|
|
155
|
+
return self.assignments.get(cache_key)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
class MetricsCollector:
|
|
159
|
+
"""Collect and store experiment metrics"""
|
|
160
|
+
|
|
161
|
+
def __init__(self, storage_path: Path = Path("experiments/metrics")):
|
|
162
|
+
self.storage_path = storage_path
|
|
163
|
+
self.storage_path.mkdir(parents=True, exist_ok=True)
|
|
164
|
+
self.metrics_buffer = []
|
|
165
|
+
|
|
166
|
+
def record_metric(self, user_id: str, experiment_id: str, variant_id: str,
|
|
167
|
+
metric_name: str, value: Union[float, int, bool],
|
|
168
|
+
timestamp: Optional[datetime] = None):
|
|
169
|
+
"""Record a metric value for a user"""
|
|
170
|
+
if timestamp is None:
|
|
171
|
+
timestamp = datetime.now()
|
|
172
|
+
|
|
173
|
+
metric_record = {
|
|
174
|
+
"user_id": user_id,
|
|
175
|
+
"experiment_id": experiment_id,
|
|
176
|
+
"variant_id": variant_id,
|
|
177
|
+
"metric_name": metric_name,
|
|
178
|
+
"value": value,
|
|
179
|
+
"timestamp": timestamp.isoformat()
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
self.metrics_buffer.append(metric_record)
|
|
183
|
+
|
|
184
|
+
# Flush buffer if it gets too large
|
|
185
|
+
if len(self.metrics_buffer) >= 1000:
|
|
186
|
+
self.flush_metrics()
|
|
187
|
+
|
|
188
|
+
def flush_metrics(self):
|
|
189
|
+
"""Flush metrics buffer to storage"""
|
|
190
|
+
if not self.metrics_buffer:
|
|
191
|
+
return
|
|
192
|
+
|
|
193
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
194
|
+
filename = self.storage_path / f"metrics_{timestamp}.json"
|
|
195
|
+
|
|
196
|
+
with open(filename, 'w') as f:
|
|
197
|
+
json.dump(self.metrics_buffer, f, indent=2)
|
|
198
|
+
|
|
199
|
+
logger.info(f"Flushed {len(self.metrics_buffer)} metrics to {filename}")
|
|
200
|
+
self.metrics_buffer.clear()
|
|
201
|
+
|
|
202
|
+
def get_experiment_metrics(self, experiment_id: str) -> pd.DataFrame:
|
|
203
|
+
"""Get all metrics for an experiment"""
|
|
204
|
+
all_metrics = []
|
|
205
|
+
|
|
206
|
+
# Load from all metric files
|
|
207
|
+
for file_path in self.storage_path.glob("metrics_*.json"):
|
|
208
|
+
with open(file_path, 'r') as f:
|
|
209
|
+
metrics = json.load(f)
|
|
210
|
+
experiment_metrics = [m for m in metrics if m["experiment_id"] == experiment_id]
|
|
211
|
+
all_metrics.extend(experiment_metrics)
|
|
212
|
+
|
|
213
|
+
if not all_metrics:
|
|
214
|
+
return pd.DataFrame()
|
|
215
|
+
|
|
216
|
+
df = pd.DataFrame(all_metrics)
|
|
217
|
+
df['timestamp'] = pd.to_datetime(df['timestamp'])
|
|
218
|
+
return df
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
class StatisticalAnalyzer:
|
|
222
|
+
"""Perform statistical analysis on A/B test results"""
|
|
223
|
+
|
|
224
|
+
def __init__(self, significance_level: float = 0.05):
|
|
225
|
+
self.significance_level = significance_level
|
|
226
|
+
|
|
227
|
+
def analyze_experiment(self, experiment: ExperimentConfig,
|
|
228
|
+
metrics_df: pd.DataFrame) -> ExperimentResult:
|
|
229
|
+
"""Analyze experiment results"""
|
|
230
|
+
if metrics_df.empty:
|
|
231
|
+
return self._empty_result(experiment.id)
|
|
232
|
+
|
|
233
|
+
# Group metrics by variant
|
|
234
|
+
variant_data = {}
|
|
235
|
+
for variant in experiment.variants:
|
|
236
|
+
variant_metrics = metrics_df[metrics_df['variant_id'] == variant.id]
|
|
237
|
+
variant_data[variant.id] = self._analyze_variant_metrics(variant_metrics, experiment.metrics)
|
|
238
|
+
|
|
239
|
+
# Perform statistical tests
|
|
240
|
+
statistical_tests = {}
|
|
241
|
+
confidence_intervals = {}
|
|
242
|
+
|
|
243
|
+
control_variant = next((v for v in experiment.variants if v.type == VariantType.CONTROL), None)
|
|
244
|
+
if control_variant:
|
|
245
|
+
for variant in experiment.variants:
|
|
246
|
+
if variant.type == VariantType.TREATMENT:
|
|
247
|
+
tests, intervals = self._compare_variants(
|
|
248
|
+
metrics_df, control_variant.id, variant.id, experiment.metrics
|
|
249
|
+
)
|
|
250
|
+
statistical_tests[variant.id] = tests
|
|
251
|
+
confidence_intervals[variant.id] = intervals
|
|
252
|
+
|
|
253
|
+
# Generate recommendations
|
|
254
|
+
recommendations = self._generate_recommendations(
|
|
255
|
+
variant_data, statistical_tests, experiment.metrics
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
# Determine winner
|
|
259
|
+
winner = self._determine_winner(statistical_tests, experiment.metrics)
|
|
260
|
+
|
|
261
|
+
return ExperimentResult(
|
|
262
|
+
experiment_id=experiment.id,
|
|
263
|
+
variant_results=variant_data,
|
|
264
|
+
statistical_tests=statistical_tests,
|
|
265
|
+
confidence_intervals=confidence_intervals,
|
|
266
|
+
recommendations=recommendations,
|
|
267
|
+
created_at=datetime.now(),
|
|
268
|
+
total_users=len(metrics_df['user_id'].unique()),
|
|
269
|
+
duration_days=(datetime.now() - experiment.start_date).days if experiment.start_date else 0,
|
|
270
|
+
statistical_significance=any(test.get('significant', False) for test in statistical_tests.values()),
|
|
271
|
+
winner_variant=winner
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
def _analyze_variant_metrics(self, variant_df: pd.DataFrame,
|
|
275
|
+
metrics_config: List[Metric]) -> Dict[str, Any]:
|
|
276
|
+
"""Analyze metrics for a single variant"""
|
|
277
|
+
if variant_df.empty:
|
|
278
|
+
return {}
|
|
279
|
+
|
|
280
|
+
results = {}
|
|
281
|
+
for metric in metrics_config:
|
|
282
|
+
metric_data = variant_df[variant_df['metric_name'] == metric.name]['value']
|
|
283
|
+
|
|
284
|
+
if metric_data.empty:
|
|
285
|
+
continue
|
|
286
|
+
|
|
287
|
+
if metric.type == "binary":
|
|
288
|
+
results[metric.name] = {
|
|
289
|
+
"count": len(metric_data),
|
|
290
|
+
"success_rate": metric_data.mean(),
|
|
291
|
+
"std": metric_data.std(),
|
|
292
|
+
"confidence_interval": self._binary_confidence_interval(metric_data)
|
|
293
|
+
}
|
|
294
|
+
elif metric.type == "continuous":
|
|
295
|
+
results[metric.name] = {
|
|
296
|
+
"count": len(metric_data),
|
|
297
|
+
"mean": metric_data.mean(),
|
|
298
|
+
"std": metric_data.std(),
|
|
299
|
+
"median": metric_data.median(),
|
|
300
|
+
"confidence_interval": self._continuous_confidence_interval(metric_data)
|
|
301
|
+
}
|
|
302
|
+
elif metric.type == "count":
|
|
303
|
+
results[metric.name] = {
|
|
304
|
+
"count": len(metric_data),
|
|
305
|
+
"sum": metric_data.sum(),
|
|
306
|
+
"mean": metric_data.mean(),
|
|
307
|
+
"rate_per_user": metric_data.sum() / len(variant_df['user_id'].unique())
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
return results
|
|
311
|
+
|
|
312
|
+
def _compare_variants(self, metrics_df: pd.DataFrame, control_id: str,
|
|
313
|
+
treatment_id: str, metrics_config: List[Metric]) -> tuple:
|
|
314
|
+
"""Compare treatment variant against control"""
|
|
315
|
+
tests = {}
|
|
316
|
+
intervals = {}
|
|
317
|
+
|
|
318
|
+
for metric in metrics_config:
|
|
319
|
+
control_data = metrics_df[
|
|
320
|
+
(metrics_df['variant_id'] == control_id) &
|
|
321
|
+
(metrics_df['metric_name'] == metric.name)
|
|
322
|
+
]['value']
|
|
323
|
+
|
|
324
|
+
treatment_data = metrics_df[
|
|
325
|
+
(metrics_df['variant_id'] == treatment_id) &
|
|
326
|
+
(metrics_df['metric_name'] == metric.name)
|
|
327
|
+
]['value']
|
|
328
|
+
|
|
329
|
+
if control_data.empty or treatment_data.empty:
|
|
330
|
+
continue
|
|
331
|
+
|
|
332
|
+
if metric.type == "binary":
|
|
333
|
+
test_result = self._binary_test(control_data, treatment_data)
|
|
334
|
+
elif metric.type == "continuous":
|
|
335
|
+
test_result = self._continuous_test(control_data, treatment_data)
|
|
336
|
+
else:
|
|
337
|
+
test_result = self._count_test(control_data, treatment_data)
|
|
338
|
+
|
|
339
|
+
tests[metric.name] = test_result
|
|
340
|
+
|
|
341
|
+
# Calculate effect size confidence interval
|
|
342
|
+
if metric.type == "binary":
|
|
343
|
+
intervals[metric.name] = self._binary_effect_interval(control_data, treatment_data)
|
|
344
|
+
else:
|
|
345
|
+
intervals[metric.name] = self._continuous_effect_interval(control_data, treatment_data)
|
|
346
|
+
|
|
347
|
+
return tests, intervals
|
|
348
|
+
|
|
349
|
+
def _binary_test(self, control: pd.Series, treatment: pd.Series) -> Dict[str, Any]:
|
|
350
|
+
"""Perform statistical test for binary metric"""
|
|
351
|
+
control_success = control.sum()
|
|
352
|
+
control_total = len(control)
|
|
353
|
+
treatment_success = treatment.sum()
|
|
354
|
+
treatment_total = len(treatment)
|
|
355
|
+
|
|
356
|
+
# Chi-square test
|
|
357
|
+
observed = [[control_success, control_total - control_success],
|
|
358
|
+
[treatment_success, treatment_total - treatment_success]]
|
|
359
|
+
|
|
360
|
+
chi2, p_value, _, _ = stats.chi2_contingency(observed)
|
|
361
|
+
|
|
362
|
+
# Effect size (difference in rates)
|
|
363
|
+
control_rate = control_success / control_total
|
|
364
|
+
treatment_rate = treatment_success / treatment_total
|
|
365
|
+
effect_size = treatment_rate - control_rate
|
|
366
|
+
|
|
367
|
+
return {
|
|
368
|
+
"test_type": "chi_square",
|
|
369
|
+
"statistic": chi2,
|
|
370
|
+
"p_value": p_value,
|
|
371
|
+
"significant": p_value < self.significance_level,
|
|
372
|
+
"effect_size": effect_size,
|
|
373
|
+
"control_rate": control_rate,
|
|
374
|
+
"treatment_rate": treatment_rate
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
def _continuous_test(self, control: pd.Series, treatment: pd.Series) -> Dict[str, Any]:
|
|
378
|
+
"""Perform statistical test for continuous metric"""
|
|
379
|
+
# Two-sample t-test
|
|
380
|
+
statistic, p_value = stats.ttest_ind(treatment, control)
|
|
381
|
+
|
|
382
|
+
# Effect size (Cohen's d)
|
|
383
|
+
pooled_std = np.sqrt(((len(control) - 1) * control.std()**2 +
|
|
384
|
+
(len(treatment) - 1) * treatment.std()**2) /
|
|
385
|
+
(len(control) + len(treatment) - 2))
|
|
386
|
+
|
|
387
|
+
cohens_d = (treatment.mean() - control.mean()) / pooled_std if pooled_std > 0 else 0
|
|
388
|
+
|
|
389
|
+
return {
|
|
390
|
+
"test_type": "t_test",
|
|
391
|
+
"statistic": statistic,
|
|
392
|
+
"p_value": p_value,
|
|
393
|
+
"significant": p_value < self.significance_level,
|
|
394
|
+
"effect_size": cohens_d,
|
|
395
|
+
"control_mean": control.mean(),
|
|
396
|
+
"treatment_mean": treatment.mean(),
|
|
397
|
+
"relative_change": (treatment.mean() - control.mean()) / control.mean() if control.mean() != 0 else 0
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
def _count_test(self, control: pd.Series, treatment: pd.Series) -> Dict[str, Any]:
|
|
401
|
+
"""Perform statistical test for count metric"""
|
|
402
|
+
# Poisson test (approximated with normal for large samples)
|
|
403
|
+
control_sum = control.sum()
|
|
404
|
+
treatment_sum = treatment.sum()
|
|
405
|
+
|
|
406
|
+
# Rate comparison
|
|
407
|
+
control_rate = control_sum / len(control)
|
|
408
|
+
treatment_rate = treatment_sum / len(treatment)
|
|
409
|
+
|
|
410
|
+
# Use two-sample Poisson test approximation
|
|
411
|
+
if control_rate > 0 and treatment_rate > 0:
|
|
412
|
+
statistic, p_value = stats.ttest_ind(treatment, control)
|
|
413
|
+
else:
|
|
414
|
+
statistic, p_value = 0, 1
|
|
415
|
+
|
|
416
|
+
return {
|
|
417
|
+
"test_type": "poisson_approximation",
|
|
418
|
+
"statistic": statistic,
|
|
419
|
+
"p_value": p_value,
|
|
420
|
+
"significant": p_value < self.significance_level,
|
|
421
|
+
"control_rate": control_rate,
|
|
422
|
+
"treatment_rate": treatment_rate,
|
|
423
|
+
"rate_ratio": treatment_rate / control_rate if control_rate > 0 else float('inf')
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
def _binary_confidence_interval(self, data: pd.Series, confidence: float = 0.95) -> tuple:
|
|
427
|
+
"""Calculate confidence interval for binary metric"""
|
|
428
|
+
n = len(data)
|
|
429
|
+
p = data.mean()
|
|
430
|
+
z = stats.norm.ppf(1 - (1 - confidence) / 2)
|
|
431
|
+
margin = z * np.sqrt(p * (1 - p) / n) if n > 0 else 0
|
|
432
|
+
return (max(0, p - margin), min(1, p + margin))
|
|
433
|
+
|
|
434
|
+
def _continuous_confidence_interval(self, data: pd.Series, confidence: float = 0.95) -> tuple:
|
|
435
|
+
"""Calculate confidence interval for continuous metric"""
|
|
436
|
+
n = len(data)
|
|
437
|
+
mean = data.mean()
|
|
438
|
+
sem = data.std() / np.sqrt(n) if n > 0 else 0
|
|
439
|
+
t_value = stats.t.ppf(1 - (1 - confidence) / 2, n - 1) if n > 1 else 0
|
|
440
|
+
margin = t_value * sem
|
|
441
|
+
return (mean - margin, mean + margin)
|
|
442
|
+
|
|
443
|
+
def _binary_effect_interval(self, control: pd.Series, treatment: pd.Series) -> tuple:
|
|
444
|
+
"""Calculate confidence interval for binary effect size"""
|
|
445
|
+
p1 = control.mean()
|
|
446
|
+
p2 = treatment.mean()
|
|
447
|
+
n1 = len(control)
|
|
448
|
+
n2 = len(treatment)
|
|
449
|
+
|
|
450
|
+
diff = p2 - p1
|
|
451
|
+
se = np.sqrt(p1 * (1 - p1) / n1 + p2 * (1 - p2) / n2) if n1 > 0 and n2 > 0 else 0
|
|
452
|
+
z = stats.norm.ppf(0.975)
|
|
453
|
+
margin = z * se
|
|
454
|
+
|
|
455
|
+
return (diff - margin, diff + margin)
|
|
456
|
+
|
|
457
|
+
def _continuous_effect_interval(self, control: pd.Series, treatment: pd.Series) -> tuple:
|
|
458
|
+
"""Calculate confidence interval for continuous effect size"""
|
|
459
|
+
diff = treatment.mean() - control.mean()
|
|
460
|
+
n1 = len(control)
|
|
461
|
+
n2 = len(treatment)
|
|
462
|
+
|
|
463
|
+
if n1 > 1 and n2 > 1:
|
|
464
|
+
pooled_var = ((n1 - 1) * control.var() + (n2 - 1) * treatment.var()) / (n1 + n2 - 2)
|
|
465
|
+
se = np.sqrt(pooled_var * (1/n1 + 1/n2))
|
|
466
|
+
t_value = stats.t.ppf(0.975, n1 + n2 - 2)
|
|
467
|
+
margin = t_value * se
|
|
468
|
+
else:
|
|
469
|
+
margin = 0
|
|
470
|
+
|
|
471
|
+
return (diff - margin, diff + margin)
|
|
472
|
+
|
|
473
|
+
def _generate_recommendations(self, variant_data: Dict, statistical_tests: Dict,
|
|
474
|
+
metrics_config: List[Metric]) -> List[str]:
|
|
475
|
+
"""Generate recommendations based on results"""
|
|
476
|
+
recommendations = []
|
|
477
|
+
|
|
478
|
+
primary_metrics = [m for m in metrics_config if m.primary]
|
|
479
|
+
|
|
480
|
+
for variant_id, tests in statistical_tests.items():
|
|
481
|
+
significant_improvements = []
|
|
482
|
+
significant_degradations = []
|
|
483
|
+
|
|
484
|
+
for metric_name, test in tests.items():
|
|
485
|
+
if test.get('significant', False):
|
|
486
|
+
metric_config = next((m for m in metrics_config if m.name == metric_name), None)
|
|
487
|
+
|
|
488
|
+
if metric_config:
|
|
489
|
+
if metric_config.goal == "increase":
|
|
490
|
+
if test.get('effect_size', 0) > 0:
|
|
491
|
+
significant_improvements.append(metric_name)
|
|
492
|
+
else:
|
|
493
|
+
significant_degradations.append(metric_name)
|
|
494
|
+
elif metric_config.goal == "decrease":
|
|
495
|
+
if test.get('effect_size', 0) < 0:
|
|
496
|
+
significant_improvements.append(metric_name)
|
|
497
|
+
else:
|
|
498
|
+
significant_degradations.append(metric_name)
|
|
499
|
+
|
|
500
|
+
if significant_improvements:
|
|
501
|
+
recommendations.append(
|
|
502
|
+
f"Variant {variant_id} shows significant improvement in: {', '.join(significant_improvements)}"
|
|
503
|
+
)
|
|
504
|
+
|
|
505
|
+
if significant_degradations:
|
|
506
|
+
recommendations.append(
|
|
507
|
+
f"Variant {variant_id} shows significant degradation in: {', '.join(significant_degradations)}"
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
if not any(test.get('significant', False) for tests in statistical_tests.values() for test in tests.values()):
|
|
511
|
+
recommendations.append("No statistically significant differences detected. Consider running experiment longer.")
|
|
512
|
+
|
|
513
|
+
return recommendations
|
|
514
|
+
|
|
515
|
+
def _determine_winner(self, statistical_tests: Dict, metrics_config: List[Metric]) -> Optional[str]:
|
|
516
|
+
"""Determine winning variant based on primary metrics"""
|
|
517
|
+
primary_metrics = [m for m in metrics_config if m.primary]
|
|
518
|
+
|
|
519
|
+
if not primary_metrics:
|
|
520
|
+
return None
|
|
521
|
+
|
|
522
|
+
variant_scores = {}
|
|
523
|
+
|
|
524
|
+
for variant_id, tests in statistical_tests.items():
|
|
525
|
+
score = 0
|
|
526
|
+
|
|
527
|
+
for metric in primary_metrics:
|
|
528
|
+
test = tests.get(metric.name)
|
|
529
|
+
if test and test.get('significant', False):
|
|
530
|
+
effect_size = test.get('effect_size', 0)
|
|
531
|
+
|
|
532
|
+
if metric.goal == "increase" and effect_size > 0:
|
|
533
|
+
score += 1
|
|
534
|
+
elif metric.goal == "decrease" and effect_size < 0:
|
|
535
|
+
score += 1
|
|
536
|
+
else:
|
|
537
|
+
score -= 1
|
|
538
|
+
|
|
539
|
+
variant_scores[variant_id] = score
|
|
540
|
+
|
|
541
|
+
if variant_scores:
|
|
542
|
+
winner = max(variant_scores.items(), key=lambda x: x[1])
|
|
543
|
+
return winner[0] if winner[1] > 0 else None
|
|
544
|
+
|
|
545
|
+
return None
|
|
546
|
+
|
|
547
|
+
def _empty_result(self, experiment_id: str) -> ExperimentResult:
|
|
548
|
+
"""Return empty result for experiments with no data"""
|
|
549
|
+
return ExperimentResult(
|
|
550
|
+
experiment_id=experiment_id,
|
|
551
|
+
variant_results={},
|
|
552
|
+
statistical_tests={},
|
|
553
|
+
confidence_intervals={},
|
|
554
|
+
recommendations=["No data available for analysis"],
|
|
555
|
+
created_at=datetime.now()
|
|
556
|
+
)
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
class ABTestingFramework:
|
|
560
|
+
"""Main A/B testing framework orchestrator"""
|
|
561
|
+
|
|
562
|
+
def __init__(self, storage_path: Path = Path("experiments")):
|
|
563
|
+
self.storage_path = storage_path
|
|
564
|
+
self.storage_path.mkdir(parents=True, exist_ok=True)
|
|
565
|
+
|
|
566
|
+
self.traffic_splitter = TrafficSplitter()
|
|
567
|
+
self.metrics_collector = MetricsCollector(storage_path / "metrics")
|
|
568
|
+
self.analyzer = StatisticalAnalyzer()
|
|
569
|
+
|
|
570
|
+
self.experiments = {}
|
|
571
|
+
self.load_experiments()
|
|
572
|
+
|
|
573
|
+
def create_experiment(self, config: ExperimentConfig) -> str:
|
|
574
|
+
"""Create new A/B test experiment"""
|
|
575
|
+
# Validate configuration
|
|
576
|
+
self._validate_experiment_config(config)
|
|
577
|
+
|
|
578
|
+
# Generate ID if not provided
|
|
579
|
+
if not config.id:
|
|
580
|
+
config.id = str(uuid.uuid4())
|
|
581
|
+
|
|
582
|
+
# Set start date if not provided
|
|
583
|
+
if not config.start_date:
|
|
584
|
+
config.start_date = datetime.now()
|
|
585
|
+
|
|
586
|
+
# Store experiment
|
|
587
|
+
self.experiments[config.id] = config
|
|
588
|
+
self.save_experiment(config)
|
|
589
|
+
|
|
590
|
+
logger.info(f"Created experiment: {config.name} ({config.id})")
|
|
591
|
+
return config.id
|
|
592
|
+
|
|
593
|
+
def start_experiment(self, experiment_id: str):
|
|
594
|
+
"""Start an experiment"""
|
|
595
|
+
if experiment_id not in self.experiments:
|
|
596
|
+
raise ValueError(f"Experiment {experiment_id} not found")
|
|
597
|
+
|
|
598
|
+
experiment = self.experiments[experiment_id]
|
|
599
|
+
experiment.status = ExperimentStatus.RUNNING
|
|
600
|
+
experiment.start_date = datetime.now()
|
|
601
|
+
|
|
602
|
+
self.save_experiment(experiment)
|
|
603
|
+
logger.info(f"Started experiment: {experiment.name}")
|
|
604
|
+
|
|
605
|
+
def stop_experiment(self, experiment_id: str):
|
|
606
|
+
"""Stop an experiment"""
|
|
607
|
+
if experiment_id not in self.experiments:
|
|
608
|
+
raise ValueError(f"Experiment {experiment_id} not found")
|
|
609
|
+
|
|
610
|
+
experiment = self.experiments[experiment_id]
|
|
611
|
+
experiment.status = ExperimentStatus.COMPLETED
|
|
612
|
+
experiment.end_date = datetime.now()
|
|
613
|
+
|
|
614
|
+
self.save_experiment(experiment)
|
|
615
|
+
logger.info(f"Stopped experiment: {experiment.name}")
|
|
616
|
+
|
|
617
|
+
def assign_user(self, user_id: str, experiment_id: str) -> str:
|
|
618
|
+
"""Assign user to experiment variant"""
|
|
619
|
+
if experiment_id not in self.experiments:
|
|
620
|
+
return "control"
|
|
621
|
+
|
|
622
|
+
experiment = self.experiments[experiment_id]
|
|
623
|
+
|
|
624
|
+
# Check experiment status
|
|
625
|
+
if experiment.status != ExperimentStatus.RUNNING:
|
|
626
|
+
return "control"
|
|
627
|
+
|
|
628
|
+
# Check date range
|
|
629
|
+
now = datetime.now()
|
|
630
|
+
if experiment.start_date and now < experiment.start_date:
|
|
631
|
+
return "control"
|
|
632
|
+
if experiment.end_date and now > experiment.end_date:
|
|
633
|
+
return "control"
|
|
634
|
+
|
|
635
|
+
return self.traffic_splitter.assign_variant(user_id, experiment)
|
|
636
|
+
|
|
637
|
+
def record_metric(self, user_id: str, experiment_id: str, metric_name: str,
|
|
638
|
+
value: Union[float, int, bool]):
|
|
639
|
+
"""Record metric for user"""
|
|
640
|
+
# Get user's variant assignment
|
|
641
|
+
variant_id = self.traffic_splitter.get_assignment(user_id, experiment_id)
|
|
642
|
+
if not variant_id:
|
|
643
|
+
variant_id = self.assign_user(user_id, experiment_id)
|
|
644
|
+
|
|
645
|
+
# Record metric
|
|
646
|
+
self.metrics_collector.record_metric(
|
|
647
|
+
user_id, experiment_id, variant_id, metric_name, value
|
|
648
|
+
)
|
|
649
|
+
|
|
650
|
+
def analyze_experiment(self, experiment_id: str) -> ExperimentResult:
|
|
651
|
+
"""Analyze experiment results"""
|
|
652
|
+
if experiment_id not in self.experiments:
|
|
653
|
+
raise ValueError(f"Experiment {experiment_id} not found")
|
|
654
|
+
|
|
655
|
+
experiment = self.experiments[experiment_id]
|
|
656
|
+
metrics_df = self.metrics_collector.get_experiment_metrics(experiment_id)
|
|
657
|
+
|
|
658
|
+
return self.analyzer.analyze_experiment(experiment, metrics_df)
|
|
659
|
+
|
|
660
|
+
def get_experiment_summary(self, experiment_id: str) -> Dict[str, Any]:
|
|
661
|
+
"""Get experiment summary"""
|
|
662
|
+
if experiment_id not in self.experiments:
|
|
663
|
+
raise ValueError(f"Experiment {experiment_id} not found")
|
|
664
|
+
|
|
665
|
+
experiment = self.experiments[experiment_id]
|
|
666
|
+
metrics_df = self.metrics_collector.get_experiment_metrics(experiment_id)
|
|
667
|
+
|
|
668
|
+
summary = {
|
|
669
|
+
"experiment": asdict(experiment),
|
|
670
|
+
"total_users": len(metrics_df['user_id'].unique()) if not metrics_df.empty else 0,
|
|
671
|
+
"total_events": len(metrics_df) if not metrics_df.empty else 0,
|
|
672
|
+
"variant_distribution": metrics_df['variant_id'].value_counts().to_dict() if not metrics_df.empty else {}
|
|
673
|
+
}
|
|
674
|
+
|
|
675
|
+
return summary
|
|
676
|
+
|
|
677
|
+
def list_experiments(self) -> List[Dict[str, Any]]:
|
|
678
|
+
"""List all experiments"""
|
|
679
|
+
return [
|
|
680
|
+
{
|
|
681
|
+
"id": exp.id,
|
|
682
|
+
"name": exp.name,
|
|
683
|
+
"status": exp.status.value,
|
|
684
|
+
"start_date": exp.start_date.isoformat() if exp.start_date else None,
|
|
685
|
+
"end_date": exp.end_date.isoformat() if exp.end_date else None,
|
|
686
|
+
"variants": len(exp.variants),
|
|
687
|
+
"metrics": len(exp.metrics)
|
|
688
|
+
}
|
|
689
|
+
for exp in self.experiments.values()
|
|
690
|
+
]
|
|
691
|
+
|
|
692
|
+
def save_experiment(self, experiment: ExperimentConfig):
|
|
693
|
+
"""Save experiment to storage"""
|
|
694
|
+
experiment_file = self.storage_path / f"experiment_{experiment.id}.json"
|
|
695
|
+
|
|
696
|
+
# Convert to dict and handle non-serializable types
|
|
697
|
+
experiment_dict = asdict(experiment)
|
|
698
|
+
|
|
699
|
+
# Convert datetime objects to ISO strings
|
|
700
|
+
if experiment_dict.get('start_date'):
|
|
701
|
+
experiment_dict['start_date'] = experiment.start_date.isoformat()
|
|
702
|
+
if experiment_dict.get('end_date'):
|
|
703
|
+
experiment_dict['end_date'] = experiment.end_date.isoformat()
|
|
704
|
+
|
|
705
|
+
# Convert enums to strings
|
|
706
|
+
experiment_dict['status'] = experiment.status.value
|
|
707
|
+
for variant in experiment_dict['variants']:
|
|
708
|
+
variant['type'] = variant['type'].value if hasattr(variant['type'], 'value') else variant['type']
|
|
709
|
+
|
|
710
|
+
with open(experiment_file, 'w') as f:
|
|
711
|
+
json.dump(experiment_dict, f, indent=2)
|
|
712
|
+
|
|
713
|
+
def load_experiments(self):
|
|
714
|
+
"""Load experiments from storage"""
|
|
715
|
+
for experiment_file in self.storage_path.glob("experiment_*.json"):
|
|
716
|
+
try:
|
|
717
|
+
with open(experiment_file, 'r') as f:
|
|
718
|
+
experiment_dict = json.load(f)
|
|
719
|
+
|
|
720
|
+
# Convert back from dict to objects
|
|
721
|
+
experiment = self._dict_to_experiment(experiment_dict)
|
|
722
|
+
self.experiments[experiment.id] = experiment
|
|
723
|
+
|
|
724
|
+
except Exception as e:
|
|
725
|
+
logger.error(f"Failed to load experiment from {experiment_file}: {e}")
|
|
726
|
+
|
|
727
|
+
def _dict_to_experiment(self, experiment_dict: Dict) -> ExperimentConfig:
|
|
728
|
+
"""Convert dictionary back to ExperimentConfig"""
|
|
729
|
+
# Convert datetime strings back to objects
|
|
730
|
+
if experiment_dict.get('start_date'):
|
|
731
|
+
experiment_dict['start_date'] = datetime.fromisoformat(experiment_dict['start_date'])
|
|
732
|
+
if experiment_dict.get('end_date'):
|
|
733
|
+
experiment_dict['end_date'] = datetime.fromisoformat(experiment_dict['end_date'])
|
|
734
|
+
|
|
735
|
+
# Convert status string back to enum
|
|
736
|
+
experiment_dict['status'] = ExperimentStatus(experiment_dict['status'])
|
|
737
|
+
|
|
738
|
+
# Convert variants
|
|
739
|
+
variants = []
|
|
740
|
+
for variant_dict in experiment_dict['variants']:
|
|
741
|
+
variant_dict['type'] = VariantType(variant_dict['type'])
|
|
742
|
+
variants.append(Variant(**variant_dict))
|
|
743
|
+
experiment_dict['variants'] = variants
|
|
744
|
+
|
|
745
|
+
# Convert metrics
|
|
746
|
+
metrics = []
|
|
747
|
+
for metric_dict in experiment_dict['metrics']:
|
|
748
|
+
metrics.append(Metric(**metric_dict))
|
|
749
|
+
experiment_dict['metrics'] = metrics
|
|
750
|
+
|
|
751
|
+
return ExperimentConfig(**experiment_dict)
|
|
752
|
+
|
|
753
|
+
def _validate_experiment_config(self, config: ExperimentConfig):
|
|
754
|
+
"""Validate experiment configuration"""
|
|
755
|
+
# Check traffic percentages sum to 100%
|
|
756
|
+
total_traffic = sum(v.traffic_percentage for v in config.variants)
|
|
757
|
+
if abs(total_traffic - 100.0) > 0.01:
|
|
758
|
+
raise ValueError(f"Variant traffic percentages must sum to 100%, got {total_traffic}")
|
|
759
|
+
|
|
760
|
+
# Check at least one control variant
|
|
761
|
+
control_variants = [v for v in config.variants if v.type == VariantType.CONTROL]
|
|
762
|
+
if not control_variants:
|
|
763
|
+
raise ValueError("At least one control variant is required")
|
|
764
|
+
|
|
765
|
+
# Check at least one primary metric
|
|
766
|
+
primary_metrics = [m for m in config.metrics if m.primary]
|
|
767
|
+
if not primary_metrics:
|
|
768
|
+
logger.warning("No primary metrics defined")
|
|
769
|
+
|
|
770
|
+
|
|
771
|
+
# Example usage
|
|
772
|
+
if __name__ == "__main__":
|
|
773
|
+
# Initialize framework
|
|
774
|
+
framework = ABTestingFramework(Path("experiments"))
|
|
775
|
+
|
|
776
|
+
# Create experiment configuration
|
|
777
|
+
config = ExperimentConfig(
|
|
778
|
+
id="model_comparison_v1",
|
|
779
|
+
name="Stock Recommendation Model A/B Test",
|
|
780
|
+
description="Compare ensemble model vs single model performance",
|
|
781
|
+
variants=[
|
|
782
|
+
Variant(
|
|
783
|
+
id="control",
|
|
784
|
+
name="Single Model",
|
|
785
|
+
type=VariantType.CONTROL,
|
|
786
|
+
traffic_percentage=50.0,
|
|
787
|
+
model_config={"model_type": "single_mlp"}
|
|
788
|
+
),
|
|
789
|
+
Variant(
|
|
790
|
+
id="treatment",
|
|
791
|
+
name="Ensemble Model",
|
|
792
|
+
type=VariantType.TREATMENT,
|
|
793
|
+
traffic_percentage=50.0,
|
|
794
|
+
model_config={"model_type": "ensemble"}
|
|
795
|
+
)
|
|
796
|
+
],
|
|
797
|
+
metrics=[
|
|
798
|
+
Metric(
|
|
799
|
+
name="prediction_accuracy",
|
|
800
|
+
type="continuous",
|
|
801
|
+
aggregation="mean",
|
|
802
|
+
goal="increase",
|
|
803
|
+
primary=True
|
|
804
|
+
),
|
|
805
|
+
Metric(
|
|
806
|
+
name="recommendation_click_rate",
|
|
807
|
+
type="binary",
|
|
808
|
+
aggregation="mean",
|
|
809
|
+
goal="increase",
|
|
810
|
+
primary=True
|
|
811
|
+
),
|
|
812
|
+
Metric(
|
|
813
|
+
name="portfolio_return",
|
|
814
|
+
type="continuous",
|
|
815
|
+
aggregation="mean",
|
|
816
|
+
goal="increase"
|
|
817
|
+
)
|
|
818
|
+
],
|
|
819
|
+
min_sample_size=1000
|
|
820
|
+
)
|
|
821
|
+
|
|
822
|
+
# Create and start experiment
|
|
823
|
+
experiment_id = framework.create_experiment(config)
|
|
824
|
+
framework.start_experiment(experiment_id)
|
|
825
|
+
|
|
826
|
+
# Simulate user assignments and metrics
|
|
827
|
+
for i in range(100):
|
|
828
|
+
user_id = f"user_{i}"
|
|
829
|
+
variant = framework.assign_user(user_id, experiment_id)
|
|
830
|
+
|
|
831
|
+
# Simulate metrics
|
|
832
|
+
framework.record_metric(user_id, experiment_id, "prediction_accuracy", random.uniform(0.6, 0.9))
|
|
833
|
+
framework.record_metric(user_id, experiment_id, "recommendation_click_rate", random.choice([0, 1]))
|
|
834
|
+
framework.record_metric(user_id, experiment_id, "portfolio_return", random.uniform(-0.1, 0.15))
|
|
835
|
+
|
|
836
|
+
# Analyze results
|
|
837
|
+
results = framework.analyze_experiment(experiment_id)
|
|
838
|
+
|
|
839
|
+
print(f"Experiment Results:")
|
|
840
|
+
print(f"Total Users: {results.total_users}")
|
|
841
|
+
print(f"Statistical Significance: {results.statistical_significance}")
|
|
842
|
+
print(f"Winner: {results.winner_variant}")
|
|
843
|
+
print(f"Recommendations: {results.recommendations}")
|
|
844
|
+
|
|
845
|
+
logger.info("A/B testing framework demo completed")
|