empathy-framework 4.6.2__py3-none-any.whl → 4.6.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. {empathy_framework-4.6.2.dist-info → empathy_framework-4.6.3.dist-info}/METADATA +1 -1
  2. {empathy_framework-4.6.2.dist-info → empathy_framework-4.6.3.dist-info}/RECORD +53 -20
  3. {empathy_framework-4.6.2.dist-info → empathy_framework-4.6.3.dist-info}/WHEEL +1 -1
  4. empathy_os/__init__.py +1 -1
  5. empathy_os/cli.py +361 -32
  6. empathy_os/config/xml_config.py +8 -3
  7. empathy_os/core.py +37 -4
  8. empathy_os/leverage_points.py +2 -1
  9. empathy_os/memory/short_term.py +45 -1
  10. empathy_os/meta_workflows/agent_creator 2.py +254 -0
  11. empathy_os/meta_workflows/builtin_templates 2.py +567 -0
  12. empathy_os/meta_workflows/cli_meta_workflows 2.py +1551 -0
  13. empathy_os/meta_workflows/form_engine 2.py +304 -0
  14. empathy_os/meta_workflows/intent_detector 2.py +298 -0
  15. empathy_os/meta_workflows/pattern_learner 2.py +754 -0
  16. empathy_os/meta_workflows/session_context 2.py +398 -0
  17. empathy_os/meta_workflows/template_registry 2.py +229 -0
  18. empathy_os/meta_workflows/workflow 2.py +980 -0
  19. empathy_os/models/token_estimator.py +16 -9
  20. empathy_os/models/validation.py +7 -1
  21. empathy_os/orchestration/pattern_learner 2.py +699 -0
  22. empathy_os/orchestration/real_tools 2.py +938 -0
  23. empathy_os/orchestration/real_tools.py +4 -2
  24. empathy_os/socratic/__init__ 2.py +273 -0
  25. empathy_os/socratic/ab_testing 2.py +969 -0
  26. empathy_os/socratic/blueprint 2.py +532 -0
  27. empathy_os/socratic/cli 2.py +689 -0
  28. empathy_os/socratic/collaboration 2.py +1112 -0
  29. empathy_os/socratic/domain_templates 2.py +916 -0
  30. empathy_os/socratic/embeddings 2.py +734 -0
  31. empathy_os/socratic/engine 2.py +729 -0
  32. empathy_os/socratic/explainer 2.py +663 -0
  33. empathy_os/socratic/feedback 2.py +767 -0
  34. empathy_os/socratic/forms 2.py +624 -0
  35. empathy_os/socratic/generator 2.py +716 -0
  36. empathy_os/socratic/llm_analyzer 2.py +635 -0
  37. empathy_os/socratic/mcp_server 2.py +751 -0
  38. empathy_os/socratic/session 2.py +306 -0
  39. empathy_os/socratic/storage 2.py +635 -0
  40. empathy_os/socratic/storage.py +2 -1
  41. empathy_os/socratic/success 2.py +719 -0
  42. empathy_os/socratic/visual_editor 2.py +812 -0
  43. empathy_os/socratic/web_ui 2.py +925 -0
  44. empathy_os/tier_recommender.py +5 -2
  45. empathy_os/workflow_commands.py +11 -6
  46. empathy_os/workflows/base.py +1 -1
  47. empathy_os/workflows/batch_processing 2.py +310 -0
  48. empathy_os/workflows/release_prep_crew 2.py +968 -0
  49. empathy_os/workflows/test_coverage_boost_crew 2.py +848 -0
  50. empathy_os/workflows/test_maintenance.py +3 -2
  51. {empathy_framework-4.6.2.dist-info → empathy_framework-4.6.3.dist-info}/entry_points.txt +0 -0
  52. {empathy_framework-4.6.2.dist-info → empathy_framework-4.6.3.dist-info}/licenses/LICENSE +0 -0
  53. {empathy_framework-4.6.2.dist-info → empathy_framework-4.6.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,969 @@
1
+ """A/B Testing for Workflow Optimization
2
+
3
+ Enables controlled experiments to compare different workflow configurations
4
+ and determine which performs better for specific goals or domains.
5
+
6
+ Key Features:
7
+ - Experiment definition with control and variant groups
8
+ - Statistical significance testing
9
+ - Automatic traffic allocation
10
+ - Multi-armed bandit for adaptive optimization
11
+ - Integration with feedback loop
12
+
13
+ Copyright 2026 Smart-AI-Memory
14
+ Licensed under Fair Source License 0.9
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import hashlib
20
+ import json
21
+ import logging
22
+ import math
23
+ import random
24
+ import time
25
+ from dataclasses import dataclass, field
26
+ from datetime import datetime
27
+ from enum import Enum
28
+ from pathlib import Path
29
+ from typing import Any
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ # =============================================================================
35
+ # DATA STRUCTURES
36
+ # =============================================================================
37
+
38
+
39
+ class ExperimentStatus(Enum):
40
+ """Status of an A/B experiment."""
41
+
42
+ DRAFT = "draft"
43
+ RUNNING = "running"
44
+ PAUSED = "paused"
45
+ COMPLETED = "completed"
46
+ STOPPED = "stopped"
47
+
48
+
49
+ class AllocationStrategy(Enum):
50
+ """Strategy for allocating traffic to variants."""
51
+
52
+ FIXED = "fixed" # Fixed percentage split
53
+ EPSILON_GREEDY = "epsilon_greedy" # Explore vs exploit
54
+ THOMPSON_SAMPLING = "thompson_sampling" # Bayesian bandits
55
+ UCB = "ucb" # Upper confidence bound
56
+
57
+
58
+ @dataclass
59
+ class Variant:
60
+ """A variant in an A/B experiment."""
61
+
62
+ variant_id: str
63
+ name: str
64
+ description: str
65
+ config: dict[str, Any]
66
+ is_control: bool = False
67
+ traffic_percentage: float = 50.0
68
+
69
+ # Statistics
70
+ impressions: int = 0
71
+ conversions: int = 0
72
+ total_success_score: float = 0.0
73
+
74
+ @property
75
+ def conversion_rate(self) -> float:
76
+ """Calculate conversion rate."""
77
+ if self.impressions == 0:
78
+ return 0.0
79
+ return self.conversions / self.impressions
80
+
81
+ @property
82
+ def avg_success_score(self) -> float:
83
+ """Calculate average success score."""
84
+ if self.impressions == 0:
85
+ return 0.0
86
+ return self.total_success_score / self.impressions
87
+
88
+ def to_dict(self) -> dict[str, Any]:
89
+ """Convert to dictionary."""
90
+ return {
91
+ "variant_id": self.variant_id,
92
+ "name": self.name,
93
+ "description": self.description,
94
+ "config": self.config,
95
+ "is_control": self.is_control,
96
+ "traffic_percentage": self.traffic_percentage,
97
+ "impressions": self.impressions,
98
+ "conversions": self.conversions,
99
+ "total_success_score": self.total_success_score,
100
+ }
101
+
102
+ @classmethod
103
+ def from_dict(cls, data: dict[str, Any]) -> Variant:
104
+ """Create from dictionary."""
105
+ return cls(
106
+ variant_id=data["variant_id"],
107
+ name=data["name"],
108
+ description=data["description"],
109
+ config=data["config"],
110
+ is_control=data.get("is_control", False),
111
+ traffic_percentage=data.get("traffic_percentage", 50.0),
112
+ impressions=data.get("impressions", 0),
113
+ conversions=data.get("conversions", 0),
114
+ total_success_score=data.get("total_success_score", 0.0),
115
+ )
116
+
117
+
118
+ @dataclass
119
+ class Experiment:
120
+ """An A/B experiment definition."""
121
+
122
+ experiment_id: str
123
+ name: str
124
+ description: str
125
+ hypothesis: str
126
+ variants: list[Variant]
127
+ domain_filter: str | None = None
128
+ goal_filter: str | None = None
129
+ allocation_strategy: AllocationStrategy = AllocationStrategy.FIXED
130
+ min_sample_size: int = 100
131
+ max_duration_days: int = 30
132
+ confidence_level: float = 0.95
133
+ status: ExperimentStatus = ExperimentStatus.DRAFT
134
+ created_at: datetime = field(default_factory=datetime.now)
135
+ started_at: datetime | None = None
136
+ ended_at: datetime | None = None
137
+
138
+ def to_dict(self) -> dict[str, Any]:
139
+ """Convert to dictionary."""
140
+ return {
141
+ "experiment_id": self.experiment_id,
142
+ "name": self.name,
143
+ "description": self.description,
144
+ "hypothesis": self.hypothesis,
145
+ "variants": [v.to_dict() for v in self.variants],
146
+ "domain_filter": self.domain_filter,
147
+ "goal_filter": self.goal_filter,
148
+ "allocation_strategy": self.allocation_strategy.value,
149
+ "min_sample_size": self.min_sample_size,
150
+ "max_duration_days": self.max_duration_days,
151
+ "confidence_level": self.confidence_level,
152
+ "status": self.status.value,
153
+ "created_at": self.created_at.isoformat(),
154
+ "started_at": self.started_at.isoformat() if self.started_at else None,
155
+ "ended_at": self.ended_at.isoformat() if self.ended_at else None,
156
+ }
157
+
158
+ @classmethod
159
+ def from_dict(cls, data: dict[str, Any]) -> Experiment:
160
+ """Create from dictionary."""
161
+ return cls(
162
+ experiment_id=data["experiment_id"],
163
+ name=data["name"],
164
+ description=data["description"],
165
+ hypothesis=data["hypothesis"],
166
+ variants=[Variant.from_dict(v) for v in data["variants"]],
167
+ domain_filter=data.get("domain_filter"),
168
+ goal_filter=data.get("goal_filter"),
169
+ allocation_strategy=AllocationStrategy(
170
+ data.get("allocation_strategy", "fixed")
171
+ ),
172
+ min_sample_size=data.get("min_sample_size", 100),
173
+ max_duration_days=data.get("max_duration_days", 30),
174
+ confidence_level=data.get("confidence_level", 0.95),
175
+ status=ExperimentStatus(data.get("status", "draft")),
176
+ created_at=datetime.fromisoformat(data["created_at"]),
177
+ started_at=(
178
+ datetime.fromisoformat(data["started_at"])
179
+ if data.get("started_at")
180
+ else None
181
+ ),
182
+ ended_at=(
183
+ datetime.fromisoformat(data["ended_at"])
184
+ if data.get("ended_at")
185
+ else None
186
+ ),
187
+ )
188
+
189
+ @property
190
+ def total_impressions(self) -> int:
191
+ """Total impressions across all variants."""
192
+ return sum(v.impressions for v in self.variants)
193
+
194
+ @property
195
+ def control(self) -> Variant | None:
196
+ """Get control variant."""
197
+ for v in self.variants:
198
+ if v.is_control:
199
+ return v
200
+ return None
201
+
202
+ @property
203
+ def treatments(self) -> list[Variant]:
204
+ """Get treatment variants (non-control)."""
205
+ return [v for v in self.variants if not v.is_control]
206
+
207
+
208
+ @dataclass
209
+ class ExperimentResult:
210
+ """Results and analysis of an experiment."""
211
+
212
+ experiment: Experiment
213
+ winner: Variant | None
214
+ is_significant: bool
215
+ p_value: float
216
+ confidence_interval: tuple[float, float]
217
+ lift: float # Percentage improvement over control
218
+ recommendation: str
219
+
220
+
221
+ # =============================================================================
222
+ # STATISTICAL ANALYSIS
223
+ # =============================================================================
224
+
225
+
226
+ class StatisticalAnalyzer:
227
+ """Statistical analysis for A/B tests."""
228
+
229
+ @staticmethod
230
+ def z_test_proportions(
231
+ n1: int,
232
+ c1: int,
233
+ n2: int,
234
+ c2: int,
235
+ ) -> tuple[float, float]:
236
+ """Two-proportion z-test.
237
+
238
+ Args:
239
+ n1: Sample size for group 1
240
+ c1: Conversions for group 1
241
+ n2: Sample size for group 2
242
+ c2: Conversions for group 2
243
+
244
+ Returns:
245
+ (z_score, p_value)
246
+ """
247
+ if n1 == 0 or n2 == 0:
248
+ return 0.0, 1.0
249
+
250
+ p1 = c1 / n1
251
+ p2 = c2 / n2
252
+ p_pooled = (c1 + c2) / (n1 + n2)
253
+
254
+ if p_pooled == 0 or p_pooled == 1:
255
+ return 0.0, 1.0
256
+
257
+ se = math.sqrt(p_pooled * (1 - p_pooled) * (1 / n1 + 1 / n2))
258
+ if se == 0:
259
+ return 0.0, 1.0
260
+
261
+ z = (p1 - p2) / se
262
+
263
+ # Approximate p-value using normal CDF
264
+ p_value = 2 * (1 - StatisticalAnalyzer._normal_cdf(abs(z)))
265
+
266
+ return z, p_value
267
+
268
+ @staticmethod
269
+ def t_test_means(
270
+ n1: int,
271
+ mean1: float,
272
+ var1: float,
273
+ n2: int,
274
+ mean2: float,
275
+ var2: float,
276
+ ) -> tuple[float, float]:
277
+ """Welch's t-test for means.
278
+
279
+ Args:
280
+ n1, mean1, var1: Stats for group 1
281
+ n2, mean2, var2: Stats for group 2
282
+
283
+ Returns:
284
+ (t_score, p_value)
285
+ """
286
+ if n1 < 2 or n2 < 2:
287
+ return 0.0, 1.0
288
+
289
+ se = math.sqrt(var1 / n1 + var2 / n2)
290
+ if se == 0:
291
+ return 0.0, 1.0
292
+
293
+ t = (mean1 - mean2) / se
294
+
295
+ # Welch-Satterthwaite degrees of freedom
296
+ num = (var1 / n1 + var2 / n2) ** 2
297
+ denom = (var1 / n1) ** 2 / (n1 - 1) + (var2 / n2) ** 2 / (n2 - 1)
298
+ df = num / denom if denom > 0 else 1
299
+
300
+ # Approximate p-value using t-distribution
301
+ p_value = 2 * StatisticalAnalyzer._t_cdf(-abs(t), df)
302
+
303
+ return t, p_value
304
+
305
+ @staticmethod
306
+ def confidence_interval(
307
+ n: int,
308
+ successes: int,
309
+ confidence: float = 0.95,
310
+ ) -> tuple[float, float]:
311
+ """Wilson score interval for proportions.
312
+
313
+ Args:
314
+ n: Sample size
315
+ successes: Number of successes
316
+ confidence: Confidence level
317
+
318
+ Returns:
319
+ (lower, upper) bounds
320
+ """
321
+ if n == 0:
322
+ return 0.0, 1.0
323
+
324
+ z = StatisticalAnalyzer._z_score(confidence)
325
+ p = successes / n
326
+
327
+ denominator = 1 + z * z / n
328
+ centre = p + z * z / (2 * n)
329
+ adjustment = z * math.sqrt((p * (1 - p) + z * z / (4 * n)) / n)
330
+
331
+ lower = max(0, (centre - adjustment) / denominator)
332
+ upper = min(1, (centre + adjustment) / denominator)
333
+
334
+ return lower, upper
335
+
336
+ @staticmethod
337
+ def _normal_cdf(x: float) -> float:
338
+ """Approximate standard normal CDF."""
339
+ return 0.5 * (1 + math.erf(x / math.sqrt(2)))
340
+
341
+ @staticmethod
342
+ def _t_cdf(t: float, df: float) -> float:
343
+ """Approximate t-distribution CDF."""
344
+ # Use normal approximation for large df
345
+ if df > 30:
346
+ return StatisticalAnalyzer._normal_cdf(t)
347
+
348
+ # Beta function approximation
349
+ x = df / (df + t * t)
350
+ return 0.5 * StatisticalAnalyzer._incomplete_beta(df / 2, 0.5, x)
351
+
352
+ @staticmethod
353
+ def _incomplete_beta(a: float, b: float, x: float) -> float:
354
+ """Approximate incomplete beta function."""
355
+ if x == 0:
356
+ return 0
357
+ if x == 1:
358
+ return 1
359
+
360
+ # Continued fraction approximation (simplified)
361
+ result = 0.0
362
+ for k in range(100):
363
+ term = (x ** k) * math.gamma(a + k) / (math.gamma(k + 1) * math.gamma(a))
364
+ result += term * ((1 - x) ** b) / (a + k)
365
+ if abs(term) < 1e-10:
366
+ break
367
+
368
+ return result * math.gamma(a + b) / (math.gamma(a) * math.gamma(b))
369
+
370
+ @staticmethod
371
+ def _z_score(confidence: float) -> float:
372
+ """Get z-score for confidence level."""
373
+ # Common values
374
+ z_scores = {
375
+ 0.90: 1.645,
376
+ 0.95: 1.96,
377
+ 0.99: 2.576,
378
+ }
379
+ return z_scores.get(confidence, 1.96)
380
+
381
+
382
+ # =============================================================================
383
+ # TRAFFIC ALLOCATOR
384
+ # =============================================================================
385
+
386
+
387
+ class TrafficAllocator:
388
+ """Allocates traffic to experiment variants."""
389
+
390
+ def __init__(self, experiment: Experiment):
391
+ """Initialize allocator.
392
+
393
+ Args:
394
+ experiment: The experiment to allocate for
395
+ """
396
+ self.experiment = experiment
397
+ self._random = random.Random()
398
+
399
+ def allocate(self, user_id: str) -> Variant:
400
+ """Allocate a user to a variant.
401
+
402
+ Args:
403
+ user_id: Unique user/session identifier
404
+
405
+ Returns:
406
+ Allocated variant
407
+ """
408
+ strategy = self.experiment.allocation_strategy
409
+
410
+ if strategy == AllocationStrategy.FIXED:
411
+ return self._fixed_allocation(user_id)
412
+ elif strategy == AllocationStrategy.EPSILON_GREEDY:
413
+ return self._epsilon_greedy(epsilon=0.1)
414
+ elif strategy == AllocationStrategy.THOMPSON_SAMPLING:
415
+ return self._thompson_sampling()
416
+ elif strategy == AllocationStrategy.UCB:
417
+ return self._ucb_allocation()
418
+ else:
419
+ return self._fixed_allocation(user_id)
420
+
421
+ def _fixed_allocation(self, user_id: str) -> Variant:
422
+ """Deterministic allocation based on user ID hash."""
423
+ # Hash user ID for consistent assignment (not for security)
424
+ hash_val = int(hashlib.md5(
425
+ f"{self.experiment.experiment_id}:{user_id}".encode(),
426
+ usedforsecurity=False
427
+ ).hexdigest(), 16)
428
+ bucket = hash_val % 100
429
+
430
+ cumulative = 0.0
431
+ for variant in self.experiment.variants:
432
+ cumulative += variant.traffic_percentage
433
+ if bucket < cumulative:
434
+ return variant
435
+
436
+ return self.experiment.variants[-1]
437
+
438
+ def _epsilon_greedy(self, epsilon: float = 0.1) -> Variant:
439
+ """Epsilon-greedy: explore with probability epsilon."""
440
+ if self._random.random() < epsilon:
441
+ # Explore: random variant
442
+ return self._random.choice(self.experiment.variants)
443
+ else:
444
+ # Exploit: best performing variant
445
+ return max(
446
+ self.experiment.variants,
447
+ key=lambda v: v.avg_success_score,
448
+ )
449
+
450
+ def _thompson_sampling(self) -> Variant:
451
+ """Thompson sampling: Bayesian multi-armed bandit."""
452
+ samples = []
453
+
454
+ for variant in self.experiment.variants:
455
+ # Beta distribution parameters
456
+ alpha = variant.conversions + 1
457
+ beta = (variant.impressions - variant.conversions) + 1
458
+
459
+ # Sample from beta distribution
460
+ sample = self._random.betavariate(alpha, beta)
461
+ samples.append((sample, variant))
462
+
463
+ # Select variant with highest sample
464
+ return max(samples, key=lambda x: x[0])[1]
465
+
466
+ def _ucb_allocation(self) -> Variant:
467
+ """Upper Confidence Bound selection."""
468
+ total_impressions = self.experiment.total_impressions or 1
469
+
470
+ ucb_scores = []
471
+ for variant in self.experiment.variants:
472
+ if variant.impressions == 0:
473
+ # Give unvisited variants high priority
474
+ ucb_scores.append((float('inf'), variant))
475
+ else:
476
+ mean = variant.avg_success_score
477
+ exploration = math.sqrt(
478
+ 2 * math.log(total_impressions) / variant.impressions
479
+ )
480
+ ucb = mean + exploration
481
+ ucb_scores.append((ucb, variant))
482
+
483
+ return max(ucb_scores, key=lambda x: x[0])[1]
484
+
485
+
486
+ # =============================================================================
487
+ # EXPERIMENT MANAGER
488
+ # =============================================================================
489
+
490
+
491
+ class ExperimentManager:
492
+ """Manages A/B experiments lifecycle."""
493
+
494
+ def __init__(self, storage_path: Path | str | None = None):
495
+ """Initialize experiment manager.
496
+
497
+ Args:
498
+ storage_path: Path to persist experiments
499
+ """
500
+ if storage_path is None:
501
+ storage_path = Path.home() / ".empathy" / "socratic" / "experiments.json"
502
+ self.storage_path = Path(storage_path)
503
+ self._experiments: dict[str, Experiment] = {}
504
+ self._allocators: dict[str, TrafficAllocator] = {}
505
+
506
+ # Load existing experiments
507
+ self._load()
508
+
509
+ def create_experiment(
510
+ self,
511
+ name: str,
512
+ description: str,
513
+ hypothesis: str,
514
+ control_config: dict[str, Any],
515
+ treatment_configs: list[dict[str, Any]],
516
+ domain_filter: str | None = None,
517
+ allocation_strategy: AllocationStrategy = AllocationStrategy.FIXED,
518
+ min_sample_size: int = 100,
519
+ ) -> Experiment:
520
+ """Create a new experiment.
521
+
522
+ Args:
523
+ name: Experiment name
524
+ description: Description
525
+ hypothesis: What we're testing
526
+ control_config: Configuration for control group
527
+ treatment_configs: Configurations for treatment groups
528
+ domain_filter: Optional domain to filter
529
+ allocation_strategy: How to allocate traffic
530
+ min_sample_size: Minimum samples before analysis
531
+
532
+ Returns:
533
+ Created experiment
534
+ """
535
+ experiment_id = hashlib.sha256(
536
+ f"{name}:{time.time()}".encode()
537
+ ).hexdigest()[:12]
538
+
539
+ # Create variants
540
+ num_variants = 1 + len(treatment_configs)
541
+ traffic_each = 100.0 / num_variants
542
+
543
+ variants = [
544
+ Variant(
545
+ variant_id=f"{experiment_id}_control",
546
+ name="Control",
547
+ description="Control group with existing configuration",
548
+ config=control_config,
549
+ is_control=True,
550
+ traffic_percentage=traffic_each,
551
+ )
552
+ ]
553
+
554
+ for i, config in enumerate(treatment_configs):
555
+ variants.append(Variant(
556
+ variant_id=f"{experiment_id}_treatment_{i}",
557
+ name=config.get("name", f"Treatment {i + 1}"),
558
+ description=config.get("description", ""),
559
+ config=config.get("config", config),
560
+ is_control=False,
561
+ traffic_percentage=traffic_each,
562
+ ))
563
+
564
+ experiment = Experiment(
565
+ experiment_id=experiment_id,
566
+ name=name,
567
+ description=description,
568
+ hypothesis=hypothesis,
569
+ variants=variants,
570
+ domain_filter=domain_filter,
571
+ allocation_strategy=allocation_strategy,
572
+ min_sample_size=min_sample_size,
573
+ )
574
+
575
+ self._experiments[experiment_id] = experiment
576
+ self._save()
577
+
578
+ return experiment
579
+
580
+ def start_experiment(self, experiment_id: str) -> bool:
581
+ """Start an experiment.
582
+
583
+ Args:
584
+ experiment_id: ID of experiment to start
585
+
586
+ Returns:
587
+ True if started successfully
588
+ """
589
+ experiment = self._experiments.get(experiment_id)
590
+ if not experiment:
591
+ return False
592
+
593
+ if experiment.status != ExperimentStatus.DRAFT:
594
+ return False
595
+
596
+ experiment.status = ExperimentStatus.RUNNING
597
+ experiment.started_at = datetime.now()
598
+ self._allocators[experiment_id] = TrafficAllocator(experiment)
599
+ self._save()
600
+
601
+ return True
602
+
603
+ def stop_experiment(self, experiment_id: str) -> ExperimentResult | None:
604
+ """Stop an experiment and analyze results.
605
+
606
+ Args:
607
+ experiment_id: ID of experiment to stop
608
+
609
+ Returns:
610
+ Experiment results with analysis
611
+ """
612
+ experiment = self._experiments.get(experiment_id)
613
+ if not experiment:
614
+ return None
615
+
616
+ experiment.status = ExperimentStatus.COMPLETED
617
+ experiment.ended_at = datetime.now()
618
+ self._save()
619
+
620
+ return self.analyze_experiment(experiment_id)
621
+
622
+ def allocate_variant(
623
+ self,
624
+ experiment_id: str,
625
+ user_id: str,
626
+ ) -> Variant | None:
627
+ """Allocate a user to a variant.
628
+
629
+ Args:
630
+ experiment_id: Experiment ID
631
+ user_id: User/session ID
632
+
633
+ Returns:
634
+ Allocated variant or None
635
+ """
636
+ experiment = self._experiments.get(experiment_id)
637
+ if not experiment or experiment.status != ExperimentStatus.RUNNING:
638
+ return None
639
+
640
+ allocator = self._allocators.get(experiment_id)
641
+ if not allocator:
642
+ allocator = TrafficAllocator(experiment)
643
+ self._allocators[experiment_id] = allocator
644
+
645
+ return allocator.allocate(user_id)
646
+
647
+ def record_impression(self, experiment_id: str, variant_id: str):
648
+ """Record an impression for a variant.
649
+
650
+ Args:
651
+ experiment_id: Experiment ID
652
+ variant_id: Variant ID
653
+ """
654
+ experiment = self._experiments.get(experiment_id)
655
+ if not experiment:
656
+ return
657
+
658
+ for variant in experiment.variants:
659
+ if variant.variant_id == variant_id:
660
+ variant.impressions += 1
661
+ break
662
+
663
+ self._save()
664
+
665
+ def record_conversion(
666
+ self,
667
+ experiment_id: str,
668
+ variant_id: str,
669
+ success_score: float = 1.0,
670
+ ):
671
+ """Record a conversion for a variant.
672
+
673
+ Args:
674
+ experiment_id: Experiment ID
675
+ variant_id: Variant ID
676
+ success_score: Score from 0-1
677
+ """
678
+ experiment = self._experiments.get(experiment_id)
679
+ if not experiment:
680
+ return
681
+
682
+ for variant in experiment.variants:
683
+ if variant.variant_id == variant_id:
684
+ variant.conversions += 1
685
+ variant.total_success_score += success_score
686
+ break
687
+
688
+ self._save()
689
+
690
+ def analyze_experiment(self, experiment_id: str) -> ExperimentResult | None:
691
+ """Analyze experiment results.
692
+
693
+ Args:
694
+ experiment_id: Experiment ID
695
+
696
+ Returns:
697
+ Analysis results
698
+ """
699
+ experiment = self._experiments.get(experiment_id)
700
+ if not experiment:
701
+ return None
702
+
703
+ control = experiment.control
704
+ if not control:
705
+ return None
706
+
707
+ treatments = experiment.treatments
708
+ if not treatments:
709
+ return None
710
+
711
+ # Find best treatment
712
+ best_treatment = max(treatments, key=lambda v: v.conversion_rate)
713
+
714
+ # Statistical test
715
+ z_score, p_value = StatisticalAnalyzer.z_test_proportions(
716
+ control.impressions,
717
+ control.conversions,
718
+ best_treatment.impressions,
719
+ best_treatment.conversions,
720
+ )
721
+
722
+ is_significant = p_value < (1 - experiment.confidence_level)
723
+
724
+ # Calculate lift
725
+ if control.conversion_rate > 0:
726
+ lift = (
727
+ (best_treatment.conversion_rate - control.conversion_rate)
728
+ / control.conversion_rate
729
+ ) * 100
730
+ else:
731
+ lift = 0.0
732
+
733
+ # Confidence interval for treatment
734
+ ci = StatisticalAnalyzer.confidence_interval(
735
+ best_treatment.impressions,
736
+ best_treatment.conversions,
737
+ experiment.confidence_level,
738
+ )
739
+
740
+ # Determine winner
741
+ winner = None
742
+ recommendation = ""
743
+
744
+ if is_significant:
745
+ if best_treatment.conversion_rate > control.conversion_rate:
746
+ winner = best_treatment
747
+ recommendation = (
748
+ f"Adopt {best_treatment.name}. It shows {lift:.1f}% improvement "
749
+ f"over control with p-value {p_value:.4f}."
750
+ )
751
+ else:
752
+ winner = control
753
+ recommendation = (
754
+ "Keep control. Treatment did not show improvement."
755
+ )
756
+ else:
757
+ recommendation = (
758
+ f"No significant difference detected (p={p_value:.4f}). "
759
+ f"Consider running longer or increasing sample size."
760
+ )
761
+
762
+ return ExperimentResult(
763
+ experiment=experiment,
764
+ winner=winner,
765
+ is_significant=is_significant,
766
+ p_value=p_value,
767
+ confidence_interval=ci,
768
+ lift=lift,
769
+ recommendation=recommendation,
770
+ )
771
+
772
+ def get_running_experiments(
773
+ self,
774
+ domain: str | None = None,
775
+ ) -> list[Experiment]:
776
+ """Get all running experiments.
777
+
778
+ Args:
779
+ domain: Optional domain filter
780
+
781
+ Returns:
782
+ List of running experiments
783
+ """
784
+ running = []
785
+ for exp in self._experiments.values():
786
+ if exp.status != ExperimentStatus.RUNNING:
787
+ continue
788
+ if domain and exp.domain_filter and exp.domain_filter != domain:
789
+ continue
790
+ running.append(exp)
791
+ return running
792
+
793
+ def get_experiment(self, experiment_id: str) -> Experiment | None:
794
+ """Get experiment by ID."""
795
+ return self._experiments.get(experiment_id)
796
+
797
+ def list_experiments(self) -> list[Experiment]:
798
+ """List all experiments."""
799
+ return list(self._experiments.values())
800
+
801
+ def _save(self):
802
+ """Save experiments to storage."""
803
+ self.storage_path.parent.mkdir(parents=True, exist_ok=True)
804
+
805
+ data = {
806
+ "version": 1,
807
+ "experiments": [e.to_dict() for e in self._experiments.values()],
808
+ }
809
+
810
+ with self.storage_path.open("w") as f:
811
+ json.dump(data, f, indent=2)
812
+
813
+ def _load(self):
814
+ """Load experiments from storage."""
815
+ if not self.storage_path.exists():
816
+ return
817
+
818
+ try:
819
+ with self.storage_path.open("r") as f:
820
+ data = json.load(f)
821
+
822
+ for exp_data in data.get("experiments", []):
823
+ exp = Experiment.from_dict(exp_data)
824
+ self._experiments[exp.experiment_id] = exp
825
+
826
+ # Restore allocators for running experiments
827
+ if exp.status == ExperimentStatus.RUNNING:
828
+ self._allocators[exp.experiment_id] = TrafficAllocator(exp)
829
+
830
+ except Exception as e:
831
+ logger.warning(f"Failed to load experiments: {e}")
832
+
833
+
834
+ # =============================================================================
835
+ # WORKFLOW A/B TESTING INTEGRATION
836
+ # =============================================================================
837
+
838
+
839
+ class WorkflowABTester:
840
+ """High-level API for A/B testing workflow configurations.
841
+
842
+ Integrates with the Socratic workflow builder to test different
843
+ configurations and optimize over time.
844
+ """
845
+
846
+ def __init__(self, storage_path: Path | str | None = None):
847
+ """Initialize the tester.
848
+
849
+ Args:
850
+ storage_path: Path to persist data
851
+ """
852
+ self.manager = ExperimentManager(storage_path)
853
+
854
+ def create_workflow_experiment(
855
+ self,
856
+ name: str,
857
+ hypothesis: str,
858
+ control_agents: list[str],
859
+ treatment_agents_list: list[list[str]],
860
+ domain: str | None = None,
861
+ ) -> str:
862
+ """Create an experiment comparing workflow agent configurations.
863
+
864
+ Args:
865
+ name: Experiment name
866
+ hypothesis: What we're testing
867
+ control_agents: Agent list for control
868
+ treatment_agents_list: Agent lists for treatments
869
+ domain: Domain filter
870
+
871
+ Returns:
872
+ Experiment ID
873
+ """
874
+ control_config = {"agents": control_agents}
875
+ treatment_configs = [
876
+ {
877
+ "name": f"Treatment {i + 1}",
878
+ "config": {"agents": agents},
879
+ }
880
+ for i, agents in enumerate(treatment_agents_list)
881
+ ]
882
+
883
+ experiment = self.manager.create_experiment(
884
+ name=name,
885
+ description=f"Testing different agent configurations for {domain or 'general'} workflows",
886
+ hypothesis=hypothesis,
887
+ control_config=control_config,
888
+ treatment_configs=treatment_configs,
889
+ domain_filter=domain,
890
+ allocation_strategy=AllocationStrategy.THOMPSON_SAMPLING,
891
+ )
892
+
893
+ return experiment.experiment_id
894
+
895
+ def get_workflow_config(
896
+ self,
897
+ session_id: str,
898
+ domain: str | None = None,
899
+ ) -> tuple[dict[str, Any], str | None, str | None]:
900
+ """Get workflow configuration for a session.
901
+
902
+ Returns control config or allocates to an experiment.
903
+
904
+ Args:
905
+ session_id: Session ID for allocation
906
+ domain: Optional domain filter
907
+
908
+ Returns:
909
+ (config, experiment_id, variant_id) or (default_config, None, None)
910
+ """
911
+ # Check for running experiments
912
+ experiments = self.manager.get_running_experiments(domain)
913
+
914
+ for exp in experiments:
915
+ variant = self.manager.allocate_variant(exp.experiment_id, session_id)
916
+ if variant:
917
+ self.manager.record_impression(exp.experiment_id, variant.variant_id)
918
+ return (variant.config, exp.experiment_id, variant.variant_id)
919
+
920
+ # No experiment, return default
921
+ return ({}, None, None)
922
+
923
+ def record_workflow_result(
924
+ self,
925
+ experiment_id: str,
926
+ variant_id: str,
927
+ success: bool,
928
+ success_score: float = 0.0,
929
+ ):
930
+ """Record the result of a workflow execution.
931
+
932
+ Args:
933
+ experiment_id: Experiment ID
934
+ variant_id: Variant ID
935
+ success: Whether workflow succeeded
936
+ success_score: Success score (0-1)
937
+ """
938
+ if success:
939
+ self.manager.record_conversion(
940
+ experiment_id,
941
+ variant_id,
942
+ success_score,
943
+ )
944
+
945
+ def get_best_config(self, domain: str | None = None) -> dict[str, Any]:
946
+ """Get the best known configuration for a domain.
947
+
948
+ Args:
949
+ domain: Domain filter
950
+
951
+ Returns:
952
+ Best configuration based on completed experiments
953
+ """
954
+ best_config: dict[str, Any] = {}
955
+ best_score = 0.0
956
+
957
+ for exp in self.manager.list_experiments():
958
+ if exp.status != ExperimentStatus.COMPLETED:
959
+ continue
960
+ if domain and exp.domain_filter != domain:
961
+ continue
962
+
963
+ result = self.manager.analyze_experiment(exp.experiment_id)
964
+ if result and result.winner:
965
+ if result.winner.avg_success_score > best_score:
966
+ best_score = result.winner.avg_success_score
967
+ best_config = result.winner.config
968
+
969
+ return best_config