tech-hub-skills 1.5.1 ā 1.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/LICENSE +21 -21
- package/.claude/README.md +291 -291
- package/.claude/bin/cli.js +266 -266
- package/.claude/bin/copilot.js +182 -182
- package/.claude/bin/postinstall.js +42 -42
- package/.claude/commands/README.md +336 -336
- package/.claude/commands/ai-engineer.md +104 -104
- package/.claude/commands/aws.md +143 -143
- package/.claude/commands/azure.md +149 -149
- package/.claude/commands/backend-developer.md +108 -108
- package/.claude/commands/code-review.md +399 -399
- package/.claude/commands/compliance-automation.md +747 -747
- package/.claude/commands/compliance-officer.md +108 -108
- package/.claude/commands/data-engineer.md +113 -113
- package/.claude/commands/data-governance.md +102 -102
- package/.claude/commands/data-scientist.md +123 -123
- package/.claude/commands/database-admin.md +109 -109
- package/.claude/commands/devops.md +160 -160
- package/.claude/commands/docker.md +160 -160
- package/.claude/commands/enterprise-dashboard.md +613 -613
- package/.claude/commands/finops.md +184 -184
- package/.claude/commands/frontend-developer.md +108 -108
- package/.claude/commands/gcp.md +143 -143
- package/.claude/commands/ml-engineer.md +115 -115
- package/.claude/commands/mlops.md +187 -187
- package/.claude/commands/network-engineer.md +109 -109
- package/.claude/commands/optimization-advisor.md +329 -329
- package/.claude/commands/orchestrator.md +623 -623
- package/.claude/commands/platform-engineer.md +102 -102
- package/.claude/commands/process-automation.md +226 -226
- package/.claude/commands/process-changelog.md +184 -184
- package/.claude/commands/process-documentation.md +484 -484
- package/.claude/commands/process-kanban.md +324 -324
- package/.claude/commands/process-versioning.md +214 -214
- package/.claude/commands/product-designer.md +104 -104
- package/.claude/commands/project-starter.md +443 -443
- package/.claude/commands/qa-engineer.md +109 -109
- package/.claude/commands/security-architect.md +135 -135
- package/.claude/commands/sre.md +109 -109
- package/.claude/commands/system-design.md +126 -126
- package/.claude/commands/technical-writer.md +101 -101
- package/.claude/package.json +46 -46
- package/.claude/roles/ai-engineer/skills/01-prompt-engineering/README.md +252 -252
- package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_ab_tester.py +356 -356
- package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_template_manager.py +274 -274
- package/.claude/roles/ai-engineer/skills/01-prompt-engineering/token_cost_estimator.py +324 -324
- package/.claude/roles/ai-engineer/skills/02-rag-pipeline/README.md +448 -448
- package/.claude/roles/ai-engineer/skills/02-rag-pipeline/document_chunker.py +336 -336
- package/.claude/roles/ai-engineer/skills/02-rag-pipeline/rag_pipeline.sql +213 -213
- package/.claude/roles/ai-engineer/skills/03-agent-orchestration/README.md +599 -599
- package/.claude/roles/ai-engineer/skills/04-llm-guardrails/README.md +735 -735
- package/.claude/roles/ai-engineer/skills/05-vector-embeddings/README.md +711 -711
- package/.claude/roles/ai-engineer/skills/06-llm-evaluation/README.md +777 -777
- package/.claude/roles/azure/skills/01-infrastructure-fundamentals/README.md +264 -264
- package/.claude/roles/azure/skills/02-data-factory/README.md +264 -264
- package/.claude/roles/azure/skills/03-synapse-analytics/README.md +264 -264
- package/.claude/roles/azure/skills/04-databricks/README.md +264 -264
- package/.claude/roles/azure/skills/05-functions/README.md +264 -264
- package/.claude/roles/azure/skills/06-kubernetes-service/README.md +264 -264
- package/.claude/roles/azure/skills/07-openai-service/README.md +264 -264
- package/.claude/roles/azure/skills/08-machine-learning/README.md +264 -264
- package/.claude/roles/azure/skills/09-storage-adls/README.md +264 -264
- package/.claude/roles/azure/skills/10-networking/README.md +264 -264
- package/.claude/roles/azure/skills/11-sql-cosmos/README.md +264 -264
- package/.claude/roles/azure/skills/12-event-hubs/README.md +264 -264
- package/.claude/roles/code-review/skills/01-automated-code-review/README.md +394 -394
- package/.claude/roles/code-review/skills/02-pr-review-workflow/README.md +427 -427
- package/.claude/roles/code-review/skills/03-code-quality-gates/README.md +518 -518
- package/.claude/roles/code-review/skills/04-reviewer-assignment/README.md +504 -504
- package/.claude/roles/code-review/skills/05-review-analytics/README.md +540 -540
- package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/README.md +550 -550
- package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/bronze_ingestion.py +337 -337
- package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/medallion_queries.sql +300 -300
- package/.claude/roles/data-engineer/skills/02-etl-pipeline/README.md +580 -580
- package/.claude/roles/data-engineer/skills/03-data-quality/README.md +579 -579
- package/.claude/roles/data-engineer/skills/04-streaming-pipelines/README.md +608 -608
- package/.claude/roles/data-engineer/skills/05-performance-optimization/README.md +547 -547
- package/.claude/roles/data-governance/skills/01-data-catalog/README.md +112 -112
- package/.claude/roles/data-governance/skills/02-data-lineage/README.md +129 -129
- package/.claude/roles/data-governance/skills/03-data-quality-framework/README.md +182 -182
- package/.claude/roles/data-governance/skills/04-access-control/README.md +39 -39
- package/.claude/roles/data-governance/skills/05-master-data-management/README.md +40 -40
- package/.claude/roles/data-governance/skills/06-compliance-privacy/README.md +46 -46
- package/.claude/roles/data-scientist/skills/01-eda-automation/README.md +230 -230
- package/.claude/roles/data-scientist/skills/01-eda-automation/eda_generator.py +446 -446
- package/.claude/roles/data-scientist/skills/02-statistical-modeling/README.md +264 -264
- package/.claude/roles/data-scientist/skills/03-feature-engineering/README.md +264 -264
- package/.claude/roles/data-scientist/skills/04-predictive-modeling/README.md +264 -264
- package/.claude/roles/data-scientist/skills/05-customer-analytics/README.md +264 -264
- package/.claude/roles/data-scientist/skills/06-campaign-analysis/README.md +264 -264
- package/.claude/roles/data-scientist/skills/07-experimentation/README.md +264 -264
- package/.claude/roles/data-scientist/skills/08-data-visualization/README.md +264 -264
- package/.claude/roles/devops/skills/01-cicd-pipeline/README.md +264 -264
- package/.claude/roles/devops/skills/02-container-orchestration/README.md +264 -264
- package/.claude/roles/devops/skills/03-infrastructure-as-code/README.md +264 -264
- package/.claude/roles/devops/skills/04-gitops/README.md +264 -264
- package/.claude/roles/devops/skills/05-environment-management/README.md +264 -264
- package/.claude/roles/devops/skills/06-automated-testing/README.md +264 -264
- package/.claude/roles/devops/skills/07-release-management/README.md +264 -264
- package/.claude/roles/devops/skills/08-monitoring-alerting/README.md +264 -264
- package/.claude/roles/devops/skills/09-devsecops/README.md +265 -265
- package/.claude/roles/finops/skills/01-cost-visibility/README.md +264 -264
- package/.claude/roles/finops/skills/02-resource-tagging/README.md +264 -264
- package/.claude/roles/finops/skills/03-budget-management/README.md +264 -264
- package/.claude/roles/finops/skills/04-reserved-instances/README.md +264 -264
- package/.claude/roles/finops/skills/05-spot-optimization/README.md +264 -264
- package/.claude/roles/finops/skills/06-storage-tiering/README.md +264 -264
- package/.claude/roles/finops/skills/07-compute-rightsizing/README.md +264 -264
- package/.claude/roles/finops/skills/08-chargeback/README.md +264 -264
- package/.claude/roles/ml-engineer/skills/01-mlops-pipeline/README.md +566 -566
- package/.claude/roles/ml-engineer/skills/02-feature-engineering/README.md +655 -655
- package/.claude/roles/ml-engineer/skills/03-model-training/README.md +704 -704
- package/.claude/roles/ml-engineer/skills/04-model-serving/README.md +845 -845
- package/.claude/roles/ml-engineer/skills/05-model-monitoring/README.md +874 -874
- package/.claude/roles/mlops/skills/01-ml-pipeline-orchestration/README.md +264 -264
- package/.claude/roles/mlops/skills/02-experiment-tracking/README.md +264 -264
- package/.claude/roles/mlops/skills/03-model-registry/README.md +264 -264
- package/.claude/roles/mlops/skills/04-feature-store/README.md +264 -264
- package/.claude/roles/mlops/skills/05-model-deployment/README.md +264 -264
- package/.claude/roles/mlops/skills/06-model-observability/README.md +264 -264
- package/.claude/roles/mlops/skills/07-data-versioning/README.md +264 -264
- package/.claude/roles/mlops/skills/08-ab-testing/README.md +264 -264
- package/.claude/roles/mlops/skills/09-automated-retraining/README.md +264 -264
- package/.claude/roles/platform-engineer/skills/01-internal-developer-platform/README.md +153 -153
- package/.claude/roles/platform-engineer/skills/02-self-service-infrastructure/README.md +57 -57
- package/.claude/roles/platform-engineer/skills/03-slo-sli-management/README.md +59 -59
- package/.claude/roles/platform-engineer/skills/04-developer-experience/README.md +57 -57
- package/.claude/roles/platform-engineer/skills/05-incident-management/README.md +73 -73
- package/.claude/roles/platform-engineer/skills/06-capacity-management/README.md +59 -59
- package/.claude/roles/product-designer/skills/01-requirements-discovery/README.md +407 -407
- package/.claude/roles/product-designer/skills/02-user-research/README.md +382 -382
- package/.claude/roles/product-designer/skills/03-brainstorming-ideation/README.md +437 -437
- package/.claude/roles/product-designer/skills/04-ux-design/README.md +496 -496
- package/.claude/roles/product-designer/skills/05-product-market-fit/README.md +376 -376
- package/.claude/roles/product-designer/skills/06-stakeholder-management/README.md +412 -412
- package/.claude/roles/security-architect/skills/01-pii-detection/README.md +319 -319
- package/.claude/roles/security-architect/skills/02-threat-modeling/README.md +264 -264
- package/.claude/roles/security-architect/skills/03-infrastructure-security/README.md +264 -264
- package/.claude/roles/security-architect/skills/04-iam/README.md +264 -264
- package/.claude/roles/security-architect/skills/05-application-security/README.md +264 -264
- package/.claude/roles/security-architect/skills/06-secrets-management/README.md +264 -264
- package/.claude/roles/security-architect/skills/07-security-monitoring/README.md +264 -264
- package/.claude/roles/system-design/skills/01-architecture-patterns/README.md +337 -337
- package/.claude/roles/system-design/skills/02-requirements-engineering/README.md +264 -264
- package/.claude/roles/system-design/skills/03-scalability/README.md +264 -264
- package/.claude/roles/system-design/skills/04-high-availability/README.md +264 -264
- package/.claude/roles/system-design/skills/05-cost-optimization-design/README.md +264 -264
- package/.claude/roles/system-design/skills/06-api-design/README.md +264 -264
- package/.claude/roles/system-design/skills/07-observability-architecture/README.md +264 -264
- package/.claude/roles/system-design/skills/08-process-automation/PROCESS_TEMPLATE.md +336 -336
- package/.claude/roles/system-design/skills/08-process-automation/README.md +521 -521
- package/.claude/roles/system-design/skills/08-process-automation/ai_prompt_generator.py +744 -744
- package/.claude/roles/system-design/skills/08-process-automation/automation_recommender.py +688 -688
- package/.claude/roles/system-design/skills/08-process-automation/plan_generator.py +679 -679
- package/.claude/roles/system-design/skills/08-process-automation/process_analyzer.py +528 -528
- package/.claude/roles/system-design/skills/08-process-automation/process_parser.py +684 -684
- package/.claude/roles/system-design/skills/08-process-automation/role_matcher.py +615 -615
- package/.claude/skills/README.md +336 -336
- package/.claude/skills/ai-engineer.md +104 -104
- package/.claude/skills/aws.md +143 -143
- package/.claude/skills/azure.md +149 -149
- package/.claude/skills/backend-developer.md +108 -108
- package/.claude/skills/code-review.md +399 -399
- package/.claude/skills/compliance-automation.md +747 -747
- package/.claude/skills/compliance-officer.md +108 -108
- package/.claude/skills/data-engineer.md +113 -113
- package/.claude/skills/data-governance.md +102 -102
- package/.claude/skills/data-scientist.md +123 -123
- package/.claude/skills/database-admin.md +109 -109
- package/.claude/skills/devops.md +160 -160
- package/.claude/skills/docker.md +160 -160
- package/.claude/skills/enterprise-dashboard.md +613 -613
- package/.claude/skills/finops.md +184 -184
- package/.claude/skills/frontend-developer.md +108 -108
- package/.claude/skills/gcp.md +143 -143
- package/.claude/skills/ml-engineer.md +115 -115
- package/.claude/skills/mlops.md +187 -187
- package/.claude/skills/network-engineer.md +109 -109
- package/.claude/skills/optimization-advisor.md +329 -329
- package/.claude/skills/orchestrator.md +623 -623
- package/.claude/skills/platform-engineer.md +102 -102
- package/.claude/skills/process-automation.md +226 -226
- package/.claude/skills/process-changelog.md +184 -184
- package/.claude/skills/process-documentation.md +484 -484
- package/.claude/skills/process-kanban.md +324 -324
- package/.claude/skills/process-versioning.md +214 -214
- package/.claude/skills/product-designer.md +104 -104
- package/.claude/skills/project-starter.md +443 -443
- package/.claude/skills/qa-engineer.md +109 -109
- package/.claude/skills/security-architect.md +135 -135
- package/.claude/skills/sre.md +109 -109
- package/.claude/skills/system-design.md +126 -126
- package/.claude/skills/technical-writer.md +101 -101
- package/.gitattributes +2 -2
- package/GITHUB_COPILOT.md +106 -106
- package/README.md +192 -184
- package/package.json +16 -8
|
@@ -1,356 +1,356 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Prompt A/B Testing Framework
|
|
3
|
-
Compare prompt variations with statistical significance testing.
|
|
4
|
-
"""
|
|
5
|
-
|
|
6
|
-
import asyncio
|
|
7
|
-
import json
|
|
8
|
-
from datetime import datetime
|
|
9
|
-
from typing import List, Dict, Any, Optional, Callable
|
|
10
|
-
from dataclasses import dataclass, asdict
|
|
11
|
-
from scipy import stats
|
|
12
|
-
import numpy as np
|
|
13
|
-
import pandas as pd
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
@dataclass
|
|
17
|
-
class PromptVariant:
|
|
18
|
-
"""A prompt variant for A/B testing."""
|
|
19
|
-
id: str
|
|
20
|
-
name: str
|
|
21
|
-
template: str
|
|
22
|
-
metadata: Dict[str, Any] = None
|
|
23
|
-
|
|
24
|
-
def __post_init__(self):
|
|
25
|
-
if self.metadata is None:
|
|
26
|
-
self.metadata = {}
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
@dataclass
|
|
30
|
-
class TestResult:
|
|
31
|
-
"""Result from testing a single variant."""
|
|
32
|
-
variant_id: str
|
|
33
|
-
response: str
|
|
34
|
-
latency_ms: float
|
|
35
|
-
tokens_used: int
|
|
36
|
-
cost: float
|
|
37
|
-
quality_score: Optional[float] = None
|
|
38
|
-
timestamp: str = None
|
|
39
|
-
|
|
40
|
-
def __post_init__(self):
|
|
41
|
-
if self.timestamp is None:
|
|
42
|
-
self.timestamp = datetime.now().isoformat()
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
class ABTest:
|
|
46
|
-
"""A/B test experiment for prompt variants."""
|
|
47
|
-
|
|
48
|
-
def __init__(
|
|
49
|
-
self,
|
|
50
|
-
name: str,
|
|
51
|
-
variants: List[PromptVariant],
|
|
52
|
-
evaluation_fn: Optional[Callable] = None
|
|
53
|
-
):
|
|
54
|
-
self.name = name
|
|
55
|
-
self.variants = {v.id: v for v in variants}
|
|
56
|
-
self.evaluation_fn = evaluation_fn or self._default_evaluation
|
|
57
|
-
self.results: List[TestResult] = []
|
|
58
|
-
|
|
59
|
-
def add_result(self, result: TestResult) -> None:
|
|
60
|
-
"""Add a test result."""
|
|
61
|
-
self.results.append(result)
|
|
62
|
-
|
|
63
|
-
def get_variant_results(self, variant_id: str) -> List[TestResult]:
|
|
64
|
-
"""Get all results for a specific variant."""
|
|
65
|
-
return [r for r in self.results if r.variant_id == variant_id]
|
|
66
|
-
|
|
67
|
-
def calculate_metrics(self, variant_id: str) -> Dict[str, float]:
|
|
68
|
-
"""Calculate aggregate metrics for a variant."""
|
|
69
|
-
results = self.get_variant_results(variant_id)
|
|
70
|
-
|
|
71
|
-
if not results:
|
|
72
|
-
return {}
|
|
73
|
-
|
|
74
|
-
return {
|
|
75
|
-
"sample_size": len(results),
|
|
76
|
-
"avg_latency_ms": np.mean([r.latency_ms for r in results]),
|
|
77
|
-
"avg_cost": np.mean([r.cost for r in results]),
|
|
78
|
-
"avg_tokens": np.mean([r.tokens_used for r in results]),
|
|
79
|
-
"avg_quality_score": np.mean([r.quality_score for r in results if r.quality_score is not None]),
|
|
80
|
-
"p50_latency": np.percentile([r.latency_ms for r in results], 50),
|
|
81
|
-
"p95_latency": np.percentile([r.latency_ms for r in results], 95),
|
|
82
|
-
"p99_latency": np.percentile([r.latency_ms for r in results], 99),
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
def compare_variants(
|
|
86
|
-
self,
|
|
87
|
-
variant_a_id: str,
|
|
88
|
-
variant_b_id: str,
|
|
89
|
-
metric: str = "quality_score"
|
|
90
|
-
) -> Dict[str, Any]:
|
|
91
|
-
"""
|
|
92
|
-
Compare two variants with statistical significance testing.
|
|
93
|
-
|
|
94
|
-
Args:
|
|
95
|
-
variant_a_id: First variant ID
|
|
96
|
-
variant_b_id: Second variant ID
|
|
97
|
-
metric: Metric to compare ('quality_score', 'latency_ms', 'cost')
|
|
98
|
-
|
|
99
|
-
Returns:
|
|
100
|
-
Comparison results with p-value and effect size
|
|
101
|
-
"""
|
|
102
|
-
results_a = self.get_variant_results(variant_a_id)
|
|
103
|
-
results_b = self.get_variant_results(variant_b_id)
|
|
104
|
-
|
|
105
|
-
if not results_a or not results_b:
|
|
106
|
-
return {"error": "Insufficient data for comparison"}
|
|
107
|
-
|
|
108
|
-
# Extract metric values
|
|
109
|
-
values_a = [getattr(r, metric) for r in results_a if getattr(r, metric) is not None]
|
|
110
|
-
values_b = [getattr(r, metric) for r in results_b if getattr(r, metric) is not None]
|
|
111
|
-
|
|
112
|
-
if not values_a or not values_b:
|
|
113
|
-
return {"error": f"No valid {metric} data"}
|
|
114
|
-
|
|
115
|
-
# Perform t-test
|
|
116
|
-
t_stat, p_value = stats.ttest_ind(values_a, values_b)
|
|
117
|
-
|
|
118
|
-
# Calculate effect size (Cohen's d)
|
|
119
|
-
pooled_std = np.sqrt((np.std(values_a)**2 + np.std(values_b)**2) / 2)
|
|
120
|
-
cohens_d = (np.mean(values_a) - np.mean(values_b)) / pooled_std if pooled_std > 0 else 0
|
|
121
|
-
|
|
122
|
-
# Determine winner
|
|
123
|
-
if p_value < 0.05:
|
|
124
|
-
if metric in ["latency_ms", "cost"]:
|
|
125
|
-
# Lower is better
|
|
126
|
-
winner = variant_a_id if np.mean(values_a) < np.mean(values_b) else variant_b_id
|
|
127
|
-
else:
|
|
128
|
-
# Higher is better
|
|
129
|
-
winner = variant_a_id if np.mean(values_a) > np.mean(values_b) else variant_b_id
|
|
130
|
-
significant = True
|
|
131
|
-
else:
|
|
132
|
-
winner = "No significant difference"
|
|
133
|
-
significant = False
|
|
134
|
-
|
|
135
|
-
return {
|
|
136
|
-
"variant_a": {
|
|
137
|
-
"id": variant_a_id,
|
|
138
|
-
"mean": float(np.mean(values_a)),
|
|
139
|
-
"std": float(np.std(values_a)),
|
|
140
|
-
"sample_size": len(values_a)
|
|
141
|
-
},
|
|
142
|
-
"variant_b": {
|
|
143
|
-
"id": variant_b_id,
|
|
144
|
-
"mean": float(np.mean(values_b)),
|
|
145
|
-
"std": float(np.std(values_b)),
|
|
146
|
-
"sample_size": len(values_b)
|
|
147
|
-
},
|
|
148
|
-
"t_statistic": float(t_stat),
|
|
149
|
-
"p_value": float(p_value),
|
|
150
|
-
"cohens_d": float(cohens_d),
|
|
151
|
-
"significant": significant,
|
|
152
|
-
"winner": winner,
|
|
153
|
-
"metric": metric,
|
|
154
|
-
"improvement": float((np.mean(values_b) - np.mean(values_a)) / np.mean(values_a) * 100)
|
|
155
|
-
}
|
|
156
|
-
|
|
157
|
-
def generate_report(self) -> str:
|
|
158
|
-
"""Generate a comprehensive A/B test report."""
|
|
159
|
-
lines = [
|
|
160
|
-
f"š A/B Test Report: {self.name}",
|
|
161
|
-
"=" * 80,
|
|
162
|
-
f"Total Results: {len(self.results)}",
|
|
163
|
-
f"Variants Tested: {len(self.variants)}",
|
|
164
|
-
"\n"
|
|
165
|
-
]
|
|
166
|
-
|
|
167
|
-
# Metrics for each variant
|
|
168
|
-
lines.append("š Variant Performance:")
|
|
169
|
-
for variant_id, variant in self.variants.items():
|
|
170
|
-
metrics = self.calculate_metrics(variant_id)
|
|
171
|
-
if metrics:
|
|
172
|
-
lines.append(f"\n {variant.name} (ID: {variant_id})")
|
|
173
|
-
lines.append(f" Sample Size: {metrics['sample_size']}")
|
|
174
|
-
lines.append(f" Avg Quality Score: {metrics['avg_quality_score']:.2f}")
|
|
175
|
-
lines.append(f" Avg Latency: {metrics['avg_latency_ms']:.0f}ms (p95: {metrics['p95_latency']:.0f}ms)")
|
|
176
|
-
lines.append(f" Avg Cost: ${metrics['avg_cost']:.4f}")
|
|
177
|
-
lines.append(f" Avg Tokens: {metrics['avg_tokens']:.0f}")
|
|
178
|
-
|
|
179
|
-
# Statistical comparisons
|
|
180
|
-
if len(self.variants) == 2:
|
|
181
|
-
variant_ids = list(self.variants.keys())
|
|
182
|
-
comparison = self.compare_variants(variant_ids[0], variant_ids[1])
|
|
183
|
-
|
|
184
|
-
lines.append("\n\nš¬ Statistical Comparison:")
|
|
185
|
-
lines.append(f" Metric: {comparison.get('metric', 'N/A')}")
|
|
186
|
-
lines.append(f" P-Value: {comparison.get('p_value', 0):.4f}")
|
|
187
|
-
lines.append(f" Cohen's d: {comparison.get('cohens_d', 0):.3f}")
|
|
188
|
-
lines.append(f" Significant: {'Yes' if comparison.get('significant') else 'No'}")
|
|
189
|
-
lines.append(f" Winner: {comparison.get('winner', 'N/A')}")
|
|
190
|
-
lines.append(f" Improvement: {comparison.get('improvement', 0):.1f}%")
|
|
191
|
-
|
|
192
|
-
return "\n".join(lines)
|
|
193
|
-
|
|
194
|
-
def _default_evaluation(self, response: str) -> float:
|
|
195
|
-
"""Default quality evaluation (length-based)."""
|
|
196
|
-
# Simple heuristic: penalize very short or very long responses
|
|
197
|
-
length = len(response)
|
|
198
|
-
if length < 50:
|
|
199
|
-
return 50.0
|
|
200
|
-
elif length > 2000:
|
|
201
|
-
return 70.0
|
|
202
|
-
else:
|
|
203
|
-
return 85.0
|
|
204
|
-
|
|
205
|
-
def export_results(self, filepath: str) -> None:
|
|
206
|
-
"""Export results to JSON file."""
|
|
207
|
-
data = {
|
|
208
|
-
"name": self.name,
|
|
209
|
-
"variants": {k: asdict(v) for k, v in self.variants.items()},
|
|
210
|
-
"results": [asdict(r) for r in self.results],
|
|
211
|
-
"summary": {
|
|
212
|
-
variant_id: self.calculate_metrics(variant_id)
|
|
213
|
-
for variant_id in self.variants.keys()
|
|
214
|
-
}
|
|
215
|
-
}
|
|
216
|
-
|
|
217
|
-
with open(filepath, 'w') as f:
|
|
218
|
-
json.dump(data, f, indent=2)
|
|
219
|
-
|
|
220
|
-
print(f"ā
Results exported to {filepath}")
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
class ABTestRunner:
|
|
224
|
-
"""Runner for executing A/B tests with actual LLM calls."""
|
|
225
|
-
|
|
226
|
-
def __init__(self, llm_client: Optional[Any] = None):
|
|
227
|
-
self.llm_client = llm_client
|
|
228
|
-
|
|
229
|
-
async def run_test(
|
|
230
|
-
self,
|
|
231
|
-
test: ABTest,
|
|
232
|
-
test_cases: List[Dict[str, Any]],
|
|
233
|
-
samples_per_variant: int = 30
|
|
234
|
-
) -> ABTest:
|
|
235
|
-
"""
|
|
236
|
-
Run an A/B test with multiple test cases.
|
|
237
|
-
|
|
238
|
-
Args:
|
|
239
|
-
test: The ABTest instance
|
|
240
|
-
test_cases: List of test case dictionaries with template variables
|
|
241
|
-
samples_per_variant: Number of samples to collect per variant
|
|
242
|
-
|
|
243
|
-
Returns:
|
|
244
|
-
The test instance with results
|
|
245
|
-
"""
|
|
246
|
-
for variant_id, variant in test.variants.items():
|
|
247
|
-
print(f"Testing variant: {variant.name} ({samples_per_variant} samples)")
|
|
248
|
-
|
|
249
|
-
for i, test_case in enumerate(test_cases[:samples_per_variant]):
|
|
250
|
-
# Render prompt with test case
|
|
251
|
-
# In production, use actual template rendering
|
|
252
|
-
prompt = variant.template.format(**test_case)
|
|
253
|
-
|
|
254
|
-
# Simulate LLM call (replace with actual API call)
|
|
255
|
-
result = await self._simulate_llm_call(prompt, variant_id)
|
|
256
|
-
|
|
257
|
-
# Evaluate quality
|
|
258
|
-
quality_score = test.evaluation_fn(result.response)
|
|
259
|
-
result.quality_score = quality_score
|
|
260
|
-
|
|
261
|
-
test.add_result(result)
|
|
262
|
-
|
|
263
|
-
if (i + 1) % 10 == 0:
|
|
264
|
-
print(f" Progress: {i + 1}/{samples_per_variant}")
|
|
265
|
-
|
|
266
|
-
return test
|
|
267
|
-
|
|
268
|
-
async def _simulate_llm_call(
|
|
269
|
-
self,
|
|
270
|
-
prompt: str,
|
|
271
|
-
variant_id: str
|
|
272
|
-
) -> TestResult:
|
|
273
|
-
"""Simulate an LLM API call (replace with actual implementation)."""
|
|
274
|
-
# In production, call actual LLM API here
|
|
275
|
-
await asyncio.sleep(0.1) # Simulate API latency
|
|
276
|
-
|
|
277
|
-
# Generate mock response
|
|
278
|
-
response = f"This is a simulated response for variant {variant_id}. " + "Sample content. " * 20
|
|
279
|
-
|
|
280
|
-
return TestResult(
|
|
281
|
-
variant_id=variant_id,
|
|
282
|
-
response=response,
|
|
283
|
-
latency_ms=np.random.uniform(200, 800),
|
|
284
|
-
tokens_used=np.random.randint(100, 500),
|
|
285
|
-
cost=np.random.uniform(0.001, 0.01)
|
|
286
|
-
)
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
# Example usage
|
|
290
|
-
if __name__ == "__main__":
|
|
291
|
-
# Define variants
|
|
292
|
-
variant_a = PromptVariant(
|
|
293
|
-
id="v1_concise",
|
|
294
|
-
name="Concise Prompt",
|
|
295
|
-
template="""Analyze this marketing campaign: {campaign_details}
|
|
296
|
-
|
|
297
|
-
Provide brief recommendations.""",
|
|
298
|
-
metadata={"author": "Team A", "hypothesis": "Shorter prompts = faster responses"}
|
|
299
|
-
)
|
|
300
|
-
|
|
301
|
-
variant_b = PromptVariant(
|
|
302
|
-
id="v2_detailed",
|
|
303
|
-
name="Detailed Prompt",
|
|
304
|
-
template="""You are a marketing analytics expert. Analyze the following campaign in detail.
|
|
305
|
-
|
|
306
|
-
Campaign Details: {campaign_details}
|
|
307
|
-
|
|
308
|
-
Please provide:
|
|
309
|
-
1. Performance assessment
|
|
310
|
-
2. Key insights
|
|
311
|
-
3. Specific recommendations
|
|
312
|
-
4. Action items
|
|
313
|
-
|
|
314
|
-
Be thorough and data-driven.""",
|
|
315
|
-
metadata={"author": "Team B", "hypothesis": "Detailed prompts = better quality"}
|
|
316
|
-
)
|
|
317
|
-
|
|
318
|
-
# Create test
|
|
319
|
-
test = ABTest(
|
|
320
|
-
name="Marketing Prompt Optimization Q1 2025",
|
|
321
|
-
variants=[variant_a, variant_b]
|
|
322
|
-
)
|
|
323
|
-
|
|
324
|
-
# Define test cases
|
|
325
|
-
test_cases = [
|
|
326
|
-
{
|
|
327
|
-
"campaign_details": f"Campaign {i}: Budget $10K, CTR 2.5%, Conv 1.2%"
|
|
328
|
-
}
|
|
329
|
-
for i in range(50)
|
|
330
|
-
]
|
|
331
|
-
|
|
332
|
-
# Run test
|
|
333
|
-
runner = ABTestRunner()
|
|
334
|
-
|
|
335
|
-
async def run():
|
|
336
|
-
test_with_results = await runner.run_test(
|
|
337
|
-
test=test,
|
|
338
|
-
test_cases=test_cases,
|
|
339
|
-
samples_per_variant=25
|
|
340
|
-
)
|
|
341
|
-
|
|
342
|
-
# Generate report
|
|
343
|
-
print("\n" + test_with_results.generate_report())
|
|
344
|
-
|
|
345
|
-
# Compare variants
|
|
346
|
-
print("\n" + "=" * 80)
|
|
347
|
-
comparison = test_with_results.compare_variants("v1_concise", "v2_detailed", "quality_score")
|
|
348
|
-
print(f"\nš Winner: {comparison['winner']}")
|
|
349
|
-
print(f"š Statistical Significance: {'Yes ā' if comparison['significant'] else 'No ā'}")
|
|
350
|
-
print(f"š Improvement: {comparison['improvement']:.1f}%")
|
|
351
|
-
|
|
352
|
-
# Export results
|
|
353
|
-
test_with_results.export_results("ab_test_results.json")
|
|
354
|
-
|
|
355
|
-
# Run async test
|
|
356
|
-
asyncio.run(run())
|
|
1
|
+
"""
|
|
2
|
+
Prompt A/B Testing Framework
|
|
3
|
+
Compare prompt variations with statistical significance testing.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import asyncio
|
|
7
|
+
import json
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from typing import List, Dict, Any, Optional, Callable
|
|
10
|
+
from dataclasses import dataclass, asdict
|
|
11
|
+
from scipy import stats
|
|
12
|
+
import numpy as np
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class PromptVariant:
|
|
18
|
+
"""A prompt variant for A/B testing."""
|
|
19
|
+
id: str
|
|
20
|
+
name: str
|
|
21
|
+
template: str
|
|
22
|
+
metadata: Dict[str, Any] = None
|
|
23
|
+
|
|
24
|
+
def __post_init__(self):
|
|
25
|
+
if self.metadata is None:
|
|
26
|
+
self.metadata = {}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class TestResult:
|
|
31
|
+
"""Result from testing a single variant."""
|
|
32
|
+
variant_id: str
|
|
33
|
+
response: str
|
|
34
|
+
latency_ms: float
|
|
35
|
+
tokens_used: int
|
|
36
|
+
cost: float
|
|
37
|
+
quality_score: Optional[float] = None
|
|
38
|
+
timestamp: str = None
|
|
39
|
+
|
|
40
|
+
def __post_init__(self):
|
|
41
|
+
if self.timestamp is None:
|
|
42
|
+
self.timestamp = datetime.now().isoformat()
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class ABTest:
|
|
46
|
+
"""A/B test experiment for prompt variants."""
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
name: str,
|
|
51
|
+
variants: List[PromptVariant],
|
|
52
|
+
evaluation_fn: Optional[Callable] = None
|
|
53
|
+
):
|
|
54
|
+
self.name = name
|
|
55
|
+
self.variants = {v.id: v for v in variants}
|
|
56
|
+
self.evaluation_fn = evaluation_fn or self._default_evaluation
|
|
57
|
+
self.results: List[TestResult] = []
|
|
58
|
+
|
|
59
|
+
def add_result(self, result: TestResult) -> None:
|
|
60
|
+
"""Add a test result."""
|
|
61
|
+
self.results.append(result)
|
|
62
|
+
|
|
63
|
+
def get_variant_results(self, variant_id: str) -> List[TestResult]:
|
|
64
|
+
"""Get all results for a specific variant."""
|
|
65
|
+
return [r for r in self.results if r.variant_id == variant_id]
|
|
66
|
+
|
|
67
|
+
def calculate_metrics(self, variant_id: str) -> Dict[str, float]:
|
|
68
|
+
"""Calculate aggregate metrics for a variant."""
|
|
69
|
+
results = self.get_variant_results(variant_id)
|
|
70
|
+
|
|
71
|
+
if not results:
|
|
72
|
+
return {}
|
|
73
|
+
|
|
74
|
+
return {
|
|
75
|
+
"sample_size": len(results),
|
|
76
|
+
"avg_latency_ms": np.mean([r.latency_ms for r in results]),
|
|
77
|
+
"avg_cost": np.mean([r.cost for r in results]),
|
|
78
|
+
"avg_tokens": np.mean([r.tokens_used for r in results]),
|
|
79
|
+
"avg_quality_score": np.mean([r.quality_score for r in results if r.quality_score is not None]),
|
|
80
|
+
"p50_latency": np.percentile([r.latency_ms for r in results], 50),
|
|
81
|
+
"p95_latency": np.percentile([r.latency_ms for r in results], 95),
|
|
82
|
+
"p99_latency": np.percentile([r.latency_ms for r in results], 99),
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
def compare_variants(
|
|
86
|
+
self,
|
|
87
|
+
variant_a_id: str,
|
|
88
|
+
variant_b_id: str,
|
|
89
|
+
metric: str = "quality_score"
|
|
90
|
+
) -> Dict[str, Any]:
|
|
91
|
+
"""
|
|
92
|
+
Compare two variants with statistical significance testing.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
variant_a_id: First variant ID
|
|
96
|
+
variant_b_id: Second variant ID
|
|
97
|
+
metric: Metric to compare ('quality_score', 'latency_ms', 'cost')
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
Comparison results with p-value and effect size
|
|
101
|
+
"""
|
|
102
|
+
results_a = self.get_variant_results(variant_a_id)
|
|
103
|
+
results_b = self.get_variant_results(variant_b_id)
|
|
104
|
+
|
|
105
|
+
if not results_a or not results_b:
|
|
106
|
+
return {"error": "Insufficient data for comparison"}
|
|
107
|
+
|
|
108
|
+
# Extract metric values
|
|
109
|
+
values_a = [getattr(r, metric) for r in results_a if getattr(r, metric) is not None]
|
|
110
|
+
values_b = [getattr(r, metric) for r in results_b if getattr(r, metric) is not None]
|
|
111
|
+
|
|
112
|
+
if not values_a or not values_b:
|
|
113
|
+
return {"error": f"No valid {metric} data"}
|
|
114
|
+
|
|
115
|
+
# Perform t-test
|
|
116
|
+
t_stat, p_value = stats.ttest_ind(values_a, values_b)
|
|
117
|
+
|
|
118
|
+
# Calculate effect size (Cohen's d)
|
|
119
|
+
pooled_std = np.sqrt((np.std(values_a)**2 + np.std(values_b)**2) / 2)
|
|
120
|
+
cohens_d = (np.mean(values_a) - np.mean(values_b)) / pooled_std if pooled_std > 0 else 0
|
|
121
|
+
|
|
122
|
+
# Determine winner
|
|
123
|
+
if p_value < 0.05:
|
|
124
|
+
if metric in ["latency_ms", "cost"]:
|
|
125
|
+
# Lower is better
|
|
126
|
+
winner = variant_a_id if np.mean(values_a) < np.mean(values_b) else variant_b_id
|
|
127
|
+
else:
|
|
128
|
+
# Higher is better
|
|
129
|
+
winner = variant_a_id if np.mean(values_a) > np.mean(values_b) else variant_b_id
|
|
130
|
+
significant = True
|
|
131
|
+
else:
|
|
132
|
+
winner = "No significant difference"
|
|
133
|
+
significant = False
|
|
134
|
+
|
|
135
|
+
return {
|
|
136
|
+
"variant_a": {
|
|
137
|
+
"id": variant_a_id,
|
|
138
|
+
"mean": float(np.mean(values_a)),
|
|
139
|
+
"std": float(np.std(values_a)),
|
|
140
|
+
"sample_size": len(values_a)
|
|
141
|
+
},
|
|
142
|
+
"variant_b": {
|
|
143
|
+
"id": variant_b_id,
|
|
144
|
+
"mean": float(np.mean(values_b)),
|
|
145
|
+
"std": float(np.std(values_b)),
|
|
146
|
+
"sample_size": len(values_b)
|
|
147
|
+
},
|
|
148
|
+
"t_statistic": float(t_stat),
|
|
149
|
+
"p_value": float(p_value),
|
|
150
|
+
"cohens_d": float(cohens_d),
|
|
151
|
+
"significant": significant,
|
|
152
|
+
"winner": winner,
|
|
153
|
+
"metric": metric,
|
|
154
|
+
"improvement": float((np.mean(values_b) - np.mean(values_a)) / np.mean(values_a) * 100)
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
def generate_report(self) -> str:
|
|
158
|
+
"""Generate a comprehensive A/B test report."""
|
|
159
|
+
lines = [
|
|
160
|
+
f"š A/B Test Report: {self.name}",
|
|
161
|
+
"=" * 80,
|
|
162
|
+
f"Total Results: {len(self.results)}",
|
|
163
|
+
f"Variants Tested: {len(self.variants)}",
|
|
164
|
+
"\n"
|
|
165
|
+
]
|
|
166
|
+
|
|
167
|
+
# Metrics for each variant
|
|
168
|
+
lines.append("š Variant Performance:")
|
|
169
|
+
for variant_id, variant in self.variants.items():
|
|
170
|
+
metrics = self.calculate_metrics(variant_id)
|
|
171
|
+
if metrics:
|
|
172
|
+
lines.append(f"\n {variant.name} (ID: {variant_id})")
|
|
173
|
+
lines.append(f" Sample Size: {metrics['sample_size']}")
|
|
174
|
+
lines.append(f" Avg Quality Score: {metrics['avg_quality_score']:.2f}")
|
|
175
|
+
lines.append(f" Avg Latency: {metrics['avg_latency_ms']:.0f}ms (p95: {metrics['p95_latency']:.0f}ms)")
|
|
176
|
+
lines.append(f" Avg Cost: ${metrics['avg_cost']:.4f}")
|
|
177
|
+
lines.append(f" Avg Tokens: {metrics['avg_tokens']:.0f}")
|
|
178
|
+
|
|
179
|
+
# Statistical comparisons
|
|
180
|
+
if len(self.variants) == 2:
|
|
181
|
+
variant_ids = list(self.variants.keys())
|
|
182
|
+
comparison = self.compare_variants(variant_ids[0], variant_ids[1])
|
|
183
|
+
|
|
184
|
+
lines.append("\n\nš¬ Statistical Comparison:")
|
|
185
|
+
lines.append(f" Metric: {comparison.get('metric', 'N/A')}")
|
|
186
|
+
lines.append(f" P-Value: {comparison.get('p_value', 0):.4f}")
|
|
187
|
+
lines.append(f" Cohen's d: {comparison.get('cohens_d', 0):.3f}")
|
|
188
|
+
lines.append(f" Significant: {'Yes' if comparison.get('significant') else 'No'}")
|
|
189
|
+
lines.append(f" Winner: {comparison.get('winner', 'N/A')}")
|
|
190
|
+
lines.append(f" Improvement: {comparison.get('improvement', 0):.1f}%")
|
|
191
|
+
|
|
192
|
+
return "\n".join(lines)
|
|
193
|
+
|
|
194
|
+
def _default_evaluation(self, response: str) -> float:
|
|
195
|
+
"""Default quality evaluation (length-based)."""
|
|
196
|
+
# Simple heuristic: penalize very short or very long responses
|
|
197
|
+
length = len(response)
|
|
198
|
+
if length < 50:
|
|
199
|
+
return 50.0
|
|
200
|
+
elif length > 2000:
|
|
201
|
+
return 70.0
|
|
202
|
+
else:
|
|
203
|
+
return 85.0
|
|
204
|
+
|
|
205
|
+
def export_results(self, filepath: str) -> None:
|
|
206
|
+
"""Export results to JSON file."""
|
|
207
|
+
data = {
|
|
208
|
+
"name": self.name,
|
|
209
|
+
"variants": {k: asdict(v) for k, v in self.variants.items()},
|
|
210
|
+
"results": [asdict(r) for r in self.results],
|
|
211
|
+
"summary": {
|
|
212
|
+
variant_id: self.calculate_metrics(variant_id)
|
|
213
|
+
for variant_id in self.variants.keys()
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
with open(filepath, 'w') as f:
|
|
218
|
+
json.dump(data, f, indent=2)
|
|
219
|
+
|
|
220
|
+
print(f"ā
Results exported to {filepath}")
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
class ABTestRunner:
|
|
224
|
+
"""Runner for executing A/B tests with actual LLM calls."""
|
|
225
|
+
|
|
226
|
+
def __init__(self, llm_client: Optional[Any] = None):
|
|
227
|
+
self.llm_client = llm_client
|
|
228
|
+
|
|
229
|
+
async def run_test(
|
|
230
|
+
self,
|
|
231
|
+
test: ABTest,
|
|
232
|
+
test_cases: List[Dict[str, Any]],
|
|
233
|
+
samples_per_variant: int = 30
|
|
234
|
+
) -> ABTest:
|
|
235
|
+
"""
|
|
236
|
+
Run an A/B test with multiple test cases.
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
test: The ABTest instance
|
|
240
|
+
test_cases: List of test case dictionaries with template variables
|
|
241
|
+
samples_per_variant: Number of samples to collect per variant
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
The test instance with results
|
|
245
|
+
"""
|
|
246
|
+
for variant_id, variant in test.variants.items():
|
|
247
|
+
print(f"Testing variant: {variant.name} ({samples_per_variant} samples)")
|
|
248
|
+
|
|
249
|
+
for i, test_case in enumerate(test_cases[:samples_per_variant]):
|
|
250
|
+
# Render prompt with test case
|
|
251
|
+
# In production, use actual template rendering
|
|
252
|
+
prompt = variant.template.format(**test_case)
|
|
253
|
+
|
|
254
|
+
# Simulate LLM call (replace with actual API call)
|
|
255
|
+
result = await self._simulate_llm_call(prompt, variant_id)
|
|
256
|
+
|
|
257
|
+
# Evaluate quality
|
|
258
|
+
quality_score = test.evaluation_fn(result.response)
|
|
259
|
+
result.quality_score = quality_score
|
|
260
|
+
|
|
261
|
+
test.add_result(result)
|
|
262
|
+
|
|
263
|
+
if (i + 1) % 10 == 0:
|
|
264
|
+
print(f" Progress: {i + 1}/{samples_per_variant}")
|
|
265
|
+
|
|
266
|
+
return test
|
|
267
|
+
|
|
268
|
+
async def _simulate_llm_call(
|
|
269
|
+
self,
|
|
270
|
+
prompt: str,
|
|
271
|
+
variant_id: str
|
|
272
|
+
) -> TestResult:
|
|
273
|
+
"""Simulate an LLM API call (replace with actual implementation)."""
|
|
274
|
+
# In production, call actual LLM API here
|
|
275
|
+
await asyncio.sleep(0.1) # Simulate API latency
|
|
276
|
+
|
|
277
|
+
# Generate mock response
|
|
278
|
+
response = f"This is a simulated response for variant {variant_id}. " + "Sample content. " * 20
|
|
279
|
+
|
|
280
|
+
return TestResult(
|
|
281
|
+
variant_id=variant_id,
|
|
282
|
+
response=response,
|
|
283
|
+
latency_ms=np.random.uniform(200, 800),
|
|
284
|
+
tokens_used=np.random.randint(100, 500),
|
|
285
|
+
cost=np.random.uniform(0.001, 0.01)
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
# Example usage
|
|
290
|
+
if __name__ == "__main__":
|
|
291
|
+
# Define variants
|
|
292
|
+
variant_a = PromptVariant(
|
|
293
|
+
id="v1_concise",
|
|
294
|
+
name="Concise Prompt",
|
|
295
|
+
template="""Analyze this marketing campaign: {campaign_details}
|
|
296
|
+
|
|
297
|
+
Provide brief recommendations.""",
|
|
298
|
+
metadata={"author": "Team A", "hypothesis": "Shorter prompts = faster responses"}
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
variant_b = PromptVariant(
|
|
302
|
+
id="v2_detailed",
|
|
303
|
+
name="Detailed Prompt",
|
|
304
|
+
template="""You are a marketing analytics expert. Analyze the following campaign in detail.
|
|
305
|
+
|
|
306
|
+
Campaign Details: {campaign_details}
|
|
307
|
+
|
|
308
|
+
Please provide:
|
|
309
|
+
1. Performance assessment
|
|
310
|
+
2. Key insights
|
|
311
|
+
3. Specific recommendations
|
|
312
|
+
4. Action items
|
|
313
|
+
|
|
314
|
+
Be thorough and data-driven.""",
|
|
315
|
+
metadata={"author": "Team B", "hypothesis": "Detailed prompts = better quality"}
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
# Create test
|
|
319
|
+
test = ABTest(
|
|
320
|
+
name="Marketing Prompt Optimization Q1 2025",
|
|
321
|
+
variants=[variant_a, variant_b]
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
# Define test cases
|
|
325
|
+
test_cases = [
|
|
326
|
+
{
|
|
327
|
+
"campaign_details": f"Campaign {i}: Budget $10K, CTR 2.5%, Conv 1.2%"
|
|
328
|
+
}
|
|
329
|
+
for i in range(50)
|
|
330
|
+
]
|
|
331
|
+
|
|
332
|
+
# Run test
|
|
333
|
+
runner = ABTestRunner()
|
|
334
|
+
|
|
335
|
+
async def run():
|
|
336
|
+
test_with_results = await runner.run_test(
|
|
337
|
+
test=test,
|
|
338
|
+
test_cases=test_cases,
|
|
339
|
+
samples_per_variant=25
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
# Generate report
|
|
343
|
+
print("\n" + test_with_results.generate_report())
|
|
344
|
+
|
|
345
|
+
# Compare variants
|
|
346
|
+
print("\n" + "=" * 80)
|
|
347
|
+
comparison = test_with_results.compare_variants("v1_concise", "v2_detailed", "quality_score")
|
|
348
|
+
print(f"\nš Winner: {comparison['winner']}")
|
|
349
|
+
print(f"š Statistical Significance: {'Yes ā' if comparison['significant'] else 'No ā'}")
|
|
350
|
+
print(f"š Improvement: {comparison['improvement']:.1f}%")
|
|
351
|
+
|
|
352
|
+
# Export results
|
|
353
|
+
test_with_results.export_results("ab_test_results.json")
|
|
354
|
+
|
|
355
|
+
# Run async test
|
|
356
|
+
asyncio.run(run())
|