tech-hub-skills 1.5.1 → 1.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. package/.claude/LICENSE +21 -21
  2. package/.claude/README.md +291 -291
  3. package/.claude/bin/cli.js +266 -266
  4. package/.claude/bin/copilot.js +182 -182
  5. package/.claude/bin/postinstall.js +42 -42
  6. package/.claude/commands/README.md +336 -336
  7. package/.claude/commands/ai-engineer.md +104 -104
  8. package/.claude/commands/aws.md +143 -143
  9. package/.claude/commands/azure.md +149 -149
  10. package/.claude/commands/backend-developer.md +108 -108
  11. package/.claude/commands/code-review.md +399 -399
  12. package/.claude/commands/compliance-automation.md +747 -747
  13. package/.claude/commands/compliance-officer.md +108 -108
  14. package/.claude/commands/data-engineer.md +113 -113
  15. package/.claude/commands/data-governance.md +102 -102
  16. package/.claude/commands/data-scientist.md +123 -123
  17. package/.claude/commands/database-admin.md +109 -109
  18. package/.claude/commands/devops.md +160 -160
  19. package/.claude/commands/docker.md +160 -160
  20. package/.claude/commands/enterprise-dashboard.md +613 -613
  21. package/.claude/commands/finops.md +184 -184
  22. package/.claude/commands/frontend-developer.md +108 -108
  23. package/.claude/commands/gcp.md +143 -143
  24. package/.claude/commands/ml-engineer.md +115 -115
  25. package/.claude/commands/mlops.md +187 -187
  26. package/.claude/commands/network-engineer.md +109 -109
  27. package/.claude/commands/optimization-advisor.md +329 -329
  28. package/.claude/commands/orchestrator.md +623 -623
  29. package/.claude/commands/platform-engineer.md +102 -102
  30. package/.claude/commands/process-automation.md +226 -226
  31. package/.claude/commands/process-changelog.md +184 -184
  32. package/.claude/commands/process-documentation.md +484 -484
  33. package/.claude/commands/process-kanban.md +324 -324
  34. package/.claude/commands/process-versioning.md +214 -214
  35. package/.claude/commands/product-designer.md +104 -104
  36. package/.claude/commands/project-starter.md +443 -443
  37. package/.claude/commands/qa-engineer.md +109 -109
  38. package/.claude/commands/security-architect.md +135 -135
  39. package/.claude/commands/sre.md +109 -109
  40. package/.claude/commands/system-design.md +126 -126
  41. package/.claude/commands/technical-writer.md +101 -101
  42. package/.claude/package.json +46 -46
  43. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/README.md +252 -252
  44. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_ab_tester.py +356 -356
  45. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/prompt_template_manager.py +274 -274
  46. package/.claude/roles/ai-engineer/skills/01-prompt-engineering/token_cost_estimator.py +324 -324
  47. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/README.md +448 -448
  48. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/document_chunker.py +336 -336
  49. package/.claude/roles/ai-engineer/skills/02-rag-pipeline/rag_pipeline.sql +213 -213
  50. package/.claude/roles/ai-engineer/skills/03-agent-orchestration/README.md +599 -599
  51. package/.claude/roles/ai-engineer/skills/04-llm-guardrails/README.md +735 -735
  52. package/.claude/roles/ai-engineer/skills/05-vector-embeddings/README.md +711 -711
  53. package/.claude/roles/ai-engineer/skills/06-llm-evaluation/README.md +777 -777
  54. package/.claude/roles/azure/skills/01-infrastructure-fundamentals/README.md +264 -264
  55. package/.claude/roles/azure/skills/02-data-factory/README.md +264 -264
  56. package/.claude/roles/azure/skills/03-synapse-analytics/README.md +264 -264
  57. package/.claude/roles/azure/skills/04-databricks/README.md +264 -264
  58. package/.claude/roles/azure/skills/05-functions/README.md +264 -264
  59. package/.claude/roles/azure/skills/06-kubernetes-service/README.md +264 -264
  60. package/.claude/roles/azure/skills/07-openai-service/README.md +264 -264
  61. package/.claude/roles/azure/skills/08-machine-learning/README.md +264 -264
  62. package/.claude/roles/azure/skills/09-storage-adls/README.md +264 -264
  63. package/.claude/roles/azure/skills/10-networking/README.md +264 -264
  64. package/.claude/roles/azure/skills/11-sql-cosmos/README.md +264 -264
  65. package/.claude/roles/azure/skills/12-event-hubs/README.md +264 -264
  66. package/.claude/roles/code-review/skills/01-automated-code-review/README.md +394 -394
  67. package/.claude/roles/code-review/skills/02-pr-review-workflow/README.md +427 -427
  68. package/.claude/roles/code-review/skills/03-code-quality-gates/README.md +518 -518
  69. package/.claude/roles/code-review/skills/04-reviewer-assignment/README.md +504 -504
  70. package/.claude/roles/code-review/skills/05-review-analytics/README.md +540 -540
  71. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/README.md +550 -550
  72. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/bronze_ingestion.py +337 -337
  73. package/.claude/roles/data-engineer/skills/01-lakehouse-architecture/medallion_queries.sql +300 -300
  74. package/.claude/roles/data-engineer/skills/02-etl-pipeline/README.md +580 -580
  75. package/.claude/roles/data-engineer/skills/03-data-quality/README.md +579 -579
  76. package/.claude/roles/data-engineer/skills/04-streaming-pipelines/README.md +608 -608
  77. package/.claude/roles/data-engineer/skills/05-performance-optimization/README.md +547 -547
  78. package/.claude/roles/data-governance/skills/01-data-catalog/README.md +112 -112
  79. package/.claude/roles/data-governance/skills/02-data-lineage/README.md +129 -129
  80. package/.claude/roles/data-governance/skills/03-data-quality-framework/README.md +182 -182
  81. package/.claude/roles/data-governance/skills/04-access-control/README.md +39 -39
  82. package/.claude/roles/data-governance/skills/05-master-data-management/README.md +40 -40
  83. package/.claude/roles/data-governance/skills/06-compliance-privacy/README.md +46 -46
  84. package/.claude/roles/data-scientist/skills/01-eda-automation/README.md +230 -230
  85. package/.claude/roles/data-scientist/skills/01-eda-automation/eda_generator.py +446 -446
  86. package/.claude/roles/data-scientist/skills/02-statistical-modeling/README.md +264 -264
  87. package/.claude/roles/data-scientist/skills/03-feature-engineering/README.md +264 -264
  88. package/.claude/roles/data-scientist/skills/04-predictive-modeling/README.md +264 -264
  89. package/.claude/roles/data-scientist/skills/05-customer-analytics/README.md +264 -264
  90. package/.claude/roles/data-scientist/skills/06-campaign-analysis/README.md +264 -264
  91. package/.claude/roles/data-scientist/skills/07-experimentation/README.md +264 -264
  92. package/.claude/roles/data-scientist/skills/08-data-visualization/README.md +264 -264
  93. package/.claude/roles/devops/skills/01-cicd-pipeline/README.md +264 -264
  94. package/.claude/roles/devops/skills/02-container-orchestration/README.md +264 -264
  95. package/.claude/roles/devops/skills/03-infrastructure-as-code/README.md +264 -264
  96. package/.claude/roles/devops/skills/04-gitops/README.md +264 -264
  97. package/.claude/roles/devops/skills/05-environment-management/README.md +264 -264
  98. package/.claude/roles/devops/skills/06-automated-testing/README.md +264 -264
  99. package/.claude/roles/devops/skills/07-release-management/README.md +264 -264
  100. package/.claude/roles/devops/skills/08-monitoring-alerting/README.md +264 -264
  101. package/.claude/roles/devops/skills/09-devsecops/README.md +265 -265
  102. package/.claude/roles/finops/skills/01-cost-visibility/README.md +264 -264
  103. package/.claude/roles/finops/skills/02-resource-tagging/README.md +264 -264
  104. package/.claude/roles/finops/skills/03-budget-management/README.md +264 -264
  105. package/.claude/roles/finops/skills/04-reserved-instances/README.md +264 -264
  106. package/.claude/roles/finops/skills/05-spot-optimization/README.md +264 -264
  107. package/.claude/roles/finops/skills/06-storage-tiering/README.md +264 -264
  108. package/.claude/roles/finops/skills/07-compute-rightsizing/README.md +264 -264
  109. package/.claude/roles/finops/skills/08-chargeback/README.md +264 -264
  110. package/.claude/roles/ml-engineer/skills/01-mlops-pipeline/README.md +566 -566
  111. package/.claude/roles/ml-engineer/skills/02-feature-engineering/README.md +655 -655
  112. package/.claude/roles/ml-engineer/skills/03-model-training/README.md +704 -704
  113. package/.claude/roles/ml-engineer/skills/04-model-serving/README.md +845 -845
  114. package/.claude/roles/ml-engineer/skills/05-model-monitoring/README.md +874 -874
  115. package/.claude/roles/mlops/skills/01-ml-pipeline-orchestration/README.md +264 -264
  116. package/.claude/roles/mlops/skills/02-experiment-tracking/README.md +264 -264
  117. package/.claude/roles/mlops/skills/03-model-registry/README.md +264 -264
  118. package/.claude/roles/mlops/skills/04-feature-store/README.md +264 -264
  119. package/.claude/roles/mlops/skills/05-model-deployment/README.md +264 -264
  120. package/.claude/roles/mlops/skills/06-model-observability/README.md +264 -264
  121. package/.claude/roles/mlops/skills/07-data-versioning/README.md +264 -264
  122. package/.claude/roles/mlops/skills/08-ab-testing/README.md +264 -264
  123. package/.claude/roles/mlops/skills/09-automated-retraining/README.md +264 -264
  124. package/.claude/roles/platform-engineer/skills/01-internal-developer-platform/README.md +153 -153
  125. package/.claude/roles/platform-engineer/skills/02-self-service-infrastructure/README.md +57 -57
  126. package/.claude/roles/platform-engineer/skills/03-slo-sli-management/README.md +59 -59
  127. package/.claude/roles/platform-engineer/skills/04-developer-experience/README.md +57 -57
  128. package/.claude/roles/platform-engineer/skills/05-incident-management/README.md +73 -73
  129. package/.claude/roles/platform-engineer/skills/06-capacity-management/README.md +59 -59
  130. package/.claude/roles/product-designer/skills/01-requirements-discovery/README.md +407 -407
  131. package/.claude/roles/product-designer/skills/02-user-research/README.md +382 -382
  132. package/.claude/roles/product-designer/skills/03-brainstorming-ideation/README.md +437 -437
  133. package/.claude/roles/product-designer/skills/04-ux-design/README.md +496 -496
  134. package/.claude/roles/product-designer/skills/05-product-market-fit/README.md +376 -376
  135. package/.claude/roles/product-designer/skills/06-stakeholder-management/README.md +412 -412
  136. package/.claude/roles/security-architect/skills/01-pii-detection/README.md +319 -319
  137. package/.claude/roles/security-architect/skills/02-threat-modeling/README.md +264 -264
  138. package/.claude/roles/security-architect/skills/03-infrastructure-security/README.md +264 -264
  139. package/.claude/roles/security-architect/skills/04-iam/README.md +264 -264
  140. package/.claude/roles/security-architect/skills/05-application-security/README.md +264 -264
  141. package/.claude/roles/security-architect/skills/06-secrets-management/README.md +264 -264
  142. package/.claude/roles/security-architect/skills/07-security-monitoring/README.md +264 -264
  143. package/.claude/roles/system-design/skills/01-architecture-patterns/README.md +337 -337
  144. package/.claude/roles/system-design/skills/02-requirements-engineering/README.md +264 -264
  145. package/.claude/roles/system-design/skills/03-scalability/README.md +264 -264
  146. package/.claude/roles/system-design/skills/04-high-availability/README.md +264 -264
  147. package/.claude/roles/system-design/skills/05-cost-optimization-design/README.md +264 -264
  148. package/.claude/roles/system-design/skills/06-api-design/README.md +264 -264
  149. package/.claude/roles/system-design/skills/07-observability-architecture/README.md +264 -264
  150. package/.claude/roles/system-design/skills/08-process-automation/PROCESS_TEMPLATE.md +336 -336
  151. package/.claude/roles/system-design/skills/08-process-automation/README.md +521 -521
  152. package/.claude/roles/system-design/skills/08-process-automation/ai_prompt_generator.py +744 -744
  153. package/.claude/roles/system-design/skills/08-process-automation/automation_recommender.py +688 -688
  154. package/.claude/roles/system-design/skills/08-process-automation/plan_generator.py +679 -679
  155. package/.claude/roles/system-design/skills/08-process-automation/process_analyzer.py +528 -528
  156. package/.claude/roles/system-design/skills/08-process-automation/process_parser.py +684 -684
  157. package/.claude/roles/system-design/skills/08-process-automation/role_matcher.py +615 -615
  158. package/.claude/skills/README.md +336 -336
  159. package/.claude/skills/ai-engineer.md +104 -104
  160. package/.claude/skills/aws.md +143 -143
  161. package/.claude/skills/azure.md +149 -149
  162. package/.claude/skills/backend-developer.md +108 -108
  163. package/.claude/skills/code-review.md +399 -399
  164. package/.claude/skills/compliance-automation.md +747 -747
  165. package/.claude/skills/compliance-officer.md +108 -108
  166. package/.claude/skills/data-engineer.md +113 -113
  167. package/.claude/skills/data-governance.md +102 -102
  168. package/.claude/skills/data-scientist.md +123 -123
  169. package/.claude/skills/database-admin.md +109 -109
  170. package/.claude/skills/devops.md +160 -160
  171. package/.claude/skills/docker.md +160 -160
  172. package/.claude/skills/enterprise-dashboard.md +613 -613
  173. package/.claude/skills/finops.md +184 -184
  174. package/.claude/skills/frontend-developer.md +108 -108
  175. package/.claude/skills/gcp.md +143 -143
  176. package/.claude/skills/ml-engineer.md +115 -115
  177. package/.claude/skills/mlops.md +187 -187
  178. package/.claude/skills/network-engineer.md +109 -109
  179. package/.claude/skills/optimization-advisor.md +329 -329
  180. package/.claude/skills/orchestrator.md +623 -623
  181. package/.claude/skills/platform-engineer.md +102 -102
  182. package/.claude/skills/process-automation.md +226 -226
  183. package/.claude/skills/process-changelog.md +184 -184
  184. package/.claude/skills/process-documentation.md +484 -484
  185. package/.claude/skills/process-kanban.md +324 -324
  186. package/.claude/skills/process-versioning.md +214 -214
  187. package/.claude/skills/product-designer.md +104 -104
  188. package/.claude/skills/project-starter.md +443 -443
  189. package/.claude/skills/qa-engineer.md +109 -109
  190. package/.claude/skills/security-architect.md +135 -135
  191. package/.claude/skills/sre.md +109 -109
  192. package/.claude/skills/system-design.md +126 -126
  193. package/.claude/skills/technical-writer.md +101 -101
  194. package/.gitattributes +2 -2
  195. package/GITHUB_COPILOT.md +106 -106
  196. package/README.md +192 -184
  197. package/package.json +16 -8
@@ -1,356 +1,356 @@
1
- """
2
- Prompt A/B Testing Framework
3
- Compare prompt variations with statistical significance testing.
4
- """
5
-
6
- import asyncio
7
- import json
8
- from datetime import datetime
9
- from typing import List, Dict, Any, Optional, Callable
10
- from dataclasses import dataclass, asdict
11
- from scipy import stats
12
- import numpy as np
13
- import pandas as pd
14
-
15
-
16
- @dataclass
17
- class PromptVariant:
18
- """A prompt variant for A/B testing."""
19
- id: str
20
- name: str
21
- template: str
22
- metadata: Dict[str, Any] = None
23
-
24
- def __post_init__(self):
25
- if self.metadata is None:
26
- self.metadata = {}
27
-
28
-
29
- @dataclass
30
- class TestResult:
31
- """Result from testing a single variant."""
32
- variant_id: str
33
- response: str
34
- latency_ms: float
35
- tokens_used: int
36
- cost: float
37
- quality_score: Optional[float] = None
38
- timestamp: str = None
39
-
40
- def __post_init__(self):
41
- if self.timestamp is None:
42
- self.timestamp = datetime.now().isoformat()
43
-
44
-
45
- class ABTest:
46
- """A/B test experiment for prompt variants."""
47
-
48
- def __init__(
49
- self,
50
- name: str,
51
- variants: List[PromptVariant],
52
- evaluation_fn: Optional[Callable] = None
53
- ):
54
- self.name = name
55
- self.variants = {v.id: v for v in variants}
56
- self.evaluation_fn = evaluation_fn or self._default_evaluation
57
- self.results: List[TestResult] = []
58
-
59
- def add_result(self, result: TestResult) -> None:
60
- """Add a test result."""
61
- self.results.append(result)
62
-
63
- def get_variant_results(self, variant_id: str) -> List[TestResult]:
64
- """Get all results for a specific variant."""
65
- return [r for r in self.results if r.variant_id == variant_id]
66
-
67
- def calculate_metrics(self, variant_id: str) -> Dict[str, float]:
68
- """Calculate aggregate metrics for a variant."""
69
- results = self.get_variant_results(variant_id)
70
-
71
- if not results:
72
- return {}
73
-
74
- return {
75
- "sample_size": len(results),
76
- "avg_latency_ms": np.mean([r.latency_ms for r in results]),
77
- "avg_cost": np.mean([r.cost for r in results]),
78
- "avg_tokens": np.mean([r.tokens_used for r in results]),
79
- "avg_quality_score": np.mean([r.quality_score for r in results if r.quality_score is not None]),
80
- "p50_latency": np.percentile([r.latency_ms for r in results], 50),
81
- "p95_latency": np.percentile([r.latency_ms for r in results], 95),
82
- "p99_latency": np.percentile([r.latency_ms for r in results], 99),
83
- }
84
-
85
- def compare_variants(
86
- self,
87
- variant_a_id: str,
88
- variant_b_id: str,
89
- metric: str = "quality_score"
90
- ) -> Dict[str, Any]:
91
- """
92
- Compare two variants with statistical significance testing.
93
-
94
- Args:
95
- variant_a_id: First variant ID
96
- variant_b_id: Second variant ID
97
- metric: Metric to compare ('quality_score', 'latency_ms', 'cost')
98
-
99
- Returns:
100
- Comparison results with p-value and effect size
101
- """
102
- results_a = self.get_variant_results(variant_a_id)
103
- results_b = self.get_variant_results(variant_b_id)
104
-
105
- if not results_a or not results_b:
106
- return {"error": "Insufficient data for comparison"}
107
-
108
- # Extract metric values
109
- values_a = [getattr(r, metric) for r in results_a if getattr(r, metric) is not None]
110
- values_b = [getattr(r, metric) for r in results_b if getattr(r, metric) is not None]
111
-
112
- if not values_a or not values_b:
113
- return {"error": f"No valid {metric} data"}
114
-
115
- # Perform t-test
116
- t_stat, p_value = stats.ttest_ind(values_a, values_b)
117
-
118
- # Calculate effect size (Cohen's d)
119
- pooled_std = np.sqrt((np.std(values_a)**2 + np.std(values_b)**2) / 2)
120
- cohens_d = (np.mean(values_a) - np.mean(values_b)) / pooled_std if pooled_std > 0 else 0
121
-
122
- # Determine winner
123
- if p_value < 0.05:
124
- if metric in ["latency_ms", "cost"]:
125
- # Lower is better
126
- winner = variant_a_id if np.mean(values_a) < np.mean(values_b) else variant_b_id
127
- else:
128
- # Higher is better
129
- winner = variant_a_id if np.mean(values_a) > np.mean(values_b) else variant_b_id
130
- significant = True
131
- else:
132
- winner = "No significant difference"
133
- significant = False
134
-
135
- return {
136
- "variant_a": {
137
- "id": variant_a_id,
138
- "mean": float(np.mean(values_a)),
139
- "std": float(np.std(values_a)),
140
- "sample_size": len(values_a)
141
- },
142
- "variant_b": {
143
- "id": variant_b_id,
144
- "mean": float(np.mean(values_b)),
145
- "std": float(np.std(values_b)),
146
- "sample_size": len(values_b)
147
- },
148
- "t_statistic": float(t_stat),
149
- "p_value": float(p_value),
150
- "cohens_d": float(cohens_d),
151
- "significant": significant,
152
- "winner": winner,
153
- "metric": metric,
154
- "improvement": float((np.mean(values_b) - np.mean(values_a)) / np.mean(values_a) * 100)
155
- }
156
-
157
- def generate_report(self) -> str:
158
- """Generate a comprehensive A/B test report."""
159
- lines = [
160
- f"šŸ“Š A/B Test Report: {self.name}",
161
- "=" * 80,
162
- f"Total Results: {len(self.results)}",
163
- f"Variants Tested: {len(self.variants)}",
164
- "\n"
165
- ]
166
-
167
- # Metrics for each variant
168
- lines.append("šŸ“ˆ Variant Performance:")
169
- for variant_id, variant in self.variants.items():
170
- metrics = self.calculate_metrics(variant_id)
171
- if metrics:
172
- lines.append(f"\n {variant.name} (ID: {variant_id})")
173
- lines.append(f" Sample Size: {metrics['sample_size']}")
174
- lines.append(f" Avg Quality Score: {metrics['avg_quality_score']:.2f}")
175
- lines.append(f" Avg Latency: {metrics['avg_latency_ms']:.0f}ms (p95: {metrics['p95_latency']:.0f}ms)")
176
- lines.append(f" Avg Cost: ${metrics['avg_cost']:.4f}")
177
- lines.append(f" Avg Tokens: {metrics['avg_tokens']:.0f}")
178
-
179
- # Statistical comparisons
180
- if len(self.variants) == 2:
181
- variant_ids = list(self.variants.keys())
182
- comparison = self.compare_variants(variant_ids[0], variant_ids[1])
183
-
184
- lines.append("\n\nšŸ”¬ Statistical Comparison:")
185
- lines.append(f" Metric: {comparison.get('metric', 'N/A')}")
186
- lines.append(f" P-Value: {comparison.get('p_value', 0):.4f}")
187
- lines.append(f" Cohen's d: {comparison.get('cohens_d', 0):.3f}")
188
- lines.append(f" Significant: {'Yes' if comparison.get('significant') else 'No'}")
189
- lines.append(f" Winner: {comparison.get('winner', 'N/A')}")
190
- lines.append(f" Improvement: {comparison.get('improvement', 0):.1f}%")
191
-
192
- return "\n".join(lines)
193
-
194
- def _default_evaluation(self, response: str) -> float:
195
- """Default quality evaluation (length-based)."""
196
- # Simple heuristic: penalize very short or very long responses
197
- length = len(response)
198
- if length < 50:
199
- return 50.0
200
- elif length > 2000:
201
- return 70.0
202
- else:
203
- return 85.0
204
-
205
- def export_results(self, filepath: str) -> None:
206
- """Export results to JSON file."""
207
- data = {
208
- "name": self.name,
209
- "variants": {k: asdict(v) for k, v in self.variants.items()},
210
- "results": [asdict(r) for r in self.results],
211
- "summary": {
212
- variant_id: self.calculate_metrics(variant_id)
213
- for variant_id in self.variants.keys()
214
- }
215
- }
216
-
217
- with open(filepath, 'w') as f:
218
- json.dump(data, f, indent=2)
219
-
220
- print(f"āœ… Results exported to {filepath}")
221
-
222
-
223
- class ABTestRunner:
224
- """Runner for executing A/B tests with actual LLM calls."""
225
-
226
- def __init__(self, llm_client: Optional[Any] = None):
227
- self.llm_client = llm_client
228
-
229
- async def run_test(
230
- self,
231
- test: ABTest,
232
- test_cases: List[Dict[str, Any]],
233
- samples_per_variant: int = 30
234
- ) -> ABTest:
235
- """
236
- Run an A/B test with multiple test cases.
237
-
238
- Args:
239
- test: The ABTest instance
240
- test_cases: List of test case dictionaries with template variables
241
- samples_per_variant: Number of samples to collect per variant
242
-
243
- Returns:
244
- The test instance with results
245
- """
246
- for variant_id, variant in test.variants.items():
247
- print(f"Testing variant: {variant.name} ({samples_per_variant} samples)")
248
-
249
- for i, test_case in enumerate(test_cases[:samples_per_variant]):
250
- # Render prompt with test case
251
- # In production, use actual template rendering
252
- prompt = variant.template.format(**test_case)
253
-
254
- # Simulate LLM call (replace with actual API call)
255
- result = await self._simulate_llm_call(prompt, variant_id)
256
-
257
- # Evaluate quality
258
- quality_score = test.evaluation_fn(result.response)
259
- result.quality_score = quality_score
260
-
261
- test.add_result(result)
262
-
263
- if (i + 1) % 10 == 0:
264
- print(f" Progress: {i + 1}/{samples_per_variant}")
265
-
266
- return test
267
-
268
- async def _simulate_llm_call(
269
- self,
270
- prompt: str,
271
- variant_id: str
272
- ) -> TestResult:
273
- """Simulate an LLM API call (replace with actual implementation)."""
274
- # In production, call actual LLM API here
275
- await asyncio.sleep(0.1) # Simulate API latency
276
-
277
- # Generate mock response
278
- response = f"This is a simulated response for variant {variant_id}. " + "Sample content. " * 20
279
-
280
- return TestResult(
281
- variant_id=variant_id,
282
- response=response,
283
- latency_ms=np.random.uniform(200, 800),
284
- tokens_used=np.random.randint(100, 500),
285
- cost=np.random.uniform(0.001, 0.01)
286
- )
287
-
288
-
289
- # Example usage
290
- if __name__ == "__main__":
291
- # Define variants
292
- variant_a = PromptVariant(
293
- id="v1_concise",
294
- name="Concise Prompt",
295
- template="""Analyze this marketing campaign: {campaign_details}
296
-
297
- Provide brief recommendations.""",
298
- metadata={"author": "Team A", "hypothesis": "Shorter prompts = faster responses"}
299
- )
300
-
301
- variant_b = PromptVariant(
302
- id="v2_detailed",
303
- name="Detailed Prompt",
304
- template="""You are a marketing analytics expert. Analyze the following campaign in detail.
305
-
306
- Campaign Details: {campaign_details}
307
-
308
- Please provide:
309
- 1. Performance assessment
310
- 2. Key insights
311
- 3. Specific recommendations
312
- 4. Action items
313
-
314
- Be thorough and data-driven.""",
315
- metadata={"author": "Team B", "hypothesis": "Detailed prompts = better quality"}
316
- )
317
-
318
- # Create test
319
- test = ABTest(
320
- name="Marketing Prompt Optimization Q1 2025",
321
- variants=[variant_a, variant_b]
322
- )
323
-
324
- # Define test cases
325
- test_cases = [
326
- {
327
- "campaign_details": f"Campaign {i}: Budget $10K, CTR 2.5%, Conv 1.2%"
328
- }
329
- for i in range(50)
330
- ]
331
-
332
- # Run test
333
- runner = ABTestRunner()
334
-
335
- async def run():
336
- test_with_results = await runner.run_test(
337
- test=test,
338
- test_cases=test_cases,
339
- samples_per_variant=25
340
- )
341
-
342
- # Generate report
343
- print("\n" + test_with_results.generate_report())
344
-
345
- # Compare variants
346
- print("\n" + "=" * 80)
347
- comparison = test_with_results.compare_variants("v1_concise", "v2_detailed", "quality_score")
348
- print(f"\nšŸ† Winner: {comparison['winner']}")
349
- print(f"šŸ“Š Statistical Significance: {'Yes āœ“' if comparison['significant'] else 'No āœ—'}")
350
- print(f"šŸ“ˆ Improvement: {comparison['improvement']:.1f}%")
351
-
352
- # Export results
353
- test_with_results.export_results("ab_test_results.json")
354
-
355
- # Run async test
356
- asyncio.run(run())
1
+ """
2
+ Prompt A/B Testing Framework
3
+ Compare prompt variations with statistical significance testing.
4
+ """
5
+
6
+ import asyncio
7
+ import json
8
+ from datetime import datetime
9
+ from typing import List, Dict, Any, Optional, Callable
10
+ from dataclasses import dataclass, asdict
11
+ from scipy import stats
12
+ import numpy as np
13
+ import pandas as pd
14
+
15
+
16
+ @dataclass
17
+ class PromptVariant:
18
+ """A prompt variant for A/B testing."""
19
+ id: str
20
+ name: str
21
+ template: str
22
+ metadata: Dict[str, Any] = None
23
+
24
+ def __post_init__(self):
25
+ if self.metadata is None:
26
+ self.metadata = {}
27
+
28
+
29
+ @dataclass
30
+ class TestResult:
31
+ """Result from testing a single variant."""
32
+ variant_id: str
33
+ response: str
34
+ latency_ms: float
35
+ tokens_used: int
36
+ cost: float
37
+ quality_score: Optional[float] = None
38
+ timestamp: str = None
39
+
40
+ def __post_init__(self):
41
+ if self.timestamp is None:
42
+ self.timestamp = datetime.now().isoformat()
43
+
44
+
45
+ class ABTest:
46
+ """A/B test experiment for prompt variants."""
47
+
48
+ def __init__(
49
+ self,
50
+ name: str,
51
+ variants: List[PromptVariant],
52
+ evaluation_fn: Optional[Callable] = None
53
+ ):
54
+ self.name = name
55
+ self.variants = {v.id: v for v in variants}
56
+ self.evaluation_fn = evaluation_fn or self._default_evaluation
57
+ self.results: List[TestResult] = []
58
+
59
+ def add_result(self, result: TestResult) -> None:
60
+ """Add a test result."""
61
+ self.results.append(result)
62
+
63
+ def get_variant_results(self, variant_id: str) -> List[TestResult]:
64
+ """Get all results for a specific variant."""
65
+ return [r for r in self.results if r.variant_id == variant_id]
66
+
67
+ def calculate_metrics(self, variant_id: str) -> Dict[str, float]:
68
+ """Calculate aggregate metrics for a variant."""
69
+ results = self.get_variant_results(variant_id)
70
+
71
+ if not results:
72
+ return {}
73
+
74
+ return {
75
+ "sample_size": len(results),
76
+ "avg_latency_ms": np.mean([r.latency_ms for r in results]),
77
+ "avg_cost": np.mean([r.cost for r in results]),
78
+ "avg_tokens": np.mean([r.tokens_used for r in results]),
79
+ "avg_quality_score": np.mean([r.quality_score for r in results if r.quality_score is not None]),
80
+ "p50_latency": np.percentile([r.latency_ms for r in results], 50),
81
+ "p95_latency": np.percentile([r.latency_ms for r in results], 95),
82
+ "p99_latency": np.percentile([r.latency_ms for r in results], 99),
83
+ }
84
+
85
+ def compare_variants(
86
+ self,
87
+ variant_a_id: str,
88
+ variant_b_id: str,
89
+ metric: str = "quality_score"
90
+ ) -> Dict[str, Any]:
91
+ """
92
+ Compare two variants with statistical significance testing.
93
+
94
+ Args:
95
+ variant_a_id: First variant ID
96
+ variant_b_id: Second variant ID
97
+ metric: Metric to compare ('quality_score', 'latency_ms', 'cost')
98
+
99
+ Returns:
100
+ Comparison results with p-value and effect size
101
+ """
102
+ results_a = self.get_variant_results(variant_a_id)
103
+ results_b = self.get_variant_results(variant_b_id)
104
+
105
+ if not results_a or not results_b:
106
+ return {"error": "Insufficient data for comparison"}
107
+
108
+ # Extract metric values
109
+ values_a = [getattr(r, metric) for r in results_a if getattr(r, metric) is not None]
110
+ values_b = [getattr(r, metric) for r in results_b if getattr(r, metric) is not None]
111
+
112
+ if not values_a or not values_b:
113
+ return {"error": f"No valid {metric} data"}
114
+
115
+ # Perform t-test
116
+ t_stat, p_value = stats.ttest_ind(values_a, values_b)
117
+
118
+ # Calculate effect size (Cohen's d)
119
+ pooled_std = np.sqrt((np.std(values_a)**2 + np.std(values_b)**2) / 2)
120
+ cohens_d = (np.mean(values_a) - np.mean(values_b)) / pooled_std if pooled_std > 0 else 0
121
+
122
+ # Determine winner
123
+ if p_value < 0.05:
124
+ if metric in ["latency_ms", "cost"]:
125
+ # Lower is better
126
+ winner = variant_a_id if np.mean(values_a) < np.mean(values_b) else variant_b_id
127
+ else:
128
+ # Higher is better
129
+ winner = variant_a_id if np.mean(values_a) > np.mean(values_b) else variant_b_id
130
+ significant = True
131
+ else:
132
+ winner = "No significant difference"
133
+ significant = False
134
+
135
+ return {
136
+ "variant_a": {
137
+ "id": variant_a_id,
138
+ "mean": float(np.mean(values_a)),
139
+ "std": float(np.std(values_a)),
140
+ "sample_size": len(values_a)
141
+ },
142
+ "variant_b": {
143
+ "id": variant_b_id,
144
+ "mean": float(np.mean(values_b)),
145
+ "std": float(np.std(values_b)),
146
+ "sample_size": len(values_b)
147
+ },
148
+ "t_statistic": float(t_stat),
149
+ "p_value": float(p_value),
150
+ "cohens_d": float(cohens_d),
151
+ "significant": significant,
152
+ "winner": winner,
153
+ "metric": metric,
154
+ "improvement": float((np.mean(values_b) - np.mean(values_a)) / np.mean(values_a) * 100)
155
+ }
156
+
157
+ def generate_report(self) -> str:
158
+ """Generate a comprehensive A/B test report."""
159
+ lines = [
160
+ f"šŸ“Š A/B Test Report: {self.name}",
161
+ "=" * 80,
162
+ f"Total Results: {len(self.results)}",
163
+ f"Variants Tested: {len(self.variants)}",
164
+ "\n"
165
+ ]
166
+
167
+ # Metrics for each variant
168
+ lines.append("šŸ“ˆ Variant Performance:")
169
+ for variant_id, variant in self.variants.items():
170
+ metrics = self.calculate_metrics(variant_id)
171
+ if metrics:
172
+ lines.append(f"\n {variant.name} (ID: {variant_id})")
173
+ lines.append(f" Sample Size: {metrics['sample_size']}")
174
+ lines.append(f" Avg Quality Score: {metrics['avg_quality_score']:.2f}")
175
+ lines.append(f" Avg Latency: {metrics['avg_latency_ms']:.0f}ms (p95: {metrics['p95_latency']:.0f}ms)")
176
+ lines.append(f" Avg Cost: ${metrics['avg_cost']:.4f}")
177
+ lines.append(f" Avg Tokens: {metrics['avg_tokens']:.0f}")
178
+
179
+ # Statistical comparisons
180
+ if len(self.variants) == 2:
181
+ variant_ids = list(self.variants.keys())
182
+ comparison = self.compare_variants(variant_ids[0], variant_ids[1])
183
+
184
+ lines.append("\n\nšŸ”¬ Statistical Comparison:")
185
+ lines.append(f" Metric: {comparison.get('metric', 'N/A')}")
186
+ lines.append(f" P-Value: {comparison.get('p_value', 0):.4f}")
187
+ lines.append(f" Cohen's d: {comparison.get('cohens_d', 0):.3f}")
188
+ lines.append(f" Significant: {'Yes' if comparison.get('significant') else 'No'}")
189
+ lines.append(f" Winner: {comparison.get('winner', 'N/A')}")
190
+ lines.append(f" Improvement: {comparison.get('improvement', 0):.1f}%")
191
+
192
+ return "\n".join(lines)
193
+
194
+ def _default_evaluation(self, response: str) -> float:
195
+ """Default quality evaluation (length-based)."""
196
+ # Simple heuristic: penalize very short or very long responses
197
+ length = len(response)
198
+ if length < 50:
199
+ return 50.0
200
+ elif length > 2000:
201
+ return 70.0
202
+ else:
203
+ return 85.0
204
+
205
+ def export_results(self, filepath: str) -> None:
206
+ """Export results to JSON file."""
207
+ data = {
208
+ "name": self.name,
209
+ "variants": {k: asdict(v) for k, v in self.variants.items()},
210
+ "results": [asdict(r) for r in self.results],
211
+ "summary": {
212
+ variant_id: self.calculate_metrics(variant_id)
213
+ for variant_id in self.variants.keys()
214
+ }
215
+ }
216
+
217
+ with open(filepath, 'w') as f:
218
+ json.dump(data, f, indent=2)
219
+
220
+ print(f"āœ… Results exported to {filepath}")
221
+
222
+
223
+ class ABTestRunner:
224
+ """Runner for executing A/B tests with actual LLM calls."""
225
+
226
+ def __init__(self, llm_client: Optional[Any] = None):
227
+ self.llm_client = llm_client
228
+
229
+ async def run_test(
230
+ self,
231
+ test: ABTest,
232
+ test_cases: List[Dict[str, Any]],
233
+ samples_per_variant: int = 30
234
+ ) -> ABTest:
235
+ """
236
+ Run an A/B test with multiple test cases.
237
+
238
+ Args:
239
+ test: The ABTest instance
240
+ test_cases: List of test case dictionaries with template variables
241
+ samples_per_variant: Number of samples to collect per variant
242
+
243
+ Returns:
244
+ The test instance with results
245
+ """
246
+ for variant_id, variant in test.variants.items():
247
+ print(f"Testing variant: {variant.name} ({samples_per_variant} samples)")
248
+
249
+ for i, test_case in enumerate(test_cases[:samples_per_variant]):
250
+ # Render prompt with test case
251
+ # In production, use actual template rendering
252
+ prompt = variant.template.format(**test_case)
253
+
254
+ # Simulate LLM call (replace with actual API call)
255
+ result = await self._simulate_llm_call(prompt, variant_id)
256
+
257
+ # Evaluate quality
258
+ quality_score = test.evaluation_fn(result.response)
259
+ result.quality_score = quality_score
260
+
261
+ test.add_result(result)
262
+
263
+ if (i + 1) % 10 == 0:
264
+ print(f" Progress: {i + 1}/{samples_per_variant}")
265
+
266
+ return test
267
+
268
+ async def _simulate_llm_call(
269
+ self,
270
+ prompt: str,
271
+ variant_id: str
272
+ ) -> TestResult:
273
+ """Simulate an LLM API call (replace with actual implementation)."""
274
+ # In production, call actual LLM API here
275
+ await asyncio.sleep(0.1) # Simulate API latency
276
+
277
+ # Generate mock response
278
+ response = f"This is a simulated response for variant {variant_id}. " + "Sample content. " * 20
279
+
280
+ return TestResult(
281
+ variant_id=variant_id,
282
+ response=response,
283
+ latency_ms=np.random.uniform(200, 800),
284
+ tokens_used=np.random.randint(100, 500),
285
+ cost=np.random.uniform(0.001, 0.01)
286
+ )
287
+
288
+
289
+ # Example usage
290
+ if __name__ == "__main__":
291
+ # Define variants
292
+ variant_a = PromptVariant(
293
+ id="v1_concise",
294
+ name="Concise Prompt",
295
+ template="""Analyze this marketing campaign: {campaign_details}
296
+
297
+ Provide brief recommendations.""",
298
+ metadata={"author": "Team A", "hypothesis": "Shorter prompts = faster responses"}
299
+ )
300
+
301
+ variant_b = PromptVariant(
302
+ id="v2_detailed",
303
+ name="Detailed Prompt",
304
+ template="""You are a marketing analytics expert. Analyze the following campaign in detail.
305
+
306
+ Campaign Details: {campaign_details}
307
+
308
+ Please provide:
309
+ 1. Performance assessment
310
+ 2. Key insights
311
+ 3. Specific recommendations
312
+ 4. Action items
313
+
314
+ Be thorough and data-driven.""",
315
+ metadata={"author": "Team B", "hypothesis": "Detailed prompts = better quality"}
316
+ )
317
+
318
+ # Create test
319
+ test = ABTest(
320
+ name="Marketing Prompt Optimization Q1 2025",
321
+ variants=[variant_a, variant_b]
322
+ )
323
+
324
+ # Define test cases
325
+ test_cases = [
326
+ {
327
+ "campaign_details": f"Campaign {i}: Budget $10K, CTR 2.5%, Conv 1.2%"
328
+ }
329
+ for i in range(50)
330
+ ]
331
+
332
+ # Run test
333
+ runner = ABTestRunner()
334
+
335
+ async def run():
336
+ test_with_results = await runner.run_test(
337
+ test=test,
338
+ test_cases=test_cases,
339
+ samples_per_variant=25
340
+ )
341
+
342
+ # Generate report
343
+ print("\n" + test_with_results.generate_report())
344
+
345
+ # Compare variants
346
+ print("\n" + "=" * 80)
347
+ comparison = test_with_results.compare_variants("v1_concise", "v2_detailed", "quality_score")
348
+ print(f"\nšŸ† Winner: {comparison['winner']}")
349
+ print(f"šŸ“Š Statistical Significance: {'Yes āœ“' if comparison['significant'] else 'No āœ—'}")
350
+ print(f"šŸ“ˆ Improvement: {comparison['improvement']:.1f}%")
351
+
352
+ # Export results
353
+ test_with_results.export_results("ab_test_results.json")
354
+
355
+ # Run async test
356
+ asyncio.run(run())