@musashishao/agent-kit 1.8.1 → 1.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent/agents/ai-architect.md +39 -0
- package/.agent/agents/cloud-engineer.md +39 -0
- package/.agent/agents/game-asset-curator.md +317 -0
- package/.agent/agents/game-developer.md +190 -89
- package/.agent/agents/game-narrative-designer.md +310 -0
- package/.agent/agents/game-qa-agent.md +441 -0
- package/.agent/agents/marketing-specialist.md +41 -0
- package/.agent/agents/penetration-tester.md +15 -1
- package/.agent/rules/CODEX.md +26 -2
- package/.agent/rules/GEMINI.md +7 -5
- package/.agent/rules/REFERENCE.md +92 -2
- package/.agent/scripts/ak_cli.py +1 -1
- package/.agent/scripts/localize_workflows.py +54 -0
- package/.agent/scripts/memory_manager.py +24 -1
- package/.agent/skills/3d-web-experience/SKILL.md +386 -0
- package/.agent/skills/DEPENDENCIES.md +54 -0
- package/.agent/skills/ab-test-setup/SKILL.md +77 -0
- package/.agent/skills/active-directory-attacks/SKILL.md +59 -0
- package/.agent/skills/agent-evaluation/SKILL.md +430 -0
- package/.agent/skills/agent-memory-systems/SKILL.md +426 -0
- package/.agent/skills/agent-tool-builder/SKILL.md +139 -0
- package/.agent/skills/ai-agents-architect/SKILL.md +115 -0
- package/.agent/skills/ai-product/SKILL.md +86 -0
- package/.agent/skills/ai-wrapper-product/SKILL.md +90 -0
- package/.agent/skills/analytics-tracking/SKILL.md +88 -0
- package/.agent/skills/api-fuzzing-bug-bounty/SKILL.md +66 -0
- package/.agent/skills/app-store-optimization/SKILL.md +66 -0
- package/.agent/skills/autonomous-agent-patterns/SKILL.md +414 -0
- package/.agent/skills/aws-penetration-testing/SKILL.md +50 -0
- package/.agent/skills/aws-serverless/SKILL.md +327 -0
- package/.agent/skills/azure-functions/SKILL.md +340 -0
- package/.agent/skills/broken-authentication/SKILL.md +53 -0
- package/.agent/skills/browser-automation/SKILL.md +408 -0
- package/.agent/skills/browser-extension-builder/SKILL.md +422 -0
- package/.agent/skills/bullmq-specialist/SKILL.md +424 -0
- package/.agent/skills/bun-development/SKILL.md +386 -0
- package/.agent/skills/burp-suite-testing/SKILL.md +60 -0
- package/.agent/skills/clerk-auth/SKILL.md +432 -0
- package/.agent/skills/cloud-penetration-testing/SKILL.md +51 -0
- package/.agent/skills/copywriting/SKILL.md +66 -0
- package/.agent/skills/crewai/SKILL.md +470 -0
- package/.agent/skills/discord-bot-architect/SKILL.md +447 -0
- package/.agent/skills/email-sequence/SKILL.md +73 -0
- package/.agent/skills/ethical-hacking-methodology/SKILL.md +67 -0
- package/.agent/skills/firebase/SKILL.md +377 -0
- package/.agent/skills/game-development/godot-expert/SKILL.md +462 -0
- package/.agent/skills/game-development/npc-ai-integration/SKILL.md +110 -0
- package/.agent/skills/game-development/procedural-generation/SKILL.md +168 -0
- package/.agent/skills/game-development/unity-integration/SKILL.md +358 -0
- package/.agent/skills/game-development/webgpu-shading/SKILL.md +209 -0
- package/.agent/skills/gcp-cloud-run/SKILL.md +358 -0
- package/.agent/skills/graphql/SKILL.md +492 -0
- package/.agent/skills/idor-testing/SKILL.md +64 -0
- package/.agent/skills/inngest/SKILL.md +128 -0
- package/.agent/skills/langfuse/SKILL.md +415 -0
- package/.agent/skills/langgraph/SKILL.md +360 -0
- package/.agent/skills/launch-strategy/SKILL.md +68 -0
- package/.agent/skills/linux-privilege-escalation/SKILL.md +62 -0
- package/.agent/skills/llm-app-patterns/SKILL.md +367 -0
- package/.agent/skills/marketing-ideas/SKILL.md +66 -0
- package/.agent/skills/metasploit-framework/SKILL.md +60 -0
- package/.agent/skills/micro-saas-launcher/SKILL.md +93 -0
- package/.agent/skills/neon-postgres/SKILL.md +339 -0
- package/.agent/skills/paid-ads/SKILL.md +64 -0
- package/.agent/skills/supabase-integration/SKILL.md +411 -0
- package/.agent/workflows/ai-agent.md +36 -0
- package/.agent/workflows/autofix.md +1 -0
- package/.agent/workflows/brainstorm.md +1 -0
- package/.agent/workflows/context.md +1 -0
- package/.agent/workflows/create.md +1 -0
- package/.agent/workflows/dashboard.md +1 -0
- package/.agent/workflows/debug.md +1 -0
- package/.agent/workflows/deploy.md +1 -0
- package/.agent/workflows/enhance.md +1 -0
- package/.agent/workflows/game-prototype.md +154 -0
- package/.agent/workflows/marketing.md +37 -0
- package/.agent/workflows/next.md +1 -0
- package/.agent/workflows/orchestrate.md +1 -0
- package/.agent/workflows/pentest.md +37 -0
- package/.agent/workflows/plan.md +1 -0
- package/.agent/workflows/preview.md +2 -1
- package/.agent/workflows/quality.md +1 -0
- package/.agent/workflows/saas.md +36 -0
- package/.agent/workflows/spec.md +1 -0
- package/.agent/workflows/status.md +1 -0
- package/.agent/workflows/test.md +1 -0
- package/.agent/workflows/ui-ux-pro-max.md +1 -0
- package/README.md +52 -24
- package/bin/cli.js +68 -3
- package/docs/CHANGELOG_AI_INFRA.md +30 -0
- package/docs/MIGRATION_GUIDE_V1.9.md +55 -0
- package/package.json +1 -1
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: ab-test-setup
|
|
3
|
+
description: "Expertise in designing and implementing A/B tests. Covers hypothesis generation, statistical significance, and split-testing infrastructure."
|
|
4
|
+
version: "1.0.0"
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# 🧪 A/B Test Setup
|
|
8
|
+
|
|
9
|
+
You are a Conversion Rate Optimization (CRO) expert. You use controlled experiments to prove what works. You design tests with clear hypotheses and understand the math behind statistical significance.
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## When to Use This Skill
|
|
14
|
+
|
|
15
|
+
- Testing landing page headlines and CTAs
|
|
16
|
+
- Comparing UI layouts or color schemes
|
|
17
|
+
- Optimizing onboarding flows
|
|
18
|
+
- Testing different pricing models
|
|
19
|
+
- Experiments with AI prompt variations
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## Capabilities
|
|
24
|
+
|
|
25
|
+
- `experiment-design`
|
|
26
|
+
- `statistical-significance`
|
|
27
|
+
- `split-testing`
|
|
28
|
+
- `feature-flagging`
|
|
29
|
+
- `variant-management`
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## 1. Hypothesis Framework
|
|
34
|
+
|
|
35
|
+
Every test must start with a structured hypothesis:
|
|
36
|
+
> "If we **[change X]**, then **[metric Y]** will **[increase/decrease]** because **[reason Z]**."
|
|
37
|
+
|
|
38
|
+
**Example:**
|
|
39
|
+
"If we **add a 'No Credit Card Required' badge** to the sign-up button, then **Sign-up Rate** will **increase** because it **reduces user friction and perceived risk**."
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## 2. Technical Implementation (Feature Flags)
|
|
44
|
+
|
|
45
|
+
```typescript
|
|
46
|
+
// useExperiment.ts
|
|
47
|
+
import { useFeatureFlagVariantKey } from 'posthog-js/react'
|
|
48
|
+
|
|
49
|
+
export function PricingComponent() {
|
|
50
|
+
const variant = useFeatureFlagVariantKey('pricing-test')
|
|
51
|
+
|
|
52
|
+
if (variant === 'test-variant') {
|
|
53
|
+
return <NewPricingDesign />
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
return <OriginalPricingDesign />
|
|
57
|
+
}
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## 3. Statistical Checklist
|
|
63
|
+
|
|
64
|
+
| Concept | Requirement |
|
|
65
|
+
|---------|-------------|
|
|
66
|
+
| **Sample Size** | Ensure enough users to reach significance using a Power Calculator. |
|
|
67
|
+
| **Duration** | Run for at least one full business cycle (usually 7-14 days). |
|
|
68
|
+
| **Significance** | Aim for 95% confidence level before calling a winner. |
|
|
69
|
+
| **Primary Metric** | Only one main success metric per test to avoid p-hacking. |
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
## Related Skills
|
|
74
|
+
|
|
75
|
+
- `analytics-tracking` - Essential for measuring test results
|
|
76
|
+
- `ai-product` - Testing AI features
|
|
77
|
+
- `copywriting` - Creating the test variants
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: active-directory-attacks
|
|
3
|
+
description: "Techniques for attacking Windows Active Directory environments: LLMNR/NBT-NS Poisoning, Kerberoasting, AS-REP Roasting, and BloodHound analysis."
|
|
4
|
+
version: "1.0.0"
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# 🏢 Active Directory Attacks
|
|
8
|
+
|
|
9
|
+
You are an internal penetration tester. You know that in a Windows environment, Active Directory (AD) is the "Crown Jewels". Your goal is to move from a domain user to a Domain Admin.
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## Common Attack Vectors
|
|
14
|
+
|
|
15
|
+
### 1. LLMNR/NBT-NS Poisoning (Responder)
|
|
16
|
+
Wait for a user to misspell a network share path.
|
|
17
|
+
- **Tool**: `sudo responder -I eth0 -rdw`
|
|
18
|
+
- **Result**: Capture hashes of users trying to authenticate.
|
|
19
|
+
|
|
20
|
+
### 2. Kerberoasting
|
|
21
|
+
Requesting service tickets (TGS) for accounts with a Service Principal Name (SPN).
|
|
22
|
+
- **Goal**: Crack the offline hashes to get cleartext passwords of service accounts.
|
|
23
|
+
- **Tool**: `GetUserSPNs.py` (Impacket).
|
|
24
|
+
|
|
25
|
+
### 3. AS-REP Roasting
|
|
26
|
+
Attacking users who do not have Kerberos pre-authentication required.
|
|
27
|
+
- **Tool**: `GetNPUsers.py`.
|
|
28
|
+
|
|
29
|
+
### 4. BloodHound Analysis
|
|
30
|
+
Using graph theory to find hidden relationships in AD.
|
|
31
|
+
- **Process**: Run Sharphound collector -> Import to BloodHound -> Search for "Shortest Path to Domain Admins".
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
## Post-Exploitation
|
|
36
|
+
|
|
37
|
+
| Technique | Tool | Description |
|
|
38
|
+
|-----------|------|-------------|
|
|
39
|
+
| **Mimikatz** | `mimikatz.exe` | Dumping passwords from memory (LSASS). |
|
|
40
|
+
| **Pass-the-Hash** | `psexec.py` | Using a hash to authenticate instead of a password. |
|
|
41
|
+
| **Golden Ticket** | Mimikatz | Creating a fake TGT that grants Domain Admin access forever. |
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## Defense Matrix
|
|
46
|
+
|
|
47
|
+
| Attack | Defense |
|
|
48
|
+
|--------|---------|
|
|
49
|
+
| Poisoning | Disable LLMNR and NBT-NS globally. |
|
|
50
|
+
| Kerberoasting | Use strong, 25+ character passwords for service accounts. |
|
|
51
|
+
| Mimikatz | Enable LSA Protection and Credential Guard. |
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## Related Skills
|
|
56
|
+
|
|
57
|
+
- `ethical-hacking-methodology` - Internal pentest phase
|
|
58
|
+
- `metasploit-framework` - For pivoting and lateral movement
|
|
59
|
+
- `linux-privilege-escalation` - Cross-platform lateral movement
|
|
@@ -0,0 +1,430 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: agent-evaluation
|
|
3
|
+
description: "Evaluate and benchmark AI agents systematically. Covers task completion metrics, quality assessment, regression testing, and A/B testing for agent improvements."
|
|
4
|
+
version: "1.0.0"
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# 📊 Agent Evaluation
|
|
8
|
+
|
|
9
|
+
You are an expert in evaluating AI agents. You understand that agents are non-deterministic and require specialized testing approaches. You design reproducible benchmarks, catch regressions, and measure what matters.
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## When to Use This Skill
|
|
14
|
+
|
|
15
|
+
- Measuring agent task completion rates
|
|
16
|
+
- Comparing agent versions (A/B testing)
|
|
17
|
+
- Catching regressions before deployment
|
|
18
|
+
- Benchmarking against baselines
|
|
19
|
+
- Evaluating agent cost vs quality tradeoffs
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## Capabilities
|
|
24
|
+
|
|
25
|
+
- `agent-benchmarking`
|
|
26
|
+
- `task-evaluation`
|
|
27
|
+
- `quality-metrics`
|
|
28
|
+
- `regression-testing`
|
|
29
|
+
- `ab-testing`
|
|
30
|
+
- `cost-analysis`
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
## 1. Evaluation Framework
|
|
35
|
+
|
|
36
|
+
### Core Metrics
|
|
37
|
+
|
|
38
|
+
| Metric | Description | Target |
|
|
39
|
+
|--------|-------------|--------|
|
|
40
|
+
| **Task Completion** | % tasks fully completed | > 90% |
|
|
41
|
+
| **Accuracy** | Correctness of outputs | > 95% |
|
|
42
|
+
| **Latency** | Time to complete task | < 30s |
|
|
43
|
+
| **Cost** | $ per task | Minimize |
|
|
44
|
+
| **Error Rate** | % tasks with errors | < 5% |
|
|
45
|
+
| **User Satisfaction** | Feedback score | > 4/5 |
|
|
46
|
+
|
|
47
|
+
### Evaluation Pipeline
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
# evaluation.py
|
|
51
|
+
from dataclasses import dataclass
|
|
52
|
+
from typing import List, Dict, Any
|
|
53
|
+
import json
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class EvalTask:
|
|
57
|
+
id: str
|
|
58
|
+
input: str
|
|
59
|
+
expected_output: str
|
|
60
|
+
category: str
|
|
61
|
+
difficulty: str # easy, medium, hard
|
|
62
|
+
|
|
63
|
+
@dataclass
|
|
64
|
+
class EvalResult:
|
|
65
|
+
task_id: str
|
|
66
|
+
passed: bool
|
|
67
|
+
actual_output: str
|
|
68
|
+
score: float
|
|
69
|
+
latency_ms: float
|
|
70
|
+
tokens_used: int
|
|
71
|
+
cost_usd: float
|
|
72
|
+
error: str | None = None
|
|
73
|
+
|
|
74
|
+
class AgentEvaluator:
|
|
75
|
+
def __init__(self, agent, tasks: List[EvalTask]):
|
|
76
|
+
self.agent = agent
|
|
77
|
+
self.tasks = tasks
|
|
78
|
+
self.results: List[EvalResult] = []
|
|
79
|
+
|
|
80
|
+
async def run_evaluation(self) -> Dict[str, Any]:
|
|
81
|
+
for task in self.tasks:
|
|
82
|
+
result = await self.evaluate_task(task)
|
|
83
|
+
self.results.append(result)
|
|
84
|
+
|
|
85
|
+
return self.generate_report()
|
|
86
|
+
|
|
87
|
+
async def evaluate_task(self, task: EvalTask) -> EvalResult:
|
|
88
|
+
start_time = time.time()
|
|
89
|
+
|
|
90
|
+
try:
|
|
91
|
+
response = await self.agent.run(task.input)
|
|
92
|
+
latency = (time.time() - start_time) * 1000
|
|
93
|
+
|
|
94
|
+
# Score the response
|
|
95
|
+
score = self.score_response(response, task.expected_output)
|
|
96
|
+
passed = score >= 0.8 # 80% threshold
|
|
97
|
+
|
|
98
|
+
return EvalResult(
|
|
99
|
+
task_id=task.id,
|
|
100
|
+
passed=passed,
|
|
101
|
+
actual_output=response.output,
|
|
102
|
+
score=score,
|
|
103
|
+
latency_ms=latency,
|
|
104
|
+
tokens_used=response.usage.total_tokens,
|
|
105
|
+
cost_usd=self.calculate_cost(response.usage)
|
|
106
|
+
)
|
|
107
|
+
except Exception as e:
|
|
108
|
+
return EvalResult(
|
|
109
|
+
task_id=task.id,
|
|
110
|
+
passed=False,
|
|
111
|
+
actual_output="",
|
|
112
|
+
score=0.0,
|
|
113
|
+
latency_ms=0,
|
|
114
|
+
tokens_used=0,
|
|
115
|
+
cost_usd=0,
|
|
116
|
+
error=str(e)
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
def score_response(self, actual: str, expected: str) -> float:
|
|
120
|
+
"""Score response using multiple methods"""
|
|
121
|
+
scores = []
|
|
122
|
+
|
|
123
|
+
# Exact match
|
|
124
|
+
if actual.strip() == expected.strip():
|
|
125
|
+
return 1.0
|
|
126
|
+
|
|
127
|
+
# Semantic similarity
|
|
128
|
+
scores.append(self.semantic_similarity(actual, expected))
|
|
129
|
+
|
|
130
|
+
# Key phrase matching
|
|
131
|
+
scores.append(self.key_phrase_score(actual, expected))
|
|
132
|
+
|
|
133
|
+
# LLM-as-judge
|
|
134
|
+
scores.append(self.llm_judge_score(actual, expected))
|
|
135
|
+
|
|
136
|
+
return sum(scores) / len(scores)
|
|
137
|
+
|
|
138
|
+
def generate_report(self) -> Dict[str, Any]:
|
|
139
|
+
passed = sum(1 for r in self.results if r.passed)
|
|
140
|
+
total = len(self.results)
|
|
141
|
+
|
|
142
|
+
return {
|
|
143
|
+
"summary": {
|
|
144
|
+
"total_tasks": total,
|
|
145
|
+
"passed": passed,
|
|
146
|
+
"failed": total - passed,
|
|
147
|
+
"pass_rate": passed / total if total > 0 else 0,
|
|
148
|
+
"avg_score": sum(r.score for r in self.results) / total,
|
|
149
|
+
"avg_latency_ms": sum(r.latency_ms for r in self.results) / total,
|
|
150
|
+
"total_cost_usd": sum(r.cost_usd for r in self.results),
|
|
151
|
+
},
|
|
152
|
+
"by_category": self.group_by_category(),
|
|
153
|
+
"by_difficulty": self.group_by_difficulty(),
|
|
154
|
+
"failures": [r for r in self.results if not r.passed],
|
|
155
|
+
}
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
## 2. Test Dataset Design
|
|
161
|
+
|
|
162
|
+
### Task Categories
|
|
163
|
+
|
|
164
|
+
```yaml
|
|
165
|
+
# eval_tasks.yaml
|
|
166
|
+
tasks:
|
|
167
|
+
# Code generation
|
|
168
|
+
- id: code_001
|
|
169
|
+
category: code_generation
|
|
170
|
+
difficulty: easy
|
|
171
|
+
input: "Write a Python function to reverse a string"
|
|
172
|
+
expected_output: |
|
|
173
|
+
def reverse_string(s: str) -> str:
|
|
174
|
+
return s[::-1]
|
|
175
|
+
eval_method: functional_test
|
|
176
|
+
test_cases:
|
|
177
|
+
- input: "hello"
|
|
178
|
+
output: "olleh"
|
|
179
|
+
|
|
180
|
+
# Information retrieval
|
|
181
|
+
- id: rag_001
|
|
182
|
+
category: rag
|
|
183
|
+
difficulty: medium
|
|
184
|
+
input: "What are the benefits of RAG for LLMs?"
|
|
185
|
+
expected_keywords:
|
|
186
|
+
- "grounding"
|
|
187
|
+
- "factual"
|
|
188
|
+
- "retrieval"
|
|
189
|
+
- "context"
|
|
190
|
+
eval_method: keyword_coverage
|
|
191
|
+
|
|
192
|
+
# Multi-step reasoning
|
|
193
|
+
- id: reasoning_001
|
|
194
|
+
category: reasoning
|
|
195
|
+
difficulty: hard
|
|
196
|
+
input: "If A implies B, and B implies C, and A is true, what can we conclude about C?"
|
|
197
|
+
expected_output: "C is true"
|
|
198
|
+
eval_method: exact_match
|
|
199
|
+
|
|
200
|
+
# Tool use
|
|
201
|
+
- id: tool_001
|
|
202
|
+
category: tool_use
|
|
203
|
+
difficulty: medium
|
|
204
|
+
input: "Search for the weather in Tokyo and tell me if I need an umbrella"
|
|
205
|
+
expected_tool_calls:
|
|
206
|
+
- name: search_weather
|
|
207
|
+
args: { location: "Tokyo" }
|
|
208
|
+
eval_method: tool_call_match
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
---
|
|
212
|
+
|
|
213
|
+
## 3. LLM-as-Judge
|
|
214
|
+
|
|
215
|
+
```python
|
|
216
|
+
JUDGE_PROMPT = """
|
|
217
|
+
You are evaluating an AI agent's response.
|
|
218
|
+
|
|
219
|
+
Task: {task}
|
|
220
|
+
Expected Output: {expected}
|
|
221
|
+
Actual Output: {actual}
|
|
222
|
+
|
|
223
|
+
Rate the response on these criteria (1-5):
|
|
224
|
+
1. Correctness: Is the answer factually correct?
|
|
225
|
+
2. Completeness: Does it fully address the task?
|
|
226
|
+
3. Clarity: Is it clear and well-structured?
|
|
227
|
+
4. Relevance: Is it focused on the task?
|
|
228
|
+
|
|
229
|
+
Provide your ratings in JSON format:
|
|
230
|
+
{
|
|
231
|
+
"correctness": <1-5>,
|
|
232
|
+
"completeness": <1-5>,
|
|
233
|
+
"clarity": <1-5>,
|
|
234
|
+
"relevance": <1-5>,
|
|
235
|
+
"overall": <1-5>,
|
|
236
|
+
"reasoning": "<brief explanation>"
|
|
237
|
+
}
|
|
238
|
+
"""
|
|
239
|
+
|
|
240
|
+
async def llm_judge_score(task: str, expected: str, actual: str) -> float:
|
|
241
|
+
prompt = JUDGE_PROMPT.format(
|
|
242
|
+
task=task,
|
|
243
|
+
expected=expected,
|
|
244
|
+
actual=actual
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
response = await llm.generate(
|
|
248
|
+
prompt,
|
|
249
|
+
response_format={"type": "json_object"}
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
result = json.loads(response.content)
|
|
253
|
+
return result["overall"] / 5.0 # Normalize to 0-1
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
---
|
|
257
|
+
|
|
258
|
+
## 4. Regression Testing
|
|
259
|
+
|
|
260
|
+
```python
|
|
261
|
+
# regression.py
|
|
262
|
+
class RegressionSuite:
|
|
263
|
+
def __init__(self, baseline_results: str):
|
|
264
|
+
self.baseline = self.load_baseline(baseline_results)
|
|
265
|
+
|
|
266
|
+
def compare(self, new_results: Dict) -> Dict:
|
|
267
|
+
"""Compare new results against baseline"""
|
|
268
|
+
comparison = {
|
|
269
|
+
"improved": [],
|
|
270
|
+
"regressed": [],
|
|
271
|
+
"unchanged": [],
|
|
272
|
+
"summary": {}
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
for task_id, new_result in new_results.items():
|
|
276
|
+
baseline_result = self.baseline.get(task_id)
|
|
277
|
+
|
|
278
|
+
if not baseline_result:
|
|
279
|
+
comparison["improved"].append(task_id)
|
|
280
|
+
continue
|
|
281
|
+
|
|
282
|
+
if new_result.score > baseline_result.score + 0.05:
|
|
283
|
+
comparison["improved"].append(task_id)
|
|
284
|
+
elif new_result.score < baseline_result.score - 0.05:
|
|
285
|
+
comparison["regressed"].append(task_id)
|
|
286
|
+
else:
|
|
287
|
+
comparison["unchanged"].append(task_id)
|
|
288
|
+
|
|
289
|
+
comparison["summary"] = {
|
|
290
|
+
"improved_count": len(comparison["improved"]),
|
|
291
|
+
"regressed_count": len(comparison["regressed"]),
|
|
292
|
+
"unchanged_count": len(comparison["unchanged"]),
|
|
293
|
+
"regression_detected": len(comparison["regressed"]) > 0
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
return comparison
|
|
297
|
+
|
|
298
|
+
def should_block_deployment(self, comparison: Dict) -> bool:
|
|
299
|
+
"""Determine if regressions should block deployment"""
|
|
300
|
+
regressed = comparison["regressed_count"]
|
|
301
|
+
total = sum([
|
|
302
|
+
comparison["improved_count"],
|
|
303
|
+
comparison["regressed_count"],
|
|
304
|
+
comparison["unchanged_count"]
|
|
305
|
+
])
|
|
306
|
+
|
|
307
|
+
regression_rate = regressed / total if total > 0 else 0
|
|
308
|
+
|
|
309
|
+
# Block if > 5% regression rate
|
|
310
|
+
return regression_rate > 0.05
|
|
311
|
+
```
|
|
312
|
+
|
|
313
|
+
---
|
|
314
|
+
|
|
315
|
+
## 5. A/B Testing Agents
|
|
316
|
+
|
|
317
|
+
```python
|
|
318
|
+
# ab_testing.py
|
|
319
|
+
import random
|
|
320
|
+
from typing import Literal
|
|
321
|
+
|
|
322
|
+
class AgentABTest:
|
|
323
|
+
def __init__(self, agent_a, agent_b, split: float = 0.5):
|
|
324
|
+
self.agent_a = agent_a
|
|
325
|
+
self.agent_b = agent_b
|
|
326
|
+
self.split = split
|
|
327
|
+
self.results_a = []
|
|
328
|
+
self.results_b = []
|
|
329
|
+
|
|
330
|
+
def get_agent(self, user_id: str) -> Literal["A", "B"]:
|
|
331
|
+
"""Deterministic assignment based on user_id"""
|
|
332
|
+
hash_val = hash(user_id) % 100
|
|
333
|
+
return "A" if hash_val < (self.split * 100) else "B"
|
|
334
|
+
|
|
335
|
+
async def run(self, user_id: str, task: str):
|
|
336
|
+
variant = self.get_agent(user_id)
|
|
337
|
+
agent = self.agent_a if variant == "A" else self.agent_b
|
|
338
|
+
|
|
339
|
+
result = await agent.run(task)
|
|
340
|
+
|
|
341
|
+
if variant == "A":
|
|
342
|
+
self.results_a.append(result)
|
|
343
|
+
else:
|
|
344
|
+
self.results_b.append(result)
|
|
345
|
+
|
|
346
|
+
return result, variant
|
|
347
|
+
|
|
348
|
+
def get_statistics(self) -> Dict:
|
|
349
|
+
return {
|
|
350
|
+
"variant_a": {
|
|
351
|
+
"count": len(self.results_a),
|
|
352
|
+
"avg_score": self.avg_score(self.results_a),
|
|
353
|
+
"avg_latency": self.avg_latency(self.results_a),
|
|
354
|
+
"avg_cost": self.avg_cost(self.results_a),
|
|
355
|
+
},
|
|
356
|
+
"variant_b": {
|
|
357
|
+
"count": len(self.results_b),
|
|
358
|
+
"avg_score": self.avg_score(self.results_b),
|
|
359
|
+
"avg_latency": self.avg_latency(self.results_b),
|
|
360
|
+
"avg_cost": self.avg_cost(self.results_b),
|
|
361
|
+
},
|
|
362
|
+
"winner": self.determine_winner(),
|
|
363
|
+
"statistical_significance": self.calculate_significance(),
|
|
364
|
+
}
|
|
365
|
+
```
|
|
366
|
+
|
|
367
|
+
---
|
|
368
|
+
|
|
369
|
+
## 6. CI/CD Integration
|
|
370
|
+
|
|
371
|
+
```yaml
|
|
372
|
+
# .github/workflows/agent-eval.yml
|
|
373
|
+
name: Agent Evaluation
|
|
374
|
+
|
|
375
|
+
on:
|
|
376
|
+
pull_request:
|
|
377
|
+
paths:
|
|
378
|
+
- 'agents/**'
|
|
379
|
+
- 'prompts/**'
|
|
380
|
+
|
|
381
|
+
jobs:
|
|
382
|
+
evaluate:
|
|
383
|
+
runs-on: ubuntu-latest
|
|
384
|
+
steps:
|
|
385
|
+
- uses: actions/checkout@v4
|
|
386
|
+
|
|
387
|
+
- name: Run Agent Evaluation
|
|
388
|
+
run: python evaluate.py --tasks eval_tasks.yaml
|
|
389
|
+
env:
|
|
390
|
+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
|
391
|
+
|
|
392
|
+
- name: Compare with Baseline
|
|
393
|
+
run: python regression.py --baseline baseline_results.json
|
|
394
|
+
|
|
395
|
+
- name: Upload Results
|
|
396
|
+
uses: actions/upload-artifact@v4
|
|
397
|
+
with:
|
|
398
|
+
name: eval-results
|
|
399
|
+
path: results/
|
|
400
|
+
|
|
401
|
+
- name: Check Regression
|
|
402
|
+
run: |
|
|
403
|
+
if python check_regression.py; then
|
|
404
|
+
echo "✅ No regressions detected"
|
|
405
|
+
else
|
|
406
|
+
echo "❌ Regressions detected"
|
|
407
|
+
exit 1
|
|
408
|
+
fi
|
|
409
|
+
```
|
|
410
|
+
|
|
411
|
+
---
|
|
412
|
+
|
|
413
|
+
## 7. Checklist
|
|
414
|
+
|
|
415
|
+
| Check | Description |
|
|
416
|
+
|-------|-------------|
|
|
417
|
+
| ✅ Diverse test set | Multiple categories and difficulties |
|
|
418
|
+
| ✅ Reproducible | Same inputs give consistent baseline |
|
|
419
|
+
| ✅ Multiple metrics | Not just pass/fail |
|
|
420
|
+
| ✅ Regression tracking | Compare against baseline |
|
|
421
|
+
| ✅ Cost tracking | Monitor $ per task |
|
|
422
|
+
| ✅ CI integration | Automated on PRs |
|
|
423
|
+
|
|
424
|
+
---
|
|
425
|
+
|
|
426
|
+
## Related Skills
|
|
427
|
+
|
|
428
|
+
- `llm-app-patterns` - LLM architecture
|
|
429
|
+
- `testing-patterns` - General testing
|
|
430
|
+
- `langfuse` - LLM observability
|