@musashishao/agent-kit 1.8.1 → 1.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. package/.agent/agents/ai-architect.md +39 -0
  2. package/.agent/agents/cloud-engineer.md +39 -0
  3. package/.agent/agents/game-asset-curator.md +317 -0
  4. package/.agent/agents/game-developer.md +190 -89
  5. package/.agent/agents/game-narrative-designer.md +310 -0
  6. package/.agent/agents/game-qa-agent.md +441 -0
  7. package/.agent/agents/marketing-specialist.md +41 -0
  8. package/.agent/agents/penetration-tester.md +15 -1
  9. package/.agent/rules/CODEX.md +26 -2
  10. package/.agent/rules/GEMINI.md +7 -5
  11. package/.agent/rules/REFERENCE.md +92 -2
  12. package/.agent/scripts/ak_cli.py +1 -1
  13. package/.agent/scripts/localize_workflows.py +54 -0
  14. package/.agent/scripts/memory_manager.py +24 -1
  15. package/.agent/skills/3d-web-experience/SKILL.md +386 -0
  16. package/.agent/skills/DEPENDENCIES.md +54 -0
  17. package/.agent/skills/ab-test-setup/SKILL.md +77 -0
  18. package/.agent/skills/active-directory-attacks/SKILL.md +59 -0
  19. package/.agent/skills/agent-evaluation/SKILL.md +430 -0
  20. package/.agent/skills/agent-memory-systems/SKILL.md +426 -0
  21. package/.agent/skills/agent-tool-builder/SKILL.md +139 -0
  22. package/.agent/skills/ai-agents-architect/SKILL.md +115 -0
  23. package/.agent/skills/ai-product/SKILL.md +86 -0
  24. package/.agent/skills/ai-wrapper-product/SKILL.md +90 -0
  25. package/.agent/skills/analytics-tracking/SKILL.md +88 -0
  26. package/.agent/skills/api-fuzzing-bug-bounty/SKILL.md +66 -0
  27. package/.agent/skills/app-store-optimization/SKILL.md +66 -0
  28. package/.agent/skills/autonomous-agent-patterns/SKILL.md +414 -0
  29. package/.agent/skills/aws-penetration-testing/SKILL.md +50 -0
  30. package/.agent/skills/aws-serverless/SKILL.md +327 -0
  31. package/.agent/skills/azure-functions/SKILL.md +340 -0
  32. package/.agent/skills/broken-authentication/SKILL.md +53 -0
  33. package/.agent/skills/browser-automation/SKILL.md +408 -0
  34. package/.agent/skills/browser-extension-builder/SKILL.md +422 -0
  35. package/.agent/skills/bullmq-specialist/SKILL.md +424 -0
  36. package/.agent/skills/bun-development/SKILL.md +386 -0
  37. package/.agent/skills/burp-suite-testing/SKILL.md +60 -0
  38. package/.agent/skills/clerk-auth/SKILL.md +432 -0
  39. package/.agent/skills/cloud-penetration-testing/SKILL.md +51 -0
  40. package/.agent/skills/copywriting/SKILL.md +66 -0
  41. package/.agent/skills/crewai/SKILL.md +470 -0
  42. package/.agent/skills/discord-bot-architect/SKILL.md +447 -0
  43. package/.agent/skills/email-sequence/SKILL.md +73 -0
  44. package/.agent/skills/ethical-hacking-methodology/SKILL.md +67 -0
  45. package/.agent/skills/firebase/SKILL.md +377 -0
  46. package/.agent/skills/game-development/godot-expert/SKILL.md +462 -0
  47. package/.agent/skills/game-development/npc-ai-integration/SKILL.md +110 -0
  48. package/.agent/skills/game-development/procedural-generation/SKILL.md +168 -0
  49. package/.agent/skills/game-development/unity-integration/SKILL.md +358 -0
  50. package/.agent/skills/game-development/webgpu-shading/SKILL.md +209 -0
  51. package/.agent/skills/gcp-cloud-run/SKILL.md +358 -0
  52. package/.agent/skills/graphql/SKILL.md +492 -0
  53. package/.agent/skills/idor-testing/SKILL.md +64 -0
  54. package/.agent/skills/inngest/SKILL.md +128 -0
  55. package/.agent/skills/langfuse/SKILL.md +415 -0
  56. package/.agent/skills/langgraph/SKILL.md +360 -0
  57. package/.agent/skills/launch-strategy/SKILL.md +68 -0
  58. package/.agent/skills/linux-privilege-escalation/SKILL.md +62 -0
  59. package/.agent/skills/llm-app-patterns/SKILL.md +367 -0
  60. package/.agent/skills/marketing-ideas/SKILL.md +66 -0
  61. package/.agent/skills/metasploit-framework/SKILL.md +60 -0
  62. package/.agent/skills/micro-saas-launcher/SKILL.md +93 -0
  63. package/.agent/skills/neon-postgres/SKILL.md +339 -0
  64. package/.agent/skills/paid-ads/SKILL.md +64 -0
  65. package/.agent/skills/supabase-integration/SKILL.md +411 -0
  66. package/.agent/workflows/ai-agent.md +36 -0
  67. package/.agent/workflows/autofix.md +1 -0
  68. package/.agent/workflows/brainstorm.md +1 -0
  69. package/.agent/workflows/context.md +1 -0
  70. package/.agent/workflows/create.md +1 -0
  71. package/.agent/workflows/dashboard.md +1 -0
  72. package/.agent/workflows/debug.md +1 -0
  73. package/.agent/workflows/deploy.md +1 -0
  74. package/.agent/workflows/enhance.md +1 -0
  75. package/.agent/workflows/game-prototype.md +154 -0
  76. package/.agent/workflows/marketing.md +37 -0
  77. package/.agent/workflows/next.md +1 -0
  78. package/.agent/workflows/orchestrate.md +1 -0
  79. package/.agent/workflows/pentest.md +37 -0
  80. package/.agent/workflows/plan.md +1 -0
  81. package/.agent/workflows/preview.md +2 -1
  82. package/.agent/workflows/quality.md +1 -0
  83. package/.agent/workflows/saas.md +36 -0
  84. package/.agent/workflows/spec.md +1 -0
  85. package/.agent/workflows/status.md +1 -0
  86. package/.agent/workflows/test.md +1 -0
  87. package/.agent/workflows/ui-ux-pro-max.md +1 -0
  88. package/README.md +52 -24
  89. package/bin/cli.js +68 -3
  90. package/docs/CHANGELOG_AI_INFRA.md +30 -0
  91. package/docs/MIGRATION_GUIDE_V1.9.md +55 -0
  92. package/package.json +1 -1
@@ -0,0 +1,77 @@
1
+ ---
2
+ name: ab-test-setup
3
+ description: "Expertise in designing and implementing A/B tests. Covers hypothesis generation, statistical significance, and split-testing infrastructure."
4
+ version: "1.0.0"
5
+ ---
6
+
7
+ # 🧪 A/B Test Setup
8
+
9
+ You are a Conversion Rate Optimization (CRO) expert. You use controlled experiments to prove what works. You design tests with clear hypotheses and understand the math behind statistical significance.
10
+
11
+ ---
12
+
13
+ ## When to Use This Skill
14
+
15
+ - Testing landing page headlines and CTAs
16
+ - Comparing UI layouts or color schemes
17
+ - Optimizing onboarding flows
18
+ - Testing different pricing models
19
+ - Experiments with AI prompt variations
20
+
21
+ ---
22
+
23
+ ## Capabilities
24
+
25
+ - `experiment-design`
26
+ - `statistical-significance`
27
+ - `split-testing`
28
+ - `feature-flagging`
29
+ - `variant-management`
30
+
31
+ ---
32
+
33
+ ## 1. Hypothesis Framework
34
+
35
+ Every test must start with a structured hypothesis:
36
+ > "If we **[change X]**, then **[metric Y]** will **[increase/decrease]** because **[reason Z]**."
37
+
38
+ **Example:**
39
+ "If we **add a 'No Credit Card Required' badge** to the sign-up button, then **Sign-up Rate** will **increase** because it **reduces user friction and perceived risk**."
40
+
41
+ ---
42
+
43
+ ## 2. Technical Implementation (Feature Flags)
44
+
45
+ ```typescript
46
+ // useExperiment.ts
47
+ import { useFeatureFlagVariantKey } from 'posthog-js/react'
48
+
49
+ export function PricingComponent() {
50
+ const variant = useFeatureFlagVariantKey('pricing-test')
51
+
52
+ if (variant === 'test-variant') {
53
+ return <NewPricingDesign />
54
+ }
55
+
56
+ return <OriginalPricingDesign />
57
+ }
58
+ ```
59
+
60
+ ---
61
+
62
+ ## 3. Statistical Checklist
63
+
64
+ | Concept | Requirement |
65
+ |---------|-------------|
66
+ | **Sample Size** | Ensure enough users to reach significance using a Power Calculator. |
67
+ | **Duration** | Run for at least one full business cycle (usually 7-14 days). |
68
+ | **Significance** | Aim for 95% confidence level before calling a winner. |
69
+ | **Primary Metric** | Only one main success metric per test to avoid p-hacking. |
70
+
71
+ ---
72
+
73
+ ## Related Skills
74
+
75
+ - `analytics-tracking` - Essential for measuring test results
76
+ - `ai-product` - Testing AI features
77
+ - `copywriting` - Creating the test variants
@@ -0,0 +1,59 @@
1
+ ---
2
+ name: active-directory-attacks
3
+ description: "Techniques for attacking Windows Active Directory environments: LLMNR/NBT-NS Poisoning, Kerberoasting, AS-REP Roasting, and BloodHound analysis."
4
+ version: "1.0.0"
5
+ ---
6
+
7
+ # 🏢 Active Directory Attacks
8
+
9
+ You are an internal penetration tester. You know that in a Windows environment, Active Directory (AD) is the "Crown Jewels". Your goal is to move from a domain user to a Domain Admin.
10
+
11
+ ---
12
+
13
+ ## Common Attack Vectors
14
+
15
+ ### 1. LLMNR/NBT-NS Poisoning (Responder)
16
+ Wait for a user to misspell a network share path.
17
+ - **Tool**: `sudo responder -I eth0 -rdw`
18
+ - **Result**: Capture hashes of users trying to authenticate.
19
+
20
+ ### 2. Kerberoasting
21
+ Requesting service tickets (TGS) for accounts with a Service Principal Name (SPN).
22
+ - **Goal**: Crack the offline hashes to get cleartext passwords of service accounts.
23
+ - **Tool**: `GetUserSPNs.py` (Impacket).
24
+
25
+ ### 3. AS-REP Roasting
26
+ Attacking users who do not have Kerberos pre-authentication required.
27
+ - **Tool**: `GetNPUsers.py`.
28
+
29
+ ### 4. BloodHound Analysis
30
+ Using graph theory to find hidden relationships in AD.
31
+ - **Process**: Run Sharphound collector -> Import to BloodHound -> Search for "Shortest Path to Domain Admins".
32
+
33
+ ---
34
+
35
+ ## Post-Exploitation
36
+
37
+ | Technique | Tool | Description |
38
+ |-----------|------|-------------|
39
+ | **Mimikatz** | `mimikatz.exe` | Dumping passwords from memory (LSASS). |
40
+ | **Pass-the-Hash** | `psexec.py` | Using a hash to authenticate instead of a password. |
41
+ | **Golden Ticket** | Mimikatz | Creating a fake TGT that grants Domain Admin access forever. |
42
+
43
+ ---
44
+
45
+ ## Defense Matrix
46
+
47
+ | Attack | Defense |
48
+ |--------|---------|
49
+ | Poisoning | Disable LLMNR and NBT-NS globally. |
50
+ | Kerberoasting | Use strong, 25+ character passwords for service accounts. |
51
+ | Mimikatz | Enable LSA Protection and Credential Guard. |
52
+
53
+ ---
54
+
55
+ ## Related Skills
56
+
57
+ - `ethical-hacking-methodology` - Internal pentest phase
58
+ - `metasploit-framework` - For pivoting and lateral movement
59
+ - `linux-privilege-escalation` - Cross-platform lateral movement
@@ -0,0 +1,430 @@
1
+ ---
2
+ name: agent-evaluation
3
+ description: "Evaluate and benchmark AI agents systematically. Covers task completion metrics, quality assessment, regression testing, and A/B testing for agent improvements."
4
+ version: "1.0.0"
5
+ ---
6
+
7
+ # 📊 Agent Evaluation
8
+
9
+ You are an expert in evaluating AI agents. You understand that agents are non-deterministic and require specialized testing approaches. You design reproducible benchmarks, catch regressions, and measure what matters.
10
+
11
+ ---
12
+
13
+ ## When to Use This Skill
14
+
15
+ - Measuring agent task completion rates
16
+ - Comparing agent versions (A/B testing)
17
+ - Catching regressions before deployment
18
+ - Benchmarking against baselines
19
+ - Evaluating agent cost vs quality tradeoffs
20
+
21
+ ---
22
+
23
+ ## Capabilities
24
+
25
+ - `agent-benchmarking`
26
+ - `task-evaluation`
27
+ - `quality-metrics`
28
+ - `regression-testing`
29
+ - `ab-testing`
30
+ - `cost-analysis`
31
+
32
+ ---
33
+
34
+ ## 1. Evaluation Framework
35
+
36
+ ### Core Metrics
37
+
38
+ | Metric | Description | Target |
39
+ |--------|-------------|--------|
40
+ | **Task Completion** | % tasks fully completed | > 90% |
41
+ | **Accuracy** | Correctness of outputs | > 95% |
42
+ | **Latency** | Time to complete task | < 30s |
43
+ | **Cost** | $ per task | Minimize |
44
+ | **Error Rate** | % tasks with errors | < 5% |
45
+ | **User Satisfaction** | Feedback score | > 4/5 |
46
+
47
+ ### Evaluation Pipeline
48
+
49
+ ```python
50
+ # evaluation.py
51
+ from dataclasses import dataclass
52
+ from typing import List, Dict, Any
53
+ import json
54
+
55
+ @dataclass
56
+ class EvalTask:
57
+ id: str
58
+ input: str
59
+ expected_output: str
60
+ category: str
61
+ difficulty: str # easy, medium, hard
62
+
63
+ @dataclass
64
+ class EvalResult:
65
+ task_id: str
66
+ passed: bool
67
+ actual_output: str
68
+ score: float
69
+ latency_ms: float
70
+ tokens_used: int
71
+ cost_usd: float
72
+ error: str | None = None
73
+
74
+ class AgentEvaluator:
75
+ def __init__(self, agent, tasks: List[EvalTask]):
76
+ self.agent = agent
77
+ self.tasks = tasks
78
+ self.results: List[EvalResult] = []
79
+
80
+ async def run_evaluation(self) -> Dict[str, Any]:
81
+ for task in self.tasks:
82
+ result = await self.evaluate_task(task)
83
+ self.results.append(result)
84
+
85
+ return self.generate_report()
86
+
87
+ async def evaluate_task(self, task: EvalTask) -> EvalResult:
88
+ start_time = time.time()
89
+
90
+ try:
91
+ response = await self.agent.run(task.input)
92
+ latency = (time.time() - start_time) * 1000
93
+
94
+ # Score the response
95
+ score = self.score_response(response, task.expected_output)
96
+ passed = score >= 0.8 # 80% threshold
97
+
98
+ return EvalResult(
99
+ task_id=task.id,
100
+ passed=passed,
101
+ actual_output=response.output,
102
+ score=score,
103
+ latency_ms=latency,
104
+ tokens_used=response.usage.total_tokens,
105
+ cost_usd=self.calculate_cost(response.usage)
106
+ )
107
+ except Exception as e:
108
+ return EvalResult(
109
+ task_id=task.id,
110
+ passed=False,
111
+ actual_output="",
112
+ score=0.0,
113
+ latency_ms=0,
114
+ tokens_used=0,
115
+ cost_usd=0,
116
+ error=str(e)
117
+ )
118
+
119
+ def score_response(self, actual: str, expected: str) -> float:
120
+ """Score response using multiple methods"""
121
+ scores = []
122
+
123
+ # Exact match
124
+ if actual.strip() == expected.strip():
125
+ return 1.0
126
+
127
+ # Semantic similarity
128
+ scores.append(self.semantic_similarity(actual, expected))
129
+
130
+ # Key phrase matching
131
+ scores.append(self.key_phrase_score(actual, expected))
132
+
133
+ # LLM-as-judge
134
+ scores.append(self.llm_judge_score(actual, expected))
135
+
136
+ return sum(scores) / len(scores)
137
+
138
+ def generate_report(self) -> Dict[str, Any]:
139
+ passed = sum(1 for r in self.results if r.passed)
140
+ total = len(self.results)
141
+
142
+ return {
143
+ "summary": {
144
+ "total_tasks": total,
145
+ "passed": passed,
146
+ "failed": total - passed,
147
+ "pass_rate": passed / total if total > 0 else 0,
148
+ "avg_score": sum(r.score for r in self.results) / total,
149
+ "avg_latency_ms": sum(r.latency_ms for r in self.results) / total,
150
+ "total_cost_usd": sum(r.cost_usd for r in self.results),
151
+ },
152
+ "by_category": self.group_by_category(),
153
+ "by_difficulty": self.group_by_difficulty(),
154
+ "failures": [r for r in self.results if not r.passed],
155
+ }
156
+ ```
157
+
158
+ ---
159
+
160
+ ## 2. Test Dataset Design
161
+
162
+ ### Task Categories
163
+
164
+ ```yaml
165
+ # eval_tasks.yaml
166
+ tasks:
167
+ # Code generation
168
+ - id: code_001
169
+ category: code_generation
170
+ difficulty: easy
171
+ input: "Write a Python function to reverse a string"
172
+ expected_output: |
173
+ def reverse_string(s: str) -> str:
174
+ return s[::-1]
175
+ eval_method: functional_test
176
+ test_cases:
177
+ - input: "hello"
178
+ output: "olleh"
179
+
180
+ # Information retrieval
181
+ - id: rag_001
182
+ category: rag
183
+ difficulty: medium
184
+ input: "What are the benefits of RAG for LLMs?"
185
+ expected_keywords:
186
+ - "grounding"
187
+ - "factual"
188
+ - "retrieval"
189
+ - "context"
190
+ eval_method: keyword_coverage
191
+
192
+ # Multi-step reasoning
193
+ - id: reasoning_001
194
+ category: reasoning
195
+ difficulty: hard
196
+ input: "If A implies B, and B implies C, and A is true, what can we conclude about C?"
197
+ expected_output: "C is true"
198
+ eval_method: exact_match
199
+
200
+ # Tool use
201
+ - id: tool_001
202
+ category: tool_use
203
+ difficulty: medium
204
+ input: "Search for the weather in Tokyo and tell me if I need an umbrella"
205
+ expected_tool_calls:
206
+ - name: search_weather
207
+ args: { location: "Tokyo" }
208
+ eval_method: tool_call_match
209
+ ```
210
+
211
+ ---
212
+
213
+ ## 3. LLM-as-Judge
214
+
215
+ ```python
216
+ JUDGE_PROMPT = """
217
+ You are evaluating an AI agent's response.
218
+
219
+ Task: {task}
220
+ Expected Output: {expected}
221
+ Actual Output: {actual}
222
+
223
+ Rate the response on these criteria (1-5):
224
+ 1. Correctness: Is the answer factually correct?
225
+ 2. Completeness: Does it fully address the task?
226
+ 3. Clarity: Is it clear and well-structured?
227
+ 4. Relevance: Is it focused on the task?
228
+
229
+ Provide your ratings in JSON format:
230
+ {
231
+ "correctness": <1-5>,
232
+ "completeness": <1-5>,
233
+ "clarity": <1-5>,
234
+ "relevance": <1-5>,
235
+ "overall": <1-5>,
236
+ "reasoning": "<brief explanation>"
237
+ }
238
+ """
239
+
240
+ async def llm_judge_score(task: str, expected: str, actual: str) -> float:
241
+ prompt = JUDGE_PROMPT.format(
242
+ task=task,
243
+ expected=expected,
244
+ actual=actual
245
+ )
246
+
247
+ response = await llm.generate(
248
+ prompt,
249
+ response_format={"type": "json_object"}
250
+ )
251
+
252
+ result = json.loads(response.content)
253
+ return result["overall"] / 5.0 # Normalize to 0-1
254
+ ```
255
+
256
+ ---
257
+
258
+ ## 4. Regression Testing
259
+
260
+ ```python
261
+ # regression.py
262
+ class RegressionSuite:
263
+ def __init__(self, baseline_results: str):
264
+ self.baseline = self.load_baseline(baseline_results)
265
+
266
+ def compare(self, new_results: Dict) -> Dict:
267
+ """Compare new results against baseline"""
268
+ comparison = {
269
+ "improved": [],
270
+ "regressed": [],
271
+ "unchanged": [],
272
+ "summary": {}
273
+ }
274
+
275
+ for task_id, new_result in new_results.items():
276
+ baseline_result = self.baseline.get(task_id)
277
+
278
+ if not baseline_result:
279
+ comparison["improved"].append(task_id)
280
+ continue
281
+
282
+ if new_result.score > baseline_result.score + 0.05:
283
+ comparison["improved"].append(task_id)
284
+ elif new_result.score < baseline_result.score - 0.05:
285
+ comparison["regressed"].append(task_id)
286
+ else:
287
+ comparison["unchanged"].append(task_id)
288
+
289
+ comparison["summary"] = {
290
+ "improved_count": len(comparison["improved"]),
291
+ "regressed_count": len(comparison["regressed"]),
292
+ "unchanged_count": len(comparison["unchanged"]),
293
+ "regression_detected": len(comparison["regressed"]) > 0
294
+ }
295
+
296
+ return comparison
297
+
298
+ def should_block_deployment(self, comparison: Dict) -> bool:
299
+ """Determine if regressions should block deployment"""
300
+ regressed = comparison["regressed_count"]
301
+ total = sum([
302
+ comparison["improved_count"],
303
+ comparison["regressed_count"],
304
+ comparison["unchanged_count"]
305
+ ])
306
+
307
+ regression_rate = regressed / total if total > 0 else 0
308
+
309
+ # Block if > 5% regression rate
310
+ return regression_rate > 0.05
311
+ ```
312
+
313
+ ---
314
+
315
+ ## 5. A/B Testing Agents
316
+
317
+ ```python
318
+ # ab_testing.py
319
+ import random
320
+ from typing import Literal
321
+
322
+ class AgentABTest:
323
+ def __init__(self, agent_a, agent_b, split: float = 0.5):
324
+ self.agent_a = agent_a
325
+ self.agent_b = agent_b
326
+ self.split = split
327
+ self.results_a = []
328
+ self.results_b = []
329
+
330
+ def get_agent(self, user_id: str) -> Literal["A", "B"]:
331
+ """Deterministic assignment based on user_id"""
332
+ hash_val = hash(user_id) % 100
333
+ return "A" if hash_val < (self.split * 100) else "B"
334
+
335
+ async def run(self, user_id: str, task: str):
336
+ variant = self.get_agent(user_id)
337
+ agent = self.agent_a if variant == "A" else self.agent_b
338
+
339
+ result = await agent.run(task)
340
+
341
+ if variant == "A":
342
+ self.results_a.append(result)
343
+ else:
344
+ self.results_b.append(result)
345
+
346
+ return result, variant
347
+
348
+ def get_statistics(self) -> Dict:
349
+ return {
350
+ "variant_a": {
351
+ "count": len(self.results_a),
352
+ "avg_score": self.avg_score(self.results_a),
353
+ "avg_latency": self.avg_latency(self.results_a),
354
+ "avg_cost": self.avg_cost(self.results_a),
355
+ },
356
+ "variant_b": {
357
+ "count": len(self.results_b),
358
+ "avg_score": self.avg_score(self.results_b),
359
+ "avg_latency": self.avg_latency(self.results_b),
360
+ "avg_cost": self.avg_cost(self.results_b),
361
+ },
362
+ "winner": self.determine_winner(),
363
+ "statistical_significance": self.calculate_significance(),
364
+ }
365
+ ```
366
+
367
+ ---
368
+
369
+ ## 6. CI/CD Integration
370
+
371
+ ```yaml
372
+ # .github/workflows/agent-eval.yml
373
+ name: Agent Evaluation
374
+
375
+ on:
376
+ pull_request:
377
+ paths:
378
+ - 'agents/**'
379
+ - 'prompts/**'
380
+
381
+ jobs:
382
+ evaluate:
383
+ runs-on: ubuntu-latest
384
+ steps:
385
+ - uses: actions/checkout@v4
386
+
387
+ - name: Run Agent Evaluation
388
+ run: python evaluate.py --tasks eval_tasks.yaml
389
+ env:
390
+ OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
391
+
392
+ - name: Compare with Baseline
393
+ run: python regression.py --baseline baseline_results.json
394
+
395
+ - name: Upload Results
396
+ uses: actions/upload-artifact@v4
397
+ with:
398
+ name: eval-results
399
+ path: results/
400
+
401
+ - name: Check Regression
402
+ run: |
403
+ if python check_regression.py; then
404
+ echo "✅ No regressions detected"
405
+ else
406
+ echo "❌ Regressions detected"
407
+ exit 1
408
+ fi
409
+ ```
410
+
411
+ ---
412
+
413
+ ## 7. Checklist
414
+
415
+ | Check | Description |
416
+ |-------|-------------|
417
+ | ✅ Diverse test set | Multiple categories and difficulties |
418
+ | ✅ Reproducible | Same inputs give consistent baseline |
419
+ | ✅ Multiple metrics | Not just pass/fail |
420
+ | ✅ Regression tracking | Compare against baseline |
421
+ | ✅ Cost tracking | Monitor $ per task |
422
+ | ✅ CI integration | Automated on PRs |
423
+
424
+ ---
425
+
426
+ ## Related Skills
427
+
428
+ - `llm-app-patterns` - LLM architecture
429
+ - `testing-patterns` - General testing
430
+ - `langfuse` - LLM observability