@fenixforce/edition-pro 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/api/approval-routes.d.ts +15 -0
- package/dist/api/fleet-routes.d.ts +23 -0
- package/dist/api/integration-routes.d.ts +17 -0
- package/dist/api/middleware.d.ts +37 -0
- package/dist/boot.d.ts +37 -0
- package/dist/business/approval-queue.d.ts +94 -0
- package/dist/business/arena.d.ts +71 -0
- package/dist/business/best-of-n.d.ts +68 -0
- package/dist/business/brainstorm.d.ts +42 -0
- package/dist/business/compile-checker.d.ts +50 -0
- package/dist/business/debate.d.ts +38 -0
- package/dist/business/fleet-budget.d.ts +69 -0
- package/dist/business/fleet-config.d.ts +125 -0
- package/dist/business/fleet.d.ts +85 -0
- package/dist/business/handoff.d.ts +56 -0
- package/dist/business/hat-system.d.ts +57 -0
- package/dist/business/index.d.ts +44 -0
- package/dist/business/integration-registry.d.ts +66 -0
- package/dist/business/node-pipeline.d.ts +62 -0
- package/dist/business/oracle.d.ts +64 -0
- package/dist/business/roles/index.d.ts +7 -0
- package/dist/business/roles/judge.d.ts +24 -0
- package/dist/business/roles/planner.d.ts +30 -0
- package/dist/business/roles/types.d.ts +37 -0
- package/dist/business/roles/worker.d.ts +25 -0
- package/dist/business/router.d.ts +75 -0
- package/dist/business/shared-memory.d.ts +85 -0
- package/dist/business/status-detector.d.ts +52 -0
- package/dist/business/swarm-registry.d.ts +63 -0
- package/dist/business/templates.d.ts +55 -0
- package/dist/business/workspace-manager.d.ts +105 -0
- package/dist/index.d.ts +21 -0
- package/dist/index.js +638 -0
- package/dist/infrastructure/pty-agent.d.ts +74 -0
- package/dist/migrations/migrate.d.ts +7 -0
- package/dist/migrations/runner.d.ts +49 -0
- package/dist/workspace/worktree.d.ts +69 -0
- package/package.json +37 -0
- package/skills/builtin/academic-researcher/SKILL.md +51 -0
- package/skills/builtin/advanced-recon/SKILL.md +75 -0
- package/skills/builtin/agent-governance/SKILL.md +122 -0
- package/skills/builtin/algorithmic-art/SKILL.md +55 -0
- package/skills/builtin/api-attack-surface-mapper/SKILL.md +88 -0
- package/skills/builtin/api-development/SKILL.md +147 -0
- package/skills/builtin/api-exploit-prover/SKILL.md +74 -0
- package/skills/builtin/api-integration/SKILL.md +73 -0
- package/skills/builtin/api-security-tester/SKILL.md +82 -0
- package/skills/builtin/api-test-executor/SKILL.md +62 -0
- package/skills/builtin/app-store-optimization/SKILL.md +46 -0
- package/skills/builtin/audio-tour-guide/SKILL.md +18 -0
- package/skills/builtin/auth-flow-operator/SKILL.md +70 -0
- package/skills/builtin/autonomous-rag/SKILL.md +21 -0
- package/skills/builtin/backend-development/SKILL.md +265 -0
- package/skills/builtin/binary-analysis-analyst/SKILL.md +61 -0
- package/skills/builtin/binary-analysis-core/SKILL.md +65 -0
- package/skills/builtin/binary-recon/SKILL.md +64 -0
- package/skills/builtin/blackboard-coordination/SKILL.md +56 -0
- package/skills/builtin/blog-to-podcast/SKILL.md +18 -0
- package/skills/builtin/blog-writing/SKILL.md +36 -0
- package/skills/builtin/brainstorming/SKILL.md +69 -0
- package/skills/builtin/brand-design/SKILL.md +42 -0
- package/skills/builtin/ci-cd-pipelines/SKILL.md +210 -0
- package/skills/builtin/cloud-infrastructure/SKILL.md +140 -0
- package/skills/builtin/code-review/SKILL.md +88 -0
- package/skills/builtin/code-review-analyst/SKILL.md +96 -0
- package/skills/builtin/code-review-recon/SKILL.md +64 -0
- package/skills/builtin/code-review-verifier/SKILL.md +55 -0
- package/skills/builtin/coding-agent-team/SKILL.md +13 -0
- package/skills/builtin/competitor-intelligence/SKILL.md +39 -0
- package/skills/builtin/content-engine/SKILL.md +82 -0
- package/skills/builtin/context7-docs/SKILL.md +145 -0
- package/skills/builtin/copywriting/SKILL.md +38 -0
- package/skills/builtin/corrective-rag/SKILL.md +19 -0
- package/skills/builtin/cost-optimization/SKILL.md +131 -0
- package/skills/builtin/crypto-vulnerability-analyst/SKILL.md +64 -0
- package/skills/builtin/customer-support/SKILL.md +48 -0
- package/skills/builtin/customer-voice-support/SKILL.md +43 -0
- package/skills/builtin/data-analysis/SKILL.md +57 -0
- package/skills/builtin/data-visualization/SKILL.md +33 -0
- package/skills/builtin/database-design/SKILL.md +119 -0
- package/skills/builtin/decision-helper/SKILL.md +84 -0
- package/skills/builtin/deep-research/SKILL.md +68 -0
- package/skills/builtin/deepwiki-research/SKILL.md +115 -0
- package/skills/builtin/dependency-audit/SKILL.md +46 -0
- package/skills/builtin/doc-coauthoring/SKILL.md +48 -0
- package/skills/builtin/docker-deployment/SKILL.md +243 -0
- package/skills/builtin/docx-generation/SKILL.md +135 -0
- package/skills/builtin/dry-run-harness/SKILL.md +61 -0
- package/skills/builtin/editor/SKILL.md +44 -0
- package/skills/builtin/email-drafter/SKILL.md +42 -0
- package/skills/builtin/error-handling/SKILL.md +82 -0
- package/skills/builtin/eval-harness/SKILL.md +197 -0
- package/skills/builtin/evaluation-framework/SKILL.md +51 -0
- package/skills/builtin/exploit-writer/SKILL.md +63 -0
- package/skills/builtin/fact-checker/SKILL.md +51 -0
- package/skills/builtin/filesystem-context/SKILL.md +47 -0
- package/skills/builtin/financial-coach/SKILL.md +18 -0
- package/skills/builtin/finding-chain-correlator/SKILL.md +70 -0
- package/skills/builtin/finding-verifier/SKILL.md +65 -0
- package/skills/builtin/frontend-design/SKILL.md +104 -0
- package/skills/builtin/frontend-development/SKILL.md +227 -0
- package/skills/builtin/frontend-slides/SKILL.md +155 -0
- package/skills/builtin/fullstack-project/SKILL.md +286 -0
- package/skills/builtin/game-development/SKILL.md +60 -0
- package/skills/builtin/git-workflow/SKILL.md +44 -0
- package/skills/builtin/i18n-localization/SKILL.md +38 -0
- package/skills/builtin/image-prompt-engineering/SKILL.md +37 -0
- package/skills/builtin/investment-research/SKILL.md +33 -0
- package/skills/builtin/investor-materials/SKILL.md +90 -0
- package/skills/builtin/javascript-surface-analyzer/SKILL.md +66 -0
- package/skills/builtin/markdown-reports/SKILL.md +68 -0
- package/skills/builtin/market-research/SKILL.md +69 -0
- package/skills/builtin/mcp-builder/SKILL.md +86 -0
- package/skills/builtin/meeting-notes/SKILL.md +47 -0
- package/skills/builtin/memory-safety-analyst/SKILL.md +61 -0
- package/skills/builtin/meta-controller/SKILL.md +44 -0
- package/skills/builtin/mixture-of-agents/SKILL.md +53 -0
- package/skills/builtin/monitoring-observability/SKILL.md +169 -0
- package/skills/builtin/negotiation-simulator/SKILL.md +24 -0
- package/skills/builtin/nestjs-development/SKILL.md +56 -0
- package/skills/builtin/nextjs-development/SKILL.md +55 -0
- package/skills/builtin/parallel-dispatch/SKILL.md +83 -0
- package/skills/builtin/pdf-generation/SKILL.md +169 -0
- package/skills/builtin/personal-finance/SKILL.md +17 -0
- package/skills/builtin/pev-workflow/SKILL.md +62 -0
- package/skills/builtin/planning-with-files/SKILL.md +59 -0
- package/skills/builtin/pptx-generation/SKILL.md +117 -0
- package/skills/builtin/prisma-orm/SKILL.md +48 -0
- package/skills/builtin/rag-database-routing/SKILL.md +38 -0
- package/skills/builtin/rapid-prototyping/SKILL.md +152 -0
- package/skills/builtin/react-development/SKILL.md +244 -0
- package/skills/builtin/react-native-mobile/SKILL.md +113 -0
- package/skills/builtin/refactoring/SKILL.md +39 -0
- package/skills/builtin/reflexive-metacognition/SKILL.md +29 -0
- package/skills/builtin/riper-workflow/SKILL.md +214 -0
- package/skills/builtin/security-audit/SKILL.md +113 -0
- package/skills/builtin/security-self-audit/SKILL.md +311 -0
- package/skills/builtin/self-evolving-agent/SKILL.md +28 -0
- package/skills/builtin/self-improvement-loop/SKILL.md +58 -0
- package/skills/builtin/semantic-search/SKILL.md +93 -0
- package/skills/builtin/seo-audit-team/SKILL.md +27 -0
- package/skills/builtin/seo-optimization/SKILL.md +49 -0
- package/skills/builtin/server-management/SKILL.md +190 -0
- package/skills/builtin/social-media-content/SKILL.md +50 -0
- package/skills/builtin/sprint-planner/SKILL.md +49 -0
- package/skills/builtin/strategic-compact/SKILL.md +61 -0
- package/skills/builtin/strategy-advisor/SKILL.md +51 -0
- package/skills/builtin/structured-thinking/SKILL.md +70 -0
- package/skills/builtin/subagent-development/SKILL.md +105 -0
- package/skills/builtin/system-design/SKILL.md +66 -0
- package/skills/builtin/systematic-debugging/SKILL.md +87 -0
- package/skills/builtin/tailwind-css/SKILL.md +55 -0
- package/skills/builtin/taint-flow-tracer/SKILL.md +89 -0
- package/skills/builtin/teaching-agent-team/SKILL.md +32 -0
- package/skills/builtin/tech-debt-manager/SKILL.md +67 -0
- package/skills/builtin/technical-documentation/SKILL.md +47 -0
- package/skills/builtin/test-driven-development/SKILL.md +70 -0
- package/skills/builtin/theme-factory/SKILL.md +244 -0
- package/skills/builtin/threat-model-generator/SKILL.md +105 -0
- package/skills/builtin/trust-layer/SKILL.md +43 -0
- package/skills/builtin/typescript-patterns/SKILL.md +61 -0
- package/skills/builtin/ui-ux-design/SKILL.md +75 -0
- package/skills/builtin/verification-before-completion/SKILL.md +41 -0
- package/skills/builtin/verification-loop/SKILL.md +120 -0
- package/skills/builtin/waf-bypass-agent/SKILL.md +97 -0
- package/skills/builtin/web-artifacts-builder/SKILL.md +117 -0
- package/skills/builtin/web-assessment-executor/SKILL.md +66 -0
- package/skills/builtin/web-exploit-prover/SKILL.md +58 -0
- package/skills/builtin/web-scraping/SKILL.md +63 -0
- package/skills/builtin/webapp-testing/SKILL.md +86 -0
- package/skills/builtin/webhook-development/SKILL.md +62 -0
- package/skills/builtin/writing-skills/SKILL.md +67 -0
- package/skills/builtin/xlsx-generation/SKILL.md +116 -0
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
# Eval Harness
|
|
2
|
+
|
|
3
|
+
A formal evaluation framework for Claude Code sessions, implementing eval-driven development (EDD) principles.
|
|
4
|
+
|
|
5
|
+
## When to Activate
|
|
6
|
+
|
|
7
|
+
- Setting up eval-driven development (EDD) for AI-assisted workflows
|
|
8
|
+
- Defining pass/fail criteria for Claude Code task completion
|
|
9
|
+
- Measuring agent reliability with pass@k metrics
|
|
10
|
+
- Creating regression test suites for prompt or agent changes
|
|
11
|
+
- Benchmarking agent performance across model versions
|
|
12
|
+
|
|
13
|
+
## Philosophy
|
|
14
|
+
|
|
15
|
+
Eval-Driven Development treats evals as the "unit tests of AI development":
|
|
16
|
+
- Define expected behavior BEFORE implementation
|
|
17
|
+
- Run evals continuously during development
|
|
18
|
+
- Track regressions with each change
|
|
19
|
+
- Use pass@k metrics for reliability measurement
|
|
20
|
+
|
|
21
|
+
## Eval Types
|
|
22
|
+
|
|
23
|
+
### Capability Evals
|
|
24
|
+
Test if Claude can do something it couldn't before:
|
|
25
|
+
```markdown
|
|
26
|
+
[CAPABILITY EVAL: feature-name]
|
|
27
|
+
Task: Description of what Claude should accomplish
|
|
28
|
+
Success Criteria:
|
|
29
|
+
- [ ] Criterion 1
|
|
30
|
+
- [ ] Criterion 2
|
|
31
|
+
- [ ] Criterion 3
|
|
32
|
+
Expected Output: Description of expected result
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### Regression Evals
|
|
36
|
+
Ensure changes don't break existing functionality:
|
|
37
|
+
```markdown
|
|
38
|
+
[REGRESSION EVAL: feature-name]
|
|
39
|
+
Baseline: SHA or checkpoint name
|
|
40
|
+
Tests:
|
|
41
|
+
- existing-test-1: PASS/FAIL
|
|
42
|
+
- existing-test-2: PASS/FAIL
|
|
43
|
+
- existing-test-3: PASS/FAIL
|
|
44
|
+
Result: X/Y passed (previously Y/Y)
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Grader Types
|
|
48
|
+
|
|
49
|
+
### 1. Code-Based Grader
|
|
50
|
+
Deterministic checks using code:
|
|
51
|
+
```bash
|
|
52
|
+
# Check if file contains expected pattern
|
|
53
|
+
grep -q "export function handleAuth" src/auth.ts && echo "PASS" || echo "FAIL"
|
|
54
|
+
|
|
55
|
+
# Check if tests pass
|
|
56
|
+
npm test -- --testPathPattern="auth" && echo "PASS" || echo "FAIL"
|
|
57
|
+
|
|
58
|
+
# Check if build succeeds
|
|
59
|
+
npm run build && echo "PASS" || echo "FAIL"
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### 2. Model-Based Grader
|
|
63
|
+
Use Claude to evaluate open-ended outputs:
|
|
64
|
+
```markdown
|
|
65
|
+
[MODEL GRADER PROMPT]
|
|
66
|
+
Evaluate the following code change:
|
|
67
|
+
1. Does it solve the stated problem?
|
|
68
|
+
2. Is it well-structured?
|
|
69
|
+
3. Are edge cases handled?
|
|
70
|
+
4. Is error handling appropriate?
|
|
71
|
+
|
|
72
|
+
Score: 1-5 (1=poor, 5=excellent)
|
|
73
|
+
Reasoning: [explanation]
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### 3. Human Grader
|
|
77
|
+
Flag for manual review:
|
|
78
|
+
```markdown
|
|
79
|
+
[HUMAN REVIEW REQUIRED]
|
|
80
|
+
Change: Description of what changed
|
|
81
|
+
Reason: Why human review is needed
|
|
82
|
+
Risk Level: LOW/MEDIUM/HIGH
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Metrics
|
|
86
|
+
|
|
87
|
+
### pass@k
|
|
88
|
+
"At least one success in k attempts"
|
|
89
|
+
- pass@1: First attempt success rate
|
|
90
|
+
- pass@3: Success within 3 attempts
|
|
91
|
+
- Typical target: pass@3 > 90%
|
|
92
|
+
|
|
93
|
+
### pass^k
|
|
94
|
+
"All k trials succeed"
|
|
95
|
+
- Higher bar for reliability
|
|
96
|
+
- pass^3: 3 consecutive successes
|
|
97
|
+
- Use for critical paths
|
|
98
|
+
|
|
99
|
+
## Eval Workflow
|
|
100
|
+
|
|
101
|
+
### 1. Define (Before Coding)
|
|
102
|
+
```markdown
|
|
103
|
+
## EVAL DEFINITION: feature-xyz
|
|
104
|
+
|
|
105
|
+
### Capability Evals
|
|
106
|
+
1. Can create new user account
|
|
107
|
+
2. Can validate email format
|
|
108
|
+
3. Can hash password securely
|
|
109
|
+
|
|
110
|
+
### Regression Evals
|
|
111
|
+
1. Existing login still works
|
|
112
|
+
2. Session management unchanged
|
|
113
|
+
3. Logout flow intact
|
|
114
|
+
|
|
115
|
+
### Success Metrics
|
|
116
|
+
- pass@3 > 90% for capability evals
|
|
117
|
+
- pass^3 = 100% for regression evals
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### 2. Implement
|
|
121
|
+
Write code to pass the defined evals.
|
|
122
|
+
|
|
123
|
+
### 3. Evaluate
|
|
124
|
+
```bash
|
|
125
|
+
# Run capability evals
|
|
126
|
+
[Run each capability eval, record PASS/FAIL]
|
|
127
|
+
|
|
128
|
+
# Run regression evals
|
|
129
|
+
npm test -- --testPathPattern="existing"
|
|
130
|
+
|
|
131
|
+
# Generate report
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
### 4. Report
|
|
135
|
+
```markdown
|
|
136
|
+
EVAL REPORT: feature-xyz
|
|
137
|
+
========================
|
|
138
|
+
|
|
139
|
+
Capability Evals:
|
|
140
|
+
create-user: PASS (pass@1)
|
|
141
|
+
validate-email: PASS (pass@2)
|
|
142
|
+
hash-password: PASS (pass@1)
|
|
143
|
+
Overall: 3/3 passed
|
|
144
|
+
|
|
145
|
+
Regression Evals:
|
|
146
|
+
login-flow: PASS
|
|
147
|
+
session-mgmt: PASS
|
|
148
|
+
logout-flow: PASS
|
|
149
|
+
Overall: 3/3 passed
|
|
150
|
+
|
|
151
|
+
Metrics:
|
|
152
|
+
pass@1: 67% (2/3)
|
|
153
|
+
pass@3: 100% (3/3)
|
|
154
|
+
|
|
155
|
+
Status: READY FOR REVIEW
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
## Integration Patterns
|
|
159
|
+
|
|
160
|
+
### Pre-Implementation
|
|
161
|
+
```
|
|
162
|
+
/eval define feature-name
|
|
163
|
+
```
|
|
164
|
+
Creates eval definition file at `.claude/evals/feature-name.md`
|
|
165
|
+
|
|
166
|
+
### During Implementation
|
|
167
|
+
```
|
|
168
|
+
/eval check feature-name
|
|
169
|
+
```
|
|
170
|
+
Runs current evals and reports status
|
|
171
|
+
|
|
172
|
+
### Post-Implementation
|
|
173
|
+
```
|
|
174
|
+
/eval report feature-name
|
|
175
|
+
```
|
|
176
|
+
Generates full eval report
|
|
177
|
+
|
|
178
|
+
## Eval Storage
|
|
179
|
+
|
|
180
|
+
Store evals in project:
|
|
181
|
+
```
|
|
182
|
+
.claude/
|
|
183
|
+
evals/
|
|
184
|
+
feature-xyz.md # Eval definition
|
|
185
|
+
feature-xyz.log # Eval run history
|
|
186
|
+
baseline.json # Regression baselines
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
## Best Practices
|
|
190
|
+
|
|
191
|
+
1. **Define evals BEFORE coding** — Forces clear thinking about success criteria
|
|
192
|
+
2. **Run evals frequently** — Catch regressions early
|
|
193
|
+
3. **Track pass@k over time** — Monitor reliability trends
|
|
194
|
+
4. **Use code graders when possible** — Deterministic > probabilistic
|
|
195
|
+
5. **Human review for security** — Never fully automate security checks
|
|
196
|
+
6. **Keep evals fast** — Slow evals don't get run
|
|
197
|
+
7. **Version evals with code** — Evals are first-class artifacts
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: evaluation-framework
|
|
3
|
+
description: "Use this skill when evaluating output quality, comparing approaches, scoring responses, or setting up quality benchmarks. Triggers: 'evaluate', 'assess', 'score', 'compare quality', 'judge', 'benchmark', 'quality check', or requests to measure how good an output is."
|
|
4
|
+
license: MIT
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# Evaluation Framework
|
|
8
|
+
|
|
9
|
+
## What This Skill Does
|
|
10
|
+
|
|
11
|
+
Evaluate LLM outputs and agent work products using structured rubrics. LLM-as-judge, pairwise comparison, and bias mitigation.
|
|
12
|
+
|
|
13
|
+
## Rubric Dimensions
|
|
14
|
+
|
|
15
|
+
| Dimension | 1 (Poor) | 3 (Adequate) | 5 (Excellent) |
|
|
16
|
+
|-----------|----------|--------------|---------------|
|
|
17
|
+
| Accuracy | Factual errors | Mostly correct | Completely correct |
|
|
18
|
+
| Completeness | Missing major elements | Covers basics | Comprehensive |
|
|
19
|
+
| Clarity | Confusing, unclear | Understandable | Clear and well-organized |
|
|
20
|
+
| Relevance | Off-topic | Generally relevant | Precisely targeted |
|
|
21
|
+
| Actionability | No clear next steps | Some guidance | Specific, implementable |
|
|
22
|
+
|
|
23
|
+
## LLM-as-Judge Pattern
|
|
24
|
+
|
|
25
|
+
```
|
|
26
|
+
System: You are an expert evaluator. Score the following output on each dimension from 1-5.
|
|
27
|
+
Provide your reasoning BEFORE your score to avoid anchoring.
|
|
28
|
+
|
|
29
|
+
[Output to evaluate]
|
|
30
|
+
|
|
31
|
+
Score each dimension:
|
|
32
|
+
1. Accuracy: [reasoning] → [score]
|
|
33
|
+
2. Completeness: [reasoning] → [score]
|
|
34
|
+
3. Clarity: [reasoning] → [score]
|
|
35
|
+
4. Relevance: [reasoning] → [score]
|
|
36
|
+
5. Actionability: [reasoning] → [score]
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Bias Mitigation
|
|
40
|
+
|
|
41
|
+
- **Position bias:** Randomize order when comparing options
|
|
42
|
+
- **Length bias:** Longer responses aren't automatically better
|
|
43
|
+
- **Self-enhancement:** Don't let the same model evaluate its own output
|
|
44
|
+
- **Anchoring:** Generate reasoning before scores, not after
|
|
45
|
+
|
|
46
|
+
## Rules
|
|
47
|
+
|
|
48
|
+
- Define evaluation criteria BEFORE looking at outputs
|
|
49
|
+
- Use blind comparison when comparing two approaches
|
|
50
|
+
- Multiple evaluators are better than one (even if all are LLMs)
|
|
51
|
+
- Document the evaluation methodology and reproduce it
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# Exploit Writer
|
|
2
|
+
|
|
3
|
+
## Purpose
|
|
4
|
+
Transform confirmed primitives into reproducible proof-of-exploit artifacts and stepwise execution plans.
|
|
5
|
+
|
|
6
|
+
## Inputs
|
|
7
|
+
- `validated_primitive`
|
|
8
|
+
- `target_context`
|
|
9
|
+
- `environment_constraints`
|
|
10
|
+
- `success_criteria`
|
|
11
|
+
|
|
12
|
+
## Workflow
|
|
13
|
+
### Phase 1: Objective and Boundaries
|
|
14
|
+
1. Define exploit goal (data read, privilege gain, state change).
|
|
15
|
+
2. Define explicit stop condition.
|
|
16
|
+
3. Define prohibited actions and safety constraints.
|
|
17
|
+
|
|
18
|
+
### Phase 2: Chain Design
|
|
19
|
+
1. Break exploit into stages: setup, trigger, control gain, impact verification.
|
|
20
|
+
2. Include fallback branches for unstable stages.
|
|
21
|
+
|
|
22
|
+
### Phase 3: Procedure Authoring
|
|
23
|
+
1. Write deterministic steps with required inputs.
|
|
24
|
+
2. Include expected output per step.
|
|
25
|
+
3. Include failure diagnostics per step.
|
|
26
|
+
|
|
27
|
+
### Phase 4: Robustness Checks
|
|
28
|
+
1. Re-run in fresh session/environment.
|
|
29
|
+
2. Validate whether exploit is deterministic or probabilistic.
|
|
30
|
+
3. Capture conditions that break reliability.
|
|
31
|
+
|
|
32
|
+
### Phase 5: Reporting Package
|
|
33
|
+
1. Provide concise replay instructions.
|
|
34
|
+
2. Provide artifact index.
|
|
35
|
+
3. Provide impact statement tied to observed behavior.
|
|
36
|
+
|
|
37
|
+
## Exploit Procedure Template
|
|
38
|
+
- Preconditions
|
|
39
|
+
- Setup commands/actions
|
|
40
|
+
- Trigger sequence
|
|
41
|
+
- Verification checks
|
|
42
|
+
- Cleanup and rollback
|
|
43
|
+
- Failure troubleshooting
|
|
44
|
+
|
|
45
|
+
## Output Contract
|
|
46
|
+
```json
|
|
47
|
+
{
|
|
48
|
+
"exploit_plan": [],
|
|
49
|
+
"stepwise_procedure": [],
|
|
50
|
+
"success_signals": [],
|
|
51
|
+
"failure_diagnostics": [],
|
|
52
|
+
"safety_notes": []
|
|
53
|
+
}
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Constraints
|
|
57
|
+
- Build only from validated primitives.
|
|
58
|
+
- Do not fabricate impact or reliability.
|
|
59
|
+
|
|
60
|
+
## Quality Checklist
|
|
61
|
+
- [ ] Another tester can replay from instructions.
|
|
62
|
+
- [ ] Preconditions are explicit.
|
|
63
|
+
- [ ] Impact claim matches observed result.
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# Fact Checker
|
|
2
|
+
|
|
3
|
+
## Rating Scale
|
|
4
|
+
|
|
5
|
+
- **TRUE**: accurate, supported by reliable evidence
|
|
6
|
+
- **MOSTLY TRUE**: accurate but missing important context
|
|
7
|
+
- **MIXED**: contains both true and false elements
|
|
8
|
+
- **MOSTLY FALSE**: misleading or largely inaccurate
|
|
9
|
+
- **FALSE**: demonstrably wrong
|
|
10
|
+
- **UNVERIFIABLE**: cannot be confirmed or denied
|
|
11
|
+
|
|
12
|
+
## Verification Process
|
|
13
|
+
|
|
14
|
+
1. **Extract the claim**: isolate the specific factual assertion, separate fact from opinion
|
|
15
|
+
2. **Determine evidence needed**: what would prove/disprove this?
|
|
16
|
+
3. **Evaluate evidence**: check authoritative sources, primary data, publication dates
|
|
17
|
+
4. **Rate the claim**: assess accuracy, note confidence, explain reasoning
|
|
18
|
+
5. **Provide context**: why it matters, common misconceptions, proper interpretation
|
|
19
|
+
|
|
20
|
+
## Manipulation Patterns to Watch
|
|
21
|
+
|
|
22
|
+
- **Cherry-picking**: selective data that supports a predetermined conclusion
|
|
23
|
+
- **Context removal**: quotes taken out of context, missing qualifiers
|
|
24
|
+
- **False equivalences**: treating unequal sources as equally valid
|
|
25
|
+
- **Correlation as causation**: two things happen together therefore one caused the other
|
|
26
|
+
- **Misleading scales**: graphs with truncated axes, inconsistent intervals
|
|
27
|
+
|
|
28
|
+
## Output Format
|
|
29
|
+
|
|
30
|
+
```markdown
|
|
31
|
+
## Claim
|
|
32
|
+
[Exact statement being verified]
|
|
33
|
+
|
|
34
|
+
## Verdict: [RATING]
|
|
35
|
+
|
|
36
|
+
## Analysis
|
|
37
|
+
[Why this rating. Evidence for and against.]
|
|
38
|
+
|
|
39
|
+
## Correct Information
|
|
40
|
+
[If claim is false, what's actually true]
|
|
41
|
+
|
|
42
|
+
## Sources
|
|
43
|
+
[Numbered with credibility notes]
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Rules
|
|
47
|
+
|
|
48
|
+
- Always search for counter-evidence, not just confirming evidence
|
|
49
|
+
- Rate the claim, not the person making it
|
|
50
|
+
- "UNVERIFIABLE" is a valid and honest answer
|
|
51
|
+
- Never present absence of evidence as evidence of absence
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: filesystem-context
|
|
3
|
+
description: "Use this skill for long-running tasks, multi-session work, or any task where context might be lost. Triggers: 'long task', 'context limit', 'multi-session', 'persist', 'remember across sessions', 'pick up where I left off', or any task spanning multiple context windows."
|
|
4
|
+
license: MIT
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# Filesystem Context
|
|
8
|
+
|
|
9
|
+
## What This Skill Does
|
|
10
|
+
|
|
11
|
+
Use the filesystem as extended agent memory. Persist state, decisions, and progress across context windows and sessions.
|
|
12
|
+
|
|
13
|
+
## Directory Convention
|
|
14
|
+
|
|
15
|
+
```
|
|
16
|
+
workspace/
|
|
17
|
+
├── session-state.md # Current task, progress, what to do next
|
|
18
|
+
├── decisions.md # Decisions made and their rationale
|
|
19
|
+
├── open-questions.md # Unresolved questions needing answers
|
|
20
|
+
└── scratchpad.md # Rough notes, temporary data
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Session Start (Orient)
|
|
24
|
+
|
|
25
|
+
```
|
|
26
|
+
1. Read session-state.md → understand current task and progress
|
|
27
|
+
2. Read decisions.md → recall prior decisions (don't re-decide)
|
|
28
|
+
3. Read open-questions.md → identify what needs resolution
|
|
29
|
+
4. Resume work from where it left off
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Session End (Persist)
|
|
33
|
+
|
|
34
|
+
```
|
|
35
|
+
1. Update session-state.md with current progress
|
|
36
|
+
2. Add any new decisions to decisions.md
|
|
37
|
+
3. Update open-questions.md (add new, mark resolved)
|
|
38
|
+
4. Clear scratchpad of obsolete notes
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Rules
|
|
42
|
+
|
|
43
|
+
- Always read state files at session start before doing anything
|
|
44
|
+
- Always update state files before session ends
|
|
45
|
+
- Decisions are permanent. Once recorded, don't re-debate without new information.
|
|
46
|
+
- Keep session-state.md under 50 lines (summary, not transcript)
|
|
47
|
+
- Use scratchpad.md for temporary reasoning (can be deleted).
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# Financial Coach
|
|
2
|
+
## Multi-Agent Pipeline
|
|
3
|
+
1. **Data Analyzer**: parse financial documents (CSV, statements), compute metrics
|
|
4
|
+
2. **Risk Assessor**: evaluate risk tolerance, current exposure, concentration
|
|
5
|
+
3. **Recommendation Generator**: actionable advice grounded in analyzed data
|
|
6
|
+
|
|
7
|
+
## Visualization Output
|
|
8
|
+
Generate Plotly chart specifications for:
|
|
9
|
+
- Income vs expenses over time (line chart)
|
|
10
|
+
- Expense breakdown (donut chart)
|
|
11
|
+
- Net worth trajectory (area chart)
|
|
12
|
+
- Debt payoff projections (stacked bar)
|
|
13
|
+
|
|
14
|
+
## Rules
|
|
15
|
+
- Every recommendation must reference specific data from the analysis
|
|
16
|
+
- Include both optimistic and conservative projections
|
|
17
|
+
- Note: informational guidance, not professional financial advice
|
|
18
|
+
- Never recommend specific securities or funds.
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# Finding Chain Correlator
|
|
2
|
+
|
|
3
|
+
## Purpose
|
|
4
|
+
Reveal compounding risk that isolated findings understate.
|
|
5
|
+
|
|
6
|
+
## Inputs
|
|
7
|
+
- `finding_set`
|
|
8
|
+
- `application_context`
|
|
9
|
+
- `auth_model`
|
|
10
|
+
- `data_sensitivity_map`
|
|
11
|
+
|
|
12
|
+
## Workflow
|
|
13
|
+
### Phase 1: Normalization
|
|
14
|
+
1. Standardize findings into capability statements.
|
|
15
|
+
2. Extract prerequisites, required role, and affected assets.
|
|
16
|
+
|
|
17
|
+
### Phase 2: Link Construction
|
|
18
|
+
1. Connect findings where output of one enables next.
|
|
19
|
+
2. Identify dependency order and branching options.
|
|
20
|
+
3. Reject links lacking technical preconditions.
|
|
21
|
+
|
|
22
|
+
### Phase 3: Chain Validation
|
|
23
|
+
1. Validate each step is feasible in target context.
|
|
24
|
+
2. Validate session and state transitions between steps.
|
|
25
|
+
3. Validate operational reliability of full chain.
|
|
26
|
+
|
|
27
|
+
### Phase 4: Impact Aggregation
|
|
28
|
+
1. Evaluate confidentiality, integrity, and availability impact.
|
|
29
|
+
2. Estimate blast radius and tenant crossover risk.
|
|
30
|
+
3. Rank by attacker effort vs outcome.
|
|
31
|
+
|
|
32
|
+
### Phase 5: Defensive Breakpoints
|
|
33
|
+
1. Identify minimal controls that break the chain.
|
|
34
|
+
2. Prioritize controls by implementation cost and risk reduction.
|
|
35
|
+
|
|
36
|
+
## Chain Scoring Factors
|
|
37
|
+
- prerequisite complexity
|
|
38
|
+
- execution reliability
|
|
39
|
+
- privilege needed
|
|
40
|
+
- detectability
|
|
41
|
+
- business impact
|
|
42
|
+
|
|
43
|
+
## Output Contract
|
|
44
|
+
```json
|
|
45
|
+
{
|
|
46
|
+
"attack_chains": [],
|
|
47
|
+
"prerequisite_graph": [],
|
|
48
|
+
"aggregate_impact": [],
|
|
49
|
+
"defensive_breakpoints": [],
|
|
50
|
+
"priority_order": []
|
|
51
|
+
}
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Constraints
|
|
55
|
+
- No speculative chain links.
|
|
56
|
+
- No additive severity without chain feasibility.
|
|
57
|
+
|
|
58
|
+
## Quality Checklist
|
|
59
|
+
- [ ] Every link has evidence.
|
|
60
|
+
- [ ] Chain order is technically valid.
|
|
61
|
+
- [ ] Defensive breakpoints are practical.
|
|
62
|
+
|
|
63
|
+
## Conditional Decision Matrix
|
|
64
|
+
| Condition | Action | Evidence Requirement |
|
|
65
|
+
|---|---|---|
|
|
66
|
+
| Finding signal unstable | downgrade confidence and add retest plan | repeated run variance log |
|
|
67
|
+
| Chain link missing prerequisite | split chain and mark dependency blocker | prerequisite graph |
|
|
68
|
+
| Impact appears low in isolation | evaluate chain amplification paths | chain-level impact narrative |
|
|
69
|
+
| Mitigation claim is partial | verify alternate path and state variants | mitigation bypass check |
|
|
70
|
+
| Environment blocker dominates | classify inconclusive with unblock requests | blocker evidence |
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# Finding Verifier
|
|
2
|
+
|
|
3
|
+
## Purpose
|
|
4
|
+
Ensure reported findings are accurate, reproducible, and correctly classified.
|
|
5
|
+
|
|
6
|
+
## Inputs
|
|
7
|
+
- `finding_report`
|
|
8
|
+
- `evidence_bundle`
|
|
9
|
+
- `environment_notes`
|
|
10
|
+
|
|
11
|
+
## Verification Workflow
|
|
12
|
+
### Phase 1: Evidence Integrity
|
|
13
|
+
1. Verify artifact completeness and timestamps.
|
|
14
|
+
2. Verify request-response pairing and context consistency.
|
|
15
|
+
|
|
16
|
+
### Phase 2: Independent Replay
|
|
17
|
+
1. Reproduce with original method.
|
|
18
|
+
2. Reproduce with alternate method when possible.
|
|
19
|
+
3. Compare behavior consistency.
|
|
20
|
+
|
|
21
|
+
### Phase 3: Confounder Analysis
|
|
22
|
+
1. Caching and stale session effects.
|
|
23
|
+
2. Timing and infrastructure noise.
|
|
24
|
+
3. Seed-data drift and race artifacts.
|
|
25
|
+
|
|
26
|
+
### Phase 4: Final Status
|
|
27
|
+
1. `confirmed` if replayable with clear impact.
|
|
28
|
+
2. `disputed` if strong counter-evidence exists.
|
|
29
|
+
3. `inconclusive` if unresolved blockers remain.
|
|
30
|
+
|
|
31
|
+
## Acceptance Criteria by Class
|
|
32
|
+
| Class | Confirmed Requires |
|
|
33
|
+
|---|---|
|
|
34
|
+
| Injection | parser/engine effect + attacker control |
|
|
35
|
+
| XSS | controlled script execution in target context |
|
|
36
|
+
| Authz | unauthorized action/object access proven |
|
|
37
|
+
| SSRF | outbound request influence or protected target reach |
|
|
38
|
+
|
|
39
|
+
## Output Contract
|
|
40
|
+
```json
|
|
41
|
+
{
|
|
42
|
+
"verification_status": [],
|
|
43
|
+
"replay_results": [],
|
|
44
|
+
"confounder_notes": [],
|
|
45
|
+
"required_follow_up": []
|
|
46
|
+
}
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Constraints
|
|
50
|
+
- Do not confirm from single unstable run.
|
|
51
|
+
- Do not dispute on intuition alone.
|
|
52
|
+
|
|
53
|
+
## Quality Checklist
|
|
54
|
+
- [ ] Independent replay attempted.
|
|
55
|
+
- [ ] Confounders addressed.
|
|
56
|
+
- [ ] Status rationale is explicit.
|
|
57
|
+
|
|
58
|
+
## Conditional Decision Matrix
|
|
59
|
+
| Condition | Action | Evidence Requirement |
|
|
60
|
+
|---|---|---|
|
|
61
|
+
| Finding signal unstable | downgrade confidence and add retest plan | repeated run variance log |
|
|
62
|
+
| Chain link missing prerequisite | split chain and mark dependency blocker | prerequisite graph |
|
|
63
|
+
| Impact appears low in isolation | evaluate chain amplification paths | chain-level impact narrative |
|
|
64
|
+
| Mitigation claim is partial | verify alternate path and state variants | mitigation bypass check |
|
|
65
|
+
| Environment blocker dominates | classify inconclusive with unblock requests | blocker evidence |
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: frontend-design
|
|
3
|
+
description: "Use this skill when the user asks to build a website, web page, landing page, web UI, HTML/CSS/JS project, or any browser-based interface. Triggers: 'build me a website', 'create a landing page', 'make a web app', 'HTML page', 'frontend', 'web UI', 'responsive layout', 'CSS', styling tasks, accessibility fixes, or any request to create something that runs in a browser. Covers HTML, CSS, JavaScript, responsive design, accessibility, and modern web standards."
|
|
4
|
+
license: MIT
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# Frontend Design
|
|
8
|
+
|
|
9
|
+
## What This Skill Does
|
|
10
|
+
|
|
11
|
+
Build complete, production-quality web interfaces. HTML, CSS, JavaScript. Responsive layouts, accessibility, modern patterns. From single landing pages to multi-page sites.
|
|
12
|
+
|
|
13
|
+
## Before You Start
|
|
14
|
+
|
|
15
|
+
1. **Fetch current docs** via Context7 if using any CSS framework (Tailwind, Bootstrap) or JS library
|
|
16
|
+
2. **Check DeepWiki** if integrating with an unfamiliar frontend framework or build tool
|
|
17
|
+
3. **Ask the user** about target browsers, mobile requirements, and any existing design system
|
|
18
|
+
|
|
19
|
+
## HTML Standards
|
|
20
|
+
|
|
21
|
+
Write semantic HTML5. Every page needs:
|
|
22
|
+
|
|
23
|
+
```html
|
|
24
|
+
<!DOCTYPE html>
|
|
25
|
+
<html lang="en">
|
|
26
|
+
<head>
|
|
27
|
+
<meta charset="UTF-8">
|
|
28
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
29
|
+
<meta name="description" content="Page description for SEO">
|
|
30
|
+
<title>Page Title</title>
|
|
31
|
+
</head>
|
|
32
|
+
<body>
|
|
33
|
+
<header>...</header>
|
|
34
|
+
<nav>...</nav>
|
|
35
|
+
<main>...</main>
|
|
36
|
+
<footer>...</footer>
|
|
37
|
+
</body>
|
|
38
|
+
</html>
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Semantic elements over divs: `<header>`, `<nav>`, `<main>`, `<section>`, `<article>`, `<aside>`, `<footer>`. Use `<div>` only when no semantic element fits.
|
|
42
|
+
|
|
43
|
+
## CSS Patterns
|
|
44
|
+
|
|
45
|
+
### Layout (use modern CSS)
|
|
46
|
+
- **Flexbox** for one-dimensional layouts (navbars, card rows, centering)
|
|
47
|
+
- **CSS Grid** for two-dimensional layouts (page layouts, dashboards, galleries)
|
|
48
|
+
- **Container queries** for component-level responsiveness
|
|
49
|
+
- Never use floats for layout
|
|
50
|
+
|
|
51
|
+
### Responsive Design
|
|
52
|
+
```css
|
|
53
|
+
/* Mobile-first breakpoints */
|
|
54
|
+
/* Base styles = mobile */
|
|
55
|
+
@media (min-width: 768px) { /* Tablet */ }
|
|
56
|
+
@media (min-width: 1024px) { /* Desktop */ }
|
|
57
|
+
@media (min-width: 1440px) { /* Large desktop */ }
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
Use `clamp()` for fluid typography: `font-size: clamp(1rem, 2.5vw, 2rem);`
|
|
61
|
+
|
|
62
|
+
### CSS Variables for Theming
|
|
63
|
+
```css
|
|
64
|
+
:root {
|
|
65
|
+
--color-primary: #2563eb;
|
|
66
|
+
--color-surface: #ffffff;
|
|
67
|
+
--color-text: #1a1a2e;
|
|
68
|
+
--radius: 8px;
|
|
69
|
+
--shadow: 0 2px 8px rgba(0,0,0,0.08);
|
|
70
|
+
--transition: 200ms ease;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
@media (prefers-color-scheme: dark) {
|
|
74
|
+
:root {
|
|
75
|
+
--color-surface: #0f0f1a;
|
|
76
|
+
--color-text: #e2e2e8;
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Accessibility Checklist
|
|
82
|
+
|
|
83
|
+
- All images have `alt` text (decorative images get `alt=""`)
|
|
84
|
+
- All interactive elements are keyboard-accessible (Tab, Enter, Escape)
|
|
85
|
+
- Color contrast ratio meets WCAG AA (4.5:1 for text, 3:1 for large text)
|
|
86
|
+
- Form inputs have associated `<label>` elements
|
|
87
|
+
- Page has a single `<h1>` and heading hierarchy doesn't skip levels
|
|
88
|
+
- Focus states are visible (never `outline: none` without a replacement)
|
|
89
|
+
- ARIA attributes used correctly: `aria-label`, `aria-expanded`, `aria-hidden`
|
|
90
|
+
|
|
91
|
+
## Performance
|
|
92
|
+
|
|
93
|
+
- Images: use `loading="lazy"`, provide `width` and `height`, use modern formats (WebP, AVIF)
|
|
94
|
+
- Fonts: `font-display: swap`, preload critical fonts, limit to 2 families max
|
|
95
|
+
- CSS: critical styles inline in `<head>`, rest loaded async
|
|
96
|
+
- JS: `defer` attribute on scripts, avoid render-blocking
|
|
97
|
+
|
|
98
|
+
## Rules
|
|
99
|
+
|
|
100
|
+
- Design for the most common use case, accommodate edge cases
|
|
101
|
+
- Every interactive element needs: default, hover, active, focus, disabled states
|
|
102
|
+
- Error messages must tell the user what went wrong AND how to fix it
|
|
103
|
+
- Test with real content, not lorem ipsum
|
|
104
|
+
- Mobile-first: write base styles for mobile, enhance for larger screens
|