helixevo 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dashboard/app/api/skills/route.ts +298 -0
- package/dashboard/app/evolution/page.tsx +128 -0
- package/dashboard/app/frontier/page.tsx +128 -0
- package/dashboard/app/globals.css +1211 -0
- package/dashboard/app/guide/page.tsx +984 -0
- package/dashboard/app/layout.tsx +79 -0
- package/dashboard/app/network/client.tsx +1090 -0
- package/dashboard/app/network/page.tsx +68 -0
- package/dashboard/app/page.tsx +147 -0
- package/dashboard/app/research/page.tsx +124 -0
- package/dashboard/components/SkillFlowNode.tsx +192 -0
- package/dashboard/lib/data.ts +146 -0
- package/dashboard/next-env.d.ts +6 -0
- package/dashboard/package-lock.json +1182 -0
- package/dashboard/package.json +21 -0
- package/dashboard/tsconfig.json +40 -0
- package/dist/cli.js +44 -9
- package/package.json +8 -1
|
@@ -0,0 +1,984 @@
|
|
|
1
|
+
'use client'
|
|
2
|
+
|
|
3
|
+
import { useState } from 'react'
|
|
4
|
+
|
|
5
|
+
// ─── Table of Contents ──────────────────────────────────────────
|
|
6
|
+
const TOC = [
|
|
7
|
+
{ id: 'overview', label: 'Overview', icon: '◆' },
|
|
8
|
+
{ id: 'quickstart', label: 'Quick Start', icon: '▸' },
|
|
9
|
+
{ id: 'commands', label: 'Commands', icon: '⌘' },
|
|
10
|
+
{ id: 'architecture', label: 'Architecture', icon: '◎' },
|
|
11
|
+
{ id: 'watch', label: 'Always-On Learning', icon: '⚡' },
|
|
12
|
+
{ id: 'evolution', label: 'Evolution Pipeline', icon: '⟳' },
|
|
13
|
+
{ id: 'judges', label: 'Multi-Judge System', icon: '⚖' },
|
|
14
|
+
{ id: 'networkhealth', label: 'Network Health', icon: '♺' },
|
|
15
|
+
{ id: 'autogen', label: 'Auto-Generalization', icon: '↑' },
|
|
16
|
+
{ id: 'metrics', label: 'Closed-Loop Metrics', icon: '📊' },
|
|
17
|
+
{ id: 'frontier', label: 'Pareto Frontier', icon: '▲' },
|
|
18
|
+
{ id: 'regression', label: 'Regression Testing', icon: '✓' },
|
|
19
|
+
{ id: 'research', label: 'Proactive Research', icon: '◎' },
|
|
20
|
+
{ id: 'network', label: 'Skill Network', icon: '⬡' },
|
|
21
|
+
{ id: 'config', label: 'Configuration', icon: '⚙' },
|
|
22
|
+
{ id: 'data', label: 'Data & Storage', icon: '◫' },
|
|
23
|
+
{ id: 'manage', label: 'Skill Management', icon: '✎' },
|
|
24
|
+
{ id: 'craft', label: 'Craft Agent', icon: '⚡' },
|
|
25
|
+
{ id: 'faq', label: 'FAQ', icon: '?' },
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
// ─── Code Block ─────────────────────────────────────────────────
|
|
29
|
+
function Code({ children, title }: { children: string; title?: string }) {
|
|
30
|
+
return (
|
|
31
|
+
<div className="guide-code-block">
|
|
32
|
+
{title && <div className="guide-code-title">{title}</div>}
|
|
33
|
+
<pre><code>{children.trim()}</code></pre>
|
|
34
|
+
</div>
|
|
35
|
+
)
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// ─── Callout ────────────────────────────────────────────────────
|
|
39
|
+
function Callout({ type, children }: { type: 'info' | 'tip' | 'warning'; children: React.ReactNode }) {
|
|
40
|
+
const styles = {
|
|
41
|
+
info: { bg: 'var(--blue-light)', border: 'var(--blue-border)', icon: 'ℹ', color: 'var(--blue)' },
|
|
42
|
+
tip: { bg: 'var(--green-light)', border: 'var(--green-border)', icon: '✦', color: 'var(--green)' },
|
|
43
|
+
warning: { bg: 'var(--yellow-light)', border: 'var(--yellow-border)', icon: '▲', color: 'var(--yellow)' },
|
|
44
|
+
}
|
|
45
|
+
const s = styles[type]
|
|
46
|
+
return (
|
|
47
|
+
<div className="guide-callout" style={{ background: s.bg, borderColor: s.border }}>
|
|
48
|
+
<span className="guide-callout-icon" style={{ color: s.color }}>{s.icon}</span>
|
|
49
|
+
<div>{children}</div>
|
|
50
|
+
</div>
|
|
51
|
+
)
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// ─── Step ───────────────────────────────────────────────────────
|
|
55
|
+
function Step({ n, title, children }: { n: number; title: string; children: React.ReactNode }) {
|
|
56
|
+
return (
|
|
57
|
+
<div className="guide-step">
|
|
58
|
+
<div className="guide-step-number">{n}</div>
|
|
59
|
+
<div className="guide-step-content">
|
|
60
|
+
<div className="guide-step-title">{title}</div>
|
|
61
|
+
{children}
|
|
62
|
+
</div>
|
|
63
|
+
</div>
|
|
64
|
+
)
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// ─── Feature Card ───────────────────────────────────────────────
|
|
68
|
+
function FeatureCard({ icon, title, desc }: { icon: string; title: string; desc: string }) {
|
|
69
|
+
return (
|
|
70
|
+
<div className="guide-feature-card">
|
|
71
|
+
<div className="guide-feature-icon">{icon}</div>
|
|
72
|
+
<div className="guide-feature-title">{title}</div>
|
|
73
|
+
<div className="guide-feature-desc">{desc}</div>
|
|
74
|
+
</div>
|
|
75
|
+
)
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// ─── Param Row ──────────────────────────────────────────────────
|
|
79
|
+
function Param({ name, type, desc, def }: { name: string; type: string; desc: string; def?: string }) {
|
|
80
|
+
return (
|
|
81
|
+
<div className="guide-param-row">
|
|
82
|
+
<div style={{ display: 'flex', alignItems: 'center', gap: 8 }}>
|
|
83
|
+
<code className="guide-param-name">{name}</code>
|
|
84
|
+
<span className="guide-param-type">{type}</span>
|
|
85
|
+
</div>
|
|
86
|
+
<div className="guide-param-desc">{desc}{def && <span style={{ color: 'var(--text-muted)' }}> Default: <code>{def}</code></span>}</div>
|
|
87
|
+
</div>
|
|
88
|
+
)
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// ─── Pipeline Step ──────────────────────────────────────────────
|
|
92
|
+
function PipelineStep({ icon, title, desc, color }: { icon: string; title: string; desc: string; color: string }) {
|
|
93
|
+
return (
|
|
94
|
+
<div className="guide-pipeline-step">
|
|
95
|
+
<div className="guide-pipeline-icon" style={{ background: `${color}15`, color }}>{icon}</div>
|
|
96
|
+
<div>
|
|
97
|
+
<div style={{ fontSize: 13, fontWeight: 600 }}>{title}</div>
|
|
98
|
+
<div style={{ fontSize: 12, color: 'var(--text-dim)', lineHeight: 1.5, marginTop: 2 }}>{desc}</div>
|
|
99
|
+
</div>
|
|
100
|
+
</div>
|
|
101
|
+
)
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// ─── Section ────────────────────────────────────────────────────
|
|
105
|
+
function Section({ id, title, subtitle, children }: { id: string; title: string; subtitle?: string; children: React.ReactNode }) {
|
|
106
|
+
return (
|
|
107
|
+
<section id={id} className="guide-section">
|
|
108
|
+
<h2 className="guide-section-title">{title}</h2>
|
|
109
|
+
{subtitle && <p className="guide-section-subtitle">{subtitle}</p>}
|
|
110
|
+
{children}
|
|
111
|
+
</section>
|
|
112
|
+
)
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// ─── Architecture Diagram ───────────────────────────────────────
|
|
116
|
+
function ArchitectureDiagram() {
|
|
117
|
+
return (
|
|
118
|
+
<div className="guide-diagram">
|
|
119
|
+
<div className="guide-diagram-row">
|
|
120
|
+
<div className="guide-diagram-box guide-diagram-input">
|
|
121
|
+
<div className="guide-diagram-box-label">Input</div>
|
|
122
|
+
<div className="guide-diagram-box-title">Failure Capture</div>
|
|
123
|
+
<div className="guide-diagram-box-desc">Sessions, corrections, manual edits</div>
|
|
124
|
+
</div>
|
|
125
|
+
<div className="guide-diagram-arrow">→</div>
|
|
126
|
+
<div className="guide-diagram-box guide-diagram-process">
|
|
127
|
+
<div className="guide-diagram-box-label">Cluster</div>
|
|
128
|
+
<div className="guide-diagram-box-title">Pattern Detection</div>
|
|
129
|
+
<div className="guide-diagram-box-desc">Group failures by root cause</div>
|
|
130
|
+
</div>
|
|
131
|
+
<div className="guide-diagram-arrow">→</div>
|
|
132
|
+
<div className="guide-diagram-box guide-diagram-process">
|
|
133
|
+
<div className="guide-diagram-box-label">Propose</div>
|
|
134
|
+
<div className="guide-diagram-box-title">Skill Mutation</div>
|
|
135
|
+
<div className="guide-diagram-box-desc">Create, edit, merge, split</div>
|
|
136
|
+
</div>
|
|
137
|
+
</div>
|
|
138
|
+
<div style={{ display: 'flex', justifyContent: 'flex-end', padding: '0 20px' }}>
|
|
139
|
+
<div className="guide-diagram-arrow-down">↓</div>
|
|
140
|
+
</div>
|
|
141
|
+
<div className="guide-diagram-row" style={{ direction: 'rtl' }}>
|
|
142
|
+
<div className="guide-diagram-box guide-diagram-output" style={{ direction: 'ltr' }}>
|
|
143
|
+
<div className="guide-diagram-box-label">Deploy</div>
|
|
144
|
+
<div className="guide-diagram-box-title">Canary + Frontier</div>
|
|
145
|
+
<div className="guide-diagram-box-desc">3-day monitoring, auto-rollback</div>
|
|
146
|
+
</div>
|
|
147
|
+
<div className="guide-diagram-arrow" style={{ direction: 'ltr' }}>←</div>
|
|
148
|
+
<div className="guide-diagram-box guide-diagram-check" style={{ direction: 'ltr' }}>
|
|
149
|
+
<div className="guide-diagram-box-label">Validate</div>
|
|
150
|
+
<div className="guide-diagram-box-title">Regression Tests</div>
|
|
151
|
+
<div className="guide-diagram-box-desc">Golden cases + cross-skill</div>
|
|
152
|
+
</div>
|
|
153
|
+
<div className="guide-diagram-arrow" style={{ direction: 'ltr' }}>←</div>
|
|
154
|
+
<div className="guide-diagram-box guide-diagram-judge" style={{ direction: 'ltr' }}>
|
|
155
|
+
<div className="guide-diagram-box-label">Evaluate</div>
|
|
156
|
+
<div className="guide-diagram-box-title">3 LLM Judges</div>
|
|
157
|
+
<div className="guide-diagram-box-desc">Task, Alignment, Side-effects</div>
|
|
158
|
+
</div>
|
|
159
|
+
</div>
|
|
160
|
+
</div>
|
|
161
|
+
)
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
// ─── Hierarchy Diagram ──────────────────────────────────────────
|
|
165
|
+
function HierarchyDiagram() {
|
|
166
|
+
return (
|
|
167
|
+
<div className="guide-hierarchy">
|
|
168
|
+
<div className="guide-hierarchy-level guide-hierarchy-system">
|
|
169
|
+
<div className="guide-hierarchy-label">System Layer</div>
|
|
170
|
+
<div className="guide-hierarchy-desc">Global agent behaviors — applies everywhere</div>
|
|
171
|
+
<div className="guide-hierarchy-examples">error-recovery, output-formatting, safety-checks</div>
|
|
172
|
+
</div>
|
|
173
|
+
<div className="guide-hierarchy-connector">
|
|
174
|
+
<span>Generalize ↑</span>
|
|
175
|
+
<span>Specialize ↓</span>
|
|
176
|
+
</div>
|
|
177
|
+
<div className="guide-hierarchy-level guide-hierarchy-domain">
|
|
178
|
+
<div className="guide-hierarchy-label">Domain Layer</div>
|
|
179
|
+
<div className="guide-hierarchy-desc">Cross-project patterns — shared techniques</div>
|
|
180
|
+
<div className="guide-hierarchy-examples">react-best-practices, api-design, testing-patterns</div>
|
|
181
|
+
</div>
|
|
182
|
+
<div className="guide-hierarchy-connector">
|
|
183
|
+
<span>Generalize ↑</span>
|
|
184
|
+
<span>Specialize ↓</span>
|
|
185
|
+
</div>
|
|
186
|
+
<div className="guide-hierarchy-level guide-hierarchy-project">
|
|
187
|
+
<div className="guide-hierarchy-label">Project Layer</div>
|
|
188
|
+
<div className="guide-hierarchy-desc">Project-specific skills — local conventions</div>
|
|
189
|
+
<div className="guide-hierarchy-examples">myapp-auth-flow, myapp-db-schema, myapp-deploy</div>
|
|
190
|
+
</div>
|
|
191
|
+
</div>
|
|
192
|
+
)
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
// ─── Main Guide Page ────────────────────────────────────────────
|
|
196
|
+
export default function GuidePage() {
|
|
197
|
+
const [activeSection, setActiveSection] = useState('overview')
|
|
198
|
+
|
|
199
|
+
return (
|
|
200
|
+
<div className="guide-layout">
|
|
201
|
+
{/* Sidebar TOC */}
|
|
202
|
+
<nav className="guide-toc">
|
|
203
|
+
<div className="guide-toc-header">
|
|
204
|
+
<div className="guide-toc-title">Documentation</div>
|
|
205
|
+
<div className="guide-toc-version">v0.2.0</div>
|
|
206
|
+
</div>
|
|
207
|
+
{TOC.map(item => (
|
|
208
|
+
<a
|
|
209
|
+
key={item.id}
|
|
210
|
+
href={`#${item.id}`}
|
|
211
|
+
className={`guide-toc-item ${activeSection === item.id ? 'active' : ''}`}
|
|
212
|
+
onClick={() => setActiveSection(item.id)}
|
|
213
|
+
>
|
|
214
|
+
<span className="guide-toc-icon">{item.icon}</span>
|
|
215
|
+
{item.label}
|
|
216
|
+
</a>
|
|
217
|
+
))}
|
|
218
|
+
</nav>
|
|
219
|
+
|
|
220
|
+
{/* Content */}
|
|
221
|
+
<div className="guide-content">
|
|
222
|
+
{/* Hero */}
|
|
223
|
+
<div className="guide-hero">
|
|
224
|
+
<div className="guide-hero-badge">Documentation</div>
|
|
225
|
+
<h1 className="guide-hero-title">Helix Guide</h1>
|
|
226
|
+
<p className="guide-hero-desc">
|
|
227
|
+
A comprehensive guide to the self-evolving skill ecosystem for AI agents.
|
|
228
|
+
Capture failures, evolve skills through multi-judge evaluation, and maintain
|
|
229
|
+
a Pareto frontier of optimal configurations.
|
|
230
|
+
</p>
|
|
231
|
+
<div className="guide-hero-features">
|
|
232
|
+
<FeatureCard icon="⚡" title="Always-On" desc="Auto-captures corrections and triggers evolution in real-time" />
|
|
233
|
+
<FeatureCard icon="♺" title="Network Evolution" desc="Skills and projects co-evolve as an interconnected organism" />
|
|
234
|
+
<FeatureCard icon="📊" title="Closed-Loop Proof" desc="Tracks whether evolution actually reduces correction rates" />
|
|
235
|
+
<FeatureCard icon="↑" title="Auto-Generalize" desc="Cross-project patterns automatically become abstract skills" />
|
|
236
|
+
</div>
|
|
237
|
+
</div>
|
|
238
|
+
|
|
239
|
+
{/* ─── Overview ─── */}
|
|
240
|
+
<Section id="overview" title="Overview" subtitle="What Helix does and why it exists.">
|
|
241
|
+
<p className="guide-text">
|
|
242
|
+
Helix is a self-improving system that manages SKILL.md files for AI agents. When an agent makes a mistake
|
|
243
|
+
and gets corrected, Helix captures that failure, clusters similar failures together, and proposes skill
|
|
244
|
+
improvements. Every proposed change goes through rigorous multi-judge evaluation and regression testing before
|
|
245
|
+
being deployed with a 3-day canary period.
|
|
246
|
+
</p>
|
|
247
|
+
<p className="guide-text">
|
|
248
|
+
Built on ideas from <strong>EvoSkill</strong> and <strong>AutoResearch</strong>, Helix implements a
|
|
249
|
+
three-directional evolution model:
|
|
250
|
+
</p>
|
|
251
|
+
<div className="guide-directions">
|
|
252
|
+
<div className="guide-direction">
|
|
253
|
+
<div className="guide-direction-arrow" style={{ color: 'var(--purple)' }}>↑</div>
|
|
254
|
+
<div>
|
|
255
|
+
<strong>Generalize</strong>
|
|
256
|
+
<span>Detect cross-project patterns and promote them to abstract, reusable skills</span>
|
|
257
|
+
</div>
|
|
258
|
+
</div>
|
|
259
|
+
<div className="guide-direction">
|
|
260
|
+
<div className="guide-direction-arrow" style={{ color: 'var(--green)' }}>↓</div>
|
|
261
|
+
<div>
|
|
262
|
+
<strong>Specialize</strong>
|
|
263
|
+
<span>Create project-specific skills from domain skills combined with local failures</span>
|
|
264
|
+
</div>
|
|
265
|
+
</div>
|
|
266
|
+
<div className="guide-direction">
|
|
267
|
+
<div className="guide-direction-arrow" style={{ color: 'var(--blue)' }}>↔</div>
|
|
268
|
+
<div>
|
|
269
|
+
<strong>Lateral</strong>
|
|
270
|
+
<span>Merge, split, and resolve conflicts between skills at the same level</span>
|
|
271
|
+
</div>
|
|
272
|
+
</div>
|
|
273
|
+
</div>
|
|
274
|
+
</Section>
|
|
275
|
+
|
|
276
|
+
{/* ─── Quick Start ─── */}
|
|
277
|
+
<Section id="quickstart" title="Quick Start" subtitle="Get up and running in under 5 minutes.">
|
|
278
|
+
<Callout type="info">
|
|
279
|
+
<strong>Prerequisites:</strong> Node.js 18+, <a href="https://bun.sh">Bun</a> (for building),
|
|
280
|
+
and <a href="https://docs.anthropic.com/en/docs/claude-code">Claude CLI</a> with a Claude Max plan.
|
|
281
|
+
</Callout>
|
|
282
|
+
|
|
283
|
+
<Step n={1} title="Install Helix">
|
|
284
|
+
<Code title="Terminal">{`# From npm (recommended)
|
|
285
|
+
npm install -g helix
|
|
286
|
+
|
|
287
|
+
# Or from source
|
|
288
|
+
git clone https://github.com/danielchen26/helixevo.git
|
|
289
|
+
cd helixevo && npm install && npm run build && npm link`}</Code>
|
|
290
|
+
</Step>
|
|
291
|
+
|
|
292
|
+
<Step n={2} title="Initialize your skill ecosystem">
|
|
293
|
+
<Code title="Terminal">{`helixevo init`}</Code>
|
|
294
|
+
<p className="guide-text-sm">
|
|
295
|
+
This scans your existing SKILL.md files (from <code>~/.agents/skills/</code>), imports them into Helix,
|
|
296
|
+
and generates golden test cases for each skill. It also creates the data directory at <code>~/.helix/</code>.
|
|
297
|
+
</p>
|
|
298
|
+
</Step>
|
|
299
|
+
|
|
300
|
+
<Step n={3} title="Capture failures from a session">
|
|
301
|
+
<Code title="Terminal">{`helixevo capture path/to/session.json --project myapp`}</Code>
|
|
302
|
+
<p className="guide-text-sm">
|
|
303
|
+
Extracts corrections, mode switches, retries, and manual edits from Claude session files and records them
|
|
304
|
+
as structured failure records.
|
|
305
|
+
</p>
|
|
306
|
+
</Step>
|
|
307
|
+
|
|
308
|
+
<Step n={4} title="Evolve skills">
|
|
309
|
+
<Code title="Terminal">{`helixevo evolve --verbose`}</Code>
|
|
310
|
+
<p className="guide-text-sm">
|
|
311
|
+
Clusters failures, generates proposals, runs 3-judge evaluation and regression tests, then deploys
|
|
312
|
+
improvements as canaries. Use <code>--dry-run</code> to preview without applying.
|
|
313
|
+
</p>
|
|
314
|
+
</Step>
|
|
315
|
+
|
|
316
|
+
<Step n={5} title="Explore the results">
|
|
317
|
+
<Code title="Terminal">{`# View the skill network
|
|
318
|
+
helixevo graph
|
|
319
|
+
|
|
320
|
+
# Open this dashboard
|
|
321
|
+
helixevo dashboard
|
|
322
|
+
|
|
323
|
+
# Check system health
|
|
324
|
+
helixevo status`}</Code>
|
|
325
|
+
</Step>
|
|
326
|
+
</Section>
|
|
327
|
+
|
|
328
|
+
{/* ─── Commands ─── */}
|
|
329
|
+
<Section id="commands" title="Commands" subtitle="Complete CLI reference for every Helix command.">
|
|
330
|
+
<div className="guide-command-grid">
|
|
331
|
+
{[
|
|
332
|
+
{
|
|
333
|
+
cmd: 'helixevo watch',
|
|
334
|
+
desc: 'Always-on learning mode. Watches for corrections in real-time, auto-captures failures, and triggers evolution when thresholds are met.',
|
|
335
|
+
flags: ['--project <name>', '--events <path>', '--verbose', '--no-evolve'],
|
|
336
|
+
},
|
|
337
|
+
{
|
|
338
|
+
cmd: 'helixevo metrics',
|
|
339
|
+
desc: 'Show correction rates per skill, trends over time, and whether each evolution actually reduced corrections. The proof that Helix works.',
|
|
340
|
+
flags: ['--verbose'],
|
|
341
|
+
},
|
|
342
|
+
{
|
|
343
|
+
cmd: 'helixevo health',
|
|
344
|
+
desc: 'Assess network health: cohesion, coverage, balance, cross-project transfer rate. Detects gaps, orphans, and generalization candidates.',
|
|
345
|
+
flags: ['--verbose'],
|
|
346
|
+
},
|
|
347
|
+
{
|
|
348
|
+
cmd: 'helixevo init',
|
|
349
|
+
desc: 'Import existing skills and generate golden test cases. Scans ~/.agents/skills/ and creates the Helix data directory.',
|
|
350
|
+
flags: ['--verbose'],
|
|
351
|
+
},
|
|
352
|
+
{
|
|
353
|
+
cmd: 'helixevo capture <session>',
|
|
354
|
+
desc: 'Extract failure records from a Claude session JSON file. Detects verbal corrections, manual edits, mode switches, and retries.',
|
|
355
|
+
flags: ['--project <name>', '--verbose'],
|
|
356
|
+
},
|
|
357
|
+
{
|
|
358
|
+
cmd: 'helixevo evolve',
|
|
359
|
+
desc: 'Run the full evolution cycle: cluster failures, generate proposals, evaluate with 3 judges, regression test, and deploy canaries.',
|
|
360
|
+
flags: ['--dry-run', '--verbose', '--max-proposals <n>'],
|
|
361
|
+
},
|
|
362
|
+
{
|
|
363
|
+
cmd: 'helixevo generalize',
|
|
364
|
+
desc: 'Promote cross-project patterns upward. Detects skills that solve similar problems across projects and creates abstract parent skills.',
|
|
365
|
+
flags: ['--dry-run', '--verbose'],
|
|
366
|
+
},
|
|
367
|
+
{
|
|
368
|
+
cmd: 'helixevo specialize --project <name>',
|
|
369
|
+
desc: 'Create project-specific skills by combining domain skills with project-level failure patterns.',
|
|
370
|
+
flags: ['--dry-run', '--verbose'],
|
|
371
|
+
},
|
|
372
|
+
{
|
|
373
|
+
cmd: 'helixevo research',
|
|
374
|
+
desc: 'Proactive web research to discover new skill opportunities. Searches for best practices, analyzes gaps, and generates experimental skills.',
|
|
375
|
+
flags: ['--project <path>', '--max-hypotheses <n>', '--dry-run', '--verbose'],
|
|
376
|
+
},
|
|
377
|
+
{
|
|
378
|
+
cmd: 'helixevo graph',
|
|
379
|
+
desc: 'Visualize the skill network. Shows nodes (skills), edges (relationships), and clusters.',
|
|
380
|
+
flags: ['--mermaid', '--obsidian <vault>', '--rebuild', '--optimize'],
|
|
381
|
+
},
|
|
382
|
+
{
|
|
383
|
+
cmd: 'helixevo dashboard',
|
|
384
|
+
desc: 'Open the interactive web dashboard at localhost:3847. Shows overview, skill network, evolution timeline, research, and frontier.',
|
|
385
|
+
flags: [],
|
|
386
|
+
},
|
|
387
|
+
{
|
|
388
|
+
cmd: 'helixevo status',
|
|
389
|
+
desc: 'Show system health: skill count, failure stats, frontier status, canary deployments, and recent evolution.',
|
|
390
|
+
flags: [],
|
|
391
|
+
},
|
|
392
|
+
{
|
|
393
|
+
cmd: 'helixevo report',
|
|
394
|
+
desc: 'Generate a detailed evolution report. Can output to file, Obsidian vault, or Craft Agent session.',
|
|
395
|
+
flags: ['--verbose'],
|
|
396
|
+
},
|
|
397
|
+
].map(c => (
|
|
398
|
+
<div key={c.cmd} className="guide-command-card">
|
|
399
|
+
<code className="guide-command-name">{c.cmd}</code>
|
|
400
|
+
<p className="guide-command-desc">{c.desc}</p>
|
|
401
|
+
{c.flags.length > 0 && (
|
|
402
|
+
<div className="guide-command-flags">
|
|
403
|
+
{c.flags.map(f => <code key={f} className="guide-command-flag">{f}</code>)}
|
|
404
|
+
</div>
|
|
405
|
+
)}
|
|
406
|
+
</div>
|
|
407
|
+
))}
|
|
408
|
+
</div>
|
|
409
|
+
|
|
410
|
+
<Callout type="tip">
|
|
411
|
+
Most commands support <code>--dry-run</code> to preview changes without applying them,
|
|
412
|
+
and <code>--verbose</code> to see detailed LLM interactions and scoring.
|
|
413
|
+
</Callout>
|
|
414
|
+
</Section>
|
|
415
|
+
|
|
416
|
+
{/* ─── Architecture ─── */}
|
|
417
|
+
<Section id="architecture" title="Architecture" subtitle="How the system is structured and how data flows through it.">
|
|
418
|
+
<h3 className="guide-h3">Pipeline</h3>
|
|
419
|
+
<p className="guide-text">
|
|
420
|
+
Every evolution cycle follows this pipeline. Each stage acts as a quality gate — proposals must
|
|
421
|
+
pass all stages to be deployed.
|
|
422
|
+
</p>
|
|
423
|
+
<ArchitectureDiagram />
|
|
424
|
+
|
|
425
|
+
<h3 className="guide-h3">Three-Layer Hierarchy</h3>
|
|
426
|
+
<p className="guide-text">
|
|
427
|
+
Skills are organized into three layers. The <code>generalize</code> command promotes patterns upward,
|
|
428
|
+
while <code>specialize</code> creates project-specific variants downward.
|
|
429
|
+
</p>
|
|
430
|
+
<HierarchyDiagram />
|
|
431
|
+
</Section>
|
|
432
|
+
|
|
433
|
+
{/* ─── Always-On Learning ─── */}
|
|
434
|
+
<Section id="watch" title="Always-On Learning" subtitle="Helix watches your work, captures corrections automatically, and evolves without manual intervention.">
|
|
435
|
+
<p className="guide-text">
|
|
436
|
+
Instead of manually running <code>helixevo capture</code> after each session, <code>helixevo watch</code> runs
|
|
437
|
+
continuously in the background. It monitors your conversation events in real-time, detects when you
|
|
438
|
+
correct the agent, and feeds those corrections into the evolution pipeline automatically.
|
|
439
|
+
</p>
|
|
440
|
+
<Code title="Terminal">{`# Start always-on mode
|
|
441
|
+
helixevo watch --project myapp
|
|
442
|
+
|
|
443
|
+
# Watch without auto-evolution (capture only)
|
|
444
|
+
helixevo watch --no-evolve
|
|
445
|
+
|
|
446
|
+
# Custom events file
|
|
447
|
+
helixevo watch --events path/to/events.jsonl --verbose`}</Code>
|
|
448
|
+
|
|
449
|
+
<h3 className="guide-h3">How It Works</h3>
|
|
450
|
+
<div className="guide-pipeline">
|
|
451
|
+
<PipelineStep icon="1" title="Event Monitoring" color="var(--blue)" desc="Polls events.jsonl every 3 seconds for new conversation messages." />
|
|
452
|
+
<div className="guide-pipeline-connector" />
|
|
453
|
+
<PipelineStep icon="2" title="Correction Detection" color="var(--purple)" desc="Fast regex pre-filter (English + Chinese) checks for correction signals like 'wrong', 'not like that', 'instead use...', '不对', '改成'." />
|
|
454
|
+
<div className="guide-pipeline-connector" />
|
|
455
|
+
<PipelineStep icon="3" title="LLM Extraction" color="var(--yellow)" desc="When a signal is detected, an LLM analyzes the recent conversation window to extract structured failure records with confidence scores." />
|
|
456
|
+
<div className="guide-pipeline-connector" />
|
|
457
|
+
<PipelineStep icon="4" title="Auto-Evolve Check" color="var(--green)" desc="Checks if evolution should trigger: burst mode (3+ failures/hour), cross-project pattern, or standard threshold (5+ failures)." />
|
|
458
|
+
<div className="guide-pipeline-connector" />
|
|
459
|
+
<PipelineStep icon="5" title="Metrics Update" color="var(--blue)" desc="Updates correction rates per skill, trend analysis, and evolution impact measurements." />
|
|
460
|
+
</div>
|
|
461
|
+
|
|
462
|
+
<h3 className="guide-h3">Auto-Evolve Triggers</h3>
|
|
463
|
+
<div className="guide-edge-grid">
|
|
464
|
+
{[
|
|
465
|
+
{ type: 'burst', color: 'var(--red)', desc: '3+ corrections in the last hour — suggests active work with recurring problems. Triggers immediately.' },
|
|
466
|
+
{ type: 'cross-project', color: 'var(--purple)', desc: 'Same failure pattern in 2+ projects — signals a missing domain-level skill. Prioritizes network-level evolution.' },
|
|
467
|
+
{ type: 'threshold', color: 'var(--blue)', desc: '5+ unresolved failures accumulated. Standard trigger with 2-hour cooldown between cycles.' },
|
|
468
|
+
].map(e => (
|
|
469
|
+
<div key={e.type} className="guide-edge-card">
|
|
470
|
+
<div className="guide-edge-type" style={{ color: e.color }}>{e.type}</div>
|
|
471
|
+
<div className="guide-edge-desc">{e.desc}</div>
|
|
472
|
+
</div>
|
|
473
|
+
))}
|
|
474
|
+
</div>
|
|
475
|
+
</Section>
|
|
476
|
+
|
|
477
|
+
{/* ─── Network Health ─── */}
|
|
478
|
+
<Section id="networkhealth" title="Network Health" subtitle="The skill network is a co-evolving organism — its health determines project success.">
|
|
479
|
+
<p className="guide-text">
|
|
480
|
+
Individual skill evolution is only part of the picture. Helix now treats the <strong>entire skill network</strong> as
|
|
481
|
+
a first-class entity that co-evolves with your projects. Network health is assessed across 4 dimensions after
|
|
482
|
+
every evolution cycle.
|
|
483
|
+
</p>
|
|
484
|
+
<Code title="Terminal">{`helixevo health --verbose`}</Code>
|
|
485
|
+
|
|
486
|
+
<h3 className="guide-h3">Health Dimensions</h3>
|
|
487
|
+
<div className="grid-2" style={{ gap: 12, marginBottom: 20 }}>
|
|
488
|
+
<div className="guide-dimension-card" style={{ borderLeftColor: 'var(--green)' }}>
|
|
489
|
+
<div style={{ fontSize: 11, fontWeight: 700, color: 'var(--green)', textTransform: 'uppercase', letterSpacing: 1 }}>Cohesion</div>
|
|
490
|
+
<div style={{ fontSize: 12, color: 'var(--text-dim)', marginTop: 4 }}>Are skills connected? Orphan skills (no edges) reduce network value.</div>
|
|
491
|
+
</div>
|
|
492
|
+
<div className="guide-dimension-card" style={{ borderLeftColor: 'var(--blue)' }}>
|
|
493
|
+
<div style={{ fontSize: 11, fontWeight: 700, color: 'var(--blue)', textTransform: 'uppercase', letterSpacing: 1 }}>Coverage</div>
|
|
494
|
+
<div style={{ fontSize: 12, color: 'var(--text-dim)', marginTop: 4 }}>What fraction of failures map to skills? Gaps indicate missing skills.</div>
|
|
495
|
+
</div>
|
|
496
|
+
<div className="guide-dimension-card" style={{ borderLeftColor: 'var(--purple)' }}>
|
|
497
|
+
<div style={{ fontSize: 11, fontWeight: 700, color: 'var(--purple)', textTransform: 'uppercase', letterSpacing: 1 }}>Balance</div>
|
|
498
|
+
<div style={{ fontSize: 12, color: 'var(--text-dim)', marginTop: 4 }}>Is the system/domain/project hierarchy well-structured? Domain should be the backbone.</div>
|
|
499
|
+
</div>
|
|
500
|
+
<div className="guide-dimension-card" style={{ borderLeftColor: 'var(--yellow)' }}>
|
|
501
|
+
<div style={{ fontSize: 11, fontWeight: 700, color: 'var(--yellow)', textTransform: 'uppercase', letterSpacing: 1 }}>Transfer Rate</div>
|
|
502
|
+
<div style={{ fontSize: 12, color: 'var(--text-dim)', marginTop: 4 }}>Do domain skills help across projects? Low transfer = too project-specific.</div>
|
|
503
|
+
</div>
|
|
504
|
+
</div>
|
|
505
|
+
<Callout type="info">
|
|
506
|
+
Network health is graded A-D (weighted average of all 4 dimensions) and shown in every evolution summary.
|
|
507
|
+
The <code>helixevo health</code> command also detects coverage gaps, orphan skills, and auto-generalization candidates.
|
|
508
|
+
</Callout>
|
|
509
|
+
</Section>
|
|
510
|
+
|
|
511
|
+
{/* ─── Auto-Generalization ─── */}
|
|
512
|
+
<Section id="autogen" title="Auto-Generalization" subtitle="When patterns recur across projects, Helix automatically creates abstract domain-level skills.">
|
|
513
|
+
<p className="guide-text">
|
|
514
|
+
Instead of waiting for you to run <code>helixevo generalize</code>, Helix now detects cross-project patterns
|
|
515
|
+
automatically during every evolution cycle. When the same type of correction appears in 2+ projects, it
|
|
516
|
+
creates an abstract domain-level skill and sets up parent/child inheritance.
|
|
517
|
+
</p>
|
|
518
|
+
<Code title="How auto-generalization works">{`Project A: "Use FlashList not FlatList" (React Native perf)
|
|
519
|
+
Project B: "Use FlashList not FlatList" (React Native perf)
|
|
520
|
+
→ Cross-project pattern detected
|
|
521
|
+
→ Abstract skill created: "react-native-performance" (domain layer)
|
|
522
|
+
→ Project A skill inherits from it
|
|
523
|
+
→ Project B skill inherits from it
|
|
524
|
+
→ Domain skill tested against all golden cases
|
|
525
|
+
→ Deployed if regression passes`}</Code>
|
|
526
|
+
<Callout type="tip">
|
|
527
|
+
Auto-generalization is the key to the <strong>double helix</strong> metaphor: as projects evolve, skills
|
|
528
|
+
generalize upward; as skills generalize, they improve all projects simultaneously.
|
|
529
|
+
</Callout>
|
|
530
|
+
</Section>
|
|
531
|
+
|
|
532
|
+
{/* ─── Closed-Loop Metrics ─── */}
|
|
533
|
+
<Section id="metrics" title="Closed-Loop Metrics" subtitle="Proving that Helix actually makes the agent better — with data, not just LLM scores.">
|
|
534
|
+
<p className="guide-text">
|
|
535
|
+
The <code>helixevo metrics</code> command answers the most important question: <strong>“Is Helix actually
|
|
536
|
+
reducing corrections?”</strong> It tracks correction rates per skill over time and measures the real
|
|
537
|
+
impact of each evolution.
|
|
538
|
+
</p>
|
|
539
|
+
<Code title="Terminal">{`helixevo metrics --verbose`}</Code>
|
|
540
|
+
|
|
541
|
+
<h3 className="guide-h3">What It Tracks</h3>
|
|
542
|
+
<ul className="guide-list">
|
|
543
|
+
<li><strong>Per-skill correction rates:</strong> 7-day rolling windows showing how often each skill leads to corrections</li>
|
|
544
|
+
<li><strong>Trend detection:</strong> Each skill is marked as improving (↓), stable (→), or degrading (↑)</li>
|
|
545
|
+
<li><strong>Evolution impact:</strong> Before/after comparison for each evolution — failures/day in the 7 days before vs. after</li>
|
|
546
|
+
<li><strong>Verdict:</strong> “X/Y evolutions reduced corrections” — the bottom line</li>
|
|
547
|
+
</ul>
|
|
548
|
+
|
|
549
|
+
<Callout type="warning">
|
|
550
|
+
Metrics need time to accumulate. The system needs at least 7 days of data after an evolution to produce
|
|
551
|
+
a reliable before/after comparison. Results shown as “Measuring” during the first 3 days.
|
|
552
|
+
</Callout>
|
|
553
|
+
</Section>
|
|
554
|
+
|
|
555
|
+
{/* ─── Evolution Pipeline ─── */}
|
|
556
|
+
<Section id="evolution" title="Evolution Pipeline" subtitle="The step-by-step process that turns failures into improved skills.">
|
|
557
|
+
<div className="guide-pipeline">
|
|
558
|
+
<PipelineStep
|
|
559
|
+
icon="1" title="Failure Clustering" color="var(--blue)"
|
|
560
|
+
desc="Unresolved failures are clustered by root cause using an LLM. Each cluster identifies a pattern (e.g., 'missing error handling in API calls') and the relevant skills."
|
|
561
|
+
/>
|
|
562
|
+
<div className="guide-pipeline-connector" />
|
|
563
|
+
<PipelineStep
|
|
564
|
+
icon="2" title="Proposal Generation" color="var(--purple)"
|
|
565
|
+
desc="For each cluster, the system generates a proposed skill mutation: create a new skill, edit an existing one, or merge/split skills. The proposer considers related proposals from history to avoid repeating failed approaches."
|
|
566
|
+
/>
|
|
567
|
+
<div className="guide-pipeline-connector" />
|
|
568
|
+
<PipelineStep
|
|
569
|
+
icon="3" title="Replay Testing" color="var(--yellow)"
|
|
570
|
+
desc="The proposed skill is tested by replaying the original failure scenario. The agent responds using the new skill, and this response is sent to the judges."
|
|
571
|
+
/>
|
|
572
|
+
<div className="guide-pipeline-connector" />
|
|
573
|
+
<PipelineStep
|
|
574
|
+
icon="4" title="Multi-Judge Evaluation" color="var(--green)"
|
|
575
|
+
desc="Three independent LLM judges score the replay (1-10): Task Completion, Correction Alignment, and Side-Effect Check. A proposal needs consensus (2/3 judges passing) to proceed."
|
|
576
|
+
/>
|
|
577
|
+
<div className="guide-pipeline-connector" />
|
|
578
|
+
<PipelineStep
|
|
579
|
+
icon="5" title="Regression Testing" color="var(--red)"
|
|
580
|
+
desc="The modified skill is tested against all golden cases for that skill AND co-evolved partner skills. Must maintain ≥95% pass rate."
|
|
581
|
+
/>
|
|
582
|
+
<div className="guide-pipeline-connector" />
|
|
583
|
+
<PipelineStep
|
|
584
|
+
icon="6" title="Canary Deployment" color="var(--blue)"
|
|
585
|
+
desc="Accepted proposals are deployed as canaries with a 3-day monitoring period. If failure rate increases above the auto-rollback threshold, the skill is automatically reverted."
|
|
586
|
+
/>
|
|
587
|
+
</div>
|
|
588
|
+
</Section>
|
|
589
|
+
|
|
590
|
+
{/* ─── Multi-Judge System ─── */}
|
|
591
|
+
<Section id="judges" title="Multi-Judge System" subtitle="Three independent judges ensure quality from different angles.">
|
|
592
|
+
<div className="guide-judges-grid">
|
|
593
|
+
<div className="guide-judge-card" style={{ borderTopColor: 'var(--green)' }}>
|
|
594
|
+
<div className="guide-judge-header">
|
|
595
|
+
<div className="guide-judge-icon" style={{ background: 'var(--green-light)', color: 'var(--green)' }}>T</div>
|
|
596
|
+
<div>
|
|
597
|
+
<div className="guide-judge-name">Task Completion</div>
|
|
598
|
+
<div className="guide-judge-role">Did the agent accomplish the user's goal?</div>
|
|
599
|
+
</div>
|
|
600
|
+
</div>
|
|
601
|
+
<p className="guide-text-sm">
|
|
602
|
+
Evaluates whether the replayed response with the new skill would actually complete the user's
|
|
603
|
+
original request. Scores 1-10 based on completeness, correctness, and usefulness.
|
|
604
|
+
</p>
|
|
605
|
+
</div>
|
|
606
|
+
<div className="guide-judge-card" style={{ borderTopColor: 'var(--blue)' }}>
|
|
607
|
+
<div className="guide-judge-header">
|
|
608
|
+
<div className="guide-judge-icon" style={{ background: 'var(--blue-light)', color: 'var(--blue)' }}>A</div>
|
|
609
|
+
<div>
|
|
610
|
+
<div className="guide-judge-name">Correction Alignment</div>
|
|
611
|
+
<div className="guide-judge-role">Does the fix match the user's correction?</div>
|
|
612
|
+
</div>
|
|
613
|
+
</div>
|
|
614
|
+
<p className="guide-text-sm">
|
|
615
|
+
Checks that the skill improvement actually addresses the specific correction the user gave.
|
|
616
|
+
Prevents generic improvements that don't fix the root cause.
|
|
617
|
+
</p>
|
|
618
|
+
</div>
|
|
619
|
+
<div className="guide-judge-card" style={{ borderTopColor: 'var(--purple)' }}>
|
|
620
|
+
<div className="guide-judge-header">
|
|
621
|
+
<div className="guide-judge-icon" style={{ background: 'var(--purple-light)', color: 'var(--purple)' }}>S</div>
|
|
622
|
+
<div>
|
|
623
|
+
<div className="guide-judge-name">Side-Effect Check</div>
|
|
624
|
+
<div className="guide-judge-role">Does the change break anything else?</div>
|
|
625
|
+
</div>
|
|
626
|
+
</div>
|
|
627
|
+
<p className="guide-text-sm">
|
|
628
|
+
Compares the original and modified skill content to detect unintended side effects.
|
|
629
|
+
Ensures the fix doesn't introduce new problems or remove useful behaviors.
|
|
630
|
+
</p>
|
|
631
|
+
</div>
|
|
632
|
+
</div>
|
|
633
|
+
|
|
634
|
+
<Callout type="info">
|
|
635
|
+
<strong>Consensus rule:</strong> At least {'{'}judgeConsensusMin{'}'} out of 3 judges must score ≥ {'{'}judgePassScore{'}'}/10
|
|
636
|
+
for a proposal to pass. During stagnation (no improvements for {'{'}stopAfterNoImprovement{'}'} rounds), the
|
|
637
|
+
threshold is lowered by 1 to explore more possibilities.
|
|
638
|
+
</Callout>
|
|
639
|
+
</Section>
|
|
640
|
+
|
|
641
|
+
{/* ─── Pareto Frontier ─── */}
|
|
642
|
+
<Section id="frontier" title="Pareto Frontier" subtitle="Tracking the best-performing skill configurations across multiple objectives.">
|
|
643
|
+
<p className="guide-text">
|
|
644
|
+
The Pareto frontier maintains the top-K skill configurations that are not dominated in any scoring dimension.
|
|
645
|
+
A configuration is “dominated” if another configuration scores higher in all three dimensions. This prevents
|
|
646
|
+
optimizing for one metric at the expense of others.
|
|
647
|
+
</p>
|
|
648
|
+
|
|
649
|
+
<h3 className="guide-h3">Scoring Dimensions</h3>
|
|
650
|
+
<div className="grid-3" style={{ marginBottom: 20 }}>
|
|
651
|
+
<div className="guide-dimension-card" style={{ borderLeftColor: 'var(--green)' }}>
|
|
652
|
+
<div style={{ fontSize: 11, fontWeight: 700, color: 'var(--green)', textTransform: 'uppercase', letterSpacing: 1 }}>Task Completion</div>
|
|
653
|
+
<div style={{ fontSize: 12, color: 'var(--text-dim)', marginTop: 4 }}>How well does the agent complete user requests?</div>
|
|
654
|
+
</div>
|
|
655
|
+
<div className="guide-dimension-card" style={{ borderLeftColor: 'var(--blue)' }}>
|
|
656
|
+
<div style={{ fontSize: 11, fontWeight: 700, color: 'var(--blue)', textTransform: 'uppercase', letterSpacing: 1 }}>Correction Alignment</div>
|
|
657
|
+
<div style={{ fontSize: 12, color: 'var(--text-dim)', marginTop: 4 }}>How accurately do skills reflect user corrections?</div>
|
|
658
|
+
</div>
|
|
659
|
+
<div className="guide-dimension-card" style={{ borderLeftColor: 'var(--purple)' }}>
|
|
660
|
+
<div style={{ fontSize: 11, fontWeight: 700, color: 'var(--purple)', textTransform: 'uppercase', letterSpacing: 1 }}>Side-Effect Free</div>
|
|
661
|
+
<div style={{ fontSize: 12, color: 'var(--text-dim)', marginTop: 4 }}>How clean are changes — no unintended regressions?</div>
|
|
662
|
+
</div>
|
|
663
|
+
</div>
|
|
664
|
+
|
|
665
|
+
<Callout type="tip">
|
|
666
|
+
The frontier capacity defaults to 5. When a new configuration is admitted and the frontier is full,
|
|
667
|
+
the dominated configuration with the lowest overall score is evicted.
|
|
668
|
+
</Callout>
|
|
669
|
+
</Section>
|
|
670
|
+
|
|
671
|
+
{/* ─── Regression Testing ─── */}
|
|
672
|
+
<Section id="regression" title="Regression Testing" subtitle="Golden cases and cross-skill validation ensure quality.">
|
|
673
|
+
<h3 className="guide-h3">Golden Cases</h3>
|
|
674
|
+
<p className="guide-text">
|
|
675
|
+
Golden cases are regression test scenarios tied to specific skills. They're created when:
|
|
676
|
+
</p>
|
|
677
|
+
<ul className="guide-list">
|
|
678
|
+
<li><strong>Init:</strong> Automatically generated from existing SKILL.md files during <code>helixevo init</code></li>
|
|
679
|
+
<li><strong>Evolution:</strong> When a failure is resolved, the scenario is promoted to a golden case</li>
|
|
680
|
+
</ul>
|
|
681
|
+
<p className="guide-text">
|
|
682
|
+
Each golden case stores the input, context, and expected behavior. During regression testing,
|
|
683
|
+
an LLM judge evaluates whether the modified skill would still handle each scenario correctly.
|
|
684
|
+
</p>
|
|
685
|
+
|
|
686
|
+
<h3 className="guide-h3">Cross-Skill Regression</h3>
|
|
687
|
+
<p className="guide-text">
|
|
688
|
+
When skill A is modified, Helix also tests golden cases from co-evolved, dependent, and enhancing
|
|
689
|
+
partner skills. This catches silent incompatibilities where changing one skill breaks a related skill's behavior.
|
|
690
|
+
</p>
|
|
691
|
+
<Code title="How it works">{`Skill A evolves
|
|
692
|
+
→ Load skill graph edges
|
|
693
|
+
→ Find partners (co-evolves, depends, enhances)
|
|
694
|
+
→ Test partner golden cases against Skill A's changes
|
|
695
|
+
→ Block if partner pass rate < 95%`}</Code>
|
|
696
|
+
</Section>
|
|
697
|
+
|
|
698
|
+
{/* ─── Proactive Research ─── */}
|
|
699
|
+
<Section id="research" title="Proactive Research" subtitle="Discover new skill opportunities through web research.">
|
|
700
|
+
<p className="guide-text">
|
|
701
|
+
The <code>research</code> command doesn't wait for failures — it proactively searches the web for best practices,
|
|
702
|
+
new techniques, and emerging patterns that could become valuable skills.
|
|
703
|
+
</p>
|
|
704
|
+
<div className="guide-pipeline">
|
|
705
|
+
<PipelineStep icon="1" title="Goal Understanding" color="var(--blue)" desc="Analyzes your project files (README, CLAUDE.md, package.json) and current skills to identify capability gaps." />
|
|
706
|
+
<div className="guide-pipeline-connector" />
|
|
707
|
+
<PipelineStep icon="2" title="Frontier Scanning" color="var(--purple)" desc="Runs parallel web searches for each identified gap, focusing on 2025-2026 best practices." />
|
|
708
|
+
<div className="guide-pipeline-connector" />
|
|
709
|
+
<PipelineStep icon="3" title="Hypothesis Generation" color="var(--yellow)" desc="Transforms discoveries into testable hypotheses with confidence scores. Only high-confidence (≥0.7) hypotheses proceed." />
|
|
710
|
+
<div className="guide-pipeline-connector" />
|
|
711
|
+
<PipelineStep icon="4" title="Experimentation" color="var(--green)" desc="For each hypothesis: generates a SKILL.md, creates test scenarios, replays them, and judges quality. Iterates on existing drafts." />
|
|
712
|
+
<div className="guide-pipeline-connector" />
|
|
713
|
+
<PipelineStep icon="5" title="Knowledge Buffer" color="var(--blue)" desc="All discoveries are saved. Failed experiments are stored as drafts for future iteration. Nothing is wasted." />
|
|
714
|
+
</div>
|
|
715
|
+
</Section>
|
|
716
|
+
|
|
717
|
+
{/* ─── Skill Network ─── */}
|
|
718
|
+
<Section id="network" title="Skill Network" subtitle="Understanding the relationships between skills.">
|
|
719
|
+
<h3 className="guide-h3">Edge Types</h3>
|
|
720
|
+
<div className="guide-edge-grid">
|
|
721
|
+
{[
|
|
722
|
+
{ type: 'inherits', color: 'var(--purple)', desc: 'Parent-child hierarchy. Project skills inherit from domain skills.' },
|
|
723
|
+
{ type: 'depends', color: 'var(--blue)', desc: 'Hard dependency. Skill B requires Skill A to function correctly.' },
|
|
724
|
+
{ type: 'enhances', color: 'var(--green)', desc: 'Soft relationship. Skill A makes Skill B more effective.' },
|
|
725
|
+
{ type: 'conflicts', color: 'var(--red)', desc: 'Rule contradiction. Skill A\'s rules contradict Skill B\'s.' },
|
|
726
|
+
{ type: 'co-evolves', color: 'var(--yellow)', desc: 'Statistical. Skills that frequently change together in evolution.' },
|
|
727
|
+
].map(e => (
|
|
728
|
+
<div key={e.type} className="guide-edge-card">
|
|
729
|
+
<div className="guide-edge-type" style={{ color: e.color }}>{e.type}</div>
|
|
730
|
+
<div className="guide-edge-desc">{e.desc}</div>
|
|
731
|
+
</div>
|
|
732
|
+
))}
|
|
733
|
+
</div>
|
|
734
|
+
|
|
735
|
+
<h3 className="guide-h3">Detection Methods</h3>
|
|
736
|
+
<ul className="guide-list">
|
|
737
|
+
<li><strong>Explicit:</strong> Declared in SKILL.md frontmatter (<code>parent</code>, <code>dependencies</code>, <code>enhances</code>, <code>conflicts</code>)</li>
|
|
738
|
+
<li><strong>LLM-inferred:</strong> An LLM analyzes skill descriptions to find implicit relationships</li>
|
|
739
|
+
<li><strong>Statistical:</strong> Co-evolution detected from evolution history (skills that change together)</li>
|
|
740
|
+
</ul>
|
|
741
|
+
|
|
742
|
+
<Code title="SKILL.md frontmatter example">{`---
|
|
743
|
+
name: React Testing Patterns
|
|
744
|
+
description: Best practices for testing React components
|
|
745
|
+
layer: domain
|
|
746
|
+
tags: [react, testing]
|
|
747
|
+
dependencies: [react-best-practices]
|
|
748
|
+
enhances: [code-review-guidelines]
|
|
749
|
+
score: 0.85
|
|
750
|
+
generation: 3
|
|
751
|
+
---
|
|
752
|
+
|
|
753
|
+
## Rules
|
|
754
|
+
...`}</Code>
|
|
755
|
+
</Section>
|
|
756
|
+
|
|
757
|
+
{/* ─── Configuration ─── */}
|
|
758
|
+
<Section id="config" title="Configuration" subtitle="All configurable parameters and their defaults.">
|
|
759
|
+
<h3 className="guide-h3">Evolution</h3>
|
|
760
|
+
<div className="guide-params">
|
|
761
|
+
<Param name="model" type="string" desc="LLM model for proposals and clustering." def='"sonnet"' />
|
|
762
|
+
<Param name="judgeModel" type="string" desc="LLM model for judge evaluations." def='"sonnet"' />
|
|
763
|
+
<Param name="evolution.schedule" type="cron" desc="When to run automatic evolution." def='"0 2 * * *"' />
|
|
764
|
+
<Param name="evolution.minFailuresForEvolution" type="number" desc="Minimum unresolved failures before evolve runs." def="5" />
|
|
765
|
+
<Param name="evolution.maxFailuresPerRun" type="number" desc="Max failures to process per run." def="20" />
|
|
766
|
+
<Param name="evolution.maxProposalsPerRun" type="number" desc="Max proposals to generate per run." def="5" />
|
|
767
|
+
<Param name="evolution.stopAfterNoImprovement" type="number" desc="Rounds without improvement before stagnation mode." def="3" />
|
|
768
|
+
<Param name="evolution.frontierCapacity" type="number" desc="Maximum configurations in the Pareto frontier." def="5" />
|
|
769
|
+
</div>
|
|
770
|
+
|
|
771
|
+
<h3 className="guide-h3">Quality Gates</h3>
|
|
772
|
+
<div className="guide-params">
|
|
773
|
+
<Param name="quality.judgePassScore" type="number" desc="Minimum judge score to pass (1-10)." def="7" />
|
|
774
|
+
<Param name="quality.judgeConsensusMin" type="number" desc="Minimum judges that must pass." def="2" />
|
|
775
|
+
<Param name="quality.regressionPassRate" type="number" desc="Minimum golden case pass rate (0-1)." def="0.95" />
|
|
776
|
+
<Param name="quality.canaryDurationDays" type="number" desc="Days to monitor canary deployments." def="3" />
|
|
777
|
+
<Param name="quality.autoRollbackThreshold" type="number" desc="Failure rate multiplier triggering rollback." def="1.5" />
|
|
778
|
+
<Param name="quality.maxGoldenCases" type="number" desc="Maximum golden cases per skill." def="50" />
|
|
779
|
+
</div>
|
|
780
|
+
|
|
781
|
+
<Code title="~/.helix/config.json">{`{
|
|
782
|
+
"model": "sonnet",
|
|
783
|
+
"judgeModel": "sonnet",
|
|
784
|
+
"evolution": {
|
|
785
|
+
"schedule": "0 2 * * *",
|
|
786
|
+
"minFailuresForEvolution": 5,
|
|
787
|
+
"maxFailuresPerRun": 20,
|
|
788
|
+
"maxProposalsPerRun": 5,
|
|
789
|
+
"stopAfterNoImprovement": 3,
|
|
790
|
+
"frontierCapacity": 5
|
|
791
|
+
},
|
|
792
|
+
"quality": {
|
|
793
|
+
"judgePassScore": 7,
|
|
794
|
+
"judgeConsensusMin": 2,
|
|
795
|
+
"regressionPassRate": 0.95,
|
|
796
|
+
"canaryDurationDays": 3,
|
|
797
|
+
"autoRollbackThreshold": 1.5,
|
|
798
|
+
"maxGoldenCases": 50
|
|
799
|
+
}
|
|
800
|
+
}`}</Code>
|
|
801
|
+
</Section>
|
|
802
|
+
|
|
803
|
+
{/* ─── Data & Storage ─── */}
|
|
804
|
+
<Section id="data" title="Data & Storage" subtitle="Where everything lives and how it's structured.">
|
|
805
|
+
<Code title="~/.helix/ directory structure">{`~/.helix/
|
|
806
|
+
├── config.json # Configuration
|
|
807
|
+
├── failures.jsonl # Captured failure records (append-only)
|
|
808
|
+
├── frontier.json # Pareto frontier (top-K programs)
|
|
809
|
+
├── evolution-history.json # All evolution iterations + proposals
|
|
810
|
+
├── golden-cases.jsonl # Regression test cases (append-only)
|
|
811
|
+
├── skill-graph.json # Cached network (nodes + edges)
|
|
812
|
+
├── canary-registry.json # Active canary deployments
|
|
813
|
+
├── knowledge-buffer.json # Research discoveries + drafts
|
|
814
|
+
├── general/ # Skills (SKILL.md files)
|
|
815
|
+
│ ├── my-skill/SKILL.md
|
|
816
|
+
│ └── ...
|
|
817
|
+
├── backups/ # Pre-canary skill backups
|
|
818
|
+
└── reports/ # Generated evolution reports`}</Code>
|
|
819
|
+
|
|
820
|
+
<h3 className="guide-h3">Key Data Formats</h3>
|
|
821
|
+
<div className="grid-2" style={{ gap: 12 }}>
|
|
822
|
+
<div className="guide-data-card">
|
|
823
|
+
<div className="guide-data-title">Failure Record</div>
|
|
824
|
+
<Code>{`{
|
|
825
|
+
"id": "f_abc123",
|
|
826
|
+
"sessionId": "session-001",
|
|
827
|
+
"project": "myapp",
|
|
828
|
+
"userRequest": "Add a login page",
|
|
829
|
+
"agentAction": "Created login.tsx",
|
|
830
|
+
"correction": "Use the existing auth hook",
|
|
831
|
+
"correctionType": "verbal",
|
|
832
|
+
"skillsActive": ["react-patterns"],
|
|
833
|
+
"resolved": false
|
|
834
|
+
}`}</Code>
|
|
835
|
+
</div>
|
|
836
|
+
<div className="guide-data-card">
|
|
837
|
+
<div className="guide-data-title">Golden Case</div>
|
|
838
|
+
<Code>{`{
|
|
839
|
+
"id": "gc_react_42",
|
|
840
|
+
"skill": "react-patterns",
|
|
841
|
+
"input": "Create a form component",
|
|
842
|
+
"context": "React 19, TypeScript",
|
|
843
|
+
"expectedBehavior": "Uses useActionState for form submission...",
|
|
844
|
+
"lastResult": "pass",
|
|
845
|
+
"consecutivePasses": 5
|
|
846
|
+
}`}</Code>
|
|
847
|
+
</div>
|
|
848
|
+
</div>
|
|
849
|
+
</Section>
|
|
850
|
+
|
|
851
|
+
{/* ─── Skill Management ─── */}
|
|
852
|
+
<Section id="manage" title="Skill Management" subtitle="Edit, create, promote, and delete skills directly from the dashboard.">
|
|
853
|
+
<p className="guide-text">
|
|
854
|
+
The <strong>Skill Network</strong> page in the dashboard is fully interactive. Click any skill to open its
|
|
855
|
+
detail panel, then use the action buttons to modify it. The network automatically adapts to your changes
|
|
856
|
+
and tells you when it can't.
|
|
857
|
+
</p>
|
|
858
|
+
|
|
859
|
+
<h3 className="guide-h3">Actions</h3>
|
|
860
|
+
<div className="guide-edge-grid">
|
|
861
|
+
{[
|
|
862
|
+
{ type: '✎ Edit', color: 'var(--purple)', desc: 'Modify skill content inline. Dark-themed code editor with live save. Co-evolved partner skills are flagged for review.' },
|
|
863
|
+
{ type: '↑ Promote', color: 'var(--blue)', desc: 'Move skill up one layer (project → domain → system). Warns about skipping layers and checks for overlap.' },
|
|
864
|
+
{ type: '↓ Demote', color: 'var(--yellow)', desc: 'Move skill down one layer. Shows what depends on it and what might break.' },
|
|
865
|
+
{ type: '✕ Delete', color: 'var(--red)', desc: 'Remove skill with confirmation. Shows orphaned children, broken dependencies, and coverage impact before deletion.' },
|
|
866
|
+
{ type: '+ Create', color: 'var(--green)', desc: 'Create a new skill from scratch via the "+ New Skill" button. Includes slug input and SKILL.md template editor.' },
|
|
867
|
+
].map(e => (
|
|
868
|
+
<div key={e.type} className="guide-edge-card">
|
|
869
|
+
<div className="guide-edge-type" style={{ color: e.color }}>{e.type}</div>
|
|
870
|
+
<div className="guide-edge-desc">{e.desc}</div>
|
|
871
|
+
</div>
|
|
872
|
+
))}
|
|
873
|
+
</div>
|
|
874
|
+
|
|
875
|
+
<h3 className="guide-h3">Network Adaptation Feedback</h3>
|
|
876
|
+
<p className="guide-text">
|
|
877
|
+
After every action, the network adaptation engine analyzes the impact and shows feedback inline:
|
|
878
|
+
</p>
|
|
879
|
+
<ul className="guide-list">
|
|
880
|
+
<li><strong>Status badge:</strong> <span style={{ color: 'var(--green)' }}>ok</span> / <span style={{ color: 'var(--yellow)' }}>warning</span> / <span style={{ color: 'var(--red)' }}>action-needed</span></li>
|
|
881
|
+
<li><strong>Messages:</strong> What the change affects — orphaned skills, broken edges, co-evolved partners</li>
|
|
882
|
+
<li><strong>Suggestions:</strong> Actionable next steps — rewire children, merge with existing skills, run graph rebuild</li>
|
|
883
|
+
</ul>
|
|
884
|
+
<Callout type="info">
|
|
885
|
+
If the network <strong>cannot adapt</strong> to your change (e.g., deleting a parent skill with children),
|
|
886
|
+
it will show “action-needed” status and tell you exactly what to fix before proceeding.
|
|
887
|
+
</Callout>
|
|
888
|
+
</Section>
|
|
889
|
+
|
|
890
|
+
{/* ─── Craft Agent Integration ─── */}
|
|
891
|
+
<Section id="craft" title="Craft Agent Integration" subtitle="Use Helix from within Craft Agent.">
|
|
892
|
+
<p className="guide-text">
|
|
893
|
+
Helix ships with a Craft Agent skill at <code>integrations/craft-agent/</code>.
|
|
894
|
+
Install it to trigger evolution directly from your Craft Agent sessions.
|
|
895
|
+
</p>
|
|
896
|
+
<Step n={1} title="Copy the skill">
|
|
897
|
+
<Code title="Terminal">{`cp -r integrations/craft-agent/skills/skill-evolver ~/.agents/skills/`}</Code>
|
|
898
|
+
</Step>
|
|
899
|
+
<Step n={2} title="Use in a session">
|
|
900
|
+
<Code title="Craft Agent">{`[skill:skill-evolver] Run evolution on my latest session`}</Code>
|
|
901
|
+
</Step>
|
|
902
|
+
<Callout type="tip">
|
|
903
|
+
The skill-evolver skill automatically captures failures from your current Craft Agent session,
|
|
904
|
+
runs evolution, and reports results — all without leaving the conversation.
|
|
905
|
+
</Callout>
|
|
906
|
+
</Section>
|
|
907
|
+
|
|
908
|
+
{/* ─── FAQ ─── */}
|
|
909
|
+
<Section id="faq" title="FAQ" subtitle="Frequently asked questions.">
|
|
910
|
+
<FAQItem q="How many failures do I need before evolution works?">
|
|
911
|
+
By default, 5 unresolved failures are required (<code>minFailuresForEvolution</code>).
|
|
912
|
+
This ensures enough signal for meaningful pattern detection.
|
|
913
|
+
</FAQItem>
|
|
914
|
+
<FAQItem q="What LLM model does Helix use?">
|
|
915
|
+
Helix uses <code>claude --print</code> with configurable models (default: <code>sonnet</code>).
|
|
916
|
+
No API key is needed — it requires a Claude Max plan subscription. Judges and proposals can use different models.
|
|
917
|
+
</FAQItem>
|
|
918
|
+
<FAQItem q="Can I use Helix with other AI agents?">
|
|
919
|
+
Yes. Helix manages standard SKILL.md files with YAML frontmatter. Any agent that reads SKILL.md
|
|
920
|
+
files can benefit from Helix's evolution pipeline.
|
|
921
|
+
</FAQItem>
|
|
922
|
+
<FAQItem q="What happens during canary rollback?">
|
|
923
|
+
If the failure rate for a canary skill exceeds <code>autoRollbackThreshold</code> (default: 1.5x),
|
|
924
|
+
the skill is automatically reverted from the backup. The failed evolution is recorded in history.
|
|
925
|
+
</FAQItem>
|
|
926
|
+
<FAQItem q="How does cross-skill regression work?">
|
|
927
|
+
When Skill A evolves, Helix checks the skill graph for co-evolved, dependent, and enhancing
|
|
928
|
+
partners. It tests their golden cases against Skill A's changes. If partner pass rate drops below 95%,
|
|
929
|
+
the proposal is rejected.
|
|
930
|
+
</FAQItem>
|
|
931
|
+
<FAQItem q="How does the knowledge buffer work?">
|
|
932
|
+
All research discoveries are saved, even if the resulting hypothesis fails. Failed experiments above
|
|
933
|
+
a minimum score are saved as drafts. When the same hypothesis appears again, Helix iterates on
|
|
934
|
+
the draft rather than starting from scratch.
|
|
935
|
+
</FAQItem>
|
|
936
|
+
<FAQItem q="How does helixevo watch detect corrections?">
|
|
937
|
+
Two-stage detection: (1) Fast regex pre-filter checks for correction signals in English and Chinese
|
|
938
|
+
(e.g., "wrong", "not like that", "不对", "改成"). (2) If a signal is detected, an LLM analyzes
|
|
939
|
+
the recent conversation window to extract structured failure records with confidence scores (>0.7 required).
|
|
940
|
+
</FAQItem>
|
|
941
|
+
<FAQItem q="How do I prove Helix is working?">
|
|
942
|
+
Run <code>helixevo metrics</code>. It tracks correction rates per skill over 7-day rolling windows
|
|
943
|
+
and compares before/after rates for each evolution. The verdict shows how many evolutions actually
|
|
944
|
+
reduced corrections. This is closed-loop measurement — not LLM scores judging LLM output.
|
|
945
|
+
</FAQItem>
|
|
946
|
+
<FAQItem q="What is network health?">
|
|
947
|
+
Network health scores the entire skill network as a unit across 4 dimensions: cohesion (connectivity),
|
|
948
|
+
coverage (failure mapping), balance (hierarchy structure), and transfer rate (cross-project reuse).
|
|
949
|
+
It runs after every evolution and is shown in the summary. Run <code>helixevo health</code> to see the full report.
|
|
950
|
+
</FAQItem>
|
|
951
|
+
<FAQItem q="When does auto-generalization trigger?">
|
|
952
|
+
Automatically during <code>helixevo evolve</code> when the network health assessment detects cross-project
|
|
953
|
+
patterns — the same failure type appearing in 2+ projects. It creates a domain-level abstract skill,
|
|
954
|
+
validates with regression tests, and sets up parent/child inheritance with the source project skills.
|
|
955
|
+
</FAQItem>
|
|
956
|
+
</Section>
|
|
957
|
+
|
|
958
|
+
{/* Footer */}
|
|
959
|
+
<div className="guide-footer">
|
|
960
|
+
<div className="guide-footer-content">
|
|
961
|
+
<div style={{ fontSize: 13, fontWeight: 600 }}>Helix v0.2.0</div>
|
|
962
|
+
<div style={{ fontSize: 12, color: 'var(--text-dim)', marginTop: 4 }}>
|
|
963
|
+
Self-evolving skill ecosystem for AI agents · MIT License
|
|
964
|
+
</div>
|
|
965
|
+
</div>
|
|
966
|
+
</div>
|
|
967
|
+
</div>
|
|
968
|
+
</div>
|
|
969
|
+
)
|
|
970
|
+
}
|
|
971
|
+
|
|
972
|
+
// ─── FAQ Item ───────────────────────────────────────────────────
|
|
973
|
+
function FAQItem({ q, children }: { q: string; children: React.ReactNode }) {
|
|
974
|
+
const [open, setOpen] = useState(false)
|
|
975
|
+
return (
|
|
976
|
+
<div className={`guide-faq-item ${open ? 'open' : ''}`} onClick={() => setOpen(!open)}>
|
|
977
|
+
<div className="guide-faq-question">
|
|
978
|
+
<span>{q}</span>
|
|
979
|
+
<span className="guide-faq-toggle">{open ? '−' : '+'}</span>
|
|
980
|
+
</div>
|
|
981
|
+
{open && <div className="guide-faq-answer">{children}</div>}
|
|
982
|
+
</div>
|
|
983
|
+
)
|
|
984
|
+
}
|