npm - helixevo - Versions diffs - 0.2.7 → 0.2.8 - Mend

helixevo 0.2.7 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/README.md +4 -4
package/dashboard/app/api/skills/route.ts +1 -1
package/dashboard/app/guide/page.tsx +18 -18
package/dashboard/lib/data.ts +5 -3
package/dist/cli.js +26 -23
package/package.json +1 -1

package/README.md CHANGED Viewed

@@ -12,7 +12,7 @@ HelixEvo builds on ideas from [EvoSkill](https://arxiv.org/abs/2603.02766) and [
 Every proposed change goes through:
 1. **3 independent LLM judges** (Task Completion, Correction Alignment, Side-Effect Check)
-2. **Regression testing** against golden cases
+2. **Regression testing** against skill tests
 3. **3-day canary deployment** with auto-rollback
 ## Prerequisites
@@ -57,7 +57,7 @@ npm link
 ## Quick Start
 ```bash
-# 1. Initialize — imports existing skills + generates golden cases
+# 1. Initialize — imports existing skills + generates skill tests
 helixevo init
 # 2. Capture failures from a session
@@ -80,7 +80,7 @@ helixevo dashboard
 | `helixevo watch` | Always-on learning: auto-capture + auto-evolve |
 | `helixevo metrics` | Correction rates, skill trends, evolution impact |
 | `helixevo health` | Network health: cohesion, coverage, balance, transfer |
-| `helixevo init` | Import existing skills + generate golden cases |
+| `helixevo init` | Import existing skills + generate skill tests |
 | `helixevo capture <session>` | Extract failures from a session file |
 | `helixevo evolve` | Evolve skills from captured failures |
 | `helixevo generalize` | Promote cross-project patterns ↑ |
@@ -126,7 +126,7 @@ All data is stored in `~/.helix/`:
 ├── failures.jsonl           # Captured failures
 ├── frontier.json            # Pareto frontier (top-k configurations)
 ├── evolution-history.json   # All evolution runs + proposals
-├── golden-cases.jsonl       # Regression test cases
+├── skill-tests.jsonl       # Regression test cases
 ├── skill-graph.json         # Cached network (nodes + edges)
 ├── canary-registry.json     # Active canary deployments
 ├── knowledge-buffer.json    # Research discoveries + drafts

package/dashboard/app/api/skills/route.ts CHANGED Viewed

@@ -122,7 +122,7 @@ function analyzeNetworkAdaptation(
       })
       result.suggestions.push({
         type: 'rewire',
-        description: `Review golden cases for [${partners.join(', ')}] after this change`
+        description: `Review skill tests for [${partners.join(', ')}] after this change`
       })
     }
   }

package/dashboard/app/guide/page.tsx CHANGED Viewed

@@ -148,7 +148,7 @@ function ArchitectureDiagram() {
         <div className="guide-diagram-box guide-diagram-check" style={{ direction: 'ltr' }}>
           <div className="guide-diagram-box-label">Validate</div>
           <div className="guide-diagram-box-title">Regression Tests</div>
-          <div className="guide-diagram-box-desc">Golden cases + cross-skill</div>
+          <div className="guide-diagram-box-desc">Skill tests + cross-skill</div>
         </div>
         <div className="guide-diagram-arrow" style={{ direction: 'ltr' }}>←</div>
         <div className="guide-diagram-box guide-diagram-judge" style={{ direction: 'ltr' }}>
@@ -317,7 +317,7 @@ cd helixevo && npm install && npm run build && npm link`}</Code>
             <Code title="Terminal">{`helixevo init`}</Code>
             <p className="guide-text-sm">
               This scans your existing SKILL.md files (from <code>~/.agents/skills/</code>), imports them into HelixEvo,
-              and generates golden test cases for each skill. It also creates the data directory at <code>~/.helix/</code>.
+              and generates skill tests for each skill. It also creates the data directory at <code>~/.helix/</code>.
             </p>
           </Step>
@@ -370,7 +370,7 @@ helixevo status`}</Code>
               },
               {
                 cmd: 'helixevo init',
-                desc: 'Import existing skills and generate golden test cases. Scans ~/.agents/skills/ and creates the HelixEvo data directory.',
+                desc: 'Import existing skills and generate skill tests. Scans ~/.agents/skills/ and creates the HelixEvo data directory.',
                 flags: ['--verbose'],
               },
               {
@@ -545,7 +545,7 @@ Project B: "Use FlashList not FlatList" (React Native perf)
   → Abstract skill created: "react-native-performance" (domain layer)
   → Project A skill inherits from it
   → Project B skill inherits from it
-  → Domain skill tested against all golden cases
+  → Domain skill tested against all skill tests
   → Deployed if regression passes`}</Code>
           <Callout type="tip">
             Auto-generalization is the key to the <strong>double helix</strong> metaphor: as projects evolve, skills
@@ -601,7 +601,7 @@ Project B: "Use FlashList not FlatList" (React Native perf)
             <div className="guide-pipeline-connector" />
             <PipelineStep
               icon="5" title="Regression Testing" color="var(--red)"
-              desc="The modified skill is tested against all golden cases for that skill AND co-evolved partner skills. Must maintain ≥95% pass rate."
+              desc="The modified skill is tested against all skill tests for that skill AND co-evolved partner skills. Must maintain ≥95% pass rate."
             />
             <div className="guide-pipeline-connector" />
             <PipelineStep
@@ -693,29 +693,29 @@ Project B: "Use FlashList not FlatList" (React Native perf)
         </Section>
         {/* ─── Regression Testing ─── */}
-        <Section id="regression" title="Regression Testing" subtitle="Golden cases and cross-skill validation ensure quality.">
-          <h3 className="guide-h3">Golden Cases</h3>
+        <Section id="regression" title="Regression Testing" subtitle="Skill tests and cross-skill validation ensure quality.">
+          <h3 className="guide-h3">Skill Tests</h3>
           <p className="guide-text">
-            Golden cases are regression test scenarios tied to specific skills. They&apos;re created when:
+            Skill tests are regression test scenarios tied to specific skills. They&apos;re created when:
           </p>
           <ul className="guide-list">
             <li><strong>Init:</strong> Automatically generated from existing SKILL.md files during <code>helixevo init</code></li>
-            <li><strong>Evolution:</strong> When a failure is resolved, the scenario is promoted to a golden case</li>
+            <li><strong>Evolution:</strong> When a failure is resolved, the scenario is promoted to a skill test</li>
           </ul>
           <p className="guide-text">
-            Each golden case stores the input, context, and expected behavior. During regression testing,
+            Each skill test stores the input, context, and expected behavior. During regression testing,
             an LLM judge evaluates whether the modified skill would still handle each scenario correctly.
           </p>
           <h3 className="guide-h3">Cross-Skill Regression</h3>
           <p className="guide-text">
-            When skill A is modified, HelixEvo also tests golden cases from co-evolved, dependent, and enhancing
+            When skill A is modified, HelixEvo also tests skill tests from co-evolved, dependent, and enhancing
             partner skills. This catches silent incompatibilities where changing one skill breaks a related skill&apos;s behavior.
           </p>
           <Code title="How it works">{`Skill A evolves
   → Load skill graph edges
   → Find partners (co-evolves, depends, enhances)
-  → Test partner golden cases against Skill A's changes
+  → Test partner skill tests against Skill A's changes
   → Block if partner pass rate < 95%`}</Code>
         </Section>
@@ -796,10 +796,10 @@ generation: 3
           <div className="guide-params">
             <Param name="quality.judgePassScore" type="number" desc="Minimum judge score to pass (1-10)." def="7" />
             <Param name="quality.judgeConsensusMin" type="number" desc="Minimum judges that must pass." def="2" />
-            <Param name="quality.regressionPassRate" type="number" desc="Minimum golden case pass rate (0-1)." def="0.95" />
+            <Param name="quality.regressionPassRate" type="number" desc="Minimum skill test pass rate (0-1)." def="0.95" />
             <Param name="quality.canaryDurationDays" type="number" desc="Days to monitor canary deployments." def="3" />
             <Param name="quality.autoRollbackThreshold" type="number" desc="Failure rate multiplier triggering rollback." def="1.5" />
-            <Param name="quality.maxGoldenCases" type="number" desc="Maximum golden cases per skill." def="50" />
+            <Param name="quality.maxSkillTests" type="number" desc="Maximum skill tests per skill." def="50" />
           </div>
           <Code title="~/.helix/config.json">{`{
@@ -819,7 +819,7 @@ generation: 3
     "regressionPassRate": 0.95,
     "canaryDurationDays": 3,
     "autoRollbackThreshold": 1.5,
-    "maxGoldenCases": 50
+    "maxSkillTests": 50
   }
 }`}</Code>
         </Section>
@@ -831,7 +831,7 @@ generation: 3
 ├── failures.jsonl           # Captured failure records (append-only)
 ├── frontier.json            # Pareto frontier (top-K programs)
 ├── evolution-history.json   # All evolution iterations + proposals
-├── golden-cases.jsonl       # Regression test cases (append-only)
+├── skill-tests.jsonl       # Regression test cases (append-only)
 ├── skill-graph.json         # Cached network (nodes + edges)
 ├── canary-registry.json     # Active canary deployments
 ├── knowledge-buffer.json    # Research discoveries + drafts
@@ -858,7 +858,7 @@ generation: 3
 }`}</Code>
             </div>
             <div className="guide-data-card">
-              <div className="guide-data-title">Golden Case</div>
+              <div className="guide-data-title">Skill Test</div>
               <Code>{`{
   "id": "gc_react_42",
   "skill": "react-patterns",
@@ -949,7 +949,7 @@ generation: 3
           </FAQItem>
           <FAQItem q="How does cross-skill regression work?">
             When Skill A evolves, HelixEvo checks the skill graph for co-evolved, dependent, and enhancing
-            partners. It tests their golden cases against Skill A&apos;s changes. If partner pass rate drops below 95%,
+            partners. It tests their skill tests against Skill A&apos;s changes. If partner pass rate drops below 95%,
             the proposal is rejected.
           </FAQItem>
           <FAQItem q="How does the knowledge buffer work?">

package/dashboard/lib/data.ts CHANGED Viewed

@@ -89,7 +89,9 @@ export function loadHistory(): { iterations: Iteration[] } {
   return readJson<{ iterations: Iteration[] }>('evolution-history.json', { iterations: [] })
 }
-export function loadGoldenCases(): { id: string; skill: string; input: string }[] {
+export function loadSkillTests(): { id: string; skill: string; input: string }[] {
+  const newFile = readJsonl('skill-tests.jsonl')
+  if (newFile.length > 0) return newFile
   return readJsonl('golden-cases.jsonl')
 }
@@ -126,7 +128,7 @@ export function getDashboardSummary() {
   const history = loadHistory()
   const buffer = loadBuffer()
   const canaries = loadCanaries()
-  const goldenCases = loadGoldenCases()
+  const skillTests = loadSkillTests()
   const evolved = graph.nodes.filter(n => n.generation > 0)
   const totalProposals = history.iterations.flatMap(i => i.proposals)
@@ -141,6 +143,6 @@ export function getDashboardSummary() {
     evolution: { runs: history.iterations.length, accepted: accepted.length, rejected: rejected.length },
     buffer: { discoveries: buffer.discoveries.length, drafts: buffer.drafts.length },
     canaries: canaries.entries.length,
-    goldenCases: goldenCases.length,
+    skillTests: skillTests.length,
   }
 }

package/dist/cli.js CHANGED Viewed

@@ -2129,7 +2129,7 @@ var init_config = __esm(() => {
       regressionPassRate: 0.95,
       canaryDurationDays: 3,
       autoRollbackThreshold: 1.5,
-      maxGoldenCases: 50
+      maxSkillTests: 50
     },
     reporting: {
       schedule: "0 8 * * *",
@@ -9226,11 +9226,14 @@ function loadHistory() {
 function saveHistory(history) {
   writeJson("evolution-history.json", history);
 }
-function loadGoldenCases() {
+function loadSkillTests() {
+  const newFile = readJsonl("skill-tests.jsonl");
+  if (newFile.length > 0)
+    return newFile;
   return readJsonl("golden-cases.jsonl");
 }
-function appendGoldenCase(gc) {
-  appendJsonl("golden-cases.jsonl", gc);
+function appendSkillTest(gc) {
+  appendJsonl("skill-tests.jsonl", gc);
 }
 function loadSkillGraph() {
   return readJson("skill-graph.json", {
@@ -9577,8 +9580,8 @@ import { join as join3 } from "node:path";
 import { homedir as homedir2 } from "node:os";
 import { existsSync as existsSync4, cpSync } from "node:fs";
-// src/prompts/golden-gen.ts
-function buildGoldenGenPrompt(skill) {
+// src/prompts/test-gen.ts
+function buildTestGenPrompt(skill) {
   return `Read this skill and generate 3 typical usage scenarios where the skill should guide correct behavior.
 ## Skill: ${skill.meta.name}
@@ -9650,13 +9653,13 @@ async function initCommand(options) {
   console.log(`
   Imported ${imported} new skills
 `);
-  if (!options.skipGolden) {
+  if (!options.skipTests) {
     const generalSkills = loadAllGeneralSkills();
-    console.log(`  Generating golden cases...
+    console.log(`  Generating skill tests...
 `);
     for (const skill of generalSkills) {
       try {
-        const prompt = buildGoldenGenPrompt(skill);
+        const prompt = buildTestGenPrompt(skill);
         const output = await chatJson({ prompt });
         for (const c of output.cases) {
           const gc = {
@@ -9671,11 +9674,11 @@ async function initCommand(options) {
             lastResult: "pass",
             consecutivePasses: 1
           };
-          appendGoldenCase(gc);
+          appendSkillTest(gc);
         }
-        console.log(`    ✓ ${skill.slug}: ${output.cases.length} golden cases`);
+        console.log(`    ✓ ${skill.slug}: ${output.cases.length} skill tests`);
       } catch (err) {
-        console.log(`    ✗ ${skill.slug}: failed to generate golden cases (${err})`);
+        console.log(`    ✗ ${skill.slug}: failed to generate skill tests (${err})`);
       }
     }
   }
@@ -9989,11 +9992,11 @@ init_config();
 init_llm();
 async function runRegression(skillSlug, newSkillContent, verbose = false) {
   const config = loadConfig();
-  const allCases = loadGoldenCases();
+  const allCases = loadSkillTests();
   const cases = allCases.filter((gc) => gc.skill === skillSlug);
   if (cases.length === 0) {
     if (verbose)
-      console.log(`    No golden cases for ${skillSlug}, skipping regression`);
+      console.log(`    No skill tests for ${skillSlug}, skipping regression`);
     return { total: 0, passed: 0, passRate: 1, failures: [] };
   }
   const failures = [];
@@ -10020,7 +10023,7 @@ async function runRegression(skillSlug, newSkillContent, verbose = false) {
     failures
   };
 }
-function promoteToGoldenCase(failure, skillSlug, replayResult) {
+function promoteToSkillTest(failure, skillSlug, replayResult) {
   const gc = {
     id: `gc_${skillSlug}_${Date.now() % 1e5}`,
     addedAt: new Date().toISOString(),
@@ -10033,7 +10036,7 @@ function promoteToGoldenCase(failure, skillSlug, replayResult) {
     lastResult: "pass",
     consecutivePasses: 1
   };
-  appendGoldenCase(gc);
+  appendSkillTest(gc);
 }
 function buildRegressionJudgePrompt(gc, skillContent) {
   return `You are a regression test judge. Determine if a modified skill can still handle this scenario correctly.
@@ -10072,7 +10075,7 @@ async function runCrossSkillRegression(skillSlug, newSkillContent, verbose = fal
     return { total: 0, passed: 0, passRate: 1, testedSkills: [] };
   }
   const config = loadConfig();
-  const allCases = loadGoldenCases();
+  const allCases = loadSkillTests();
   const partnerCases = allCases.filter((gc) => partners.includes(gc.skill));
   if (partnerCases.length === 0) {
     return { total: 0, passed: 0, passRate: 1, testedSkills: partners };
@@ -10496,7 +10499,7 @@ async function evolveCommand(options) {
       const skillFailureCount = allFailures.filter((f) => f.skillsActive.includes(skillSlug2)).length;
       deployCanary(skillSlug2, `v${generation}`, backupPath, config.quality.canaryDurationDays, skillFailureCount);
       console.log(`  \uD83D\uDC25 Canary deployed: ${config.quality.canaryDurationDays} day monitoring period`);
-      promoteToGoldenCase(testFailure, skillSlug2, replayResult);
+      promoteToSkillTest(testFailure, skillSlug2, replayResult);
       const program2 = {
         id: `gen${generation}-${skillSlug2}`,
         generation,
@@ -10703,7 +10706,7 @@ async function statusCommand() {
   const unresolved = failures.filter((f) => !f.resolved);
   const frontier = loadFrontier();
   const skills = loadAllGeneralSkills();
-  const goldenCases = loadGoldenCases();
+  const skillTests = loadSkillTests();
   const stagnation = getStagnationCount();
   const recentIter = getRecentIterations(7);
   console.log(`\uD83E\uDDEC HelixEvo Status
@@ -10722,7 +10725,7 @@ async function statusCommand() {
   }
   console.log(`
   Failures: ${unresolved.length} unresolved / ${failures.length} total`);
-  console.log(`  Golden cases: ${goldenCases.length}`);
+  console.log(`  Skill tests: ${skillTests.length}`);
   const buffer = getBufferStats();
   console.log(`
   Knowledge Buffer:`);
@@ -12924,12 +12927,12 @@ async function metricsCommand(options) {
 // src/cli.ts
 var program2 = new Command;
-program2.name("helixevo").description("Self-evolving skill ecosystem for AI agents").version("0.2.7").addHelpText("after", `
+program2.name("helixevo").description("Self-evolving skill ecosystem for AI agents").version("0.2.8").addHelpText("after", `
 Examples:
   $ helixevo watch                         Always-on learning (auto-capture + auto-evolve)
   $ helixevo watch --project myapp         Watch with project context
   $ helixevo metrics                       Show correction rates and evolution impact
-  $ helixevo init                          Import skills + generate golden cases
+  $ helixevo init                          Import skills + generate skill tests
   $ helixevo status                        Show system health
   $ helixevo evolve --verbose              Evolve skills from failures
   $ helixevo evolve --dry-run              Preview proposals without applying
@@ -12943,7 +12946,7 @@ Examples:
   $ helixevo graph --optimize              Detect merge/split/conflicts
   $ helixevo report --days 7               Weekly evolution report
   $ helixevo capture <session.json>        Extract failures from session`);
-program2.command("init").description("Import existing skills + generate golden cases").option("--skills-paths <paths...>", "Paths to scan for existing skills").option("--skip-golden", "Skip golden case generation").action(initCommand);
+program2.command("init").description("Import existing skills + generate skill tests").option("--skills-paths <paths...>", "Paths to scan for existing skills").option("--skip-tests", "Skip skill test generation").action(initCommand);
 program2.command("capture").description("Capture failures from a Craft Agent session").argument("<sessionPath>", "Path to session conversation file").option("--project <name>", "Project name override").action(captureCommand);
 program2.command("evolve").description("Evolve skills from failures [--dry-run] [--verbose] [--max-proposals <n>]").option("--dry-run", "Show proposals without applying").option("--verbose", "Show detailed LLM interactions").option("--max-proposals <n>", "Max proposals per run", "5").action(evolveCommand);
 program2.command("generalize").description("Promote cross-skill patterns to higher layer ↑ [--dry-run] [--verbose]").option("--dry-run", "Show candidates without applying").option("--verbose", "Show detailed analysis").action(generalizeCommand);

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "helixevo",
-  "version": "0.2.7",
+  "version": "0.2.8",
   "description": "Self-evolving skill ecosystem for AI agents. Skills and projects co-evolve through multi-judge evaluation and a Pareto frontier.",
   "type": "module",
   "bin": {