selftune 0.2.23 → 0.2.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/README.md +93 -15
- package/apps/local-dashboard/dist/assets/index-DgY2KGP-.css +1 -0
- package/apps/local-dashboard/dist/assets/index-Dhgv5BQO.js +15 -0
- package/apps/local-dashboard/dist/assets/vendor-react-C5oyHiV1.js +11 -0
- package/apps/local-dashboard/dist/assets/{vendor-table-BIiI3YhS.js → vendor-table-Bc_bbKd8.js} +1 -1
- package/apps/local-dashboard/dist/assets/vendor-ui-B3BPIYy7.js +1 -0
- package/apps/local-dashboard/dist/index.html +5 -5
- package/cli/selftune/adapters/codex/install.ts +310 -78
- package/cli/selftune/adapters/opencode/install.ts +3 -4
- package/cli/selftune/alpha-upload/build-payloads.ts +3 -3
- package/cli/selftune/alpha-upload/stage-canonical.ts +17 -11
- package/cli/selftune/auto-update.ts +200 -8
- package/cli/selftune/canonical-export.ts +55 -25
- package/cli/selftune/command-surface.ts +397 -0
- package/cli/selftune/contribute/contribute.ts +64 -13
- package/cli/selftune/contribution-config.ts +57 -3
- package/cli/selftune/contribution-preferences.ts +117 -0
- package/cli/selftune/contribution-signals.ts +8 -4
- package/cli/selftune/contribution-staging.ts +13 -2
- package/cli/selftune/contributions.ts +55 -121
- package/cli/selftune/creator-contributions.ts +29 -10
- package/cli/selftune/cron/setup.ts +7 -3
- package/cli/selftune/dashboard-contract.ts +73 -0
- package/cli/selftune/dashboard-server.ts +168 -17
- package/cli/selftune/dashboard.ts +350 -17
- package/cli/selftune/eval/baseline.ts +21 -5
- package/cli/selftune/eval/execution-eval.ts +170 -0
- package/cli/selftune/eval/family-overlap.ts +2 -2
- package/cli/selftune/eval/hooks-to-evals.ts +228 -82
- package/cli/selftune/eval/import-skillsbench.ts +2 -2
- package/cli/selftune/eval/invocation-classifier.ts +56 -0
- package/cli/selftune/eval/synthetic-evals.ts +5 -3
- package/cli/selftune/eval/unit-test-cli.ts +7 -4
- package/cli/selftune/evolution/apply-proposal.ts +295 -0
- package/cli/selftune/evolution/engines/replay-engine.ts +79 -57
- package/cli/selftune/evolution/evolve-body.ts +100 -39
- package/cli/selftune/evolution/evolve.ts +244 -52
- package/cli/selftune/evolution/rollback.ts +0 -1
- package/cli/selftune/evolution/validate-body.ts +68 -42
- package/cli/selftune/evolution/validate-host-replay.ts +510 -60
- package/cli/selftune/evolution/validate-proposal.ts +11 -150
- package/cli/selftune/evolution/validate-routing.ts +43 -41
- package/cli/selftune/evolution/validation-contract.ts +91 -0
- package/cli/selftune/grading/auto-grade.ts +11 -7
- package/cli/selftune/grading/grade-session.ts +10 -16
- package/cli/selftune/index.ts +35 -10
- package/cli/selftune/ingestors/claude-replay.ts +15 -10
- package/cli/selftune/ingestors/codex-wrapper.ts +3 -3
- package/cli/selftune/ingestors/opencode-ingest.ts +2 -2
- package/cli/selftune/ingestors/pi-ingest.ts +3 -2
- package/cli/selftune/init.ts +27 -3
- package/cli/selftune/localdb/direct-write.ts +35 -1
- package/cli/selftune/localdb/queries/cron.ts +34 -0
- package/cli/selftune/localdb/queries/dashboard.ts +834 -0
- package/cli/selftune/localdb/queries/evolution.ts +158 -0
- package/cli/selftune/localdb/queries/execution.ts +133 -0
- package/cli/selftune/localdb/queries/json.ts +18 -0
- package/cli/selftune/localdb/queries/monitoring.ts +263 -0
- package/cli/selftune/localdb/queries/raw.ts +95 -0
- package/cli/selftune/localdb/queries/staging.ts +270 -0
- package/cli/selftune/localdb/queries/trust.ts +392 -0
- package/cli/selftune/localdb/queries.ts +60 -2288
- package/cli/selftune/localdb/schema.ts +21 -0
- package/cli/selftune/monitoring/watch.ts +96 -29
- package/cli/selftune/normalization.ts +3 -0
- package/cli/selftune/observability.ts +4 -2
- package/cli/selftune/orchestrate/cli.ts +161 -0
- package/cli/selftune/orchestrate/execute.ts +295 -0
- package/cli/selftune/orchestrate/finalize.ts +157 -0
- package/cli/selftune/orchestrate/locks.ts +40 -0
- package/cli/selftune/orchestrate/plan.ts +131 -0
- package/cli/selftune/orchestrate/post-run.ts +59 -0
- package/cli/selftune/orchestrate/prepare.ts +334 -0
- package/cli/selftune/orchestrate/report.ts +182 -0
- package/cli/selftune/orchestrate/runtime.ts +120 -0
- package/cli/selftune/orchestrate/signals.ts +48 -0
- package/cli/selftune/orchestrate.ts +150 -1173
- package/cli/selftune/repair/skill-usage.ts +5 -2
- package/cli/selftune/routes/overview.ts +5 -2
- package/cli/selftune/routes/skill-report.ts +15 -2
- package/cli/selftune/schedule.ts +5 -5
- package/cli/selftune/status.ts +39 -2
- package/cli/selftune/testing-readiness.ts +597 -0
- package/cli/selftune/types.ts +44 -4
- package/cli/selftune/uninstall.ts +2 -1
- package/cli/selftune/utils/canonical-log.ts +1 -9
- package/cli/selftune/utils/cli-error.ts +9 -0
- package/cli/selftune/utils/llm-call.ts +126 -6
- package/cli/selftune/utils/skill-discovery.ts +2 -0
- package/cli/selftune/workflows/proposals.ts +184 -0
- package/cli/selftune/workflows/skill-scaffold.ts +241 -0
- package/cli/selftune/workflows/workflows.ts +100 -26
- package/node_modules/@selftune/telemetry-contract/fixtures/complete-push.ts +1 -1
- package/node_modules/@selftune/telemetry-contract/fixtures/evidence-only-push.ts +1 -1
- package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
- package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +1 -1
- package/node_modules/@selftune/telemetry-contract/src/schemas.ts +41 -1
- package/node_modules/@selftune/telemetry-contract/src/types.ts +103 -2
- package/package.json +25 -9
- package/packages/dashboard-core/AGENTS.md +18 -0
- package/packages/dashboard-core/README.md +30 -0
- package/packages/dashboard-core/index.ts +3 -0
- package/packages/dashboard-core/package.json +39 -0
- package/packages/dashboard-core/src/chrome/DashboardChrome.tsx +74 -0
- package/packages/dashboard-core/src/chrome/DashboardHeader.tsx +200 -0
- package/packages/dashboard-core/src/chrome/DashboardSidebar.tsx +219 -0
- package/packages/dashboard-core/src/chrome/RuntimeBadge.tsx +46 -0
- package/packages/dashboard-core/src/chrome/index.ts +14 -0
- package/packages/dashboard-core/src/chrome/types.ts +81 -0
- package/packages/dashboard-core/src/chrome/utils.ts +23 -0
- package/packages/dashboard-core/src/gates/FeatureGate.tsx +11 -0
- package/packages/dashboard-core/src/gates/LockedRoute.tsx +29 -0
- package/packages/dashboard-core/src/gates/UpgradeCard.tsx +89 -0
- package/packages/dashboard-core/src/gates/index.ts +3 -0
- package/packages/dashboard-core/src/host/DashboardHostProvider.tsx +62 -0
- package/packages/dashboard-core/src/host/adapter.ts +47 -0
- package/packages/dashboard-core/src/host/capabilities.ts +55 -0
- package/packages/dashboard-core/src/host/index.ts +3 -0
- package/packages/dashboard-core/src/models/analytics.ts +39 -0
- package/packages/dashboard-core/src/models/index.ts +4 -0
- package/packages/dashboard-core/src/models/overview.ts +98 -0
- package/packages/dashboard-core/src/models/runtime.ts +7 -0
- package/packages/dashboard-core/src/models/skills.ts +34 -0
- package/packages/dashboard-core/src/routes/index.ts +2 -0
- package/packages/dashboard-core/src/routes/manifest.test.ts +70 -0
- package/packages/dashboard-core/src/routes/manifest.ts +451 -0
- package/packages/dashboard-core/src/routes/types.ts +39 -0
- package/packages/dashboard-core/src/screens/analytics/AnalyticsScreen.tsx +278 -0
- package/packages/dashboard-core/src/screens/analytics/index.ts +1 -0
- package/packages/dashboard-core/src/screens/index.ts +37 -0
- package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.test.ts +101 -0
- package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.tsx +393 -0
- package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.test.tsx +113 -0
- package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.tsx +72 -0
- package/packages/dashboard-core/src/screens/overview/OverviewCoreSurface.tsx +71 -0
- package/packages/dashboard-core/src/screens/overview/OverviewOnboardingBanner.tsx +90 -0
- package/packages/dashboard-core/src/screens/overview/OverviewRunSummary.tsx +40 -0
- package/packages/dashboard-core/src/screens/overview/index.ts +16 -0
- package/packages/dashboard-core/src/screens/overview/types.ts +13 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportDailyBreakdownSection.tsx +99 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportDataQualityTabContent.tsx +35 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceRail.tsx +71 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceSection.tsx +63 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceTabContent.tsx +25 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportInvocationsSection.tsx +24 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportMissedQueriesSection.tsx +79 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportScaffold.tsx +150 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportSections.test.tsx +224 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.test.tsx +76 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.tsx +88 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTrendSection.tsx +33 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTrustBadge.tsx +67 -0
- package/packages/dashboard-core/src/screens/skill-report/index.ts +45 -0
- package/packages/dashboard-core/src/screens/skills/SkillsLibraryScreen.tsx +162 -0
- package/packages/dashboard-core/src/screens/skills/index.ts +6 -0
- package/packages/telemetry-contract/fixtures/complete-push.ts +1 -1
- package/packages/telemetry-contract/fixtures/evidence-only-push.ts +1 -1
- package/packages/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
- package/packages/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +1 -1
- package/packages/telemetry-contract/src/schemas.ts +41 -1
- package/packages/telemetry-contract/src/types.ts +103 -2
- package/packages/ui/src/components/EvidenceViewer.tsx +80 -25
- package/packages/ui/src/components/OverviewPanels.tsx +67 -26
- package/packages/ui/src/primitives/tabs.tsx +7 -6
- package/packages/ui/src/types.ts +10 -0
- package/skill/SKILL.md +130 -332
- package/skill/agents/diagnosis-analyst.md +3 -3
- package/skill/agents/evolution-reviewer.md +3 -3
- package/skill/agents/integration-guide.md +3 -3
- package/skill/agents/pattern-analyst.md +2 -2
- package/skill/references/cli-quick-reference.md +89 -0
- package/skill/references/creator-playbook.md +131 -0
- package/skill/references/examples.md +48 -0
- package/skill/references/troubleshooting.md +47 -0
- package/skill/references/version-history.md +1 -1
- package/skill/selftune.contribute.json +11 -0
- package/skill/{Workflows → workflows}/Baseline.md +20 -1
- package/skill/{Workflows → workflows}/Contribute.md +23 -10
- package/skill/{Workflows → workflows}/Contributions.md +13 -5
- package/skill/workflows/CreateTestDeploy.md +170 -0
- package/skill/{Workflows → workflows}/CreatorContributions.md +18 -6
- package/skill/{Workflows → workflows}/Cron.md +1 -1
- package/skill/{Workflows → workflows}/Dashboard.md +20 -0
- package/skill/{Workflows → workflows}/Doctor.md +1 -1
- package/skill/{Workflows → workflows}/Evals.md +67 -2
- package/skill/{Workflows → workflows}/Evolve.md +119 -30
- package/skill/{Workflows → workflows}/EvolveBody.md +41 -1
- package/skill/{Workflows → workflows}/Grade.md +1 -1
- package/skill/{Workflows → workflows}/Initialize.md +8 -4
- package/skill/{Workflows → workflows}/Orchestrate.md +13 -3
- package/skill/{Workflows → workflows}/Schedule.md +3 -3
- package/skill/workflows/SignalsDashboard.md +87 -0
- package/skill/{Workflows → workflows}/UnitTest.md +19 -0
- package/skill/{Workflows → workflows}/Watch.md +42 -2
- package/skill/{Workflows → workflows}/Workflows.md +39 -2
- package/apps/local-dashboard/dist/assets/index-CwOtTrUS.css +0 -1
- package/apps/local-dashboard/dist/assets/index-f1HQpbeH.js +0 -59
- package/apps/local-dashboard/dist/assets/vendor-react-CKkiCskZ.js +0 -11
- package/apps/local-dashboard/dist/assets/vendor-ui-jVSaIZey.js +0 -12
- /package/skill/{Workflows → workflows}/AlphaUpload.md +0 -0
- /package/skill/{Workflows → workflows}/AutoActivation.md +0 -0
- /package/skill/{Workflows → workflows}/Badge.md +0 -0
- /package/skill/{Workflows → workflows}/Composability.md +0 -0
- /package/skill/{Workflows → workflows}/EvolutionMemory.md +0 -0
- /package/skill/{Workflows → workflows}/ExportCanonical.md +0 -0
- /package/skill/{Workflows → workflows}/Hook.md +0 -0
- /package/skill/{Workflows → workflows}/ImportSkillsBench.md +0 -0
- /package/skill/{Workflows → workflows}/Ingest.md +0 -0
- /package/skill/{Workflows → workflows}/PlatformHooks.md +0 -0
- /package/skill/{Workflows → workflows}/Quickstart.md +0 -0
- /package/skill/{Workflows → workflows}/Recover.md +0 -0
- /package/skill/{Workflows → workflows}/Registry.md +0 -0
- /package/skill/{Workflows → workflows}/RepairSkillUsage.md +0 -0
- /package/skill/{Workflows → workflows}/Replay.md +0 -0
- /package/skill/{Workflows → workflows}/Rollback.md +0 -0
- /package/skill/{Workflows → workflows}/Sync.md +0 -0
- /package/skill/{Workflows → workflows}/Telemetry.md +0 -0
- /package/skill/{Workflows → workflows}/Uninstall.md +0 -0
|
@@ -10,9 +10,10 @@
|
|
|
10
10
|
|
|
11
11
|
import { parseArgs } from "node:util";
|
|
12
12
|
|
|
13
|
+
import { writeGradingBaseline } from "../localdb/direct-write.js";
|
|
13
14
|
import type { BaselineResult, EvalEntry } from "../types.js";
|
|
14
15
|
import { CLIError, handleCLIError } from "../utils/cli-error.js";
|
|
15
|
-
import { callLlm } from "../utils/llm-call.js";
|
|
16
|
+
import { callLlm, detectLlmAgent } from "../utils/llm-call.js";
|
|
16
17
|
import { buildTriggerCheckPrompt, parseTriggerResponse } from "../utils/trigger-check.js";
|
|
17
18
|
|
|
18
19
|
// ---------------------------------------------------------------------------
|
|
@@ -208,7 +209,6 @@ Options:
|
|
|
208
209
|
}
|
|
209
210
|
|
|
210
211
|
// Detect agent
|
|
211
|
-
const { detectAgent } = await import("../utils/llm-call.js");
|
|
212
212
|
const requestedAgent = values.agent;
|
|
213
213
|
if (requestedAgent && !Bun.which(requestedAgent)) {
|
|
214
214
|
throw new CLIError(
|
|
@@ -217,12 +217,12 @@ Options:
|
|
|
217
217
|
"Install it or omit --agent to use auto-detection",
|
|
218
218
|
);
|
|
219
219
|
}
|
|
220
|
-
const agent = requestedAgent ??
|
|
220
|
+
const agent = requestedAgent ?? detectLlmAgent();
|
|
221
221
|
if (!agent) {
|
|
222
222
|
throw new CLIError(
|
|
223
|
-
"No agent CLI (claude/codex/opencode) found in PATH",
|
|
223
|
+
"No agent CLI (claude/codex/opencode/pi) found in PATH",
|
|
224
224
|
"AGENT_NOT_FOUND",
|
|
225
|
-
"Install Claude Code, Codex, or
|
|
225
|
+
"Install Claude Code, Codex, OpenCode, or Pi",
|
|
226
226
|
);
|
|
227
227
|
}
|
|
228
228
|
|
|
@@ -233,6 +233,22 @@ Options:
|
|
|
233
233
|
agent,
|
|
234
234
|
});
|
|
235
235
|
|
|
236
|
+
writeGradingBaseline({
|
|
237
|
+
skill_name: values.skill,
|
|
238
|
+
proposal_id: null,
|
|
239
|
+
measured_at: result.measured_at,
|
|
240
|
+
pass_rate: result.with_skill_pass_rate,
|
|
241
|
+
mean_score: null,
|
|
242
|
+
sample_size: evalSet.length,
|
|
243
|
+
grading_results_json: JSON.stringify({
|
|
244
|
+
baseline_pass_rate: result.baseline_pass_rate,
|
|
245
|
+
with_skill_pass_rate: result.with_skill_pass_rate,
|
|
246
|
+
lift: result.lift,
|
|
247
|
+
adds_value: result.adds_value,
|
|
248
|
+
per_entry: result.per_entry,
|
|
249
|
+
}),
|
|
250
|
+
});
|
|
251
|
+
|
|
236
252
|
console.log(JSON.stringify(result, null, 2));
|
|
237
253
|
process.exit(result.adds_value ? 0 : 1);
|
|
238
254
|
}
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* execution-eval.ts
|
|
3
|
+
*
|
|
4
|
+
* Experimental execution eval harness — runs assertion-based evals
|
|
5
|
+
* in a staged skill workspace. Phase 2 of eval system gap closure.
|
|
6
|
+
*
|
|
7
|
+
* Behind experimental flag: must be explicitly opted into.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { copyFileSync, existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync } from "node:fs";
|
|
11
|
+
import { tmpdir } from "node:os";
|
|
12
|
+
import { join } from "node:path";
|
|
13
|
+
import type { ExecutionAssertion, ExecutionEvalEntry } from "../types.js";
|
|
14
|
+
|
|
15
|
+
// ---------------------------------------------------------------------------
|
|
16
|
+
// Result types
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
|
|
19
|
+
export interface ExecutionEvalResult {
|
|
20
|
+
entry: ExecutionEvalEntry;
|
|
21
|
+
passed: boolean;
|
|
22
|
+
assertion_results: AssertionResult[];
|
|
23
|
+
workspace_path?: string;
|
|
24
|
+
elapsed_ms: number;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export interface AssertionResult {
|
|
28
|
+
assertion: ExecutionAssertion;
|
|
29
|
+
passed: boolean;
|
|
30
|
+
actual?: string;
|
|
31
|
+
error?: string;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// ---------------------------------------------------------------------------
|
|
35
|
+
// Staged workspace management
|
|
36
|
+
// ---------------------------------------------------------------------------
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Create a staged workspace for execution eval isolation.
|
|
40
|
+
* Copies the skill into a temp directory so assertions don't affect the real skill.
|
|
41
|
+
*/
|
|
42
|
+
export function createStagedWorkspace(skillPath: string): string {
|
|
43
|
+
const workspace = mkdtempSync(join(tmpdir(), "selftune-exec-eval-"));
|
|
44
|
+
const skillDir = join(workspace, "skill");
|
|
45
|
+
mkdirSync(skillDir, { recursive: true });
|
|
46
|
+
|
|
47
|
+
if (existsSync(skillPath)) {
|
|
48
|
+
copyFileSync(skillPath, join(skillDir, "SKILL.md"));
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
return workspace;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Clean up a staged workspace.
|
|
56
|
+
*/
|
|
57
|
+
export function cleanupStagedWorkspace(workspacePath: string): void {
|
|
58
|
+
try {
|
|
59
|
+
rmSync(workspacePath, { recursive: true, force: true });
|
|
60
|
+
} catch {
|
|
61
|
+
// Best-effort cleanup — non-fatal
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// ---------------------------------------------------------------------------
|
|
66
|
+
// Assertion runner
|
|
67
|
+
// ---------------------------------------------------------------------------
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Run a single execution assertion against a workspace.
|
|
71
|
+
*/
|
|
72
|
+
export function runAssertion(
|
|
73
|
+
assertion: ExecutionAssertion,
|
|
74
|
+
workspacePath: string,
|
|
75
|
+
): AssertionResult {
|
|
76
|
+
try {
|
|
77
|
+
switch (assertion.type) {
|
|
78
|
+
case "file_exists": {
|
|
79
|
+
const target = join(workspacePath, assertion.target);
|
|
80
|
+
const exists = existsSync(target);
|
|
81
|
+
const passed = assertion.negated ? !exists : exists;
|
|
82
|
+
return { assertion, passed, actual: exists ? "exists" : "not found" };
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
case "file_contains": {
|
|
86
|
+
const target = join(workspacePath, assertion.target);
|
|
87
|
+
if (!existsSync(target)) {
|
|
88
|
+
return { assertion, passed: !!assertion.negated, actual: "file not found" };
|
|
89
|
+
}
|
|
90
|
+
const content = readFileSync(target, "utf-8");
|
|
91
|
+
const pattern = new RegExp(assertion.expected ?? "");
|
|
92
|
+
const matches = pattern.test(content);
|
|
93
|
+
const passed = assertion.negated ? !matches : matches;
|
|
94
|
+
return { assertion, passed, actual: matches ? "matched" : "no match" };
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
case "command_output":
|
|
98
|
+
case "skill_triggered":
|
|
99
|
+
case "custom": {
|
|
100
|
+
return {
|
|
101
|
+
assertion,
|
|
102
|
+
passed: false,
|
|
103
|
+
error: `Assertion type "${assertion.type}" not yet implemented`,
|
|
104
|
+
};
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
default:
|
|
108
|
+
return { assertion, passed: false, error: "Unknown assertion type" };
|
|
109
|
+
}
|
|
110
|
+
} catch (err) {
|
|
111
|
+
return { assertion, passed: false, error: String(err) };
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// ---------------------------------------------------------------------------
|
|
116
|
+
// Execution eval runner
|
|
117
|
+
// ---------------------------------------------------------------------------
|
|
118
|
+
|
|
119
|
+
/**
|
|
120
|
+
* Run all execution evals for a set of entries.
|
|
121
|
+
* Requires experimental opt-in (entries must have `experimental: true`).
|
|
122
|
+
*/
|
|
123
|
+
export async function runExecutionEvals(
|
|
124
|
+
entries: ExecutionEvalEntry[],
|
|
125
|
+
skillPath: string,
|
|
126
|
+
options?: { gateDeployment?: boolean },
|
|
127
|
+
): Promise<{
|
|
128
|
+
results: ExecutionEvalResult[];
|
|
129
|
+
gate_passed: boolean;
|
|
130
|
+
summary: { total: number; passed: number; failed: number };
|
|
131
|
+
}> {
|
|
132
|
+
const results: ExecutionEvalResult[] = [];
|
|
133
|
+
|
|
134
|
+
for (const entry of entries) {
|
|
135
|
+
const start = Date.now();
|
|
136
|
+
let workspace: string | undefined;
|
|
137
|
+
|
|
138
|
+
try {
|
|
139
|
+
if (entry.requires_workspace) {
|
|
140
|
+
workspace = createStagedWorkspace(skillPath);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
const assertionResults = entry.assertions.map((a) => runAssertion(a, workspace ?? "."));
|
|
144
|
+
|
|
145
|
+
const passed = assertionResults.every((r) => r.passed);
|
|
146
|
+
|
|
147
|
+
results.push({
|
|
148
|
+
entry,
|
|
149
|
+
passed,
|
|
150
|
+
assertion_results: assertionResults,
|
|
151
|
+
workspace_path: workspace,
|
|
152
|
+
elapsed_ms: Date.now() - start,
|
|
153
|
+
});
|
|
154
|
+
} finally {
|
|
155
|
+
if (workspace) {
|
|
156
|
+
cleanupStagedWorkspace(workspace);
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
const passed = results.filter((r) => r.passed).length;
|
|
162
|
+
const failed = results.length - passed;
|
|
163
|
+
const gatePassed = !options?.gateDeployment || failed === 0;
|
|
164
|
+
|
|
165
|
+
return {
|
|
166
|
+
results,
|
|
167
|
+
gate_passed: gatePassed,
|
|
168
|
+
summary: { total: results.length, passed, failed },
|
|
169
|
+
};
|
|
170
|
+
}
|
|
@@ -438,7 +438,7 @@ function buildRefactorProposal(
|
|
|
438
438
|
return {
|
|
439
439
|
workflow_name: workflowName,
|
|
440
440
|
source_skill: skillName,
|
|
441
|
-
suggested_path: `
|
|
441
|
+
suggested_path: `workflows/${workflowName}.md`,
|
|
442
442
|
};
|
|
443
443
|
});
|
|
444
444
|
|
|
@@ -453,7 +453,7 @@ function buildRefactorProposal(
|
|
|
453
453
|
migration_notes: [
|
|
454
454
|
`Create a parent skill \`${parentSkillName}\` whose SKILL.md routes into internal workflows instead of exposing each family member as a primary top-level trigger surface.`,
|
|
455
455
|
"Keep the existing sibling skills as thin compatibility aliases for at least one release cycle while usage shifts to the parent skill.",
|
|
456
|
-
"Move execution-specific instructions into internal
|
|
456
|
+
"Move execution-specific instructions into internal workflows/ or references/ files so the parent SKILL.md stays focused on routing and progressive disclosure.",
|
|
457
457
|
"Use the compatibility aliases to measure whether trigger quality improves before removing the old skill entry points.",
|
|
458
458
|
],
|
|
459
459
|
};
|