selftune 0.2.23 → 0.2.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/README.md +93 -15
- package/apps/local-dashboard/dist/assets/index-DgY2KGP-.css +1 -0
- package/apps/local-dashboard/dist/assets/index-Dhgv5BQO.js +15 -0
- package/apps/local-dashboard/dist/assets/vendor-react-C5oyHiV1.js +11 -0
- package/apps/local-dashboard/dist/assets/{vendor-table-BIiI3YhS.js → vendor-table-Bc_bbKd8.js} +1 -1
- package/apps/local-dashboard/dist/assets/vendor-ui-B3BPIYy7.js +1 -0
- package/apps/local-dashboard/dist/index.html +5 -5
- package/cli/selftune/adapters/codex/install.ts +310 -78
- package/cli/selftune/adapters/opencode/install.ts +3 -4
- package/cli/selftune/alpha-upload/build-payloads.ts +3 -3
- package/cli/selftune/alpha-upload/stage-canonical.ts +17 -11
- package/cli/selftune/auto-update.ts +200 -8
- package/cli/selftune/canonical-export.ts +55 -25
- package/cli/selftune/command-surface.ts +397 -0
- package/cli/selftune/contribute/contribute.ts +64 -13
- package/cli/selftune/contribution-config.ts +57 -3
- package/cli/selftune/contribution-preferences.ts +117 -0
- package/cli/selftune/contribution-signals.ts +8 -4
- package/cli/selftune/contribution-staging.ts +13 -2
- package/cli/selftune/contributions.ts +55 -121
- package/cli/selftune/creator-contributions.ts +29 -10
- package/cli/selftune/cron/setup.ts +7 -3
- package/cli/selftune/dashboard-contract.ts +73 -0
- package/cli/selftune/dashboard-server.ts +168 -17
- package/cli/selftune/dashboard.ts +350 -17
- package/cli/selftune/eval/baseline.ts +21 -5
- package/cli/selftune/eval/execution-eval.ts +170 -0
- package/cli/selftune/eval/family-overlap.ts +2 -2
- package/cli/selftune/eval/hooks-to-evals.ts +228 -82
- package/cli/selftune/eval/import-skillsbench.ts +2 -2
- package/cli/selftune/eval/invocation-classifier.ts +56 -0
- package/cli/selftune/eval/synthetic-evals.ts +5 -3
- package/cli/selftune/eval/unit-test-cli.ts +7 -4
- package/cli/selftune/evolution/apply-proposal.ts +295 -0
- package/cli/selftune/evolution/engines/replay-engine.ts +79 -57
- package/cli/selftune/evolution/evolve-body.ts +100 -39
- package/cli/selftune/evolution/evolve.ts +244 -52
- package/cli/selftune/evolution/rollback.ts +0 -1
- package/cli/selftune/evolution/validate-body.ts +68 -42
- package/cli/selftune/evolution/validate-host-replay.ts +510 -60
- package/cli/selftune/evolution/validate-proposal.ts +11 -150
- package/cli/selftune/evolution/validate-routing.ts +43 -41
- package/cli/selftune/evolution/validation-contract.ts +91 -0
- package/cli/selftune/grading/auto-grade.ts +11 -7
- package/cli/selftune/grading/grade-session.ts +10 -16
- package/cli/selftune/index.ts +35 -10
- package/cli/selftune/ingestors/claude-replay.ts +15 -10
- package/cli/selftune/ingestors/codex-wrapper.ts +3 -3
- package/cli/selftune/ingestors/opencode-ingest.ts +2 -2
- package/cli/selftune/ingestors/pi-ingest.ts +3 -2
- package/cli/selftune/init.ts +27 -3
- package/cli/selftune/localdb/direct-write.ts +35 -1
- package/cli/selftune/localdb/queries/cron.ts +34 -0
- package/cli/selftune/localdb/queries/dashboard.ts +834 -0
- package/cli/selftune/localdb/queries/evolution.ts +158 -0
- package/cli/selftune/localdb/queries/execution.ts +133 -0
- package/cli/selftune/localdb/queries/json.ts +18 -0
- package/cli/selftune/localdb/queries/monitoring.ts +263 -0
- package/cli/selftune/localdb/queries/raw.ts +95 -0
- package/cli/selftune/localdb/queries/staging.ts +270 -0
- package/cli/selftune/localdb/queries/trust.ts +392 -0
- package/cli/selftune/localdb/queries.ts +60 -2288
- package/cli/selftune/localdb/schema.ts +21 -0
- package/cli/selftune/monitoring/watch.ts +96 -29
- package/cli/selftune/normalization.ts +3 -0
- package/cli/selftune/observability.ts +4 -2
- package/cli/selftune/orchestrate/cli.ts +161 -0
- package/cli/selftune/orchestrate/execute.ts +295 -0
- package/cli/selftune/orchestrate/finalize.ts +157 -0
- package/cli/selftune/orchestrate/locks.ts +40 -0
- package/cli/selftune/orchestrate/plan.ts +131 -0
- package/cli/selftune/orchestrate/post-run.ts +59 -0
- package/cli/selftune/orchestrate/prepare.ts +334 -0
- package/cli/selftune/orchestrate/report.ts +182 -0
- package/cli/selftune/orchestrate/runtime.ts +120 -0
- package/cli/selftune/orchestrate/signals.ts +48 -0
- package/cli/selftune/orchestrate.ts +150 -1173
- package/cli/selftune/repair/skill-usage.ts +5 -2
- package/cli/selftune/routes/overview.ts +5 -2
- package/cli/selftune/routes/skill-report.ts +15 -2
- package/cli/selftune/schedule.ts +5 -5
- package/cli/selftune/status.ts +39 -2
- package/cli/selftune/testing-readiness.ts +597 -0
- package/cli/selftune/types.ts +44 -4
- package/cli/selftune/uninstall.ts +2 -1
- package/cli/selftune/utils/canonical-log.ts +1 -9
- package/cli/selftune/utils/cli-error.ts +9 -0
- package/cli/selftune/utils/llm-call.ts +126 -6
- package/cli/selftune/utils/skill-discovery.ts +2 -0
- package/cli/selftune/workflows/proposals.ts +184 -0
- package/cli/selftune/workflows/skill-scaffold.ts +241 -0
- package/cli/selftune/workflows/workflows.ts +100 -26
- package/node_modules/@selftune/telemetry-contract/fixtures/complete-push.ts +1 -1
- package/node_modules/@selftune/telemetry-contract/fixtures/evidence-only-push.ts +1 -1
- package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
- package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +1 -1
- package/node_modules/@selftune/telemetry-contract/src/schemas.ts +41 -1
- package/node_modules/@selftune/telemetry-contract/src/types.ts +103 -2
- package/package.json +25 -9
- package/packages/dashboard-core/AGENTS.md +18 -0
- package/packages/dashboard-core/README.md +30 -0
- package/packages/dashboard-core/index.ts +3 -0
- package/packages/dashboard-core/package.json +39 -0
- package/packages/dashboard-core/src/chrome/DashboardChrome.tsx +74 -0
- package/packages/dashboard-core/src/chrome/DashboardHeader.tsx +200 -0
- package/packages/dashboard-core/src/chrome/DashboardSidebar.tsx +219 -0
- package/packages/dashboard-core/src/chrome/RuntimeBadge.tsx +46 -0
- package/packages/dashboard-core/src/chrome/index.ts +14 -0
- package/packages/dashboard-core/src/chrome/types.ts +81 -0
- package/packages/dashboard-core/src/chrome/utils.ts +23 -0
- package/packages/dashboard-core/src/gates/FeatureGate.tsx +11 -0
- package/packages/dashboard-core/src/gates/LockedRoute.tsx +29 -0
- package/packages/dashboard-core/src/gates/UpgradeCard.tsx +89 -0
- package/packages/dashboard-core/src/gates/index.ts +3 -0
- package/packages/dashboard-core/src/host/DashboardHostProvider.tsx +62 -0
- package/packages/dashboard-core/src/host/adapter.ts +47 -0
- package/packages/dashboard-core/src/host/capabilities.ts +55 -0
- package/packages/dashboard-core/src/host/index.ts +3 -0
- package/packages/dashboard-core/src/models/analytics.ts +39 -0
- package/packages/dashboard-core/src/models/index.ts +4 -0
- package/packages/dashboard-core/src/models/overview.ts +98 -0
- package/packages/dashboard-core/src/models/runtime.ts +7 -0
- package/packages/dashboard-core/src/models/skills.ts +34 -0
- package/packages/dashboard-core/src/routes/index.ts +2 -0
- package/packages/dashboard-core/src/routes/manifest.test.ts +70 -0
- package/packages/dashboard-core/src/routes/manifest.ts +451 -0
- package/packages/dashboard-core/src/routes/types.ts +39 -0
- package/packages/dashboard-core/src/screens/analytics/AnalyticsScreen.tsx +278 -0
- package/packages/dashboard-core/src/screens/analytics/index.ts +1 -0
- package/packages/dashboard-core/src/screens/index.ts +37 -0
- package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.test.ts +101 -0
- package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.tsx +393 -0
- package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.test.tsx +113 -0
- package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.tsx +72 -0
- package/packages/dashboard-core/src/screens/overview/OverviewCoreSurface.tsx +71 -0
- package/packages/dashboard-core/src/screens/overview/OverviewOnboardingBanner.tsx +90 -0
- package/packages/dashboard-core/src/screens/overview/OverviewRunSummary.tsx +40 -0
- package/packages/dashboard-core/src/screens/overview/index.ts +16 -0
- package/packages/dashboard-core/src/screens/overview/types.ts +13 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportDailyBreakdownSection.tsx +99 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportDataQualityTabContent.tsx +35 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceRail.tsx +71 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceSection.tsx +63 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceTabContent.tsx +25 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportInvocationsSection.tsx +24 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportMissedQueriesSection.tsx +79 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportScaffold.tsx +150 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportSections.test.tsx +224 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.test.tsx +76 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.tsx +88 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTrendSection.tsx +33 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTrustBadge.tsx +67 -0
- package/packages/dashboard-core/src/screens/skill-report/index.ts +45 -0
- package/packages/dashboard-core/src/screens/skills/SkillsLibraryScreen.tsx +162 -0
- package/packages/dashboard-core/src/screens/skills/index.ts +6 -0
- package/packages/telemetry-contract/fixtures/complete-push.ts +1 -1
- package/packages/telemetry-contract/fixtures/evidence-only-push.ts +1 -1
- package/packages/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
- package/packages/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +1 -1
- package/packages/telemetry-contract/src/schemas.ts +41 -1
- package/packages/telemetry-contract/src/types.ts +103 -2
- package/packages/ui/src/components/EvidenceViewer.tsx +80 -25
- package/packages/ui/src/components/OverviewPanels.tsx +67 -26
- package/packages/ui/src/primitives/tabs.tsx +7 -6
- package/packages/ui/src/types.ts +10 -0
- package/skill/SKILL.md +130 -332
- package/skill/agents/diagnosis-analyst.md +3 -3
- package/skill/agents/evolution-reviewer.md +3 -3
- package/skill/agents/integration-guide.md +3 -3
- package/skill/agents/pattern-analyst.md +2 -2
- package/skill/references/cli-quick-reference.md +89 -0
- package/skill/references/creator-playbook.md +131 -0
- package/skill/references/examples.md +48 -0
- package/skill/references/troubleshooting.md +47 -0
- package/skill/references/version-history.md +1 -1
- package/skill/selftune.contribute.json +11 -0
- package/skill/{Workflows → workflows}/Baseline.md +20 -1
- package/skill/{Workflows → workflows}/Contribute.md +23 -10
- package/skill/{Workflows → workflows}/Contributions.md +13 -5
- package/skill/workflows/CreateTestDeploy.md +170 -0
- package/skill/{Workflows → workflows}/CreatorContributions.md +18 -6
- package/skill/{Workflows → workflows}/Cron.md +1 -1
- package/skill/{Workflows → workflows}/Dashboard.md +20 -0
- package/skill/{Workflows → workflows}/Doctor.md +1 -1
- package/skill/{Workflows → workflows}/Evals.md +67 -2
- package/skill/{Workflows → workflows}/Evolve.md +119 -30
- package/skill/{Workflows → workflows}/EvolveBody.md +41 -1
- package/skill/{Workflows → workflows}/Grade.md +1 -1
- package/skill/{Workflows → workflows}/Initialize.md +8 -4
- package/skill/{Workflows → workflows}/Orchestrate.md +13 -3
- package/skill/{Workflows → workflows}/Schedule.md +3 -3
- package/skill/workflows/SignalsDashboard.md +87 -0
- package/skill/{Workflows → workflows}/UnitTest.md +19 -0
- package/skill/{Workflows → workflows}/Watch.md +42 -2
- package/skill/{Workflows → workflows}/Workflows.md +39 -2
- package/apps/local-dashboard/dist/assets/index-CwOtTrUS.css +0 -1
- package/apps/local-dashboard/dist/assets/index-f1HQpbeH.js +0 -59
- package/apps/local-dashboard/dist/assets/vendor-react-CKkiCskZ.js +0 -11
- package/apps/local-dashboard/dist/assets/vendor-ui-jVSaIZey.js +0 -12
- /package/skill/{Workflows → workflows}/AlphaUpload.md +0 -0
- /package/skill/{Workflows → workflows}/AutoActivation.md +0 -0
- /package/skill/{Workflows → workflows}/Badge.md +0 -0
- /package/skill/{Workflows → workflows}/Composability.md +0 -0
- /package/skill/{Workflows → workflows}/EvolutionMemory.md +0 -0
- /package/skill/{Workflows → workflows}/ExportCanonical.md +0 -0
- /package/skill/{Workflows → workflows}/Hook.md +0 -0
- /package/skill/{Workflows → workflows}/ImportSkillsBench.md +0 -0
- /package/skill/{Workflows → workflows}/Ingest.md +0 -0
- /package/skill/{Workflows → workflows}/PlatformHooks.md +0 -0
- /package/skill/{Workflows → workflows}/Quickstart.md +0 -0
- /package/skill/{Workflows → workflows}/Recover.md +0 -0
- /package/skill/{Workflows → workflows}/Registry.md +0 -0
- /package/skill/{Workflows → workflows}/RepairSkillUsage.md +0 -0
- /package/skill/{Workflows → workflows}/Replay.md +0 -0
- /package/skill/{Workflows → workflows}/Rollback.md +0 -0
- /package/skill/{Workflows → workflows}/Sync.md +0 -0
- /package/skill/{Workflows → workflows}/Telemetry.md +0 -0
- /package/skill/{Workflows → workflows}/Uninstall.md +0 -0
|
@@ -14,14 +14,20 @@ import { basename, dirname, isAbsolute, join } from "node:path";
|
|
|
14
14
|
|
|
15
15
|
import type { EvalEntry, RoutingReplayEntryResult, RoutingReplayFixture } from "../types.js";
|
|
16
16
|
import { parseFrontmatter } from "../utils/frontmatter.js";
|
|
17
|
-
import {
|
|
18
|
-
|
|
17
|
+
import {
|
|
18
|
+
containsWholeSkillMention,
|
|
19
|
+
extractExplicitSkillMentions,
|
|
20
|
+
extractSkillNamesFromPathReferences,
|
|
21
|
+
findGitRepositoryRoot,
|
|
22
|
+
} from "../utils/skill-discovery.js";
|
|
19
23
|
import {
|
|
20
24
|
extractWhenToUseLines,
|
|
21
25
|
jaccardSimilarity,
|
|
22
26
|
tokenizeText,
|
|
23
27
|
} from "../utils/text-similarity.js";
|
|
24
|
-
import { replaceSection } from "./deploy-proposal.js";
|
|
28
|
+
import { replaceBody, replaceSection } from "./deploy-proposal.js";
|
|
29
|
+
import { replaceDescription } from "../utils/frontmatter.js";
|
|
30
|
+
import type { ReplayValidationOptions } from "./engines/replay-engine.js";
|
|
25
31
|
|
|
26
32
|
interface ReplaySkillSurface {
|
|
27
33
|
skillName: string;
|
|
@@ -31,29 +37,34 @@ interface ReplaySkillSurface {
|
|
|
31
37
|
|
|
32
38
|
interface ReplayWorkspace {
|
|
33
39
|
rootDir: string;
|
|
40
|
+
skillRegistryDir: string;
|
|
34
41
|
targetSkillPath: string;
|
|
35
42
|
competingSkillPaths: string[];
|
|
36
43
|
}
|
|
37
44
|
|
|
38
|
-
export
|
|
45
|
+
export type RuntimeReplayContentTarget = "routing" | "description" | "body";
|
|
46
|
+
|
|
47
|
+
export interface RuntimeReplayInvokerInput {
|
|
39
48
|
query: string;
|
|
49
|
+
platform: RoutingReplayFixture["platform"];
|
|
40
50
|
workspaceRoot: string;
|
|
51
|
+
skillRegistryDir: string;
|
|
41
52
|
targetSkillName: string;
|
|
42
53
|
targetSkillPath: string;
|
|
43
54
|
competingSkillPaths: string[];
|
|
44
55
|
}
|
|
45
56
|
|
|
46
|
-
export interface
|
|
47
|
-
|
|
57
|
+
export interface RuntimeReplayObservation {
|
|
58
|
+
triggeredSkillNames: string[];
|
|
48
59
|
readSkillPaths: string[];
|
|
49
60
|
rawOutput: string;
|
|
50
61
|
sessionId?: string;
|
|
51
62
|
runtimeError?: string;
|
|
52
63
|
}
|
|
53
64
|
|
|
54
|
-
export type
|
|
55
|
-
input:
|
|
56
|
-
) => Promise<
|
|
65
|
+
export type RuntimeReplayInvoker = (
|
|
66
|
+
input: RuntimeReplayInvokerInput,
|
|
67
|
+
) => Promise<RuntimeReplayObservation>;
|
|
57
68
|
|
|
58
69
|
/**
|
|
59
70
|
* Minimum score needed before replay treats routing text or skill-surface overlap
|
|
@@ -64,6 +75,13 @@ const HOST_REPLAY_MATCH_THRESHOLD = 0.18;
|
|
|
64
75
|
const CLAUDE_RUNTIME_REPLAY_TIMEOUT_MS = 30_000;
|
|
65
76
|
const CLAUDE_RUNTIME_ROUTING_PROMPT =
|
|
66
77
|
"You are being evaluated only on skill routing. Do not solve the user's task. If a local project skill is relevant, invoke exactly one skill immediately. If no local project skill fits, respond with NO_SKILL and do not browse unrelated files.";
|
|
78
|
+
const HOST_RUNTIME_REPLAY_TIMEOUT_MS = 45_000;
|
|
79
|
+
const GENERIC_RUNTIME_ROUTING_PROMPT = [
|
|
80
|
+
"You are being evaluated only on local skill routing.",
|
|
81
|
+
"Do not solve the user's task.",
|
|
82
|
+
"If exactly one local project skill is relevant, open only that skill's SKILL.md immediately and stop after selecting it.",
|
|
83
|
+
"If no local project skill fits, reply with NO_SKILL and do not browse unrelated files.",
|
|
84
|
+
].join(" ");
|
|
67
85
|
|
|
68
86
|
function resolveReplayPath(path: string): string {
|
|
69
87
|
try {
|
|
@@ -105,6 +123,26 @@ function listCompetingSkillPaths(targetSkillPath: string): string[] {
|
|
|
105
123
|
return competingPaths.sort((a, b) => a.localeCompare(b));
|
|
106
124
|
}
|
|
107
125
|
|
|
126
|
+
function getRuntimeReplayRegistryRelativeDir(platform: RoutingReplayFixture["platform"]): string {
|
|
127
|
+
switch (platform) {
|
|
128
|
+
case "claude_code":
|
|
129
|
+
return join(".claude", "skills");
|
|
130
|
+
case "codex":
|
|
131
|
+
return join(".agents", "skills");
|
|
132
|
+
case "opencode":
|
|
133
|
+
return join(".opencode", "skills");
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
export function resolveRuntimeReplayPlatform(
|
|
138
|
+
agent: string | null | undefined,
|
|
139
|
+
): RoutingReplayFixture["platform"] | undefined {
|
|
140
|
+
if (agent === "claude") return "claude_code";
|
|
141
|
+
if (agent === "codex") return "codex";
|
|
142
|
+
if (agent === "opencode") return "opencode";
|
|
143
|
+
return undefined;
|
|
144
|
+
}
|
|
145
|
+
|
|
108
146
|
export function buildRoutingReplayFixture(options: {
|
|
109
147
|
skillName: string;
|
|
110
148
|
skillPath: string;
|
|
@@ -127,9 +165,19 @@ export function buildRoutingReplayFixture(options: {
|
|
|
127
165
|
};
|
|
128
166
|
}
|
|
129
167
|
|
|
130
|
-
function buildRuntimeReplayTargetContent(
|
|
168
|
+
function buildRuntimeReplayTargetContent(
|
|
169
|
+
skillPath: string,
|
|
170
|
+
content: string,
|
|
171
|
+
contentTarget: RuntimeReplayContentTarget,
|
|
172
|
+
): string {
|
|
131
173
|
const currentContent = readFileSync(skillPath, "utf8");
|
|
132
|
-
|
|
174
|
+
if (contentTarget === "body") {
|
|
175
|
+
return replaceBody(currentContent, content.trim());
|
|
176
|
+
}
|
|
177
|
+
if (contentTarget === "description") {
|
|
178
|
+
return replaceDescription(currentContent, content.trim());
|
|
179
|
+
}
|
|
180
|
+
return replaceSection(currentContent, "Workflow Routing", content.trim());
|
|
133
181
|
}
|
|
134
182
|
|
|
135
183
|
function stageReplaySkill(
|
|
@@ -148,18 +196,19 @@ function stageReplaySkill(
|
|
|
148
196
|
|
|
149
197
|
function buildRuntimeReplayWorkspace(
|
|
150
198
|
fixture: RoutingReplayFixture,
|
|
151
|
-
|
|
199
|
+
content: string,
|
|
200
|
+
contentTarget: RuntimeReplayContentTarget,
|
|
152
201
|
): ReplayWorkspace {
|
|
153
202
|
const rootDir = mkdtempSync(join(tmpdir(), "selftune-runtime-replay-"));
|
|
154
203
|
try {
|
|
155
|
-
const registryDir = join(rootDir,
|
|
204
|
+
const registryDir = join(rootDir, getRuntimeReplayRegistryRelativeDir(fixture.platform));
|
|
156
205
|
mkdirSync(join(rootDir, ".git"), { recursive: true });
|
|
157
206
|
mkdirSync(registryDir, { recursive: true });
|
|
158
207
|
|
|
159
208
|
const targetSkillPath = stageReplaySkill(
|
|
160
209
|
registryDir,
|
|
161
210
|
fixture.target_skill_path,
|
|
162
|
-
buildRuntimeReplayTargetContent(fixture.target_skill_path,
|
|
211
|
+
buildRuntimeReplayTargetContent(fixture.target_skill_path, content, contentTarget),
|
|
163
212
|
);
|
|
164
213
|
const competingSkillPaths = fixture.competing_skill_paths.map((skillPath) =>
|
|
165
214
|
stageReplaySkill(registryDir, skillPath),
|
|
@@ -167,6 +216,7 @@ function buildRuntimeReplayWorkspace(
|
|
|
167
216
|
|
|
168
217
|
return {
|
|
169
218
|
rootDir,
|
|
219
|
+
skillRegistryDir: registryDir,
|
|
170
220
|
targetSkillPath,
|
|
171
221
|
competingSkillPaths,
|
|
172
222
|
};
|
|
@@ -180,8 +230,8 @@ function cleanupRuntimeReplayWorkspace(workspace: ReplayWorkspace): void {
|
|
|
180
230
|
rmSync(workspace.rootDir, { recursive: true, force: true });
|
|
181
231
|
}
|
|
182
232
|
|
|
183
|
-
function parseClaudeRuntimeReplayOutput(rawOutput: string):
|
|
184
|
-
const
|
|
233
|
+
function parseClaudeRuntimeReplayOutput(rawOutput: string): RuntimeReplayObservation {
|
|
234
|
+
const triggeredSkillNames = new Set<string>();
|
|
185
235
|
const readSkillPaths = new Set<string>();
|
|
186
236
|
let sessionId: string | undefined;
|
|
187
237
|
let runtimeError: string | undefined;
|
|
@@ -227,7 +277,7 @@ function parseClaudeRuntimeReplayOutput(rawOutput: string): ClaudeRuntimeReplayO
|
|
|
227
277
|
if (toolName === "Skill") {
|
|
228
278
|
const skillName = input.skill;
|
|
229
279
|
if (typeof skillName === "string" && skillName.trim()) {
|
|
230
|
-
|
|
280
|
+
triggeredSkillNames.add(skillName.trim());
|
|
231
281
|
}
|
|
232
282
|
}
|
|
233
283
|
|
|
@@ -241,7 +291,268 @@ function parseClaudeRuntimeReplayOutput(rawOutput: string): ClaudeRuntimeReplayO
|
|
|
241
291
|
}
|
|
242
292
|
|
|
243
293
|
return {
|
|
244
|
-
|
|
294
|
+
triggeredSkillNames: [...triggeredSkillNames],
|
|
295
|
+
readSkillPaths: [...readSkillPaths],
|
|
296
|
+
rawOutput,
|
|
297
|
+
...(sessionId ? { sessionId } : {}),
|
|
298
|
+
...(runtimeError ? { runtimeError } : {}),
|
|
299
|
+
};
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
function buildKnownSkillNames(input: RuntimeReplayInvokerInput): Set<string> {
|
|
303
|
+
return new Set([
|
|
304
|
+
input.targetSkillName.trim(),
|
|
305
|
+
...input.competingSkillPaths.map((skillPath) => basename(dirname(skillPath)).trim()),
|
|
306
|
+
]);
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
function extractReplaySkillPathReferences(text: string): string[] {
|
|
310
|
+
if (!text) return [];
|
|
311
|
+
|
|
312
|
+
const matches = new Set<string>();
|
|
313
|
+
const patterns = [
|
|
314
|
+
/(?:^|[\s"'`])((?:\/etc\/codex\/skills\/[^/\s"'`]+|[^"'`\s]*?\.agents\/skills\/[^/\s"'`]+|[^"'`\s]*?\.codex\/skills\/(?:\.system\/)?[^/\s"'`]+|[^"'`\s]*?\.opencode\/skills\/[^/\s"'`]+|[^"'`\s]*?\.claude\/skills\/[^/\s"'`]+)\/SKILL\.md)(?=[\s"'`]|$)/gi,
|
|
315
|
+
];
|
|
316
|
+
|
|
317
|
+
for (const pattern of patterns) {
|
|
318
|
+
let match = pattern.exec(text);
|
|
319
|
+
while (match !== null) {
|
|
320
|
+
const value = match[1]?.trim();
|
|
321
|
+
if (value) {
|
|
322
|
+
matches.add(value);
|
|
323
|
+
}
|
|
324
|
+
match = pattern.exec(text);
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
return [...matches];
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
function normalizeReplayEventType(value: unknown): string {
|
|
332
|
+
return typeof value === "string" ? value.replace(/[._]/g, "-").trim().toLowerCase() : "";
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
export function parseCodexRuntimeReplayOutput(
|
|
336
|
+
rawOutput: string,
|
|
337
|
+
knownSkillNames: Set<string>,
|
|
338
|
+
): RuntimeReplayObservation {
|
|
339
|
+
const triggeredSkillNames = new Set<string>();
|
|
340
|
+
const readSkillPaths = new Set<string>();
|
|
341
|
+
let sessionId: string | undefined;
|
|
342
|
+
let runtimeError: string | undefined;
|
|
343
|
+
|
|
344
|
+
const noteSkillPathsAndNames = (text: unknown): void => {
|
|
345
|
+
if (typeof text !== "string" || !text) return;
|
|
346
|
+
|
|
347
|
+
for (const filePath of extractReplaySkillPathReferences(text)) {
|
|
348
|
+
readSkillPaths.add(filePath);
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
for (const skillName of extractSkillNamesFromPathReferences(text, knownSkillNames)) {
|
|
352
|
+
triggeredSkillNames.add(skillName);
|
|
353
|
+
}
|
|
354
|
+
};
|
|
355
|
+
|
|
356
|
+
const noteExplicitMentions = (text: unknown): void => {
|
|
357
|
+
if (typeof text !== "string" || !text) return;
|
|
358
|
+
for (const skillName of extractExplicitSkillMentions(text, knownSkillNames)) {
|
|
359
|
+
triggeredSkillNames.add(skillName);
|
|
360
|
+
}
|
|
361
|
+
};
|
|
362
|
+
|
|
363
|
+
for (const line of rawOutput.split("\n")) {
|
|
364
|
+
const trimmed = line.trim();
|
|
365
|
+
if (!trimmed) continue;
|
|
366
|
+
|
|
367
|
+
let parsed: Record<string, unknown>;
|
|
368
|
+
try {
|
|
369
|
+
parsed = JSON.parse(trimmed);
|
|
370
|
+
} catch {
|
|
371
|
+
continue;
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
const eventType = normalizeReplayEventType(parsed.type);
|
|
375
|
+
|
|
376
|
+
const threadId = parsed.thread_id;
|
|
377
|
+
if (typeof threadId === "string" && threadId) {
|
|
378
|
+
sessionId = threadId;
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
if (typeof parsed.error === "string" && parsed.error) {
|
|
382
|
+
runtimeError = parsed.error;
|
|
383
|
+
} else if (eventType === "turn-failed") {
|
|
384
|
+
const error = parsed.error;
|
|
385
|
+
if (typeof error === "object" && error !== null) {
|
|
386
|
+
const message = (error as Record<string, unknown>).message;
|
|
387
|
+
if (typeof message === "string") {
|
|
388
|
+
runtimeError = message;
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
} else if (eventType === "error" && typeof parsed.message === "string" && parsed.message) {
|
|
392
|
+
runtimeError = parsed.message;
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
if (
|
|
396
|
+
eventType === "item-completed" ||
|
|
397
|
+
eventType === "item-started" ||
|
|
398
|
+
eventType === "item-updated"
|
|
399
|
+
) {
|
|
400
|
+
const item =
|
|
401
|
+
typeof parsed.item === "object" && parsed.item !== null
|
|
402
|
+
? (parsed.item as Record<string, unknown>)
|
|
403
|
+
: undefined;
|
|
404
|
+
const itemType = normalizeReplayEventType(item?.item_type ?? item?.type);
|
|
405
|
+
|
|
406
|
+
if (itemType === "command-execution") {
|
|
407
|
+
noteSkillPathsAndNames(item?.command);
|
|
408
|
+
if (item?.exit_code !== undefined && item.exit_code !== 0 && !runtimeError) {
|
|
409
|
+
runtimeError = `command execution exited with code ${String(item.exit_code)}`;
|
|
410
|
+
}
|
|
411
|
+
}
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
if (eventType === "response-item") {
|
|
415
|
+
const payload =
|
|
416
|
+
typeof parsed.payload === "object" && parsed.payload !== null
|
|
417
|
+
? (parsed.payload as Record<string, unknown>)
|
|
418
|
+
: undefined;
|
|
419
|
+
const payloadType = normalizeReplayEventType(payload?.type);
|
|
420
|
+
|
|
421
|
+
if (payloadType === "function-call") {
|
|
422
|
+
noteSkillPathsAndNames(payload?.arguments);
|
|
423
|
+
} else if (payloadType === "message") {
|
|
424
|
+
const role = payload?.role;
|
|
425
|
+
const content = Array.isArray(payload?.content)
|
|
426
|
+
? (payload.content as Array<Record<string, unknown>>)
|
|
427
|
+
: [];
|
|
428
|
+
for (const part of content) {
|
|
429
|
+
const text = part?.text;
|
|
430
|
+
noteSkillPathsAndNames(text);
|
|
431
|
+
if (role === "user") {
|
|
432
|
+
noteExplicitMentions(text);
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
} else if (payloadType === "agent-reasoning") {
|
|
436
|
+
noteSkillPathsAndNames(payload?.text);
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
return {
|
|
442
|
+
triggeredSkillNames: [...triggeredSkillNames],
|
|
443
|
+
readSkillPaths: [...readSkillPaths],
|
|
444
|
+
rawOutput,
|
|
445
|
+
...(sessionId ? { sessionId } : {}),
|
|
446
|
+
...(runtimeError ? { runtimeError } : {}),
|
|
447
|
+
};
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
export function parseOpenCodeRuntimeReplayOutput(
|
|
451
|
+
rawOutput: string,
|
|
452
|
+
knownSkillNames: Set<string>,
|
|
453
|
+
): RuntimeReplayObservation {
|
|
454
|
+
const triggeredSkillNames = new Set<string>();
|
|
455
|
+
const readSkillPaths = new Set<string>();
|
|
456
|
+
let sessionId: string | undefined;
|
|
457
|
+
let runtimeError: string | undefined;
|
|
458
|
+
|
|
459
|
+
const noteSkillPathsAndNames = (text: unknown): void => {
|
|
460
|
+
if (typeof text !== "string" || !text) return;
|
|
461
|
+
|
|
462
|
+
for (const filePath of extractReplaySkillPathReferences(text)) {
|
|
463
|
+
readSkillPaths.add(filePath);
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
for (const skillName of extractSkillNamesFromPathReferences(text, knownSkillNames)) {
|
|
467
|
+
triggeredSkillNames.add(skillName);
|
|
468
|
+
}
|
|
469
|
+
};
|
|
470
|
+
|
|
471
|
+
const noteExplicitMentions = (text: unknown): void => {
|
|
472
|
+
if (typeof text !== "string" || !text) return;
|
|
473
|
+
for (const skillName of extractExplicitSkillMentions(text, knownSkillNames)) {
|
|
474
|
+
triggeredSkillNames.add(skillName);
|
|
475
|
+
}
|
|
476
|
+
};
|
|
477
|
+
|
|
478
|
+
for (const line of rawOutput.split("\n")) {
|
|
479
|
+
const trimmed = line.trim();
|
|
480
|
+
if (!trimmed) continue;
|
|
481
|
+
|
|
482
|
+
let parsed: Record<string, unknown>;
|
|
483
|
+
try {
|
|
484
|
+
parsed = JSON.parse(trimmed);
|
|
485
|
+
} catch {
|
|
486
|
+
continue;
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
const nestedPart =
|
|
490
|
+
typeof parsed.part === "object" && parsed.part !== null
|
|
491
|
+
? (parsed.part as Record<string, unknown>)
|
|
492
|
+
: undefined;
|
|
493
|
+
const eventType = normalizeReplayEventType(nestedPart?.type ?? parsed.type);
|
|
494
|
+
const payload =
|
|
495
|
+
nestedPart &&
|
|
496
|
+
(nestedPart.tool !== undefined || nestedPart.state !== undefined || nestedPart.text)
|
|
497
|
+
? nestedPart
|
|
498
|
+
: parsed;
|
|
499
|
+
|
|
500
|
+
const possibleSessionId = parsed.sessionID ?? parsed.session_id ?? payload.sessionID;
|
|
501
|
+
if (typeof possibleSessionId === "string" && possibleSessionId) {
|
|
502
|
+
sessionId = possibleSessionId;
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
if (typeof parsed.error === "string" && parsed.error) {
|
|
506
|
+
runtimeError = parsed.error;
|
|
507
|
+
} else if (typeof payload.error === "string" && payload.error) {
|
|
508
|
+
runtimeError = payload.error;
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
if (eventType === "tool") {
|
|
512
|
+
const toolName = normalizeReplayEventType(payload.tool ?? payload.name);
|
|
513
|
+
const state =
|
|
514
|
+
typeof payload.state === "object" && payload.state !== null
|
|
515
|
+
? (payload.state as Record<string, unknown>)
|
|
516
|
+
: {};
|
|
517
|
+
const input =
|
|
518
|
+
typeof state.input === "object" && state.input !== null
|
|
519
|
+
? (state.input as Record<string, unknown>)
|
|
520
|
+
: {};
|
|
521
|
+
const status = normalizeReplayEventType(state.status);
|
|
522
|
+
|
|
523
|
+
if (toolName === "read" || toolName === "read-file") {
|
|
524
|
+
const filePath = input.filePath ?? input.file_path ?? input.path;
|
|
525
|
+
if (typeof filePath === "string" && basename(filePath).toUpperCase() === "SKILL.MD") {
|
|
526
|
+
readSkillPaths.add(filePath);
|
|
527
|
+
triggeredSkillNames.add(basename(dirname(filePath)));
|
|
528
|
+
}
|
|
529
|
+
} else if (toolName === "bash" || toolName === "execute-bash") {
|
|
530
|
+
noteSkillPathsAndNames(input.command ?? input.cmd);
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
const metadata =
|
|
534
|
+
typeof state.metadata === "object" && state.metadata !== null
|
|
535
|
+
? (state.metadata as Record<string, unknown>)
|
|
536
|
+
: {};
|
|
537
|
+
const exitCode = metadata.exit;
|
|
538
|
+
if (status === "completed" && exitCode !== undefined && exitCode !== 0 && !runtimeError) {
|
|
539
|
+
runtimeError = `tool exited with code ${String(exitCode)}`;
|
|
540
|
+
}
|
|
541
|
+
} else if (eventType === "text" || eventType === "reasoning") {
|
|
542
|
+
noteSkillPathsAndNames(payload.text);
|
|
543
|
+
noteExplicitMentions(payload.text);
|
|
544
|
+
} else if (eventType === "error" && typeof payload.message === "string" && payload.message) {
|
|
545
|
+
runtimeError = payload.message;
|
|
546
|
+
} else if (eventType === "step-finish") {
|
|
547
|
+
const reason = payload.reason;
|
|
548
|
+
if (typeof reason === "string" && reason.toLowerCase() === "error" && !runtimeError) {
|
|
549
|
+
runtimeError = "step finished with error";
|
|
550
|
+
}
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
return {
|
|
555
|
+
triggeredSkillNames: [...triggeredSkillNames],
|
|
245
556
|
readSkillPaths: [...readSkillPaths],
|
|
246
557
|
rawOutput,
|
|
247
558
|
...(sessionId ? { sessionId } : {}),
|
|
@@ -250,8 +561,8 @@ function parseClaudeRuntimeReplayOutput(rawOutput: string): ClaudeRuntimeReplayO
|
|
|
250
561
|
}
|
|
251
562
|
|
|
252
563
|
async function invokeClaudeRuntimeReplay(
|
|
253
|
-
input:
|
|
254
|
-
): Promise<
|
|
564
|
+
input: RuntimeReplayInvokerInput,
|
|
565
|
+
): Promise<RuntimeReplayObservation> {
|
|
255
566
|
const command = [
|
|
256
567
|
"claude",
|
|
257
568
|
"-p",
|
|
@@ -289,7 +600,7 @@ async function invokeClaudeRuntimeReplay(
|
|
|
289
600
|
const observation = parseClaudeRuntimeReplayOutput(stdoutText);
|
|
290
601
|
const combinedError = [observation.runtimeError, stderrText.trim()].filter(Boolean).join(" | ");
|
|
291
602
|
const hasRoutingSignal =
|
|
292
|
-
observation.
|
|
603
|
+
observation.triggeredSkillNames.length > 0 || observation.readSkillPaths.length > 0;
|
|
293
604
|
|
|
294
605
|
if (exitCode !== 0 && !hasRoutingSignal) {
|
|
295
606
|
throw new Error(combinedError || `claude runtime replay exited with code ${exitCode}`);
|
|
@@ -301,20 +612,101 @@ async function invokeClaudeRuntimeReplay(
|
|
|
301
612
|
};
|
|
302
613
|
}
|
|
303
614
|
|
|
304
|
-
function
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
615
|
+
async function invokeCodexRuntimeReplay(
|
|
616
|
+
input: RuntimeReplayInvokerInput,
|
|
617
|
+
): Promise<RuntimeReplayObservation> {
|
|
618
|
+
const prompt = `${GENERIC_RUNTIME_ROUTING_PROMPT}\n\nUser request: ${input.query}`;
|
|
619
|
+
const command = [
|
|
620
|
+
"codex",
|
|
621
|
+
"exec",
|
|
622
|
+
"--json",
|
|
623
|
+
"--skip-git-repo-check",
|
|
624
|
+
"--sandbox",
|
|
625
|
+
"read-only",
|
|
626
|
+
"-C",
|
|
627
|
+
input.workspaceRoot,
|
|
628
|
+
prompt,
|
|
629
|
+
];
|
|
630
|
+
|
|
631
|
+
const proc = Bun.spawn(command, {
|
|
632
|
+
cwd: input.workspaceRoot,
|
|
633
|
+
stdout: "pipe",
|
|
634
|
+
stderr: "pipe",
|
|
635
|
+
env: { ...process.env, CLAUDECODE: "" },
|
|
636
|
+
});
|
|
637
|
+
const timeout = setTimeout(() => proc.kill(), HOST_RUNTIME_REPLAY_TIMEOUT_MS);
|
|
638
|
+
|
|
639
|
+
const [stdoutText, stderrText, exitCode] = await Promise.all([
|
|
640
|
+
new Response(proc.stdout).text(),
|
|
641
|
+
new Response(proc.stderr).text(),
|
|
642
|
+
proc.exited,
|
|
643
|
+
]);
|
|
644
|
+
clearTimeout(timeout);
|
|
645
|
+
|
|
646
|
+
const observation = parseCodexRuntimeReplayOutput(stdoutText, buildKnownSkillNames(input));
|
|
647
|
+
const combinedError = [observation.runtimeError, stderrText.trim()].filter(Boolean).join(" | ");
|
|
648
|
+
const hasRoutingSignal =
|
|
649
|
+
observation.triggeredSkillNames.length > 0 || observation.readSkillPaths.length > 0;
|
|
650
|
+
|
|
651
|
+
if (exitCode !== 0 && !hasRoutingSignal) {
|
|
652
|
+
throw new Error(combinedError || `codex runtime replay exited with code ${exitCode}`);
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
return {
|
|
656
|
+
...observation,
|
|
657
|
+
...(combinedError ? { runtimeError: combinedError } : {}),
|
|
658
|
+
};
|
|
659
|
+
}
|
|
660
|
+
|
|
661
|
+
async function invokeOpenCodeRuntimeReplay(
|
|
662
|
+
input: RuntimeReplayInvokerInput,
|
|
663
|
+
): Promise<RuntimeReplayObservation> {
|
|
664
|
+
const prompt = `${GENERIC_RUNTIME_ROUTING_PROMPT}\n\nUser request: ${input.query}`;
|
|
665
|
+
const command = [
|
|
666
|
+
"opencode",
|
|
667
|
+
"run",
|
|
668
|
+
"--format",
|
|
669
|
+
"json",
|
|
670
|
+
"--dir",
|
|
671
|
+
input.workspaceRoot,
|
|
672
|
+
"--dangerously-skip-permissions",
|
|
673
|
+
prompt,
|
|
674
|
+
];
|
|
675
|
+
|
|
676
|
+
const proc = Bun.spawn(command, {
|
|
677
|
+
cwd: input.workspaceRoot,
|
|
678
|
+
stdout: "pipe",
|
|
679
|
+
stderr: "pipe",
|
|
680
|
+
env: { ...process.env, CLAUDECODE: "" },
|
|
681
|
+
});
|
|
682
|
+
const timeout = setTimeout(() => proc.kill(), HOST_RUNTIME_REPLAY_TIMEOUT_MS);
|
|
683
|
+
|
|
684
|
+
const [stdoutText, stderrText, exitCode] = await Promise.all([
|
|
685
|
+
new Response(proc.stdout).text(),
|
|
686
|
+
new Response(proc.stderr).text(),
|
|
687
|
+
proc.exited,
|
|
688
|
+
]);
|
|
689
|
+
clearTimeout(timeout);
|
|
690
|
+
|
|
691
|
+
const observation = parseOpenCodeRuntimeReplayOutput(stdoutText, buildKnownSkillNames(input));
|
|
692
|
+
const combinedError = [observation.runtimeError, stderrText.trim()].filter(Boolean).join(" | ");
|
|
693
|
+
const hasRoutingSignal =
|
|
694
|
+
observation.triggeredSkillNames.length > 0 || observation.readSkillPaths.length > 0;
|
|
695
|
+
|
|
696
|
+
if (exitCode !== 0 && !hasRoutingSignal) {
|
|
697
|
+
throw new Error(combinedError || `opencode runtime replay exited with code ${exitCode}`);
|
|
698
|
+
}
|
|
699
|
+
|
|
700
|
+
return {
|
|
701
|
+
...observation,
|
|
702
|
+
...(combinedError ? { runtimeError: combinedError } : {}),
|
|
703
|
+
};
|
|
312
704
|
}
|
|
313
705
|
|
|
314
706
|
function evaluateRuntimeReplayObservation(
|
|
315
707
|
entry: EvalEntry,
|
|
316
708
|
fixture: RoutingReplayFixture,
|
|
317
|
-
observation:
|
|
709
|
+
observation: RuntimeReplayObservation,
|
|
318
710
|
workspace: ReplayWorkspace,
|
|
319
711
|
): RoutingReplayEntryResult {
|
|
320
712
|
const normalizedReadPaths = new Set(
|
|
@@ -325,14 +717,14 @@ function evaluateRuntimeReplayObservation(
|
|
|
325
717
|
...workspace.competingSkillPaths.map(resolveReplayPath),
|
|
326
718
|
]);
|
|
327
719
|
const targetSkillName = fixture.target_skill_name.trim();
|
|
328
|
-
const
|
|
329
|
-
const
|
|
720
|
+
const targetTriggered = observation.triggeredSkillNames.includes(targetSkillName);
|
|
721
|
+
const competingTriggered = observation.triggeredSkillNames.find((skillName) =>
|
|
330
722
|
fixture.competing_skill_paths.some(
|
|
331
723
|
(skillPath) => basename(dirname(skillPath)).trim() === skillName.trim(),
|
|
332
724
|
),
|
|
333
725
|
);
|
|
334
|
-
const
|
|
335
|
-
(skillName) => skillName.trim() !== targetSkillName && skillName.trim() !==
|
|
726
|
+
const unrelatedTriggered = observation.triggeredSkillNames.find(
|
|
727
|
+
(skillName) => skillName.trim() !== targetSkillName && skillName.trim() !== competingTriggered,
|
|
336
728
|
);
|
|
337
729
|
const unrelatedReadPaths = [...normalizedReadPaths].filter((path) => !allowedReadPaths.has(path));
|
|
338
730
|
const targetRead = normalizedReadPaths.has(resolveReplayPath(workspace.targetSkillPath));
|
|
@@ -342,43 +734,43 @@ function evaluateRuntimeReplayObservation(
|
|
|
342
734
|
const sessionPrefix = observation.sessionId
|
|
343
735
|
? `runtime replay session ${observation.sessionId}`
|
|
344
736
|
: "runtime replay";
|
|
345
|
-
if (observation.
|
|
737
|
+
if (observation.triggeredSkillNames.length > 1) {
|
|
346
738
|
return {
|
|
347
739
|
query: entry.query,
|
|
348
740
|
should_trigger: entry.should_trigger,
|
|
349
741
|
triggered: false,
|
|
350
742
|
passed: false,
|
|
351
|
-
evidence: `${sessionPrefix}
|
|
743
|
+
evidence: `${sessionPrefix} selected multiple skills: ${observation.triggeredSkillNames.join(", ")}`,
|
|
352
744
|
};
|
|
353
745
|
}
|
|
354
746
|
|
|
355
|
-
if (
|
|
747
|
+
if (targetTriggered) {
|
|
356
748
|
return {
|
|
357
749
|
query: entry.query,
|
|
358
750
|
should_trigger: entry.should_trigger,
|
|
359
751
|
triggered: true,
|
|
360
752
|
passed: entry.should_trigger,
|
|
361
|
-
evidence: `${sessionPrefix}
|
|
753
|
+
evidence: `${sessionPrefix} selected target skill: ${targetSkillName}`,
|
|
362
754
|
};
|
|
363
755
|
}
|
|
364
756
|
|
|
365
|
-
if (
|
|
757
|
+
if (competingTriggered) {
|
|
366
758
|
return {
|
|
367
759
|
query: entry.query,
|
|
368
760
|
should_trigger: entry.should_trigger,
|
|
369
761
|
triggered: false,
|
|
370
762
|
passed: !entry.should_trigger,
|
|
371
|
-
evidence: `${sessionPrefix}
|
|
763
|
+
evidence: `${sessionPrefix} selected competing skill: ${competingTriggered}`,
|
|
372
764
|
};
|
|
373
765
|
}
|
|
374
766
|
|
|
375
|
-
if (
|
|
767
|
+
if (unrelatedTriggered) {
|
|
376
768
|
return {
|
|
377
769
|
query: entry.query,
|
|
378
770
|
should_trigger: entry.should_trigger,
|
|
379
771
|
triggered: false,
|
|
380
772
|
passed: false,
|
|
381
|
-
evidence: `${sessionPrefix}
|
|
773
|
+
evidence: `${sessionPrefix} selected unrelated skill: ${unrelatedTriggered}`,
|
|
382
774
|
};
|
|
383
775
|
}
|
|
384
776
|
|
|
@@ -398,7 +790,7 @@ function evaluateRuntimeReplayObservation(
|
|
|
398
790
|
should_trigger: entry.should_trigger,
|
|
399
791
|
triggered: false,
|
|
400
792
|
passed: !entry.should_trigger,
|
|
401
|
-
evidence: `${sessionPrefix} only read the target skill without
|
|
793
|
+
evidence: `${sessionPrefix} only read the target skill without selecting it`,
|
|
402
794
|
};
|
|
403
795
|
}
|
|
404
796
|
|
|
@@ -408,7 +800,7 @@ function evaluateRuntimeReplayObservation(
|
|
|
408
800
|
should_trigger: entry.should_trigger,
|
|
409
801
|
triggered: false,
|
|
410
802
|
passed: !entry.should_trigger,
|
|
411
|
-
evidence: `${sessionPrefix} only read a competing skill without
|
|
803
|
+
evidence: `${sessionPrefix} only read a competing skill without selecting it`,
|
|
412
804
|
};
|
|
413
805
|
}
|
|
414
806
|
|
|
@@ -421,7 +813,7 @@ function evaluateRuntimeReplayObservation(
|
|
|
421
813
|
should_trigger: entry.should_trigger,
|
|
422
814
|
triggered: false,
|
|
423
815
|
passed: !entry.should_trigger,
|
|
424
|
-
evidence: `${sessionPrefix} did not
|
|
816
|
+
evidence: `${sessionPrefix} did not select any local project skill`,
|
|
425
817
|
};
|
|
426
818
|
}
|
|
427
819
|
|
|
@@ -578,33 +970,75 @@ export function runHostReplayFixture(options: {
|
|
|
578
970
|
});
|
|
579
971
|
}
|
|
580
972
|
|
|
581
|
-
|
|
973
|
+
function getDefaultRuntimeReplayInvoker(
|
|
974
|
+
platform: RoutingReplayFixture["platform"],
|
|
975
|
+
): RuntimeReplayInvoker {
|
|
976
|
+
switch (platform) {
|
|
977
|
+
case "claude_code":
|
|
978
|
+
return invokeClaudeRuntimeReplay;
|
|
979
|
+
case "codex":
|
|
980
|
+
return invokeCodexRuntimeReplay;
|
|
981
|
+
case "opencode":
|
|
982
|
+
return invokeOpenCodeRuntimeReplay;
|
|
983
|
+
}
|
|
984
|
+
}
|
|
985
|
+
|
|
986
|
+
export function buildRuntimeReplayValidationOptions(options: {
|
|
987
|
+
skillName: string;
|
|
988
|
+
skillPath: string;
|
|
989
|
+
agent: string | null | undefined;
|
|
990
|
+
contentTarget?: RuntimeReplayContentTarget;
|
|
991
|
+
}): ReplayValidationOptions | undefined {
|
|
992
|
+
const platform = resolveRuntimeReplayPlatform(options.agent);
|
|
993
|
+
if (!platform) return undefined;
|
|
994
|
+
|
|
995
|
+
try {
|
|
996
|
+
const replayFixture = buildRoutingReplayFixture({
|
|
997
|
+
skillName: options.skillName,
|
|
998
|
+
skillPath: options.skillPath,
|
|
999
|
+
platform,
|
|
1000
|
+
});
|
|
1001
|
+
|
|
1002
|
+
return {
|
|
1003
|
+
replayFixture,
|
|
1004
|
+
replayRunner: async ({ routing, evalSet, fixture }) =>
|
|
1005
|
+
await runHostRuntimeReplayFixture({
|
|
1006
|
+
routing,
|
|
1007
|
+
evalSet,
|
|
1008
|
+
fixture,
|
|
1009
|
+
contentTarget: options.contentTarget ?? "routing",
|
|
1010
|
+
}),
|
|
1011
|
+
};
|
|
1012
|
+
} catch {
|
|
1013
|
+
return undefined;
|
|
1014
|
+
}
|
|
1015
|
+
}
|
|
1016
|
+
|
|
1017
|
+
export async function runHostRuntimeReplayFixture(options: {
|
|
582
1018
|
routing: string;
|
|
583
1019
|
evalSet: EvalEntry[];
|
|
584
1020
|
fixture: RoutingReplayFixture;
|
|
585
|
-
|
|
1021
|
+
contentTarget?: RuntimeReplayContentTarget;
|
|
1022
|
+
runtimeInvoker?: RuntimeReplayInvoker;
|
|
586
1023
|
}): Promise<RoutingReplayEntryResult[]> {
|
|
587
|
-
const
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
if (options.fixture.platform !== "claude_code") {
|
|
591
|
-
return prefixReplayEvidence(
|
|
592
|
-
runHostReplayFixture(options),
|
|
593
|
-
fallbackReason(`unsupported platform ${options.fixture.platform}`),
|
|
594
|
-
);
|
|
595
|
-
}
|
|
596
|
-
|
|
597
|
-
const invokeRuntime = options.runtimeInvoker ?? invokeClaudeRuntimeReplay;
|
|
1024
|
+
const invokeRuntime =
|
|
1025
|
+
options.runtimeInvoker ?? getDefaultRuntimeReplayInvoker(options.fixture.platform);
|
|
598
1026
|
let workspace: ReplayWorkspace | undefined;
|
|
599
1027
|
|
|
600
1028
|
try {
|
|
601
|
-
workspace = buildRuntimeReplayWorkspace(
|
|
1029
|
+
workspace = buildRuntimeReplayWorkspace(
|
|
1030
|
+
options.fixture,
|
|
1031
|
+
options.routing,
|
|
1032
|
+
options.contentTarget ?? "routing",
|
|
1033
|
+
);
|
|
602
1034
|
const results: RoutingReplayEntryResult[] = [];
|
|
603
1035
|
|
|
604
1036
|
for (const entry of options.evalSet) {
|
|
605
1037
|
const observation = await invokeRuntime({
|
|
606
1038
|
query: entry.query,
|
|
1039
|
+
platform: options.fixture.platform,
|
|
607
1040
|
workspaceRoot: workspace.rootDir,
|
|
1041
|
+
skillRegistryDir: workspace.skillRegistryDir,
|
|
608
1042
|
targetSkillName: options.fixture.target_skill_name,
|
|
609
1043
|
targetSkillPath: workspace.targetSkillPath,
|
|
610
1044
|
competingSkillPaths: workspace.competingSkillPaths,
|
|
@@ -617,8 +1051,24 @@ export async function runClaudeRuntimeReplayFixture(options: {
|
|
|
617
1051
|
return results;
|
|
618
1052
|
} catch (error) {
|
|
619
1053
|
const message = error instanceof Error ? error.message : String(error);
|
|
620
|
-
|
|
1054
|
+
throw new Error(message);
|
|
621
1055
|
} finally {
|
|
622
1056
|
if (workspace) cleanupRuntimeReplayWorkspace(workspace);
|
|
623
1057
|
}
|
|
624
1058
|
}
|
|
1059
|
+
|
|
1060
|
+
export async function runClaudeRuntimeReplayFixture(options: {
|
|
1061
|
+
routing: string;
|
|
1062
|
+
evalSet: EvalEntry[];
|
|
1063
|
+
fixture: RoutingReplayFixture;
|
|
1064
|
+
contentTarget?: RuntimeReplayContentTarget;
|
|
1065
|
+
runtimeInvoker?: RuntimeReplayInvoker;
|
|
1066
|
+
}): Promise<RoutingReplayEntryResult[]> {
|
|
1067
|
+
if (options.fixture.platform !== "claude_code") {
|
|
1068
|
+
throw new Error(
|
|
1069
|
+
`runtime replay is only supported for claude_code fixtures (received ${options.fixture.platform})`,
|
|
1070
|
+
);
|
|
1071
|
+
}
|
|
1072
|
+
|
|
1073
|
+
return runHostRuntimeReplayFixture(options);
|
|
1074
|
+
}
|