selftune 0.1.4 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/diagnosis-analyst.md +156 -0
- package/.claude/agents/evolution-reviewer.md +180 -0
- package/.claude/agents/integration-guide.md +212 -0
- package/.claude/agents/pattern-analyst.md +160 -0
- package/CHANGELOG.md +46 -1
- package/README.md +105 -257
- package/apps/local-dashboard/dist/assets/geist-cyrillic-wght-normal-CHSlOQsW.woff2 +0 -0
- package/apps/local-dashboard/dist/assets/geist-latin-ext-wght-normal-DMtmJ5ZE.woff2 +0 -0
- package/apps/local-dashboard/dist/assets/geist-latin-wght-normal-Dm3htQBi.woff2 +0 -0
- package/apps/local-dashboard/dist/assets/index-C4EOTFZ2.js +15 -0
- package/apps/local-dashboard/dist/assets/index-bl-Webyd.css +1 -0
- package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +60 -0
- package/apps/local-dashboard/dist/assets/vendor-table-B7VF2Ipl.js +26 -0
- package/apps/local-dashboard/dist/assets/vendor-ui-D7_zX_qy.js +346 -0
- package/apps/local-dashboard/dist/favicon.png +0 -0
- package/apps/local-dashboard/dist/index.html +17 -0
- package/apps/local-dashboard/dist/logo.png +0 -0
- package/apps/local-dashboard/dist/logo.svg +9 -0
- package/assets/BeforeAfter.gif +0 -0
- package/assets/FeedbackLoop.gif +0 -0
- package/assets/logo.svg +9 -0
- package/assets/skill-health-badge.svg +20 -0
- package/cli/selftune/activation-rules.ts +171 -0
- package/cli/selftune/badge/badge-data.ts +108 -0
- package/cli/selftune/badge/badge-svg.ts +212 -0
- package/cli/selftune/badge/badge.ts +99 -0
- package/cli/selftune/canonical-export.ts +183 -0
- package/cli/selftune/constants.ts +103 -1
- package/cli/selftune/contribute/bundle.ts +314 -0
- package/cli/selftune/contribute/contribute.ts +214 -0
- package/cli/selftune/contribute/sanitize.ts +162 -0
- package/cli/selftune/cron/setup.ts +266 -0
- package/cli/selftune/dashboard-contract.ts +202 -0
- package/cli/selftune/dashboard-server.ts +1049 -0
- package/cli/selftune/dashboard.ts +43 -156
- package/cli/selftune/eval/baseline.ts +248 -0
- package/cli/selftune/eval/composability-v2.ts +273 -0
- package/cli/selftune/eval/composability.ts +117 -0
- package/cli/selftune/eval/generate-unit-tests.ts +143 -0
- package/cli/selftune/eval/hooks-to-evals.ts +101 -16
- package/cli/selftune/eval/import-skillsbench.ts +221 -0
- package/cli/selftune/eval/synthetic-evals.ts +172 -0
- package/cli/selftune/eval/unit-test-cli.ts +152 -0
- package/cli/selftune/eval/unit-test.ts +196 -0
- package/cli/selftune/evolution/deploy-proposal.ts +142 -1
- package/cli/selftune/evolution/evidence.ts +26 -0
- package/cli/selftune/evolution/evolve-body.ts +586 -0
- package/cli/selftune/evolution/evolve.ts +825 -116
- package/cli/selftune/evolution/extract-patterns.ts +105 -16
- package/cli/selftune/evolution/pareto.ts +314 -0
- package/cli/selftune/evolution/propose-body.ts +171 -0
- package/cli/selftune/evolution/propose-description.ts +100 -2
- package/cli/selftune/evolution/propose-routing.ts +166 -0
- package/cli/selftune/evolution/refine-body.ts +141 -0
- package/cli/selftune/evolution/rollback.ts +21 -4
- package/cli/selftune/evolution/validate-body.ts +254 -0
- package/cli/selftune/evolution/validate-proposal.ts +257 -35
- package/cli/selftune/evolution/validate-routing.ts +177 -0
- package/cli/selftune/grading/auto-grade.ts +200 -0
- package/cli/selftune/grading/grade-session.ts +513 -42
- package/cli/selftune/grading/pre-gates.ts +104 -0
- package/cli/selftune/grading/results.ts +42 -0
- package/cli/selftune/hooks/auto-activate.ts +185 -0
- package/cli/selftune/hooks/evolution-guard.ts +165 -0
- package/cli/selftune/hooks/prompt-log.ts +172 -2
- package/cli/selftune/hooks/session-stop.ts +123 -3
- package/cli/selftune/hooks/skill-change-guard.ts +112 -0
- package/cli/selftune/hooks/skill-eval.ts +119 -3
- package/cli/selftune/index.ts +415 -48
- package/cli/selftune/ingestors/claude-replay.ts +377 -0
- package/cli/selftune/ingestors/codex-rollout.ts +345 -46
- package/cli/selftune/ingestors/codex-wrapper.ts +207 -39
- package/cli/selftune/ingestors/openclaw-ingest.ts +573 -0
- package/cli/selftune/ingestors/opencode-ingest.ts +193 -17
- package/cli/selftune/init.ts +376 -16
- package/cli/selftune/last.ts +14 -5
- package/cli/selftune/localdb/db.ts +63 -0
- package/cli/selftune/localdb/materialize.ts +428 -0
- package/cli/selftune/localdb/queries.ts +376 -0
- package/cli/selftune/localdb/schema.ts +204 -0
- package/cli/selftune/memory/writer.ts +447 -0
- package/cli/selftune/monitoring/watch.ts +90 -16
- package/cli/selftune/normalization.ts +682 -0
- package/cli/selftune/observability.ts +19 -44
- package/cli/selftune/orchestrate.ts +1073 -0
- package/cli/selftune/quickstart.ts +203 -0
- package/cli/selftune/repair/skill-usage.ts +576 -0
- package/cli/selftune/schedule.ts +561 -0
- package/cli/selftune/status.ts +59 -33
- package/cli/selftune/sync.ts +627 -0
- package/cli/selftune/types.ts +525 -5
- package/cli/selftune/utils/canonical-log.ts +45 -0
- package/cli/selftune/utils/frontmatter.ts +217 -0
- package/cli/selftune/utils/hooks.ts +41 -0
- package/cli/selftune/utils/html.ts +27 -0
- package/cli/selftune/utils/llm-call.ts +103 -19
- package/cli/selftune/utils/math.ts +10 -0
- package/cli/selftune/utils/query-filter.ts +139 -0
- package/cli/selftune/utils/skill-discovery.ts +340 -0
- package/cli/selftune/utils/skill-log.ts +68 -0
- package/cli/selftune/utils/skill-usage-confidence.ts +18 -0
- package/cli/selftune/utils/transcript.ts +307 -26
- package/cli/selftune/utils/trigger-check.ts +89 -0
- package/cli/selftune/utils/tui.ts +156 -0
- package/cli/selftune/workflows/discover.ts +254 -0
- package/cli/selftune/workflows/skill-md-writer.ts +288 -0
- package/cli/selftune/workflows/workflows.ts +188 -0
- package/package.json +28 -11
- package/packages/telemetry-contract/README.md +11 -0
- package/packages/telemetry-contract/fixtures/golden.json +87 -0
- package/packages/telemetry-contract/fixtures/golden.test.ts +42 -0
- package/packages/telemetry-contract/index.ts +1 -0
- package/packages/telemetry-contract/package.json +19 -0
- package/packages/telemetry-contract/src/index.ts +2 -0
- package/packages/telemetry-contract/src/types.ts +163 -0
- package/packages/telemetry-contract/src/validators.ts +109 -0
- package/skill/SKILL.md +180 -33
- package/skill/Workflows/AutoActivation.md +145 -0
- package/skill/Workflows/Badge.md +124 -0
- package/skill/Workflows/Baseline.md +144 -0
- package/skill/Workflows/Composability.md +107 -0
- package/skill/Workflows/Contribute.md +94 -0
- package/skill/Workflows/Cron.md +132 -0
- package/skill/Workflows/Dashboard.md +214 -0
- package/skill/Workflows/Doctor.md +63 -14
- package/skill/Workflows/Evals.md +110 -18
- package/skill/Workflows/EvolutionMemory.md +154 -0
- package/skill/Workflows/Evolve.md +181 -21
- package/skill/Workflows/EvolveBody.md +159 -0
- package/skill/Workflows/Grade.md +36 -31
- package/skill/Workflows/ImportSkillsBench.md +117 -0
- package/skill/Workflows/Ingest.md +142 -21
- package/skill/Workflows/Initialize.md +91 -23
- package/skill/Workflows/Orchestrate.md +139 -0
- package/skill/Workflows/Replay.md +91 -0
- package/skill/Workflows/Rollback.md +23 -4
- package/skill/Workflows/Schedule.md +61 -0
- package/skill/Workflows/Sync.md +88 -0
- package/skill/Workflows/UnitTest.md +150 -0
- package/skill/Workflows/Watch.md +33 -1
- package/skill/Workflows/Workflows.md +129 -0
- package/skill/assets/activation-rules-default.json +26 -0
- package/skill/assets/multi-skill-settings.json +63 -0
- package/skill/assets/single-skill-settings.json +57 -0
- package/skill/references/invocation-taxonomy.md +2 -2
- package/skill/references/logs.md +164 -2
- package/skill/references/setup-patterns.md +65 -0
- package/skill/references/version-history.md +40 -0
- package/skill/settings_snippet.json +23 -0
- package/templates/activation-rules-default.json +27 -0
- package/templates/multi-skill-settings.json +64 -0
- package/templates/single-skill-settings.json +58 -0
- package/dashboard/index.html +0 -1119
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* validate-routing.ts
|
|
3
|
+
*
|
|
4
|
+
* Validates a routing table evolution proposal by checking structural validity
|
|
5
|
+
* and running trigger accuracy checks against an eval set.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type { BodyEvolutionProposal, BodyValidationResult, EvalEntry } from "../types.js";
|
|
9
|
+
import { callLlm } from "../utils/llm-call.js";
|
|
10
|
+
import { buildTriggerCheckPrompt, parseTriggerResponse } from "../utils/trigger-check.js";
|
|
11
|
+
|
|
12
|
+
// ---------------------------------------------------------------------------
|
|
13
|
+
// Structural validation
|
|
14
|
+
// ---------------------------------------------------------------------------
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Check that a routing table is valid markdown table syntax with
|
|
18
|
+
* `| Trigger | Workflow |` columns.
|
|
19
|
+
*/
|
|
20
|
+
export function validateRoutingStructure(routing: string): { valid: boolean; reason: string } {
|
|
21
|
+
const lines = routing
|
|
22
|
+
.trim()
|
|
23
|
+
.split("\n")
|
|
24
|
+
.filter((l) => l.trim().length > 0);
|
|
25
|
+
|
|
26
|
+
if (lines.length < 2) {
|
|
27
|
+
return { valid: false, reason: "Routing table must have at least a header and one data row" };
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
// Check header row contains Trigger and Workflow columns
|
|
31
|
+
const headerLine = lines[0].trim();
|
|
32
|
+
if (!headerLine.startsWith("|") || !headerLine.endsWith("|")) {
|
|
33
|
+
return {
|
|
34
|
+
valid: false,
|
|
35
|
+
reason: "Header row must be a markdown table row starting and ending with |",
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
const headerLower = headerLine.toLowerCase();
|
|
40
|
+
if (!headerLower.includes("trigger") || !headerLower.includes("workflow")) {
|
|
41
|
+
return { valid: false, reason: "Header must contain 'Trigger' and 'Workflow' columns" };
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// Check separator row (line 2) has dashes
|
|
45
|
+
const separatorLine = lines[1].trim();
|
|
46
|
+
if (!separatorLine.includes("---")) {
|
|
47
|
+
return { valid: false, reason: "Second row must be a markdown table separator (contains ---)" };
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
// Check at least one data row
|
|
51
|
+
if (lines.length < 3) {
|
|
52
|
+
return { valid: false, reason: "Routing table must have at least one data row" };
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// Check data rows are pipe-delimited
|
|
56
|
+
for (let i = 2; i < lines.length; i++) {
|
|
57
|
+
const row = lines[i].trim();
|
|
58
|
+
if (!row.startsWith("|") || !row.endsWith("|")) {
|
|
59
|
+
return { valid: false, reason: `Data row ${i - 1} is not a valid markdown table row` };
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
return { valid: true, reason: "Valid markdown routing table" };
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// ---------------------------------------------------------------------------
|
|
67
|
+
// Trigger accuracy validation
|
|
68
|
+
// ---------------------------------------------------------------------------
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Run before/after trigger checks on the eval set using the routing content.
|
|
72
|
+
* Returns pass rates for comparison.
|
|
73
|
+
*/
|
|
74
|
+
export async function validateRoutingTriggerAccuracy(
|
|
75
|
+
originalRouting: string,
|
|
76
|
+
proposedRouting: string,
|
|
77
|
+
evalSet: EvalEntry[],
|
|
78
|
+
agent: string,
|
|
79
|
+
modelFlag?: string,
|
|
80
|
+
): Promise<{ before_pass_rate: number; after_pass_rate: number; improved: boolean }> {
|
|
81
|
+
if (evalSet.length === 0) {
|
|
82
|
+
return { before_pass_rate: 0, after_pass_rate: 0, improved: false };
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
const systemPrompt = "You are an evaluation assistant. Answer only YES or NO.";
|
|
86
|
+
let beforePassed = 0;
|
|
87
|
+
let afterPassed = 0;
|
|
88
|
+
|
|
89
|
+
for (const entry of evalSet) {
|
|
90
|
+
// Check with original routing
|
|
91
|
+
const beforePrompt = buildTriggerCheckPrompt(originalRouting, entry.query);
|
|
92
|
+
const beforeRaw = await callLlm(systemPrompt, beforePrompt, agent, modelFlag);
|
|
93
|
+
const beforeTriggered = parseTriggerResponse(beforeRaw);
|
|
94
|
+
const beforePass =
|
|
95
|
+
(entry.should_trigger && beforeTriggered) || (!entry.should_trigger && !beforeTriggered);
|
|
96
|
+
|
|
97
|
+
// Check with proposed routing
|
|
98
|
+
const afterPrompt = buildTriggerCheckPrompt(proposedRouting, entry.query);
|
|
99
|
+
const afterRaw = await callLlm(systemPrompt, afterPrompt, agent, modelFlag);
|
|
100
|
+
const afterTriggered = parseTriggerResponse(afterRaw);
|
|
101
|
+
const afterPass =
|
|
102
|
+
(entry.should_trigger && afterTriggered) || (!entry.should_trigger && !afterTriggered);
|
|
103
|
+
|
|
104
|
+
if (beforePass) beforePassed++;
|
|
105
|
+
if (afterPass) afterPassed++;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
const total = evalSet.length;
|
|
109
|
+
const beforePassRate = beforePassed / total;
|
|
110
|
+
const afterPassRate = afterPassed / total;
|
|
111
|
+
|
|
112
|
+
return {
|
|
113
|
+
before_pass_rate: beforePassRate,
|
|
114
|
+
after_pass_rate: afterPassRate,
|
|
115
|
+
improved: afterPassRate > beforePassRate,
|
|
116
|
+
};
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// ---------------------------------------------------------------------------
|
|
120
|
+
// Full routing validation
|
|
121
|
+
// ---------------------------------------------------------------------------
|
|
122
|
+
|
|
123
|
+
/** Validate a routing table proposal: structural check + trigger accuracy. */
|
|
124
|
+
export async function validateRoutingProposal(
|
|
125
|
+
proposal: BodyEvolutionProposal,
|
|
126
|
+
evalSet: EvalEntry[],
|
|
127
|
+
agent: string,
|
|
128
|
+
modelFlag?: string,
|
|
129
|
+
): Promise<BodyValidationResult> {
|
|
130
|
+
const gateResults: Array<{ gate: string; passed: boolean; reason: string }> = [];
|
|
131
|
+
|
|
132
|
+
// Gate 1: Structural validation
|
|
133
|
+
const structural = validateRoutingStructure(proposal.proposed_body);
|
|
134
|
+
gateResults.push({
|
|
135
|
+
gate: "structural",
|
|
136
|
+
passed: structural.valid,
|
|
137
|
+
reason: structural.reason,
|
|
138
|
+
});
|
|
139
|
+
|
|
140
|
+
if (!structural.valid) {
|
|
141
|
+
return {
|
|
142
|
+
proposal_id: proposal.proposal_id,
|
|
143
|
+
gates_passed: 0,
|
|
144
|
+
gates_total: 2,
|
|
145
|
+
gate_results: gateResults,
|
|
146
|
+
improved: false,
|
|
147
|
+
regressions: [],
|
|
148
|
+
};
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
// Gate 2: Trigger accuracy
|
|
152
|
+
const accuracy = await validateRoutingTriggerAccuracy(
|
|
153
|
+
proposal.original_body,
|
|
154
|
+
proposal.proposed_body,
|
|
155
|
+
evalSet,
|
|
156
|
+
agent,
|
|
157
|
+
modelFlag,
|
|
158
|
+
);
|
|
159
|
+
gateResults.push({
|
|
160
|
+
gate: "trigger_accuracy",
|
|
161
|
+
passed: accuracy.improved,
|
|
162
|
+
reason: accuracy.improved
|
|
163
|
+
? `Improved: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`
|
|
164
|
+
: `Not improved: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`,
|
|
165
|
+
});
|
|
166
|
+
|
|
167
|
+
const gatesPassed = gateResults.filter((g) => g.passed).length;
|
|
168
|
+
|
|
169
|
+
return {
|
|
170
|
+
proposal_id: proposal.proposal_id,
|
|
171
|
+
gates_passed: gatesPassed,
|
|
172
|
+
gates_total: 2,
|
|
173
|
+
gate_results: gateResults,
|
|
174
|
+
improved: gatesPassed === 2,
|
|
175
|
+
regressions: [],
|
|
176
|
+
};
|
|
177
|
+
}
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* auto-grade.ts
|
|
4
|
+
*
|
|
5
|
+
* Frictionless grading command that auto-finds the most recent real session
|
|
6
|
+
* for a skill, auto-derives expectations from SKILL.md, grades, and outputs results.
|
|
7
|
+
*
|
|
8
|
+
* Usage:
|
|
9
|
+
* selftune auto-grade --skill <name> [--skill-path <path>] [--output <path>] [--agent <agent>]
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import { mkdirSync, writeFileSync } from "node:fs";
|
|
13
|
+
import { dirname } from "node:path";
|
|
14
|
+
import { parseArgs } from "node:util";
|
|
15
|
+
|
|
16
|
+
import { AGENT_CANDIDATES, TELEMETRY_LOG } from "../constants.js";
|
|
17
|
+
import type { GradingResult, SessionTelemetryRecord } from "../types.js";
|
|
18
|
+
import { readJsonl } from "../utils/jsonl.js";
|
|
19
|
+
import { detectAgent as _detectAgent } from "../utils/llm-call.js";
|
|
20
|
+
import { readEffectiveSkillUsageRecords } from "../utils/skill-log.js";
|
|
21
|
+
import { readExcerpt } from "../utils/transcript.js";
|
|
22
|
+
import {
|
|
23
|
+
buildDefaultGradingOutputPath,
|
|
24
|
+
deriveExpectationsFromSkill,
|
|
25
|
+
gradeSession,
|
|
26
|
+
resolveLatestSessionForSkill,
|
|
27
|
+
resolveSessionById,
|
|
28
|
+
} from "./grade-session.js";
|
|
29
|
+
|
|
30
|
+
export async function cliMain(): Promise<void> {
|
|
31
|
+
const { values } = parseArgs({
|
|
32
|
+
options: {
|
|
33
|
+
skill: { type: "string" },
|
|
34
|
+
"skill-path": { type: "string" },
|
|
35
|
+
"session-id": { type: "string" },
|
|
36
|
+
"telemetry-log": { type: "string", default: TELEMETRY_LOG },
|
|
37
|
+
output: { type: "string" },
|
|
38
|
+
agent: { type: "string" },
|
|
39
|
+
"show-transcript": { type: "boolean", default: false },
|
|
40
|
+
help: { type: "boolean", short: "h", default: false },
|
|
41
|
+
},
|
|
42
|
+
strict: true,
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
if (values.help) {
|
|
46
|
+
console.log(`selftune auto-grade — Frictionless skill session grading
|
|
47
|
+
|
|
48
|
+
Usage:
|
|
49
|
+
selftune auto-grade --skill <name> [options]
|
|
50
|
+
|
|
51
|
+
Options:
|
|
52
|
+
--skill Skill name (required)
|
|
53
|
+
--skill-path Path to SKILL.md (auto-detected if omitted)
|
|
54
|
+
--session-id Grade a specific session (auto-detects most recent if omitted)
|
|
55
|
+
--telemetry-log Path to telemetry log (default: ~/.claude/session_telemetry_log.jsonl)
|
|
56
|
+
--output Output path for grading JSON (default: ~/.selftune/grading/result-<session>.json)
|
|
57
|
+
--agent Agent CLI to use (${AGENT_CANDIDATES.join(", ")})
|
|
58
|
+
--show-transcript Print transcript excerpt before grading
|
|
59
|
+
-h, --help Show this help message`);
|
|
60
|
+
process.exit(0);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
const skill = values.skill;
|
|
64
|
+
if (!skill) {
|
|
65
|
+
console.error("[ERROR] --skill is required");
|
|
66
|
+
process.exit(1);
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
// --- Determine agent ---
|
|
70
|
+
let agent: string | null = null;
|
|
71
|
+
const validAgents = [...AGENT_CANDIDATES];
|
|
72
|
+
if (values.agent) {
|
|
73
|
+
if (!validAgents.includes(values.agent)) {
|
|
74
|
+
console.error(
|
|
75
|
+
`[ERROR] Invalid --agent '${values.agent}'. Expected one of: ${validAgents.join(", ")}`,
|
|
76
|
+
);
|
|
77
|
+
process.exit(1);
|
|
78
|
+
}
|
|
79
|
+
agent = values.agent;
|
|
80
|
+
} else {
|
|
81
|
+
agent = _detectAgent();
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
if (!agent) {
|
|
85
|
+
console.error(
|
|
86
|
+
`[ERROR] No supported agent CLI (${AGENT_CANDIDATES.join("/")}) found in PATH.\n` +
|
|
87
|
+
"Install one of the supported agent CLIs.",
|
|
88
|
+
);
|
|
89
|
+
process.exit(1);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
console.error(`[INFO] Auto-grade via agent: ${agent}`);
|
|
93
|
+
|
|
94
|
+
// --- Auto-find session ---
|
|
95
|
+
const telemetryLog = values["telemetry-log"] ?? TELEMETRY_LOG;
|
|
96
|
+
const telRecords = readJsonl<SessionTelemetryRecord>(telemetryLog);
|
|
97
|
+
const skillUsageRecords = readEffectiveSkillUsageRecords();
|
|
98
|
+
|
|
99
|
+
let telemetry: SessionTelemetryRecord;
|
|
100
|
+
let sessionId: string;
|
|
101
|
+
let transcriptPath: string;
|
|
102
|
+
|
|
103
|
+
if (values["session-id"]) {
|
|
104
|
+
sessionId = values["session-id"];
|
|
105
|
+
const resolved = resolveSessionById(telRecords, sessionId);
|
|
106
|
+
if (!resolved) {
|
|
107
|
+
console.error(
|
|
108
|
+
`[ERROR] Session '${sessionId}' not found in telemetry or recoverable transcript data. ` +
|
|
109
|
+
"Check the session ID or omit --session-id to auto-select the latest matching session.",
|
|
110
|
+
);
|
|
111
|
+
process.exit(1);
|
|
112
|
+
}
|
|
113
|
+
telemetry = resolved.telemetry;
|
|
114
|
+
transcriptPath = resolved.transcriptPath;
|
|
115
|
+
} else {
|
|
116
|
+
const resolved = resolveLatestSessionForSkill(telRecords, skillUsageRecords, skill);
|
|
117
|
+
if (!resolved) {
|
|
118
|
+
console.error(
|
|
119
|
+
`[ERROR] No session found for skill '${skill}'. Run the skill first, or pass --session-id.`,
|
|
120
|
+
);
|
|
121
|
+
process.exit(1);
|
|
122
|
+
}
|
|
123
|
+
telemetry = resolved.telemetry;
|
|
124
|
+
sessionId = resolved.sessionId ?? "unknown";
|
|
125
|
+
transcriptPath = resolved.transcriptPath ?? "";
|
|
126
|
+
const note =
|
|
127
|
+
resolved.source === "telemetry" ? "" : ` (${resolved.source.replaceAll("_", " ")})`;
|
|
128
|
+
console.error(`[INFO] Found most recent '${skill}' session: ${sessionId}${note}`);
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
const transcriptExcerpt = transcriptPath ? readExcerpt(transcriptPath) : "(no transcript)";
|
|
132
|
+
|
|
133
|
+
if (values["show-transcript"]) {
|
|
134
|
+
console.log("=== TRANSCRIPT EXCERPT ===");
|
|
135
|
+
console.log(transcriptExcerpt);
|
|
136
|
+
console.log("==========================\n");
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
// --- Auto-derive expectations ---
|
|
140
|
+
const derived = deriveExpectationsFromSkill(skill, values["skill-path"]);
|
|
141
|
+
if (derived.derived) {
|
|
142
|
+
console.error(
|
|
143
|
+
`[INFO] Auto-derived ${derived.expectations.length} expectations from ${derived.source}`,
|
|
144
|
+
);
|
|
145
|
+
} else {
|
|
146
|
+
console.error(`[WARN] Using generic expectations (${derived.source})`);
|
|
147
|
+
}
|
|
148
|
+
const expectations = derived.expectations;
|
|
149
|
+
|
|
150
|
+
let result: GradingResult;
|
|
151
|
+
try {
|
|
152
|
+
result = await gradeSession({
|
|
153
|
+
expectations,
|
|
154
|
+
telemetry,
|
|
155
|
+
sessionId,
|
|
156
|
+
skillName: skill,
|
|
157
|
+
transcriptExcerpt,
|
|
158
|
+
transcriptPath,
|
|
159
|
+
agent,
|
|
160
|
+
});
|
|
161
|
+
} catch (err) {
|
|
162
|
+
console.error(`[ERROR] ${err instanceof Error ? err.message : String(err)}`);
|
|
163
|
+
process.exit(1);
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
const outputPath = values.output ?? buildDefaultGradingOutputPath(sessionId);
|
|
167
|
+
const outputDir = dirname(outputPath);
|
|
168
|
+
if (outputDir !== ".") {
|
|
169
|
+
mkdirSync(outputDir, { recursive: true });
|
|
170
|
+
}
|
|
171
|
+
writeFileSync(outputPath, JSON.stringify(result, null, 2), "utf-8");
|
|
172
|
+
|
|
173
|
+
// Print summary
|
|
174
|
+
const { summary } = result;
|
|
175
|
+
const rate = summary.pass_rate ?? 0;
|
|
176
|
+
const meanStr =
|
|
177
|
+
summary.mean_score != null ? ` | mean score: ${summary.mean_score.toFixed(2)}` : "";
|
|
178
|
+
console.log(
|
|
179
|
+
`\nResults: ${summary.passed}/${summary.total} passed (${Math.round(rate * 100)}%)${meanStr}`,
|
|
180
|
+
);
|
|
181
|
+
for (const exp of result.expectations ?? []) {
|
|
182
|
+
const icon = exp.passed ? "\u2713" : "\u2717";
|
|
183
|
+
const scoreStr = exp.score != null ? ` [${exp.score.toFixed(1)}]` : "";
|
|
184
|
+
const sourceStr = exp.source ? ` (${exp.source})` : "";
|
|
185
|
+
console.log(` ${icon}${scoreStr}${sourceStr} ${String(exp.text ?? "").slice(0, 70)}`);
|
|
186
|
+
if (!exp.passed) {
|
|
187
|
+
console.log(` -> ${String(exp.evidence ?? "").slice(0, 100)}`);
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
console.log(`\nWrote ${outputPath}`);
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
// Guard: only run when invoked directly
|
|
195
|
+
if (import.meta.main) {
|
|
196
|
+
cliMain().catch((err) => {
|
|
197
|
+
console.error(`[FATAL] ${err}`);
|
|
198
|
+
process.exit(1);
|
|
199
|
+
});
|
|
200
|
+
}
|