selftune 0.2.23 → 0.2.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/README.md +93 -15
- package/apps/local-dashboard/dist/assets/index-DgY2KGP-.css +1 -0
- package/apps/local-dashboard/dist/assets/index-Dhgv5BQO.js +15 -0
- package/apps/local-dashboard/dist/assets/vendor-react-C5oyHiV1.js +11 -0
- package/apps/local-dashboard/dist/assets/{vendor-table-BIiI3YhS.js → vendor-table-Bc_bbKd8.js} +1 -1
- package/apps/local-dashboard/dist/assets/vendor-ui-B3BPIYy7.js +1 -0
- package/apps/local-dashboard/dist/index.html +5 -5
- package/cli/selftune/adapters/codex/install.ts +310 -78
- package/cli/selftune/adapters/opencode/install.ts +3 -4
- package/cli/selftune/alpha-upload/build-payloads.ts +3 -3
- package/cli/selftune/alpha-upload/stage-canonical.ts +17 -11
- package/cli/selftune/auto-update.ts +200 -8
- package/cli/selftune/canonical-export.ts +55 -25
- package/cli/selftune/command-surface.ts +397 -0
- package/cli/selftune/contribute/contribute.ts +64 -13
- package/cli/selftune/contribution-config.ts +57 -3
- package/cli/selftune/contribution-preferences.ts +117 -0
- package/cli/selftune/contribution-signals.ts +8 -4
- package/cli/selftune/contribution-staging.ts +13 -2
- package/cli/selftune/contributions.ts +55 -121
- package/cli/selftune/creator-contributions.ts +29 -10
- package/cli/selftune/cron/setup.ts +7 -3
- package/cli/selftune/dashboard-contract.ts +73 -0
- package/cli/selftune/dashboard-server.ts +168 -17
- package/cli/selftune/dashboard.ts +350 -17
- package/cli/selftune/eval/baseline.ts +21 -5
- package/cli/selftune/eval/execution-eval.ts +170 -0
- package/cli/selftune/eval/family-overlap.ts +2 -2
- package/cli/selftune/eval/hooks-to-evals.ts +228 -82
- package/cli/selftune/eval/import-skillsbench.ts +2 -2
- package/cli/selftune/eval/invocation-classifier.ts +56 -0
- package/cli/selftune/eval/synthetic-evals.ts +5 -3
- package/cli/selftune/eval/unit-test-cli.ts +7 -4
- package/cli/selftune/evolution/apply-proposal.ts +295 -0
- package/cli/selftune/evolution/engines/replay-engine.ts +79 -57
- package/cli/selftune/evolution/evolve-body.ts +100 -39
- package/cli/selftune/evolution/evolve.ts +244 -52
- package/cli/selftune/evolution/rollback.ts +0 -1
- package/cli/selftune/evolution/validate-body.ts +68 -42
- package/cli/selftune/evolution/validate-host-replay.ts +510 -60
- package/cli/selftune/evolution/validate-proposal.ts +11 -150
- package/cli/selftune/evolution/validate-routing.ts +43 -41
- package/cli/selftune/evolution/validation-contract.ts +91 -0
- package/cli/selftune/grading/auto-grade.ts +11 -7
- package/cli/selftune/grading/grade-session.ts +10 -16
- package/cli/selftune/index.ts +35 -10
- package/cli/selftune/ingestors/claude-replay.ts +15 -10
- package/cli/selftune/ingestors/codex-wrapper.ts +3 -3
- package/cli/selftune/ingestors/opencode-ingest.ts +2 -2
- package/cli/selftune/ingestors/pi-ingest.ts +3 -2
- package/cli/selftune/init.ts +27 -3
- package/cli/selftune/localdb/direct-write.ts +35 -1
- package/cli/selftune/localdb/queries/cron.ts +34 -0
- package/cli/selftune/localdb/queries/dashboard.ts +834 -0
- package/cli/selftune/localdb/queries/evolution.ts +158 -0
- package/cli/selftune/localdb/queries/execution.ts +133 -0
- package/cli/selftune/localdb/queries/json.ts +18 -0
- package/cli/selftune/localdb/queries/monitoring.ts +263 -0
- package/cli/selftune/localdb/queries/raw.ts +95 -0
- package/cli/selftune/localdb/queries/staging.ts +270 -0
- package/cli/selftune/localdb/queries/trust.ts +392 -0
- package/cli/selftune/localdb/queries.ts +60 -2288
- package/cli/selftune/localdb/schema.ts +21 -0
- package/cli/selftune/monitoring/watch.ts +96 -29
- package/cli/selftune/normalization.ts +3 -0
- package/cli/selftune/observability.ts +4 -2
- package/cli/selftune/orchestrate/cli.ts +161 -0
- package/cli/selftune/orchestrate/execute.ts +295 -0
- package/cli/selftune/orchestrate/finalize.ts +157 -0
- package/cli/selftune/orchestrate/locks.ts +40 -0
- package/cli/selftune/orchestrate/plan.ts +131 -0
- package/cli/selftune/orchestrate/post-run.ts +59 -0
- package/cli/selftune/orchestrate/prepare.ts +334 -0
- package/cli/selftune/orchestrate/report.ts +182 -0
- package/cli/selftune/orchestrate/runtime.ts +120 -0
- package/cli/selftune/orchestrate/signals.ts +48 -0
- package/cli/selftune/orchestrate.ts +150 -1173
- package/cli/selftune/repair/skill-usage.ts +5 -2
- package/cli/selftune/routes/overview.ts +5 -2
- package/cli/selftune/routes/skill-report.ts +15 -2
- package/cli/selftune/schedule.ts +5 -5
- package/cli/selftune/status.ts +39 -2
- package/cli/selftune/testing-readiness.ts +597 -0
- package/cli/selftune/types.ts +44 -4
- package/cli/selftune/uninstall.ts +2 -1
- package/cli/selftune/utils/canonical-log.ts +1 -9
- package/cli/selftune/utils/cli-error.ts +9 -0
- package/cli/selftune/utils/llm-call.ts +126 -6
- package/cli/selftune/utils/skill-discovery.ts +2 -0
- package/cli/selftune/workflows/proposals.ts +184 -0
- package/cli/selftune/workflows/skill-scaffold.ts +241 -0
- package/cli/selftune/workflows/workflows.ts +100 -26
- package/node_modules/@selftune/telemetry-contract/fixtures/complete-push.ts +1 -1
- package/node_modules/@selftune/telemetry-contract/fixtures/evidence-only-push.ts +1 -1
- package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
- package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +1 -1
- package/node_modules/@selftune/telemetry-contract/src/schemas.ts +41 -1
- package/node_modules/@selftune/telemetry-contract/src/types.ts +103 -2
- package/package.json +25 -9
- package/packages/dashboard-core/AGENTS.md +18 -0
- package/packages/dashboard-core/README.md +30 -0
- package/packages/dashboard-core/index.ts +3 -0
- package/packages/dashboard-core/package.json +39 -0
- package/packages/dashboard-core/src/chrome/DashboardChrome.tsx +74 -0
- package/packages/dashboard-core/src/chrome/DashboardHeader.tsx +200 -0
- package/packages/dashboard-core/src/chrome/DashboardSidebar.tsx +219 -0
- package/packages/dashboard-core/src/chrome/RuntimeBadge.tsx +46 -0
- package/packages/dashboard-core/src/chrome/index.ts +14 -0
- package/packages/dashboard-core/src/chrome/types.ts +81 -0
- package/packages/dashboard-core/src/chrome/utils.ts +23 -0
- package/packages/dashboard-core/src/gates/FeatureGate.tsx +11 -0
- package/packages/dashboard-core/src/gates/LockedRoute.tsx +29 -0
- package/packages/dashboard-core/src/gates/UpgradeCard.tsx +89 -0
- package/packages/dashboard-core/src/gates/index.ts +3 -0
- package/packages/dashboard-core/src/host/DashboardHostProvider.tsx +62 -0
- package/packages/dashboard-core/src/host/adapter.ts +47 -0
- package/packages/dashboard-core/src/host/capabilities.ts +55 -0
- package/packages/dashboard-core/src/host/index.ts +3 -0
- package/packages/dashboard-core/src/models/analytics.ts +39 -0
- package/packages/dashboard-core/src/models/index.ts +4 -0
- package/packages/dashboard-core/src/models/overview.ts +98 -0
- package/packages/dashboard-core/src/models/runtime.ts +7 -0
- package/packages/dashboard-core/src/models/skills.ts +34 -0
- package/packages/dashboard-core/src/routes/index.ts +2 -0
- package/packages/dashboard-core/src/routes/manifest.test.ts +70 -0
- package/packages/dashboard-core/src/routes/manifest.ts +451 -0
- package/packages/dashboard-core/src/routes/types.ts +39 -0
- package/packages/dashboard-core/src/screens/analytics/AnalyticsScreen.tsx +278 -0
- package/packages/dashboard-core/src/screens/analytics/index.ts +1 -0
- package/packages/dashboard-core/src/screens/index.ts +37 -0
- package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.test.ts +101 -0
- package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.tsx +393 -0
- package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.test.tsx +113 -0
- package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.tsx +72 -0
- package/packages/dashboard-core/src/screens/overview/OverviewCoreSurface.tsx +71 -0
- package/packages/dashboard-core/src/screens/overview/OverviewOnboardingBanner.tsx +90 -0
- package/packages/dashboard-core/src/screens/overview/OverviewRunSummary.tsx +40 -0
- package/packages/dashboard-core/src/screens/overview/index.ts +16 -0
- package/packages/dashboard-core/src/screens/overview/types.ts +13 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportDailyBreakdownSection.tsx +99 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportDataQualityTabContent.tsx +35 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceRail.tsx +71 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceSection.tsx +63 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceTabContent.tsx +25 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportInvocationsSection.tsx +24 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportMissedQueriesSection.tsx +79 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportScaffold.tsx +150 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportSections.test.tsx +224 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.test.tsx +76 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.tsx +88 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTrendSection.tsx +33 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTrustBadge.tsx +67 -0
- package/packages/dashboard-core/src/screens/skill-report/index.ts +45 -0
- package/packages/dashboard-core/src/screens/skills/SkillsLibraryScreen.tsx +162 -0
- package/packages/dashboard-core/src/screens/skills/index.ts +6 -0
- package/packages/telemetry-contract/fixtures/complete-push.ts +1 -1
- package/packages/telemetry-contract/fixtures/evidence-only-push.ts +1 -1
- package/packages/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
- package/packages/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +1 -1
- package/packages/telemetry-contract/src/schemas.ts +41 -1
- package/packages/telemetry-contract/src/types.ts +103 -2
- package/packages/ui/src/components/EvidenceViewer.tsx +80 -25
- package/packages/ui/src/components/OverviewPanels.tsx +67 -26
- package/packages/ui/src/primitives/tabs.tsx +7 -6
- package/packages/ui/src/types.ts +10 -0
- package/skill/SKILL.md +130 -332
- package/skill/agents/diagnosis-analyst.md +3 -3
- package/skill/agents/evolution-reviewer.md +3 -3
- package/skill/agents/integration-guide.md +3 -3
- package/skill/agents/pattern-analyst.md +2 -2
- package/skill/references/cli-quick-reference.md +89 -0
- package/skill/references/creator-playbook.md +131 -0
- package/skill/references/examples.md +48 -0
- package/skill/references/troubleshooting.md +47 -0
- package/skill/references/version-history.md +1 -1
- package/skill/selftune.contribute.json +11 -0
- package/skill/{Workflows → workflows}/Baseline.md +20 -1
- package/skill/{Workflows → workflows}/Contribute.md +23 -10
- package/skill/{Workflows → workflows}/Contributions.md +13 -5
- package/skill/workflows/CreateTestDeploy.md +170 -0
- package/skill/{Workflows → workflows}/CreatorContributions.md +18 -6
- package/skill/{Workflows → workflows}/Cron.md +1 -1
- package/skill/{Workflows → workflows}/Dashboard.md +20 -0
- package/skill/{Workflows → workflows}/Doctor.md +1 -1
- package/skill/{Workflows → workflows}/Evals.md +67 -2
- package/skill/{Workflows → workflows}/Evolve.md +119 -30
- package/skill/{Workflows → workflows}/EvolveBody.md +41 -1
- package/skill/{Workflows → workflows}/Grade.md +1 -1
- package/skill/{Workflows → workflows}/Initialize.md +8 -4
- package/skill/{Workflows → workflows}/Orchestrate.md +13 -3
- package/skill/{Workflows → workflows}/Schedule.md +3 -3
- package/skill/workflows/SignalsDashboard.md +87 -0
- package/skill/{Workflows → workflows}/UnitTest.md +19 -0
- package/skill/{Workflows → workflows}/Watch.md +42 -2
- package/skill/{Workflows → workflows}/Workflows.md +39 -2
- package/apps/local-dashboard/dist/assets/index-CwOtTrUS.css +0 -1
- package/apps/local-dashboard/dist/assets/index-f1HQpbeH.js +0 -59
- package/apps/local-dashboard/dist/assets/vendor-react-CKkiCskZ.js +0 -11
- package/apps/local-dashboard/dist/assets/vendor-ui-jVSaIZey.js +0 -12
- /package/skill/{Workflows → workflows}/AlphaUpload.md +0 -0
- /package/skill/{Workflows → workflows}/AutoActivation.md +0 -0
- /package/skill/{Workflows → workflows}/Badge.md +0 -0
- /package/skill/{Workflows → workflows}/Composability.md +0 -0
- /package/skill/{Workflows → workflows}/EvolutionMemory.md +0 -0
- /package/skill/{Workflows → workflows}/ExportCanonical.md +0 -0
- /package/skill/{Workflows → workflows}/Hook.md +0 -0
- /package/skill/{Workflows → workflows}/ImportSkillsBench.md +0 -0
- /package/skill/{Workflows → workflows}/Ingest.md +0 -0
- /package/skill/{Workflows → workflows}/PlatformHooks.md +0 -0
- /package/skill/{Workflows → workflows}/Quickstart.md +0 -0
- /package/skill/{Workflows → workflows}/Recover.md +0 -0
- /package/skill/{Workflows → workflows}/Registry.md +0 -0
- /package/skill/{Workflows → workflows}/RepairSkillUsage.md +0 -0
- /package/skill/{Workflows → workflows}/Replay.md +0 -0
- /package/skill/{Workflows → workflows}/Rollback.md +0 -0
- /package/skill/{Workflows → workflows}/Sync.md +0 -0
- /package/skill/{Workflows → workflows}/Telemetry.md +0 -0
- /package/skill/{Workflows → workflows}/Uninstall.md +0 -0
|
@@ -23,6 +23,7 @@
|
|
|
23
23
|
import { writeFileSync } from "node:fs";
|
|
24
24
|
import { parseArgs } from "node:util";
|
|
25
25
|
|
|
26
|
+
import { PUBLIC_COMMAND_SURFACES, renderCommandHelp } from "../command-surface.js";
|
|
26
27
|
import { GENERIC_NEGATIVES, QUERY_LOG, SKILL_LOG, TELEMETRY_LOG } from "../constants.js";
|
|
27
28
|
import { getDb } from "../localdb/db.js";
|
|
28
29
|
import {
|
|
@@ -32,27 +33,31 @@ import {
|
|
|
32
33
|
} from "../localdb/queries.js";
|
|
33
34
|
import type {
|
|
34
35
|
EvalEntry,
|
|
35
|
-
|
|
36
|
+
EvalSourceStats,
|
|
36
37
|
QueryLogRecord,
|
|
37
38
|
SessionTelemetryRecord,
|
|
38
39
|
SkillUsageRecord,
|
|
39
40
|
} from "../types.js";
|
|
40
41
|
import { CLIError, handleCLIError } from "../utils/cli-error.js";
|
|
41
|
-
import {
|
|
42
|
+
import { detectLlmAgent } from "../utils/llm-call.js";
|
|
42
43
|
import {
|
|
43
44
|
filterActionableQueryRecords,
|
|
44
45
|
filterActionableSkillUsageRecords,
|
|
45
46
|
} from "../utils/query-filter.js";
|
|
46
47
|
import { seededShuffle } from "../utils/seeded-random.js";
|
|
47
48
|
import {
|
|
48
|
-
escapeRegExp,
|
|
49
49
|
findInstalledSkillNames,
|
|
50
50
|
findInstalledSkillPath,
|
|
51
51
|
findRepositoryClaudeSkillDirs,
|
|
52
52
|
findRepositorySkillDirs,
|
|
53
53
|
} from "../utils/skill-discovery.js";
|
|
54
54
|
import { isHighConfidencePositiveSkillRecord } from "../utils/skill-usage-confidence.js";
|
|
55
|
+
import { readJsonl } from "../utils/jsonl.js";
|
|
56
|
+
import { classifyInvocation } from "./invocation-classifier.js";
|
|
55
57
|
import { generateSyntheticEvals } from "./synthetic-evals.js";
|
|
58
|
+
import { writeCanonicalEvalSet } from "../testing-readiness.js";
|
|
59
|
+
|
|
60
|
+
export { classifyInvocation } from "./invocation-classifier.js";
|
|
56
61
|
|
|
57
62
|
// ---------------------------------------------------------------------------
|
|
58
63
|
// Query truncation
|
|
@@ -64,69 +69,6 @@ function truncateQuery(query: string): string {
|
|
|
64
69
|
return query.length > MAX_QUERY_LENGTH ? query.slice(0, MAX_QUERY_LENGTH) : query;
|
|
65
70
|
}
|
|
66
71
|
|
|
67
|
-
// ---------------------------------------------------------------------------
|
|
68
|
-
// Invocation taxonomy classifier
|
|
69
|
-
// ---------------------------------------------------------------------------
|
|
70
|
-
|
|
71
|
-
export function classifyInvocation(query: string, skillName: string): InvocationType {
|
|
72
|
-
const qLower = query.toLowerCase();
|
|
73
|
-
const skillLower = skillName.toLowerCase();
|
|
74
|
-
|
|
75
|
-
// --- Explicit checks ---
|
|
76
|
-
|
|
77
|
-
// Explicit: mentions skill name or $skill syntax
|
|
78
|
-
if (
|
|
79
|
-
qLower.includes(`$${skillLower}`) ||
|
|
80
|
-
query.includes(`$${skillName}`) ||
|
|
81
|
-
qLower.includes(skillLower)
|
|
82
|
-
) {
|
|
83
|
-
return "explicit";
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
// Handle hyphenated skill names: check if all parts appear
|
|
87
|
-
if (skillLower.includes("-")) {
|
|
88
|
-
const parts = skillLower.split("-");
|
|
89
|
-
if (parts.every((part) => new RegExp(`\\b${escapeRegExp(part)}\\b`, "i").test(query))) {
|
|
90
|
-
return "explicit";
|
|
91
|
-
}
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
// Convert skill-name to camelCase and check
|
|
95
|
-
const camelCase = skillLower.replace(/-([a-z])/g, (_, c) => c.toUpperCase());
|
|
96
|
-
if (camelCase !== skillLower && qLower.includes(camelCase.toLowerCase())) {
|
|
97
|
-
return "explicit";
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
// --- Contextual checks ---
|
|
101
|
-
|
|
102
|
-
const wordCount = query.split(/\s+/).length;
|
|
103
|
-
const hasProperNoun = /\b[A-Z][a-z]{2,}\b/.test(query);
|
|
104
|
-
|
|
105
|
-
// Temporal references suggest domain context
|
|
106
|
-
const hasTemporalRef =
|
|
107
|
-
/\b(next week|last week|tomorrow|yesterday|Q[1-4]|monday|tuesday|wednesday|thursday|friday|january|february|march|april|may|june|july|august|september|october|november|december)\b/i.test(
|
|
108
|
-
query,
|
|
109
|
-
);
|
|
110
|
-
|
|
111
|
-
// Filenames suggest contextual usage
|
|
112
|
-
const hasFilename = /\b\w+\.\w{2,4}\b/.test(query);
|
|
113
|
-
|
|
114
|
-
// Email addresses suggest contextual usage
|
|
115
|
-
const hasEmail = /\b\S+@\S+\.\S+\b/.test(query);
|
|
116
|
-
|
|
117
|
-
if (wordCount > 15 || hasProperNoun || hasTemporalRef || hasFilename || hasEmail) {
|
|
118
|
-
return "contextual";
|
|
119
|
-
}
|
|
120
|
-
|
|
121
|
-
// Borderline: 10-15 words with domain signals (multi-digit numbers, uppercase acronyms)
|
|
122
|
-
const hasDomainSignal = /\b\d{2,}\b/.test(query) || /[A-Z]{2,}/.test(query);
|
|
123
|
-
if (wordCount >= 10 && hasDomainSignal) {
|
|
124
|
-
return "contextual";
|
|
125
|
-
}
|
|
126
|
-
|
|
127
|
-
return "implicit";
|
|
128
|
-
}
|
|
129
|
-
|
|
130
72
|
// ---------------------------------------------------------------------------
|
|
131
73
|
// Build eval set
|
|
132
74
|
// ---------------------------------------------------------------------------
|
|
@@ -144,6 +86,7 @@ export function buildEvalSet(
|
|
|
144
86
|
const actionableQueryRecords = filterActionableQueryRecords(queryRecords);
|
|
145
87
|
const effectiveMaxPerSide = Number.isNaN(maxPerSide) || maxPerSide <= 0 ? 50 : maxPerSide;
|
|
146
88
|
const effectiveSeed = Number.isNaN(seed) ? 42 : seed;
|
|
89
|
+
const buildTimestamp = new Date().toISOString();
|
|
147
90
|
|
|
148
91
|
// Build set of positive query texts (for exclusion from negatives)
|
|
149
92
|
const positiveQueries = new Set<string>();
|
|
@@ -166,7 +109,12 @@ export function buildEvalSet(
|
|
|
166
109
|
const q = (r.query ?? "").trim();
|
|
167
110
|
if (!q || q === "(query not found)" || seen.has(q)) continue;
|
|
168
111
|
seen.add(q);
|
|
169
|
-
const entry: EvalEntry = {
|
|
112
|
+
const entry: EvalEntry = {
|
|
113
|
+
query: truncateQuery(q),
|
|
114
|
+
should_trigger: true,
|
|
115
|
+
source: "log",
|
|
116
|
+
created_at: buildTimestamp,
|
|
117
|
+
};
|
|
170
118
|
if (annotateTaxonomy) {
|
|
171
119
|
entry.invocation_type = classifyInvocation(q, skillName);
|
|
172
120
|
}
|
|
@@ -189,7 +137,12 @@ export function buildEvalSet(
|
|
|
189
137
|
|
|
190
138
|
const shuffledNeg = seededShuffle(negCandidates, effectiveSeed).slice(0, effectiveMaxPerSide);
|
|
191
139
|
negatives = shuffledNeg.map((q) => {
|
|
192
|
-
const entry: EvalEntry = {
|
|
140
|
+
const entry: EvalEntry = {
|
|
141
|
+
query: truncateQuery(q),
|
|
142
|
+
should_trigger: false,
|
|
143
|
+
source: "log",
|
|
144
|
+
created_at: buildTimestamp,
|
|
145
|
+
};
|
|
193
146
|
if (annotateTaxonomy) {
|
|
194
147
|
entry.invocation_type = "negative";
|
|
195
148
|
}
|
|
@@ -202,7 +155,12 @@ export function buildEvalSet(
|
|
|
202
155
|
const fallbacks: EvalEntry[] = [];
|
|
203
156
|
for (const q of GENERIC_NEGATIVES) {
|
|
204
157
|
if (negSeen.has(q) || positiveQueries.has(q)) continue;
|
|
205
|
-
const entry: EvalEntry = {
|
|
158
|
+
const entry: EvalEntry = {
|
|
159
|
+
query: q,
|
|
160
|
+
should_trigger: false,
|
|
161
|
+
source: "log",
|
|
162
|
+
created_at: buildTimestamp,
|
|
163
|
+
};
|
|
206
164
|
if (annotateTaxonomy) {
|
|
207
165
|
entry.invocation_type = "negative";
|
|
208
166
|
}
|
|
@@ -215,6 +173,116 @@ export function buildEvalSet(
|
|
|
215
173
|
return [...shuffledPositives, ...negatives];
|
|
216
174
|
}
|
|
217
175
|
|
|
176
|
+
// ---------------------------------------------------------------------------
|
|
177
|
+
// Normalized Levenshtein distance
|
|
178
|
+
// ---------------------------------------------------------------------------
|
|
179
|
+
|
|
180
|
+
function levenshteinDistance(a: string, b: string): number {
|
|
181
|
+
const la = a.length;
|
|
182
|
+
const lb = b.length;
|
|
183
|
+
if (la === 0) return lb;
|
|
184
|
+
if (lb === 0) return la;
|
|
185
|
+
|
|
186
|
+
// Use two-row optimization to keep memory O(min(la, lb))
|
|
187
|
+
let prev = Array.from<number>({ length: lb + 1 });
|
|
188
|
+
let curr = Array.from<number>({ length: lb + 1 });
|
|
189
|
+
|
|
190
|
+
for (let j = 0; j <= lb; j++) prev[j] = j;
|
|
191
|
+
|
|
192
|
+
for (let i = 1; i <= la; i++) {
|
|
193
|
+
curr[0] = i;
|
|
194
|
+
for (let j = 1; j <= lb; j++) {
|
|
195
|
+
const cost = a[i - 1] === b[j - 1] ? 0 : 1;
|
|
196
|
+
curr[j] = Math.min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost);
|
|
197
|
+
}
|
|
198
|
+
[prev, curr] = [curr, prev];
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
return prev[lb];
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
function normalizedLevenshtein(a: string, b: string): number {
|
|
205
|
+
const maxLen = Math.max(a.length, b.length);
|
|
206
|
+
if (maxLen === 0) return 0;
|
|
207
|
+
return levenshteinDistance(a, b) / maxLen;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// ---------------------------------------------------------------------------
|
|
211
|
+
// Blend eval sets (log + synthetic)
|
|
212
|
+
// ---------------------------------------------------------------------------
|
|
213
|
+
|
|
214
|
+
/**
|
|
215
|
+
* Blend log-based and synthetic eval entries.
|
|
216
|
+
*
|
|
217
|
+
* Policy:
|
|
218
|
+
* - Keep ALL log-based entries (source: "log")
|
|
219
|
+
* - Add synthetic entries that cover gaps (boundary cases, underrepresented types)
|
|
220
|
+
* - Deduplicate: drop synthetic if normalizedLevenshtein(synthetic, anyLog) < 0.3
|
|
221
|
+
* - Mark surviving synthetic entries as source: "blended"
|
|
222
|
+
* - Cap total at 2x the log-based count
|
|
223
|
+
*/
|
|
224
|
+
export function blendEvalSets(logEntries: EvalEntry[], syntheticEntries: EvalEntry[]): EvalEntry[] {
|
|
225
|
+
const result: EvalEntry[] = [...logEntries];
|
|
226
|
+
const logCount = logEntries.length;
|
|
227
|
+
const cap = logCount * 2;
|
|
228
|
+
|
|
229
|
+
if (logCount === 0 || syntheticEntries.length === 0) {
|
|
230
|
+
return result.slice(0, cap);
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
// Normalize log queries for comparison
|
|
234
|
+
const logQueries = logEntries.map((e) => e.query.toLowerCase().trim());
|
|
235
|
+
|
|
236
|
+
// Filter synthetic entries: drop those too similar to any log entry
|
|
237
|
+
const candidates: EvalEntry[] = [];
|
|
238
|
+
for (const synth of syntheticEntries) {
|
|
239
|
+
const synthNorm = synth.query.toLowerCase().trim();
|
|
240
|
+
let tooSimilar = false;
|
|
241
|
+
for (const logQ of logQueries) {
|
|
242
|
+
// Length pre-filter: skip Levenshtein if lengths differ by >70%
|
|
243
|
+
const maxLen = Math.max(synthNorm.length, logQ.length);
|
|
244
|
+
if (maxLen > 0 && Math.abs(synthNorm.length - logQ.length) / maxLen > 0.7) continue;
|
|
245
|
+
if (normalizedLevenshtein(synthNorm, logQ) < 0.3) {
|
|
246
|
+
tooSimilar = true;
|
|
247
|
+
break;
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
if (!tooSimilar) {
|
|
251
|
+
candidates.push({ ...synth, source: "blended" });
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
// Add candidates up to the cap
|
|
256
|
+
const slotsAvailable = cap - result.length;
|
|
257
|
+
result.push(...candidates.slice(0, slotsAvailable));
|
|
258
|
+
|
|
259
|
+
return result;
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
// ---------------------------------------------------------------------------
|
|
263
|
+
// Eval source stats
|
|
264
|
+
// ---------------------------------------------------------------------------
|
|
265
|
+
|
|
266
|
+
export function computeEvalSourceStats(entries: EvalEntry[]): EvalSourceStats {
|
|
267
|
+
const stats: EvalSourceStats = { total: entries.length, synthetic: 0, log: 0, blended: 0 };
|
|
268
|
+
const timestamps: string[] = [];
|
|
269
|
+
|
|
270
|
+
for (const entry of entries) {
|
|
271
|
+
if (entry.source === "synthetic") stats.synthetic++;
|
|
272
|
+
else if (entry.source === "log") stats.log++;
|
|
273
|
+
else if (entry.source === "blended") stats.blended++;
|
|
274
|
+
if (entry.created_at) timestamps.push(entry.created_at);
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
if (timestamps.length > 0) {
|
|
278
|
+
timestamps.sort();
|
|
279
|
+
stats.oldest = timestamps[0];
|
|
280
|
+
stats.newest = timestamps[timestamps.length - 1];
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
return stats;
|
|
284
|
+
}
|
|
285
|
+
|
|
218
286
|
// ---------------------------------------------------------------------------
|
|
219
287
|
// Installed skill discovery / readiness
|
|
220
288
|
// ---------------------------------------------------------------------------
|
|
@@ -505,33 +573,40 @@ export async function cliMain(): Promise<void> {
|
|
|
505
573
|
"telemetry-log": { type: "string", default: TELEMETRY_LOG },
|
|
506
574
|
synthetic: { type: "boolean", default: false },
|
|
507
575
|
"auto-synthetic": { type: "boolean", default: false },
|
|
576
|
+
blend: { type: "boolean", default: false },
|
|
508
577
|
"skill-path": { type: "string" },
|
|
509
578
|
model: { type: "string" },
|
|
579
|
+
help: { type: "boolean", default: false },
|
|
510
580
|
},
|
|
511
581
|
strict: true,
|
|
512
582
|
});
|
|
513
583
|
|
|
584
|
+
if (values.help) {
|
|
585
|
+
console.log(renderCommandHelp(PUBLIC_COMMAND_SURFACES.evalGenerate));
|
|
586
|
+
process.exit(0);
|
|
587
|
+
}
|
|
588
|
+
|
|
514
589
|
// --- Synthetic mode: generate evals from SKILL.md via LLM ---
|
|
515
590
|
if (values.synthetic) {
|
|
516
591
|
if (!values.skill) {
|
|
517
592
|
throw new CLIError(
|
|
518
593
|
"--skill required with --synthetic",
|
|
519
594
|
"MISSING_FLAG",
|
|
520
|
-
"selftune
|
|
595
|
+
"selftune eval generate --synthetic --skill <name> --skill-path <path>",
|
|
521
596
|
);
|
|
522
597
|
}
|
|
523
598
|
if (!values["skill-path"]) {
|
|
524
599
|
throw new CLIError(
|
|
525
600
|
"--skill-path required with --synthetic",
|
|
526
601
|
"MISSING_FLAG",
|
|
527
|
-
"selftune
|
|
602
|
+
"selftune eval generate --synthetic --skill <name> --skill-path <path>",
|
|
528
603
|
);
|
|
529
604
|
}
|
|
530
605
|
|
|
531
|
-
const agent =
|
|
606
|
+
const agent = detectLlmAgent();
|
|
532
607
|
if (!agent) {
|
|
533
608
|
throw new CLIError(
|
|
534
|
-
"No agent CLI found (claude/codex/opencode)",
|
|
609
|
+
"No agent CLI found (claude/codex/opencode/pi)",
|
|
535
610
|
"AGENT_NOT_FOUND",
|
|
536
611
|
"Install one of the supported agent CLIs",
|
|
537
612
|
);
|
|
@@ -549,11 +624,13 @@ export async function cliMain(): Promise<void> {
|
|
|
549
624
|
|
|
550
625
|
const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
|
|
551
626
|
writeFileSync(outputPath, JSON.stringify(evalSet, null, 2), "utf-8");
|
|
627
|
+
const canonicalPath = writeCanonicalEvalSet(values.skill, evalSet);
|
|
552
628
|
|
|
553
629
|
const pos = evalSet.filter((e) => e.should_trigger);
|
|
554
630
|
const neg = evalSet.filter((e) => !e.should_trigger);
|
|
555
631
|
|
|
556
632
|
console.log(`Wrote ${evalSet.length} synthetic eval entries to ${outputPath}`);
|
|
633
|
+
console.log(`Canonical eval copy: ${canonicalPath}`);
|
|
557
634
|
console.log(` Positives (should_trigger=true) : ${pos.length}`);
|
|
558
635
|
console.log(` Negatives (should_trigger=false): ${neg.length}`);
|
|
559
636
|
|
|
@@ -582,10 +659,23 @@ export async function cliMain(): Promise<void> {
|
|
|
582
659
|
let queryRecords: QueryLogRecord[];
|
|
583
660
|
let telemetryRecords: SessionTelemetryRecord[];
|
|
584
661
|
|
|
585
|
-
const
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
662
|
+
const skillLogPath = values["skill-log"] ?? SKILL_LOG;
|
|
663
|
+
const queryLogPath = values["query-log"] ?? QUERY_LOG;
|
|
664
|
+
const telemetryLogPath = values["telemetry-log"] ?? TELEMETRY_LOG;
|
|
665
|
+
const hasCustomSkillLog = skillLogPath !== SKILL_LOG;
|
|
666
|
+
const hasCustomQueryLog = queryLogPath !== QUERY_LOG;
|
|
667
|
+
const hasCustomTelemetryLog = telemetryLogPath !== TELEMETRY_LOG;
|
|
668
|
+
|
|
669
|
+
const db = hasCustomSkillLog && hasCustomQueryLog && hasCustomTelemetryLog ? undefined : getDb();
|
|
670
|
+
skillRecords = hasCustomSkillLog
|
|
671
|
+
? readJsonl<SkillUsageRecord>(skillLogPath)
|
|
672
|
+
: (querySkillUsageRecords(db!) as SkillUsageRecord[]);
|
|
673
|
+
queryRecords = hasCustomQueryLog
|
|
674
|
+
? readJsonl<QueryLogRecord>(queryLogPath)
|
|
675
|
+
: (queryQueryLog(db!) as QueryLogRecord[]);
|
|
676
|
+
telemetryRecords = hasCustomTelemetryLog
|
|
677
|
+
? readJsonl<SessionTelemetryRecord>(telemetryLogPath)
|
|
678
|
+
: (querySessionTelemetry(db!) as SessionTelemetryRecord[]);
|
|
589
679
|
|
|
590
680
|
if (values["list-skills"]) {
|
|
591
681
|
listSkills(skillRecords, queryRecords, telemetryRecords);
|
|
@@ -596,7 +686,7 @@ export async function cliMain(): Promise<void> {
|
|
|
596
686
|
throw new CLIError(
|
|
597
687
|
"--skill required (or use --list-skills)",
|
|
598
688
|
"MISSING_FLAG",
|
|
599
|
-
"selftune
|
|
689
|
+
"selftune eval generate --skill <name> or selftune eval generate --list-skills",
|
|
600
690
|
);
|
|
601
691
|
}
|
|
602
692
|
|
|
@@ -632,10 +722,10 @@ export async function cliMain(): Promise<void> {
|
|
|
632
722
|
);
|
|
633
723
|
}
|
|
634
724
|
|
|
635
|
-
const agent =
|
|
725
|
+
const agent = detectLlmAgent();
|
|
636
726
|
if (!agent) {
|
|
637
727
|
throw new CLIError(
|
|
638
|
-
"No agent CLI found (claude/codex/opencode)",
|
|
728
|
+
"No agent CLI found (claude/codex/opencode/pi)",
|
|
639
729
|
"AGENT_NOT_FOUND",
|
|
640
730
|
"Install one of the supported agent CLIs",
|
|
641
731
|
);
|
|
@@ -652,10 +742,12 @@ export async function cliMain(): Promise<void> {
|
|
|
652
742
|
});
|
|
653
743
|
const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
|
|
654
744
|
writeFileSync(outputPath, JSON.stringify(syntheticEvalSet, null, 2), "utf-8");
|
|
745
|
+
const canonicalPath = writeCanonicalEvalSet(values.skill, syntheticEvalSet);
|
|
655
746
|
const pos = syntheticEvalSet.filter((e) => e.should_trigger);
|
|
656
747
|
const neg = syntheticEvalSet.filter((e) => !e.should_trigger);
|
|
657
748
|
|
|
658
749
|
console.log(`Wrote ${syntheticEvalSet.length} synthetic eval entries to ${outputPath}`);
|
|
750
|
+
console.log(`Canonical eval copy: ${canonicalPath}`);
|
|
659
751
|
console.log(` Positives (should_trigger=true) : ${pos.length}`);
|
|
660
752
|
console.log(` Negatives (should_trigger=false): ${neg.length}`);
|
|
661
753
|
console.log("\nNext steps:");
|
|
@@ -666,9 +758,63 @@ export async function cliMain(): Promise<void> {
|
|
|
666
758
|
return;
|
|
667
759
|
}
|
|
668
760
|
|
|
761
|
+
// --- Blend mode: merge log-based evals with synthetic gap-fillers ---
|
|
762
|
+
let finalEvalSet = evalSet;
|
|
763
|
+
if (values.blend) {
|
|
764
|
+
const skillPath = values["skill-path"] ?? detectedSkillPath;
|
|
765
|
+
if (!skillPath) {
|
|
766
|
+
throw new CLIError(
|
|
767
|
+
`--blend requires a resolvable SKILL.md path. Use --skill-path or install the skill locally.`,
|
|
768
|
+
"MISSING_FLAG",
|
|
769
|
+
`selftune eval generate --skill ${values.skill} --blend --skill-path /path/to/SKILL.md`,
|
|
770
|
+
);
|
|
771
|
+
}
|
|
772
|
+
|
|
773
|
+
const agent = detectLlmAgent();
|
|
774
|
+
if (!agent) {
|
|
775
|
+
throw new CLIError(
|
|
776
|
+
"No agent CLI found (claude/codex/opencode/pi)",
|
|
777
|
+
"AGENT_NOT_FOUND",
|
|
778
|
+
"Install one of the supported agent CLIs",
|
|
779
|
+
);
|
|
780
|
+
}
|
|
781
|
+
|
|
782
|
+
// Fail fast before expensive LLM calls — blending with zero logs always produces []
|
|
783
|
+
if (evalSet.length === 0) {
|
|
784
|
+
throw new CLIError(
|
|
785
|
+
`--blend requires log-based eval entries to blend with synthetic entries. No log data found for skill "${values.skill}".`,
|
|
786
|
+
"BLEND_NO_LOGS",
|
|
787
|
+
`Use --synthetic instead for cold-start skills, or run selftune sync first to ingest session data.`,
|
|
788
|
+
);
|
|
789
|
+
}
|
|
790
|
+
|
|
791
|
+
const effectiveMax = Number.isNaN(maxPerSide) || maxPerSide <= 0 ? 50 : maxPerSide;
|
|
792
|
+
console.log(`Generating synthetic evals for blending with '${values.skill}'...`);
|
|
793
|
+
const syntheticEvalSet = await generateSyntheticEvals(skillPath, values.skill, agent, {
|
|
794
|
+
maxPositives: effectiveMax,
|
|
795
|
+
maxNegatives: effectiveMax,
|
|
796
|
+
modelFlag: values.model,
|
|
797
|
+
});
|
|
798
|
+
|
|
799
|
+
finalEvalSet = blendEvalSets(evalSet, syntheticEvalSet);
|
|
800
|
+
const stats = computeEvalSourceStats(finalEvalSet);
|
|
801
|
+
console.log(
|
|
802
|
+
`Blended: ${stats.log} log + ${stats.blended} synthetic gap-fillers = ${stats.total} total`,
|
|
803
|
+
);
|
|
804
|
+
}
|
|
805
|
+
|
|
669
806
|
const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
|
|
670
|
-
writeFileSync(outputPath, JSON.stringify(
|
|
671
|
-
|
|
807
|
+
writeFileSync(outputPath, JSON.stringify(finalEvalSet, null, 2), "utf-8");
|
|
808
|
+
const canonicalPath = writeCanonicalEvalSet(values.skill, finalEvalSet);
|
|
809
|
+
printEvalStats(
|
|
810
|
+
finalEvalSet,
|
|
811
|
+
values.skill,
|
|
812
|
+
outputPath,
|
|
813
|
+
skillRecords,
|
|
814
|
+
queryRecords,
|
|
815
|
+
annotateTaxonomy,
|
|
816
|
+
);
|
|
817
|
+
console.log(`Canonical eval copy: ${canonicalPath}`);
|
|
672
818
|
if (positiveCount === 0 && detectedSkillPath) {
|
|
673
819
|
printSyntheticFallbackHint(values.skill, detectedSkillPath);
|
|
674
820
|
}
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
* <dir>/tasks/<task-id>/task.toml — metadata (difficulty, category, tags, etc.)
|
|
11
11
|
*/
|
|
12
12
|
|
|
13
|
-
import { existsSync, readdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
13
|
+
import { existsSync, readdirSync, readFileSync, type Dirent, writeFileSync } from "node:fs";
|
|
14
14
|
import { join } from "node:path";
|
|
15
15
|
import { parseArgs } from "node:util";
|
|
16
16
|
|
|
@@ -72,7 +72,7 @@ export function parseSkillsBenchDir(dirPath: string): SkillsBenchTask[] {
|
|
|
72
72
|
|
|
73
73
|
const tasks: SkillsBenchTask[] = [];
|
|
74
74
|
|
|
75
|
-
let entries:
|
|
75
|
+
let entries: Dirent[];
|
|
76
76
|
try {
|
|
77
77
|
entries = readdirSync(tasksDir, { withFileTypes: true });
|
|
78
78
|
} catch {
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import type { InvocationType } from "../types.js";
|
|
2
|
+
import { escapeRegExp } from "../utils/skill-discovery.js";
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Classify how directly a user query invokes a skill.
|
|
6
|
+
*
|
|
7
|
+
* Kept separate from eval generation so synthetic evals can reuse the
|
|
8
|
+
* classifier without creating an import cycle with hooks-to-evals.
|
|
9
|
+
*/
|
|
10
|
+
export function classifyInvocation(query: string, skillName: string): InvocationType {
|
|
11
|
+
const qLower = query.toLowerCase();
|
|
12
|
+
const skillLower = skillName.toLowerCase();
|
|
13
|
+
|
|
14
|
+
// Explicit: mentions skill name or $skill syntax.
|
|
15
|
+
if (
|
|
16
|
+
qLower.includes(`$${skillLower}`) ||
|
|
17
|
+
query.includes(`$${skillName}`) ||
|
|
18
|
+
qLower.includes(skillLower)
|
|
19
|
+
) {
|
|
20
|
+
return "explicit";
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
// Handle hyphenated skill names: check if all parts appear.
|
|
24
|
+
if (skillLower.includes("-")) {
|
|
25
|
+
const parts = skillLower.split("-");
|
|
26
|
+
if (parts.every((part) => new RegExp(`\\b${escapeRegExp(part)}\\b`, "i").test(query))) {
|
|
27
|
+
return "explicit";
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
// Convert skill-name to camelCase and check.
|
|
32
|
+
const camelCase = skillLower.replace(/-([a-z])/g, (_, c) => c.toUpperCase());
|
|
33
|
+
if (camelCase !== skillLower && qLower.includes(camelCase.toLowerCase())) {
|
|
34
|
+
return "explicit";
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
const wordCount = query.split(/\s+/).length;
|
|
38
|
+
const hasProperNoun = /\b[A-Z][a-z]{2,}\b/.test(query);
|
|
39
|
+
const hasTemporalRef =
|
|
40
|
+
/\b(next week|last week|tomorrow|yesterday|Q[1-4]|monday|tuesday|wednesday|thursday|friday|january|february|march|april|may|june|july|august|september|october|november|december)\b/i.test(
|
|
41
|
+
query,
|
|
42
|
+
);
|
|
43
|
+
const hasFilename = /\b\w+\.\w{2,4}\b/.test(query);
|
|
44
|
+
const hasEmail = /\b\S+@\S+\.\S+\b/.test(query);
|
|
45
|
+
|
|
46
|
+
if (wordCount > 15 || hasProperNoun || hasTemporalRef || hasFilename || hasEmail) {
|
|
47
|
+
return "contextual";
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
const hasDomainSignal = /\b\d{2,}\b/.test(query) || /[A-Z]{2,}/.test(query);
|
|
51
|
+
if (wordCount >= 10 && hasDomainSignal) {
|
|
52
|
+
return "contextual";
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
return "implicit";
|
|
56
|
+
}
|
|
@@ -8,10 +8,10 @@
|
|
|
8
8
|
|
|
9
9
|
import { readFileSync } from "node:fs";
|
|
10
10
|
|
|
11
|
-
import type { EvalEntry, InvocationType } from "../types.js";
|
|
11
|
+
import type { EvalEntry, InvocationType, SkillUsageRecord } from "../types.js";
|
|
12
12
|
import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
|
|
13
13
|
import { findInstalledSkillNames } from "../utils/skill-discovery.js";
|
|
14
|
-
import { classifyInvocation } from "./
|
|
14
|
+
import { classifyInvocation } from "./invocation-classifier.js";
|
|
15
15
|
|
|
16
16
|
// ---------------------------------------------------------------------------
|
|
17
17
|
// Types
|
|
@@ -414,6 +414,8 @@ export function parseSyntheticResponse(raw: string, skillName: string): EvalEntr
|
|
|
414
414
|
query,
|
|
415
415
|
should_trigger: entry.should_trigger,
|
|
416
416
|
invocation_type: invocationType,
|
|
417
|
+
source: "synthetic",
|
|
418
|
+
created_at: new Date().toISOString(),
|
|
417
419
|
});
|
|
418
420
|
}
|
|
419
421
|
|
|
@@ -449,7 +451,7 @@ export async function generateSyntheticEvals(
|
|
|
449
451
|
const db = getDb();
|
|
450
452
|
|
|
451
453
|
// Positives: high-confidence triggered records for this skill
|
|
452
|
-
const skillRecords = querySkillUsageRecords(db);
|
|
454
|
+
const skillRecords = querySkillUsageRecords(db) as SkillUsageRecord[];
|
|
453
455
|
const positive = skillRecords
|
|
454
456
|
.filter((r) => isHighConfidencePositiveSkillRecord(r, skillName))
|
|
455
457
|
.map((r) => r.query)
|
|
@@ -19,8 +19,9 @@ import { parseArgs } from "node:util";
|
|
|
19
19
|
|
|
20
20
|
import { SELFTUNE_CONFIG_DIR } from "../constants.js";
|
|
21
21
|
import type { EvalEntry } from "../types.js";
|
|
22
|
+
import { writeUnitTestRunResult } from "../testing-readiness.js";
|
|
22
23
|
import { CLIError } from "../utils/cli-error.js";
|
|
23
|
-
import { callLlm,
|
|
24
|
+
import { callLlm, detectLlmAgent } from "../utils/llm-call.js";
|
|
24
25
|
import { generateUnitTests } from "./generate-unit-tests.js";
|
|
25
26
|
import type { AgentRunner } from "./unit-test.js";
|
|
26
27
|
import { loadUnitTests, runUnitTestSuite } from "./unit-test.js";
|
|
@@ -58,10 +59,10 @@ export async function cliMain(): Promise<void> {
|
|
|
58
59
|
|
|
59
60
|
// --generate: create tests from skill content
|
|
60
61
|
if (values.generate) {
|
|
61
|
-
const agent =
|
|
62
|
+
const agent = detectLlmAgent();
|
|
62
63
|
if (!agent) {
|
|
63
64
|
throw new CLIError(
|
|
64
|
-
"No agent CLI found (claude/codex/opencode). Cannot generate tests",
|
|
65
|
+
"No agent CLI found (claude/codex/opencode/pi). Cannot generate tests",
|
|
65
66
|
"AGENT_NOT_FOUND",
|
|
66
67
|
"Install one of the supported agent CLIs",
|
|
67
68
|
);
|
|
@@ -118,7 +119,7 @@ export async function cliMain(): Promise<void> {
|
|
|
118
119
|
let agentRunner: AgentRunner;
|
|
119
120
|
|
|
120
121
|
if (values["run-agent"]) {
|
|
121
|
-
const agent =
|
|
122
|
+
const agent = detectLlmAgent();
|
|
122
123
|
if (!agent) {
|
|
123
124
|
throw new CLIError(
|
|
124
125
|
"No agent CLI found. Cannot run agent-based tests",
|
|
@@ -137,11 +138,13 @@ export async function cliMain(): Promise<void> {
|
|
|
137
138
|
}
|
|
138
139
|
|
|
139
140
|
const suite = await runUnitTestSuite(tests, skillName, agentRunner);
|
|
141
|
+
const resultPath = writeUnitTestRunResult(skillName, suite);
|
|
140
142
|
|
|
141
143
|
// Print results
|
|
142
144
|
console.log(`\nResults for '${suite.skill_name}':`);
|
|
143
145
|
console.log(` Total: ${suite.total} Passed: ${suite.passed} Failed: ${suite.failed}`);
|
|
144
146
|
console.log(` Pass rate: ${(suite.pass_rate * 100).toFixed(1)}%`);
|
|
147
|
+
console.log(` Stored: ${resultPath}`);
|
|
145
148
|
|
|
146
149
|
if (suite.failed > 0) {
|
|
147
150
|
console.log("\nFailed tests:");
|