selftune 0.2.30 → 0.2.32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +83 -56
- package/apps/local-dashboard/dist/assets/index-B-ut4w0B.js +15 -0
- package/apps/local-dashboard/dist/assets/index-BFGfCVrL.css +1 -0
- package/apps/local-dashboard/dist/assets/vendor-ui-DfowE3Hu.js +1 -0
- package/apps/local-dashboard/dist/index.html +3 -3
- package/cli/selftune/command-surface.ts +613 -2
- package/cli/selftune/create/baseline.ts +429 -0
- package/cli/selftune/create/check.ts +35 -0
- package/cli/selftune/create/init.ts +115 -0
- package/cli/selftune/create/package-candidate-state.ts +771 -0
- package/cli/selftune/create/package-evaluator.ts +710 -0
- package/cli/selftune/create/package-fingerprint.ts +142 -0
- package/cli/selftune/create/package-search.ts +377 -0
- package/cli/selftune/create/publish.ts +431 -0
- package/cli/selftune/create/readiness.ts +495 -0
- package/cli/selftune/create/replay.ts +330 -0
- package/cli/selftune/create/report.ts +74 -0
- package/cli/selftune/create/scaffold.ts +121 -0
- package/cli/selftune/create/skills-ref-adapter.ts +177 -0
- package/cli/selftune/create/status.ts +33 -0
- package/cli/selftune/create/templates.ts +249 -0
- package/cli/selftune/cron/setup.ts +1 -1
- package/cli/selftune/dashboard-action-events.ts +4 -1
- package/cli/selftune/dashboard-action-result.ts +789 -24
- package/cli/selftune/dashboard-action-stream.ts +80 -0
- package/cli/selftune/dashboard-contract.ts +146 -3
- package/cli/selftune/dashboard-server.ts +5 -4
- package/cli/selftune/eval/hooks-to-evals.ts +58 -35
- package/cli/selftune/eval/synthetic-evals.ts +145 -17
- package/cli/selftune/evolution/bounded-mutations.ts +1045 -0
- package/cli/selftune/evolution/evolve-body.ts +9 -36
- package/cli/selftune/evolution/evolve.ts +8 -72
- package/cli/selftune/evolution/stopping-criteria.ts +5 -13
- package/cli/selftune/evolution/unblock-suggestions.ts +0 -16
- package/cli/selftune/evolution/validate-host-replay.ts +115 -15
- package/cli/selftune/improve.ts +206 -0
- package/cli/selftune/index.ts +123 -6
- package/cli/selftune/init.ts +1 -1
- package/cli/selftune/localdb/queries/dashboard.ts +30 -0
- package/cli/selftune/localdb/schema.ts +52 -0
- package/cli/selftune/monitoring/watch.ts +257 -23
- package/cli/selftune/orchestrate/execute.ts +300 -1
- package/cli/selftune/orchestrate/finalize.ts +14 -0
- package/cli/selftune/orchestrate/plan.ts +22 -5
- package/cli/selftune/orchestrate/prepare.ts +59 -4
- package/cli/selftune/orchestrate/report.ts +1 -1
- package/cli/selftune/orchestrate.ts +34 -1
- package/cli/selftune/publish.ts +35 -0
- package/cli/selftune/registry/github-install.ts +256 -0
- package/cli/selftune/registry/index.ts +1 -1
- package/cli/selftune/registry/install.ts +58 -7
- package/cli/selftune/routes/actions.ts +81 -15
- package/cli/selftune/routes/overview.ts +1 -1
- package/cli/selftune/routes/skill-report.ts +147 -2
- package/cli/selftune/run.ts +18 -0
- package/cli/selftune/schedule.ts +3 -3
- package/cli/selftune/search-run.ts +703 -0
- package/cli/selftune/status.ts +35 -11
- package/cli/selftune/testing-readiness.ts +431 -40
- package/cli/selftune/types.ts +316 -0
- package/cli/selftune/utils/eval-readiness.ts +1 -0
- package/cli/selftune/utils/json-output.ts +11 -0
- package/cli/selftune/utils/lifecycle-surface.ts +48 -0
- package/cli/selftune/utils/query-filter.ts +82 -1
- package/cli/selftune/utils/tui.ts +85 -2
- package/cli/selftune/verify.ts +205 -0
- package/cli/selftune/workflows/proposals.ts +1 -1
- package/cli/selftune/workflows/skill-scaffold.ts +141 -63
- package/cli/selftune/workflows/workflows.ts +4 -4
- package/package.json +1 -1
- package/packages/dashboard-core/src/routes/manifest.ts +2 -2
- package/packages/ui/src/components/SkillReportPanels.tsx +7 -7
- package/packages/ui/src/primitives/button.tsx +5 -0
- package/skill/SKILL.md +148 -85
- package/skill/references/cli-quick-reference.md +16 -1
- package/skill/references/creator-playbook.md +31 -10
- package/skill/workflows/Baseline.md +8 -9
- package/skill/workflows/Contributions.md +4 -4
- package/skill/workflows/Create.md +173 -0
- package/skill/workflows/CreateTestDeploy.md +34 -30
- package/skill/workflows/Cron.md +2 -2
- package/skill/workflows/Dashboard.md +3 -3
- package/skill/workflows/Evals.md +13 -7
- package/skill/workflows/Evolve.md +75 -32
- package/skill/workflows/EvolveBody.md +22 -15
- package/skill/workflows/Hook.md +1 -1
- package/skill/workflows/Improve.md +168 -0
- package/skill/workflows/Initialize.md +3 -3
- package/skill/workflows/Orchestrate.md +49 -12
- package/skill/workflows/Publish.md +100 -0
- package/skill/workflows/Registry.md +19 -13
- package/skill/workflows/Run.md +72 -0
- package/skill/workflows/Schedule.md +2 -2
- package/skill/workflows/SearchRun.md +89 -0
- package/skill/workflows/SignalsDashboard.md +2 -2
- package/skill/workflows/UnitTest.md +13 -4
- package/skill/workflows/Verify.md +136 -0
- package/skill/workflows/Watch.md +114 -47
- package/skill/workflows/Workflows.md +13 -8
- package/apps/local-dashboard/dist/assets/index-BcXquWFB.css +0 -1
- package/apps/local-dashboard/dist/assets/index-Coq42hE4.js +0 -15
- package/apps/local-dashboard/dist/assets/vendor-ui-B0H8s1mP.js +0 -1
|
@@ -21,8 +21,10 @@ import {
|
|
|
21
21
|
querySkillUsageRecords,
|
|
22
22
|
} from "../localdb/queries.js";
|
|
23
23
|
import { updateContextAfterWatch } from "../memory/writer.js";
|
|
24
|
+
import { readCanonicalPackageEvaluationArtifact } from "../testing-readiness.js";
|
|
24
25
|
import type { SyncResult } from "../sync.js";
|
|
25
26
|
import type {
|
|
27
|
+
CreatePackageEvaluationWatchEfficiencyRegressionSummary,
|
|
26
28
|
InvocationType,
|
|
27
29
|
MonitoringSnapshot,
|
|
28
30
|
QueryLogRecord,
|
|
@@ -49,6 +51,10 @@ export interface WatchOptions {
|
|
|
49
51
|
gradeRegressionThreshold?: number;
|
|
50
52
|
/** Enable grade-based regression watch (default true). */
|
|
51
53
|
enableGradeWatch?: boolean;
|
|
54
|
+
/** Relative regression threshold for observed efficiency (default 0.25). */
|
|
55
|
+
efficiencyRegressionThreshold?: number;
|
|
56
|
+
/** Enable efficiency-based regression watch (default true). */
|
|
57
|
+
enableEfficiencyWatch?: boolean;
|
|
52
58
|
/** Injected log paths for testing (override defaults). */
|
|
53
59
|
_telemetryLogPath?: string;
|
|
54
60
|
_skillLogPath?: string;
|
|
@@ -71,9 +77,59 @@ export interface WatchResult {
|
|
|
71
77
|
alert: string | null;
|
|
72
78
|
rolledBack: boolean;
|
|
73
79
|
recommendation: string;
|
|
80
|
+
recommended_command?: string | null;
|
|
74
81
|
sync_result?: SyncResult;
|
|
75
82
|
gradeAlert?: string | null;
|
|
76
83
|
gradeRegression?: { before: number; after: number; delta: number } | null;
|
|
84
|
+
efficiencyAlert?: string | null;
|
|
85
|
+
efficiencyRegression?: CreatePackageEvaluationWatchEfficiencyRegressionSummary | null;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// ---------------------------------------------------------------------------
|
|
89
|
+
// Watch trust scoring — aggregates watch signals into a 0-1 trust score
|
|
90
|
+
// ---------------------------------------------------------------------------
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Compute a trust score (0-1) from a WatchResult.
|
|
94
|
+
*
|
|
95
|
+
* A skill with no regressions and sufficient checks scores 1.0.
|
|
96
|
+
* Active alerts reduce trust proportional to severity:
|
|
97
|
+
* - Trigger regression: -0.5
|
|
98
|
+
* - Grade regression: -0.3 (scaled by delta magnitude)
|
|
99
|
+
* - Insufficient data: caps at 0.5
|
|
100
|
+
*/
|
|
101
|
+
export function computeWatchTrustScore(watchResult: WatchResult): number {
|
|
102
|
+
const { snapshot, alert, gradeRegression } = watchResult;
|
|
103
|
+
|
|
104
|
+
// Not enough data to form a trust opinion — cap at 0.5
|
|
105
|
+
if (snapshot.skill_checks < MIN_MONITORING_SKILL_CHECKS) {
|
|
106
|
+
return 0.5;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
let score = 1.0;
|
|
110
|
+
|
|
111
|
+
// Trigger pass rate regression: major trust penalty
|
|
112
|
+
if (snapshot.regression_detected) {
|
|
113
|
+
score -= 0.5;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
// Grade regression: penalty scaled by delta (max 0.3)
|
|
117
|
+
if (gradeRegression) {
|
|
118
|
+
const gradePenalty = Math.min(gradeRegression.delta * 2, 0.3);
|
|
119
|
+
score -= gradePenalty;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// Any active alert without specific regression (catch-all)
|
|
123
|
+
if (alert && !snapshot.regression_detected && !gradeRegression) {
|
|
124
|
+
score -= 0.2;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// Rolled back: significant trust hit
|
|
128
|
+
if (watchResult.rolledBack) {
|
|
129
|
+
score -= 0.2;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
return Math.max(0, Math.min(1, score));
|
|
77
133
|
}
|
|
78
134
|
|
|
79
135
|
// ---------------------------------------------------------------------------
|
|
@@ -83,8 +139,170 @@ export interface WatchResult {
|
|
|
83
139
|
const DEFAULT_BASELINE_PASS_RATE = 0.5;
|
|
84
140
|
const DEFAULT_REGRESSION_THRESHOLD = 0.1;
|
|
85
141
|
const DEFAULT_GRADE_REGRESSION_THRESHOLD = 0.15;
|
|
142
|
+
const DEFAULT_EFFICIENCY_REGRESSION_THRESHOLD = 0.25;
|
|
86
143
|
export const MIN_MONITORING_SKILL_CHECKS = 3;
|
|
87
144
|
|
|
145
|
+
type MonitoringWindow = {
|
|
146
|
+
telemetry: SessionTelemetryRecord[];
|
|
147
|
+
skillRecords: SkillUsageRecord[];
|
|
148
|
+
queryRecords: QueryLogRecord[];
|
|
149
|
+
};
|
|
150
|
+
|
|
151
|
+
function selectMonitoringWindow(
|
|
152
|
+
skillName: string,
|
|
153
|
+
telemetry: SessionTelemetryRecord[],
|
|
154
|
+
skillRecords: SkillUsageRecord[],
|
|
155
|
+
queryRecords: QueryLogRecord[],
|
|
156
|
+
windowSessions: number,
|
|
157
|
+
): MonitoringWindow {
|
|
158
|
+
const actionableSkillRecords = filterActionableSkillUsageRecords(skillRecords);
|
|
159
|
+
const actionableQueryRecords = filterActionableQueryRecords(queryRecords);
|
|
160
|
+
const windowedTelemetry = telemetry.slice(-windowSessions);
|
|
161
|
+
const windowedSessionIds = new Set(windowedTelemetry.map((t) => t.session_id));
|
|
162
|
+
|
|
163
|
+
const skillNameFiltered = actionableSkillRecords.filter((r) => r.skill_name === skillName);
|
|
164
|
+
const hasSessionOverlap =
|
|
165
|
+
windowedSessionIds.size > 0 &&
|
|
166
|
+
(skillNameFiltered.some((r) => windowedSessionIds.has(r.session_id)) ||
|
|
167
|
+
actionableQueryRecords.some((r) => windowedSessionIds.has(r.session_id)));
|
|
168
|
+
|
|
169
|
+
return {
|
|
170
|
+
telemetry: hasSessionOverlap
|
|
171
|
+
? windowedTelemetry.filter((record) => windowedSessionIds.has(record.session_id))
|
|
172
|
+
: telemetry,
|
|
173
|
+
skillRecords: hasSessionOverlap
|
|
174
|
+
? skillNameFiltered.filter((r) => windowedSessionIds.has(r.session_id))
|
|
175
|
+
: skillNameFiltered,
|
|
176
|
+
queryRecords: hasSessionOverlap
|
|
177
|
+
? actionableQueryRecords.filter((r) => windowedSessionIds.has(r.session_id))
|
|
178
|
+
: actionableQueryRecords,
|
|
179
|
+
};
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
function averageNullable(values: Array<number | null | undefined>): number | null {
|
|
183
|
+
const valid = values.filter((value): value is number => typeof value === "number");
|
|
184
|
+
if (valid.length === 0) return null;
|
|
185
|
+
return valid.reduce((sum, value) => sum + value, 0) / valid.length;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
function divideNullable(total: number | null | undefined, count: number | null | undefined) {
|
|
189
|
+
if (typeof total !== "number" || typeof count !== "number" || count <= 0) return null;
|
|
190
|
+
return total / count;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
function computeDeltaRatio(observed: number | null, baseline: number | null): number | null {
|
|
194
|
+
if (observed == null || baseline == null || baseline <= 0) return null;
|
|
195
|
+
return (observed - baseline) / baseline;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
function buildEfficiencyRegression(
|
|
199
|
+
skillName: string,
|
|
200
|
+
telemetry: SessionTelemetryRecord[],
|
|
201
|
+
skillRecords: SkillUsageRecord[],
|
|
202
|
+
efficiencyRegressionThreshold: number,
|
|
203
|
+
): {
|
|
204
|
+
efficiencyAlert: string | null;
|
|
205
|
+
efficiencyRegression: CreatePackageEvaluationWatchEfficiencyRegressionSummary | null;
|
|
206
|
+
} {
|
|
207
|
+
const baselineEfficiency =
|
|
208
|
+
readCanonicalPackageEvaluationArtifact(skillName)?.summary.efficiency?.with_skill;
|
|
209
|
+
if (!baselineEfficiency) {
|
|
210
|
+
return {
|
|
211
|
+
efficiencyAlert: null,
|
|
212
|
+
efficiencyRegression: null,
|
|
213
|
+
};
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
const triggeredSessionIds = new Set(
|
|
217
|
+
skillRecords.filter((record) => record.triggered).map((record) => record.session_id),
|
|
218
|
+
);
|
|
219
|
+
if (triggeredSessionIds.size < MIN_MONITORING_SKILL_CHECKS) {
|
|
220
|
+
return {
|
|
221
|
+
efficiencyAlert: null,
|
|
222
|
+
efficiencyRegression: null,
|
|
223
|
+
};
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
const observedTelemetry = telemetry.filter((record) =>
|
|
227
|
+
triggeredSessionIds.has(record.session_id),
|
|
228
|
+
);
|
|
229
|
+
if (observedTelemetry.length < MIN_MONITORING_SKILL_CHECKS) {
|
|
230
|
+
return {
|
|
231
|
+
efficiencyAlert: null,
|
|
232
|
+
efficiencyRegression: null,
|
|
233
|
+
};
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
const efficiencyRegression: CreatePackageEvaluationWatchEfficiencyRegressionSummary = {
|
|
237
|
+
sample_size: observedTelemetry.length,
|
|
238
|
+
baseline_avg_duration_ms: baselineEfficiency.avg_duration_ms,
|
|
239
|
+
observed_avg_duration_ms: averageNullable(
|
|
240
|
+
observedTelemetry.map((record) => record.duration_ms ?? null),
|
|
241
|
+
),
|
|
242
|
+
duration_delta_ratio: null,
|
|
243
|
+
baseline_avg_input_tokens: divideNullable(
|
|
244
|
+
baselineEfficiency.total_input_tokens,
|
|
245
|
+
baselineEfficiency.eval_runs,
|
|
246
|
+
),
|
|
247
|
+
observed_avg_input_tokens: averageNullable(
|
|
248
|
+
observedTelemetry.map((record) => record.input_tokens ?? null),
|
|
249
|
+
),
|
|
250
|
+
input_tokens_delta_ratio: null,
|
|
251
|
+
baseline_avg_output_tokens: divideNullable(
|
|
252
|
+
baselineEfficiency.total_output_tokens,
|
|
253
|
+
baselineEfficiency.eval_runs,
|
|
254
|
+
),
|
|
255
|
+
observed_avg_output_tokens: averageNullable(
|
|
256
|
+
observedTelemetry.map((record) => record.output_tokens ?? null),
|
|
257
|
+
),
|
|
258
|
+
output_tokens_delta_ratio: null,
|
|
259
|
+
baseline_avg_turns: divideNullable(
|
|
260
|
+
baselineEfficiency.total_turns,
|
|
261
|
+
baselineEfficiency.eval_runs,
|
|
262
|
+
),
|
|
263
|
+
observed_avg_turns: averageNullable(
|
|
264
|
+
observedTelemetry.map((record) => record.assistant_turns ?? null),
|
|
265
|
+
),
|
|
266
|
+
turns_delta_ratio: null,
|
|
267
|
+
};
|
|
268
|
+
|
|
269
|
+
efficiencyRegression.duration_delta_ratio = computeDeltaRatio(
|
|
270
|
+
efficiencyRegression.observed_avg_duration_ms,
|
|
271
|
+
efficiencyRegression.baseline_avg_duration_ms,
|
|
272
|
+
);
|
|
273
|
+
efficiencyRegression.input_tokens_delta_ratio = computeDeltaRatio(
|
|
274
|
+
efficiencyRegression.observed_avg_input_tokens,
|
|
275
|
+
efficiencyRegression.baseline_avg_input_tokens,
|
|
276
|
+
);
|
|
277
|
+
efficiencyRegression.output_tokens_delta_ratio = computeDeltaRatio(
|
|
278
|
+
efficiencyRegression.observed_avg_output_tokens,
|
|
279
|
+
efficiencyRegression.baseline_avg_output_tokens,
|
|
280
|
+
);
|
|
281
|
+
efficiencyRegression.turns_delta_ratio = computeDeltaRatio(
|
|
282
|
+
efficiencyRegression.observed_avg_turns,
|
|
283
|
+
efficiencyRegression.baseline_avg_turns,
|
|
284
|
+
);
|
|
285
|
+
|
|
286
|
+
const regressions: string[] = [];
|
|
287
|
+
const pushRegression = (label: string, ratio: number | null) => {
|
|
288
|
+
if (ratio != null && ratio > efficiencyRegressionThreshold) {
|
|
289
|
+
regressions.push(`${label} +${(ratio * 100).toFixed(1)}%`);
|
|
290
|
+
}
|
|
291
|
+
};
|
|
292
|
+
pushRegression("duration", efficiencyRegression.duration_delta_ratio);
|
|
293
|
+
pushRegression("input_tokens", efficiencyRegression.input_tokens_delta_ratio);
|
|
294
|
+
pushRegression("output_tokens", efficiencyRegression.output_tokens_delta_ratio);
|
|
295
|
+
pushRegression("turns", efficiencyRegression.turns_delta_ratio);
|
|
296
|
+
|
|
297
|
+
return {
|
|
298
|
+
efficiencyAlert:
|
|
299
|
+
regressions.length > 0
|
|
300
|
+
? `efficiency regression detected for "${skillName}": ${regressions.join(", ")} exceeds threshold=${(efficiencyRegressionThreshold * 100).toFixed(1)}%`
|
|
301
|
+
: null,
|
|
302
|
+
efficiencyRegression,
|
|
303
|
+
};
|
|
304
|
+
}
|
|
305
|
+
|
|
88
306
|
// ---------------------------------------------------------------------------
|
|
89
307
|
// computeMonitoringSnapshot - pure function
|
|
90
308
|
// ---------------------------------------------------------------------------
|
|
@@ -114,27 +332,8 @@ export function computeMonitoringSnapshot(
|
|
|
114
332
|
baselinePassRate: number,
|
|
115
333
|
regressionThreshold: number = DEFAULT_REGRESSION_THRESHOLD,
|
|
116
334
|
): MonitoringSnapshot {
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
const actionableQueryRecords = filterActionableQueryRecords(queryRecords);
|
|
120
|
-
const windowedTelemetry = telemetry.slice(-windowSessions);
|
|
121
|
-
const windowedSessionIds = new Set(windowedTelemetry.map((t) => t.session_id));
|
|
122
|
-
|
|
123
|
-
// 2. Filter skill records by skill name first
|
|
124
|
-
const skillNameFiltered = actionableSkillRecords.filter((r) => r.skill_name === skillName);
|
|
125
|
-
|
|
126
|
-
// 3. Apply session ID windowing only if telemetry is present and overlaps
|
|
127
|
-
const hasSessionOverlap =
|
|
128
|
-
windowedSessionIds.size > 0 &&
|
|
129
|
-
(skillNameFiltered.some((r) => windowedSessionIds.has(r.session_id)) ||
|
|
130
|
-
actionableQueryRecords.some((r) => windowedSessionIds.has(r.session_id)));
|
|
131
|
-
|
|
132
|
-
const filteredSkillRecords = hasSessionOverlap
|
|
133
|
-
? skillNameFiltered.filter((r) => windowedSessionIds.has(r.session_id))
|
|
134
|
-
: skillNameFiltered;
|
|
135
|
-
const filteredQueryRecords = hasSessionOverlap
|
|
136
|
-
? actionableQueryRecords.filter((r) => windowedSessionIds.has(r.session_id))
|
|
137
|
-
: actionableQueryRecords;
|
|
335
|
+
const { skillRecords: filteredSkillRecords, queryRecords: filteredQueryRecords } =
|
|
336
|
+
selectMonitoringWindow(skillName, telemetry, skillRecords, queryRecords, windowSessions);
|
|
138
337
|
|
|
139
338
|
// 4. Compute pass rate from explicit skill checks, not from all queries.
|
|
140
339
|
const triggeredCount = filteredSkillRecords.filter((r) => r.triggered).length;
|
|
@@ -202,6 +401,8 @@ export async function watch(options: WatchOptions): Promise<WatchResult> {
|
|
|
202
401
|
regressionThreshold = DEFAULT_REGRESSION_THRESHOLD,
|
|
203
402
|
gradeRegressionThreshold = DEFAULT_GRADE_REGRESSION_THRESHOLD,
|
|
204
403
|
enableGradeWatch = true,
|
|
404
|
+
efficiencyRegressionThreshold = DEFAULT_EFFICIENCY_REGRESSION_THRESHOLD,
|
|
405
|
+
enableEfficiencyWatch = true,
|
|
205
406
|
autoRollback = false,
|
|
206
407
|
_telemetryLogPath = TELEMETRY_LOG,
|
|
207
408
|
_skillLogPath = SKILL_LOG,
|
|
@@ -246,6 +447,13 @@ export async function watch(options: WatchOptions): Promise<WatchResult> {
|
|
|
246
447
|
baselinePassRate,
|
|
247
448
|
regressionThreshold,
|
|
248
449
|
);
|
|
450
|
+
const monitoringWindow = selectMonitoringWindow(
|
|
451
|
+
skillName,
|
|
452
|
+
telemetry,
|
|
453
|
+
skillRecords,
|
|
454
|
+
queryRecords,
|
|
455
|
+
windowSessions,
|
|
456
|
+
);
|
|
249
457
|
|
|
250
458
|
// 4. Build trigger alert. Grade alerts are added below before rollback
|
|
251
459
|
// decisions so either signal can drive automated rollback.
|
|
@@ -296,7 +504,22 @@ export async function watch(options: WatchOptions): Promise<WatchResult> {
|
|
|
296
504
|
}
|
|
297
505
|
}
|
|
298
506
|
|
|
299
|
-
|
|
507
|
+
let efficiencyAlert: string | null = null;
|
|
508
|
+
let efficiencyRegression: CreatePackageEvaluationWatchEfficiencyRegressionSummary | null = null;
|
|
509
|
+
if (enableEfficiencyWatch) {
|
|
510
|
+
const efficiencyResult = buildEfficiencyRegression(
|
|
511
|
+
skillName,
|
|
512
|
+
monitoringWindow.telemetry,
|
|
513
|
+
monitoringWindow.skillRecords,
|
|
514
|
+
efficiencyRegressionThreshold,
|
|
515
|
+
);
|
|
516
|
+
efficiencyAlert = efficiencyResult.efficiencyAlert;
|
|
517
|
+
efficiencyRegression = efficiencyResult.efficiencyRegression;
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
const alerts = [triggerAlert, gradeAlert, efficiencyAlert].filter((value): value is string =>
|
|
521
|
+
Boolean(value),
|
|
522
|
+
);
|
|
300
523
|
const alert = alerts.length > 0 ? alerts.join("\n") : null;
|
|
301
524
|
|
|
302
525
|
if (alert && autoRollback) {
|
|
@@ -311,10 +534,14 @@ export async function watch(options: WatchOptions): Promise<WatchResult> {
|
|
|
311
534
|
}
|
|
312
535
|
|
|
313
536
|
let recommendation: string;
|
|
537
|
+
let recommendedCommand: string | null = null;
|
|
314
538
|
if (alert) {
|
|
539
|
+
recommendedCommand = rolledBack
|
|
540
|
+
? null
|
|
541
|
+
: `selftune rollback --skill ${skillName} --skill-path ${skillPath}`;
|
|
315
542
|
recommendation = rolledBack
|
|
316
543
|
? `Rolled back "${skillName}" to previous version. Monitor to confirm recovery.`
|
|
317
|
-
: `Consider running:
|
|
544
|
+
: `Consider running: ${recommendedCommand}`;
|
|
318
545
|
} else if (snapshot.skill_checks < MIN_MONITORING_SKILL_CHECKS) {
|
|
319
546
|
recommendation =
|
|
320
547
|
`Skill "${skillName}" has only ${snapshot.skill_checks} actionable check(s) in the current window. ` +
|
|
@@ -342,8 +569,15 @@ export async function watch(options: WatchOptions): Promise<WatchResult> {
|
|
|
342
569
|
alert,
|
|
343
570
|
rolledBack,
|
|
344
571
|
recommendation,
|
|
572
|
+
recommended_command: recommendedCommand,
|
|
345
573
|
gradeAlert,
|
|
346
574
|
gradeRegression,
|
|
575
|
+
...(efficiencyAlert || efficiencyRegression
|
|
576
|
+
? {
|
|
577
|
+
efficiencyAlert,
|
|
578
|
+
efficiencyRegression,
|
|
579
|
+
}
|
|
580
|
+
: {}),
|
|
347
581
|
...(syncResult ? { sync_result: syncResult } : {}),
|
|
348
582
|
};
|
|
349
583
|
}
|
|
@@ -10,7 +10,7 @@ import type { EvolveOptions, evolve as evolveSkill } from "../evolution/evolve.j
|
|
|
10
10
|
import type { ReplayValidationOptions } from "../evolution/engines/replay-engine.js";
|
|
11
11
|
import { buildRuntimeReplayValidationOptions } from "../evolution/validate-host-replay.js";
|
|
12
12
|
import { findRecentlyDeployedSkills } from "./plan.js";
|
|
13
|
-
import type { OrchestrateOptions, SkillAction } from "../orchestrate.js";
|
|
13
|
+
import type { OrchestrateOptions, PackageSearchResult, SkillAction } from "../orchestrate.js";
|
|
14
14
|
import type { EvolutionAuditEntry, SessionTelemetryRecord, SkillUsageRecord } from "../types.js";
|
|
15
15
|
import { readExcerpt } from "../utils/transcript.js";
|
|
16
16
|
|
|
@@ -293,3 +293,302 @@ export async function watchRecentDeploys(
|
|
|
293
293
|
|
|
294
294
|
return { freshAuditEntries, freshlyWatchedSkills };
|
|
295
295
|
}
|
|
296
|
+
|
|
297
|
+
// ---------------------------------------------------------------------------
|
|
298
|
+
// Package Search Phase
|
|
299
|
+
// ---------------------------------------------------------------------------
|
|
300
|
+
|
|
301
|
+
export interface RunPackageSearchPhaseInput {
|
|
302
|
+
packageSearchCandidates: SkillAction[];
|
|
303
|
+
dryRun: boolean;
|
|
304
|
+
agent: string | null;
|
|
305
|
+
resolveSkillPath: (skillName: string) => string | undefined;
|
|
306
|
+
deps?: RunPackageSearchPhaseDeps;
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
export interface RunPackageSearchPhaseDeps {
|
|
310
|
+
generateReflectiveRoutingMutations?: typeof import("../evolution/bounded-mutations.js").generateReflectiveRoutingMutations;
|
|
311
|
+
generateReflectiveBodyMutations?: typeof import("../evolution/bounded-mutations.js").generateReflectiveBodyMutations;
|
|
312
|
+
generateRoutingMutations?: typeof import("../evolution/bounded-mutations.js").generateRoutingMutations;
|
|
313
|
+
generateBodyMutations?: typeof import("../evolution/bounded-mutations.js").generateBodyMutations;
|
|
314
|
+
generateTargetedRoutingMutations?: typeof import("../evolution/bounded-mutations.js").generateTargetedRoutingMutations;
|
|
315
|
+
generateTargetedBodyMutations?: typeof import("../evolution/bounded-mutations.js").generateTargetedBodyMutations;
|
|
316
|
+
extractMutationWeaknesses?: typeof import("../evolution/bounded-mutations.js").extractMutationWeaknesses;
|
|
317
|
+
cleanupVariants?: typeof import("../evolution/bounded-mutations.js").cleanupVariants;
|
|
318
|
+
computeCreatePackageFingerprint?: typeof import("../create/package-fingerprint.js").computeCreatePackageFingerprint;
|
|
319
|
+
runPackageSearch?: typeof import("../create/package-search.js").runPackageSearch;
|
|
320
|
+
applySearchRunWinner?: typeof import("../search-run.js").applySearchRunWinner;
|
|
321
|
+
getDb?: typeof import("../localdb/db.js").getDb;
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
/**
|
|
325
|
+
* Runs bounded package search for candidates tagged with action "package-search".
|
|
326
|
+
*
|
|
327
|
+
* For each candidate:
|
|
328
|
+
* 1. Resolves skill path
|
|
329
|
+
* 2. Generates routing + body mutations (bounded variants)
|
|
330
|
+
* 3. Fingerprints each variant
|
|
331
|
+
* 4. Runs package search evaluation across variants
|
|
332
|
+
* 5. Applies the winning candidate if found
|
|
333
|
+
* 6. Cleans up temporary variant files
|
|
334
|
+
*
|
|
335
|
+
* Returns candidates where a winner was found and applied.
|
|
336
|
+
*/
|
|
337
|
+
export async function runPackageSearchPhase(
|
|
338
|
+
input: RunPackageSearchPhaseInput,
|
|
339
|
+
): Promise<SkillAction[]> {
|
|
340
|
+
const { packageSearchCandidates, dryRun, agent, resolveSkillPath, deps = {} } = input;
|
|
341
|
+
|
|
342
|
+
if (packageSearchCandidates.length === 0) return [];
|
|
343
|
+
|
|
344
|
+
console.error(
|
|
345
|
+
`[orchestrate] Package search: ${packageSearchCandidates.length} candidate(s)${dryRun ? " (dry-run)" : ""}`,
|
|
346
|
+
);
|
|
347
|
+
|
|
348
|
+
// Pre-resolve skill paths and handle dry-run before loading optional modules
|
|
349
|
+
const resolved: Array<{ candidate: SkillAction; skillPath: string }> = [];
|
|
350
|
+
for (const candidate of packageSearchCandidates) {
|
|
351
|
+
const skillPath = resolveSkillPath(candidate.skill);
|
|
352
|
+
if (!skillPath) {
|
|
353
|
+
candidate.action = "skip";
|
|
354
|
+
candidate.reason = `SKILL.md not found for "${candidate.skill}"`;
|
|
355
|
+
console.error(` [pkg-search] ${candidate.skill}: ${candidate.reason}`);
|
|
356
|
+
continue;
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
if (dryRun) {
|
|
360
|
+
candidate.packageSearchResult = {
|
|
361
|
+
searched: false,
|
|
362
|
+
winnerApplied: false,
|
|
363
|
+
candidateCount: 0,
|
|
364
|
+
};
|
|
365
|
+
console.error(` [pkg-search] ${candidate.skill}: skipped (dry-run)`);
|
|
366
|
+
continue;
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
resolved.push({ candidate, skillPath });
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
// Nothing left to process after path resolution and dry-run filtering
|
|
373
|
+
if (resolved.length === 0) return [];
|
|
374
|
+
|
|
375
|
+
// Lazy-load package search dependencies. These modules are optional and may
|
|
376
|
+
// not exist yet if the package-search feature is still being built.
|
|
377
|
+
let generateRoutingMutations: typeof import("../evolution/bounded-mutations.js").generateRoutingMutations;
|
|
378
|
+
let generateBodyMutations: typeof import("../evolution/bounded-mutations.js").generateBodyMutations;
|
|
379
|
+
let generateReflectiveRoutingMutations: typeof import("../evolution/bounded-mutations.js").generateReflectiveRoutingMutations;
|
|
380
|
+
let generateReflectiveBodyMutations: typeof import("../evolution/bounded-mutations.js").generateReflectiveBodyMutations;
|
|
381
|
+
let generateTargetedRoutingMutations: typeof import("../evolution/bounded-mutations.js").generateTargetedRoutingMutations;
|
|
382
|
+
let generateTargetedBodyMutations: typeof import("../evolution/bounded-mutations.js").generateTargetedBodyMutations;
|
|
383
|
+
let extractMutationWeaknesses: typeof import("../evolution/bounded-mutations.js").extractMutationWeaknesses;
|
|
384
|
+
let cleanupVariants: typeof import("../evolution/bounded-mutations.js").cleanupVariants;
|
|
385
|
+
let computeCreatePackageFingerprint: typeof import("../create/package-fingerprint.js").computeCreatePackageFingerprint;
|
|
386
|
+
let runPackageSearch: typeof import("../create/package-search.js").runPackageSearch;
|
|
387
|
+
let applySearchRunWinner: typeof import("../search-run.js").applySearchRunWinner;
|
|
388
|
+
let getDb: typeof import("../localdb/db.js").getDb;
|
|
389
|
+
|
|
390
|
+
try {
|
|
391
|
+
if (
|
|
392
|
+
deps.generateReflectiveRoutingMutations &&
|
|
393
|
+
deps.generateReflectiveBodyMutations &&
|
|
394
|
+
deps.generateRoutingMutations &&
|
|
395
|
+
deps.generateBodyMutations &&
|
|
396
|
+
deps.generateTargetedRoutingMutations &&
|
|
397
|
+
deps.generateTargetedBodyMutations &&
|
|
398
|
+
deps.extractMutationWeaknesses &&
|
|
399
|
+
deps.cleanupVariants &&
|
|
400
|
+
deps.computeCreatePackageFingerprint &&
|
|
401
|
+
deps.runPackageSearch &&
|
|
402
|
+
deps.applySearchRunWinner &&
|
|
403
|
+
deps.getDb
|
|
404
|
+
) {
|
|
405
|
+
generateReflectiveRoutingMutations = deps.generateReflectiveRoutingMutations;
|
|
406
|
+
generateReflectiveBodyMutations = deps.generateReflectiveBodyMutations;
|
|
407
|
+
generateRoutingMutations = deps.generateRoutingMutations;
|
|
408
|
+
generateBodyMutations = deps.generateBodyMutations;
|
|
409
|
+
generateTargetedRoutingMutations = deps.generateTargetedRoutingMutations;
|
|
410
|
+
generateTargetedBodyMutations = deps.generateTargetedBodyMutations;
|
|
411
|
+
extractMutationWeaknesses = deps.extractMutationWeaknesses;
|
|
412
|
+
cleanupVariants = deps.cleanupVariants;
|
|
413
|
+
computeCreatePackageFingerprint = deps.computeCreatePackageFingerprint;
|
|
414
|
+
runPackageSearch = deps.runPackageSearch;
|
|
415
|
+
applySearchRunWinner = deps.applySearchRunWinner;
|
|
416
|
+
getDb = deps.getDb;
|
|
417
|
+
} else {
|
|
418
|
+
const boundedMutations = await import("../evolution/bounded-mutations.js");
|
|
419
|
+
generateReflectiveRoutingMutations = boundedMutations.generateReflectiveRoutingMutations;
|
|
420
|
+
generateReflectiveBodyMutations = boundedMutations.generateReflectiveBodyMutations;
|
|
421
|
+
generateRoutingMutations = boundedMutations.generateRoutingMutations;
|
|
422
|
+
generateBodyMutations = boundedMutations.generateBodyMutations;
|
|
423
|
+
generateTargetedRoutingMutations = boundedMutations.generateTargetedRoutingMutations;
|
|
424
|
+
generateTargetedBodyMutations = boundedMutations.generateTargetedBodyMutations;
|
|
425
|
+
extractMutationWeaknesses = boundedMutations.extractMutationWeaknesses;
|
|
426
|
+
cleanupVariants = boundedMutations.cleanupVariants;
|
|
427
|
+
|
|
428
|
+
const fingerprint = await import("../create/package-fingerprint.js");
|
|
429
|
+
computeCreatePackageFingerprint = fingerprint.computeCreatePackageFingerprint;
|
|
430
|
+
|
|
431
|
+
const packageSearch = await import("../create/package-search.js");
|
|
432
|
+
runPackageSearch = packageSearch.runPackageSearch;
|
|
433
|
+
|
|
434
|
+
const searchRun = await import("../search-run.js");
|
|
435
|
+
applySearchRunWinner = searchRun.applySearchRunWinner;
|
|
436
|
+
|
|
437
|
+
const localdb = await import("../localdb/db.js");
|
|
438
|
+
getDb = localdb.getDb;
|
|
439
|
+
}
|
|
440
|
+
} catch (err) {
|
|
441
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
442
|
+
console.error(`[orchestrate] Package search modules not available — skipping. ${msg}`);
|
|
443
|
+
for (const { candidate } of resolved) {
|
|
444
|
+
candidate.action = "skip";
|
|
445
|
+
candidate.reason = `package-search modules unavailable: ${msg}`;
|
|
446
|
+
}
|
|
447
|
+
return [];
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
const improved: SkillAction[] = [];
|
|
451
|
+
|
|
452
|
+
for (const { candidate, skillPath } of resolved) {
|
|
453
|
+
let allMutations: Array<{
|
|
454
|
+
variantSkillPath: string;
|
|
455
|
+
mutationSurface: "routing" | "body";
|
|
456
|
+
mutationDescription: string;
|
|
457
|
+
parentFingerprint: string;
|
|
458
|
+
}> = [];
|
|
459
|
+
try {
|
|
460
|
+
console.error(` [pkg-search] ${candidate.skill}: generating bounded mutations...`);
|
|
461
|
+
const db = getDb();
|
|
462
|
+
const weaknesses = extractMutationWeaknesses(candidate.skill, db);
|
|
463
|
+
|
|
464
|
+
// Generate reflective, targeted, and deterministic mutations in
|
|
465
|
+
// priority order. Reflective variants consume measured evaluator evidence
|
|
466
|
+
// first, then bounded heuristics fill the remaining space.
|
|
467
|
+
const [
|
|
468
|
+
routingMutations,
|
|
469
|
+
bodyMutations,
|
|
470
|
+
reflectiveRoutingMutations,
|
|
471
|
+
reflectiveBodyMutations,
|
|
472
|
+
targetedRoutingMutations,
|
|
473
|
+
targetedBodyMutations,
|
|
474
|
+
] = await Promise.all([
|
|
475
|
+
generateRoutingMutations(skillPath),
|
|
476
|
+
generateBodyMutations(skillPath),
|
|
477
|
+
agent
|
|
478
|
+
? Promise.resolve(
|
|
479
|
+
generateReflectiveRoutingMutations(skillPath, weaknesses, {
|
|
480
|
+
maxVariants: 1,
|
|
481
|
+
skillName: candidate.skill,
|
|
482
|
+
agent,
|
|
483
|
+
}).catch(() => []),
|
|
484
|
+
)
|
|
485
|
+
: Promise.resolve([]),
|
|
486
|
+
agent
|
|
487
|
+
? Promise.resolve(
|
|
488
|
+
generateReflectiveBodyMutations(skillPath, weaknesses, {
|
|
489
|
+
maxVariants: 1,
|
|
490
|
+
skillName: candidate.skill,
|
|
491
|
+
agent,
|
|
492
|
+
}).catch(() => []),
|
|
493
|
+
)
|
|
494
|
+
: Promise.resolve([]),
|
|
495
|
+
Promise.resolve(generateTargetedRoutingMutations(skillPath, weaknesses)),
|
|
496
|
+
Promise.resolve(generateTargetedBodyMutations(skillPath, weaknesses)),
|
|
497
|
+
]);
|
|
498
|
+
|
|
499
|
+
allMutations = [
|
|
500
|
+
...reflectiveRoutingMutations,
|
|
501
|
+
...reflectiveBodyMutations,
|
|
502
|
+
...targetedRoutingMutations,
|
|
503
|
+
...targetedBodyMutations,
|
|
504
|
+
...routingMutations,
|
|
505
|
+
...bodyMutations,
|
|
506
|
+
];
|
|
507
|
+
if (allMutations.length === 0) {
|
|
508
|
+
candidate.packageSearchResult = {
|
|
509
|
+
searched: false,
|
|
510
|
+
winnerApplied: false,
|
|
511
|
+
candidateCount: 0,
|
|
512
|
+
};
|
|
513
|
+
candidate.reason = "no mutations generated";
|
|
514
|
+
console.error(` [pkg-search] ${candidate.skill}: no mutations generated`);
|
|
515
|
+
continue;
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
// Fingerprint and deduplicate each variant.
|
|
519
|
+
const candidatePaths: Array<{ skill_path: string; fingerprint: string }> = [];
|
|
520
|
+
const seenFingerprints = new Set<string>();
|
|
521
|
+
for (const mutation of allMutations) {
|
|
522
|
+
const fp = computeCreatePackageFingerprint(mutation.variantSkillPath);
|
|
523
|
+
if (fp && !seenFingerprints.has(fp)) {
|
|
524
|
+
seenFingerprints.add(fp);
|
|
525
|
+
candidatePaths.push({ skill_path: mutation.variantSkillPath, fingerprint: fp });
|
|
526
|
+
}
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
if (candidatePaths.length === 0) {
|
|
530
|
+
cleanupVariants(allMutations);
|
|
531
|
+
candidate.packageSearchResult = {
|
|
532
|
+
searched: false,
|
|
533
|
+
winnerApplied: false,
|
|
534
|
+
candidateCount: 0,
|
|
535
|
+
};
|
|
536
|
+
candidate.reason = "no fingerprints computed";
|
|
537
|
+
console.error(` [pkg-search] ${candidate.skill}: no fingerprints computed`);
|
|
538
|
+
continue;
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
console.error(
|
|
542
|
+
` [pkg-search] ${candidate.skill}: searching ${candidatePaths.length} variant(s)...`,
|
|
543
|
+
);
|
|
544
|
+
|
|
545
|
+
// Run the package search
|
|
546
|
+
const searchResult = await runPackageSearch({
|
|
547
|
+
skill_name: candidate.skill,
|
|
548
|
+
candidate_paths: candidatePaths,
|
|
549
|
+
agent: agent ?? undefined,
|
|
550
|
+
db: getDb(),
|
|
551
|
+
});
|
|
552
|
+
|
|
553
|
+
const searchedResult: PackageSearchResult = {
|
|
554
|
+
searched: true,
|
|
555
|
+
winnerApplied: false,
|
|
556
|
+
candidateCount: candidatePaths.length,
|
|
557
|
+
};
|
|
558
|
+
|
|
559
|
+
// Apply winner if found
|
|
560
|
+
if (searchResult.winner_candidate_id) {
|
|
561
|
+
console.error(` [pkg-search] ${candidate.skill}: winner found, applying...`);
|
|
562
|
+
const applyResult = applySearchRunWinner(
|
|
563
|
+
candidate.skill,
|
|
564
|
+
skillPath,
|
|
565
|
+
searchResult.winner_candidate_id,
|
|
566
|
+
);
|
|
567
|
+
searchedResult.winnerApplied = applyResult.applied_winner;
|
|
568
|
+
searchedResult.winnerCandidateId = searchResult.winner_candidate_id;
|
|
569
|
+
|
|
570
|
+
if (applyResult.applied_winner) {
|
|
571
|
+
console.error(` [pkg-search] ${candidate.skill}: winner applied successfully`);
|
|
572
|
+
improved.push(candidate);
|
|
573
|
+
} else {
|
|
574
|
+
console.error(` [pkg-search] ${candidate.skill}: winner could not be applied`);
|
|
575
|
+
}
|
|
576
|
+
} else {
|
|
577
|
+
console.error(` [pkg-search] ${candidate.skill}: no winner found`);
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
candidate.packageSearchResult = searchedResult;
|
|
581
|
+
} catch (err) {
|
|
582
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
583
|
+
candidate.action = "skip";
|
|
584
|
+
candidate.reason = `package-search error: ${msg}`;
|
|
585
|
+
console.error(` [pkg-search] ${candidate.skill}: error — ${msg}`);
|
|
586
|
+
} finally {
|
|
587
|
+
if (allMutations.length > 0) {
|
|
588
|
+
cleanupVariants(allMutations);
|
|
589
|
+
}
|
|
590
|
+
}
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
return improved;
|
|
594
|
+
}
|