selftune 0.2.31 → 0.2.32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. package/README.md +83 -56
  2. package/apps/local-dashboard/dist/assets/index-B-ut4w0B.js +15 -0
  3. package/apps/local-dashboard/dist/assets/index-BFGfCVrL.css +1 -0
  4. package/apps/local-dashboard/dist/assets/vendor-ui-DfowE3Hu.js +1 -0
  5. package/apps/local-dashboard/dist/index.html +3 -3
  6. package/cli/selftune/command-surface.ts +613 -2
  7. package/cli/selftune/create/baseline.ts +429 -0
  8. package/cli/selftune/create/check.ts +35 -0
  9. package/cli/selftune/create/init.ts +115 -0
  10. package/cli/selftune/create/package-candidate-state.ts +771 -0
  11. package/cli/selftune/create/package-evaluator.ts +710 -0
  12. package/cli/selftune/create/package-fingerprint.ts +142 -0
  13. package/cli/selftune/create/package-search.ts +377 -0
  14. package/cli/selftune/create/publish.ts +431 -0
  15. package/cli/selftune/create/readiness.ts +495 -0
  16. package/cli/selftune/create/replay.ts +330 -0
  17. package/cli/selftune/create/report.ts +74 -0
  18. package/cli/selftune/create/scaffold.ts +121 -0
  19. package/cli/selftune/create/skills-ref-adapter.ts +177 -0
  20. package/cli/selftune/create/status.ts +33 -0
  21. package/cli/selftune/create/templates.ts +249 -0
  22. package/cli/selftune/cron/setup.ts +1 -1
  23. package/cli/selftune/dashboard-action-events.ts +4 -1
  24. package/cli/selftune/dashboard-action-result.ts +789 -24
  25. package/cli/selftune/dashboard-action-stream.ts +80 -0
  26. package/cli/selftune/dashboard-contract.ts +146 -3
  27. package/cli/selftune/dashboard-server.ts +5 -4
  28. package/cli/selftune/eval/hooks-to-evals.ts +58 -35
  29. package/cli/selftune/eval/synthetic-evals.ts +145 -17
  30. package/cli/selftune/evolution/bounded-mutations.ts +1045 -0
  31. package/cli/selftune/evolution/evolve-body.ts +9 -36
  32. package/cli/selftune/evolution/evolve.ts +8 -72
  33. package/cli/selftune/evolution/stopping-criteria.ts +5 -13
  34. package/cli/selftune/evolution/unblock-suggestions.ts +0 -16
  35. package/cli/selftune/evolution/validate-host-replay.ts +115 -15
  36. package/cli/selftune/improve.ts +206 -0
  37. package/cli/selftune/index.ts +123 -6
  38. package/cli/selftune/init.ts +1 -1
  39. package/cli/selftune/localdb/queries/dashboard.ts +30 -0
  40. package/cli/selftune/localdb/schema.ts +52 -0
  41. package/cli/selftune/monitoring/watch.ts +257 -23
  42. package/cli/selftune/orchestrate/execute.ts +300 -1
  43. package/cli/selftune/orchestrate/finalize.ts +14 -0
  44. package/cli/selftune/orchestrate/plan.ts +22 -5
  45. package/cli/selftune/orchestrate/prepare.ts +59 -4
  46. package/cli/selftune/orchestrate/report.ts +1 -1
  47. package/cli/selftune/orchestrate.ts +34 -1
  48. package/cli/selftune/publish.ts +35 -0
  49. package/cli/selftune/routes/actions.ts +81 -15
  50. package/cli/selftune/routes/overview.ts +1 -1
  51. package/cli/selftune/routes/skill-report.ts +147 -2
  52. package/cli/selftune/run.ts +18 -0
  53. package/cli/selftune/schedule.ts +3 -3
  54. package/cli/selftune/search-run.ts +703 -0
  55. package/cli/selftune/status.ts +35 -11
  56. package/cli/selftune/testing-readiness.ts +431 -40
  57. package/cli/selftune/types.ts +316 -0
  58. package/cli/selftune/utils/eval-readiness.ts +1 -0
  59. package/cli/selftune/utils/json-output.ts +11 -0
  60. package/cli/selftune/utils/lifecycle-surface.ts +48 -0
  61. package/cli/selftune/utils/query-filter.ts +82 -1
  62. package/cli/selftune/utils/tui.ts +85 -2
  63. package/cli/selftune/verify.ts +205 -0
  64. package/cli/selftune/workflows/proposals.ts +1 -1
  65. package/cli/selftune/workflows/skill-scaffold.ts +141 -63
  66. package/cli/selftune/workflows/workflows.ts +4 -4
  67. package/package.json +1 -1
  68. package/skill/SKILL.md +148 -85
  69. package/skill/references/cli-quick-reference.md +16 -1
  70. package/skill/references/creator-playbook.md +31 -10
  71. package/skill/workflows/Baseline.md +8 -9
  72. package/skill/workflows/Contributions.md +4 -4
  73. package/skill/workflows/Create.md +173 -0
  74. package/skill/workflows/CreateTestDeploy.md +34 -30
  75. package/skill/workflows/Cron.md +2 -2
  76. package/skill/workflows/Dashboard.md +3 -3
  77. package/skill/workflows/Evals.md +13 -7
  78. package/skill/workflows/Evolve.md +75 -32
  79. package/skill/workflows/EvolveBody.md +22 -15
  80. package/skill/workflows/Hook.md +1 -1
  81. package/skill/workflows/Improve.md +168 -0
  82. package/skill/workflows/Initialize.md +3 -3
  83. package/skill/workflows/Orchestrate.md +49 -12
  84. package/skill/workflows/Publish.md +100 -0
  85. package/skill/workflows/Run.md +72 -0
  86. package/skill/workflows/Schedule.md +2 -2
  87. package/skill/workflows/SearchRun.md +89 -0
  88. package/skill/workflows/SignalsDashboard.md +2 -2
  89. package/skill/workflows/UnitTest.md +13 -4
  90. package/skill/workflows/Verify.md +136 -0
  91. package/skill/workflows/Watch.md +114 -47
  92. package/skill/workflows/Workflows.md +13 -8
  93. package/apps/local-dashboard/dist/assets/index-B7v_o1WC.js +0 -15
  94. package/apps/local-dashboard/dist/assets/index-CrO77SVi.css +0 -1
  95. package/apps/local-dashboard/dist/assets/vendor-ui-B0H8s1mP.js +0 -1
@@ -21,8 +21,10 @@ import {
21
21
  querySkillUsageRecords,
22
22
  } from "../localdb/queries.js";
23
23
  import { updateContextAfterWatch } from "../memory/writer.js";
24
+ import { readCanonicalPackageEvaluationArtifact } from "../testing-readiness.js";
24
25
  import type { SyncResult } from "../sync.js";
25
26
  import type {
27
+ CreatePackageEvaluationWatchEfficiencyRegressionSummary,
26
28
  InvocationType,
27
29
  MonitoringSnapshot,
28
30
  QueryLogRecord,
@@ -49,6 +51,10 @@ export interface WatchOptions {
49
51
  gradeRegressionThreshold?: number;
50
52
  /** Enable grade-based regression watch (default true). */
51
53
  enableGradeWatch?: boolean;
54
+ /** Relative regression threshold for observed efficiency (default 0.25). */
55
+ efficiencyRegressionThreshold?: number;
56
+ /** Enable efficiency-based regression watch (default true). */
57
+ enableEfficiencyWatch?: boolean;
52
58
  /** Injected log paths for testing (override defaults). */
53
59
  _telemetryLogPath?: string;
54
60
  _skillLogPath?: string;
@@ -71,9 +77,59 @@ export interface WatchResult {
71
77
  alert: string | null;
72
78
  rolledBack: boolean;
73
79
  recommendation: string;
80
+ recommended_command?: string | null;
74
81
  sync_result?: SyncResult;
75
82
  gradeAlert?: string | null;
76
83
  gradeRegression?: { before: number; after: number; delta: number } | null;
84
+ efficiencyAlert?: string | null;
85
+ efficiencyRegression?: CreatePackageEvaluationWatchEfficiencyRegressionSummary | null;
86
+ }
87
+
88
+ // ---------------------------------------------------------------------------
89
+ // Watch trust scoring — aggregates watch signals into a 0-1 trust score
90
+ // ---------------------------------------------------------------------------
91
+
92
+ /**
93
+ * Compute a trust score (0-1) from a WatchResult.
94
+ *
95
+ * A skill with no regressions and sufficient checks scores 1.0.
96
+ * Active alerts reduce trust proportional to severity:
97
+ * - Trigger regression: -0.5
98
+ * - Grade regression: -0.3 (scaled by delta magnitude)
99
+ * - Insufficient data: caps at 0.5
100
+ */
101
+ export function computeWatchTrustScore(watchResult: WatchResult): number {
102
+ const { snapshot, alert, gradeRegression } = watchResult;
103
+
104
+ // Not enough data to form a trust opinion — cap at 0.5
105
+ if (snapshot.skill_checks < MIN_MONITORING_SKILL_CHECKS) {
106
+ return 0.5;
107
+ }
108
+
109
+ let score = 1.0;
110
+
111
+ // Trigger pass rate regression: major trust penalty
112
+ if (snapshot.regression_detected) {
113
+ score -= 0.5;
114
+ }
115
+
116
+ // Grade regression: penalty scaled by delta (max 0.3)
117
+ if (gradeRegression) {
118
+ const gradePenalty = Math.min(gradeRegression.delta * 2, 0.3);
119
+ score -= gradePenalty;
120
+ }
121
+
122
+ // Any active alert without specific regression (catch-all)
123
+ if (alert && !snapshot.regression_detected && !gradeRegression) {
124
+ score -= 0.2;
125
+ }
126
+
127
+ // Rolled back: significant trust hit
128
+ if (watchResult.rolledBack) {
129
+ score -= 0.2;
130
+ }
131
+
132
+ return Math.max(0, Math.min(1, score));
77
133
  }
78
134
 
79
135
  // ---------------------------------------------------------------------------
@@ -83,8 +139,170 @@ export interface WatchResult {
83
139
  const DEFAULT_BASELINE_PASS_RATE = 0.5;
84
140
  const DEFAULT_REGRESSION_THRESHOLD = 0.1;
85
141
  const DEFAULT_GRADE_REGRESSION_THRESHOLD = 0.15;
142
+ const DEFAULT_EFFICIENCY_REGRESSION_THRESHOLD = 0.25;
86
143
  export const MIN_MONITORING_SKILL_CHECKS = 3;
87
144
 
145
+ type MonitoringWindow = {
146
+ telemetry: SessionTelemetryRecord[];
147
+ skillRecords: SkillUsageRecord[];
148
+ queryRecords: QueryLogRecord[];
149
+ };
150
+
151
+ function selectMonitoringWindow(
152
+ skillName: string,
153
+ telemetry: SessionTelemetryRecord[],
154
+ skillRecords: SkillUsageRecord[],
155
+ queryRecords: QueryLogRecord[],
156
+ windowSessions: number,
157
+ ): MonitoringWindow {
158
+ const actionableSkillRecords = filterActionableSkillUsageRecords(skillRecords);
159
+ const actionableQueryRecords = filterActionableQueryRecords(queryRecords);
160
+ const windowedTelemetry = telemetry.slice(-windowSessions);
161
+ const windowedSessionIds = new Set(windowedTelemetry.map((t) => t.session_id));
162
+
163
+ const skillNameFiltered = actionableSkillRecords.filter((r) => r.skill_name === skillName);
164
+ const hasSessionOverlap =
165
+ windowedSessionIds.size > 0 &&
166
+ (skillNameFiltered.some((r) => windowedSessionIds.has(r.session_id)) ||
167
+ actionableQueryRecords.some((r) => windowedSessionIds.has(r.session_id)));
168
+
169
+ return {
170
+ telemetry: hasSessionOverlap
171
+ ? windowedTelemetry.filter((record) => windowedSessionIds.has(record.session_id))
172
+ : telemetry,
173
+ skillRecords: hasSessionOverlap
174
+ ? skillNameFiltered.filter((r) => windowedSessionIds.has(r.session_id))
175
+ : skillNameFiltered,
176
+ queryRecords: hasSessionOverlap
177
+ ? actionableQueryRecords.filter((r) => windowedSessionIds.has(r.session_id))
178
+ : actionableQueryRecords,
179
+ };
180
+ }
181
+
182
+ function averageNullable(values: Array<number | null | undefined>): number | null {
183
+ const valid = values.filter((value): value is number => typeof value === "number");
184
+ if (valid.length === 0) return null;
185
+ return valid.reduce((sum, value) => sum + value, 0) / valid.length;
186
+ }
187
+
188
+ function divideNullable(total: number | null | undefined, count: number | null | undefined) {
189
+ if (typeof total !== "number" || typeof count !== "number" || count <= 0) return null;
190
+ return total / count;
191
+ }
192
+
193
+ function computeDeltaRatio(observed: number | null, baseline: number | null): number | null {
194
+ if (observed == null || baseline == null || baseline <= 0) return null;
195
+ return (observed - baseline) / baseline;
196
+ }
197
+
198
+ function buildEfficiencyRegression(
199
+ skillName: string,
200
+ telemetry: SessionTelemetryRecord[],
201
+ skillRecords: SkillUsageRecord[],
202
+ efficiencyRegressionThreshold: number,
203
+ ): {
204
+ efficiencyAlert: string | null;
205
+ efficiencyRegression: CreatePackageEvaluationWatchEfficiencyRegressionSummary | null;
206
+ } {
207
+ const baselineEfficiency =
208
+ readCanonicalPackageEvaluationArtifact(skillName)?.summary.efficiency?.with_skill;
209
+ if (!baselineEfficiency) {
210
+ return {
211
+ efficiencyAlert: null,
212
+ efficiencyRegression: null,
213
+ };
214
+ }
215
+
216
+ const triggeredSessionIds = new Set(
217
+ skillRecords.filter((record) => record.triggered).map((record) => record.session_id),
218
+ );
219
+ if (triggeredSessionIds.size < MIN_MONITORING_SKILL_CHECKS) {
220
+ return {
221
+ efficiencyAlert: null,
222
+ efficiencyRegression: null,
223
+ };
224
+ }
225
+
226
+ const observedTelemetry = telemetry.filter((record) =>
227
+ triggeredSessionIds.has(record.session_id),
228
+ );
229
+ if (observedTelemetry.length < MIN_MONITORING_SKILL_CHECKS) {
230
+ return {
231
+ efficiencyAlert: null,
232
+ efficiencyRegression: null,
233
+ };
234
+ }
235
+
236
+ const efficiencyRegression: CreatePackageEvaluationWatchEfficiencyRegressionSummary = {
237
+ sample_size: observedTelemetry.length,
238
+ baseline_avg_duration_ms: baselineEfficiency.avg_duration_ms,
239
+ observed_avg_duration_ms: averageNullable(
240
+ observedTelemetry.map((record) => record.duration_ms ?? null),
241
+ ),
242
+ duration_delta_ratio: null,
243
+ baseline_avg_input_tokens: divideNullable(
244
+ baselineEfficiency.total_input_tokens,
245
+ baselineEfficiency.eval_runs,
246
+ ),
247
+ observed_avg_input_tokens: averageNullable(
248
+ observedTelemetry.map((record) => record.input_tokens ?? null),
249
+ ),
250
+ input_tokens_delta_ratio: null,
251
+ baseline_avg_output_tokens: divideNullable(
252
+ baselineEfficiency.total_output_tokens,
253
+ baselineEfficiency.eval_runs,
254
+ ),
255
+ observed_avg_output_tokens: averageNullable(
256
+ observedTelemetry.map((record) => record.output_tokens ?? null),
257
+ ),
258
+ output_tokens_delta_ratio: null,
259
+ baseline_avg_turns: divideNullable(
260
+ baselineEfficiency.total_turns,
261
+ baselineEfficiency.eval_runs,
262
+ ),
263
+ observed_avg_turns: averageNullable(
264
+ observedTelemetry.map((record) => record.assistant_turns ?? null),
265
+ ),
266
+ turns_delta_ratio: null,
267
+ };
268
+
269
+ efficiencyRegression.duration_delta_ratio = computeDeltaRatio(
270
+ efficiencyRegression.observed_avg_duration_ms,
271
+ efficiencyRegression.baseline_avg_duration_ms,
272
+ );
273
+ efficiencyRegression.input_tokens_delta_ratio = computeDeltaRatio(
274
+ efficiencyRegression.observed_avg_input_tokens,
275
+ efficiencyRegression.baseline_avg_input_tokens,
276
+ );
277
+ efficiencyRegression.output_tokens_delta_ratio = computeDeltaRatio(
278
+ efficiencyRegression.observed_avg_output_tokens,
279
+ efficiencyRegression.baseline_avg_output_tokens,
280
+ );
281
+ efficiencyRegression.turns_delta_ratio = computeDeltaRatio(
282
+ efficiencyRegression.observed_avg_turns,
283
+ efficiencyRegression.baseline_avg_turns,
284
+ );
285
+
286
+ const regressions: string[] = [];
287
+ const pushRegression = (label: string, ratio: number | null) => {
288
+ if (ratio != null && ratio > efficiencyRegressionThreshold) {
289
+ regressions.push(`${label} +${(ratio * 100).toFixed(1)}%`);
290
+ }
291
+ };
292
+ pushRegression("duration", efficiencyRegression.duration_delta_ratio);
293
+ pushRegression("input_tokens", efficiencyRegression.input_tokens_delta_ratio);
294
+ pushRegression("output_tokens", efficiencyRegression.output_tokens_delta_ratio);
295
+ pushRegression("turns", efficiencyRegression.turns_delta_ratio);
296
+
297
+ return {
298
+ efficiencyAlert:
299
+ regressions.length > 0
300
+ ? `efficiency regression detected for "${skillName}": ${regressions.join(", ")} exceeds threshold=${(efficiencyRegressionThreshold * 100).toFixed(1)}%`
301
+ : null,
302
+ efficiencyRegression,
303
+ };
304
+ }
305
+
88
306
  // ---------------------------------------------------------------------------
89
307
  // computeMonitoringSnapshot - pure function
90
308
  // ---------------------------------------------------------------------------
@@ -114,27 +332,8 @@ export function computeMonitoringSnapshot(
114
332
  baselinePassRate: number,
115
333
  regressionThreshold: number = DEFAULT_REGRESSION_THRESHOLD,
116
334
  ): MonitoringSnapshot {
117
- // 1. Window the telemetry to the last N sessions (by array order, assumed chronological)
118
- const actionableSkillRecords = filterActionableSkillUsageRecords(skillRecords);
119
- const actionableQueryRecords = filterActionableQueryRecords(queryRecords);
120
- const windowedTelemetry = telemetry.slice(-windowSessions);
121
- const windowedSessionIds = new Set(windowedTelemetry.map((t) => t.session_id));
122
-
123
- // 2. Filter skill records by skill name first
124
- const skillNameFiltered = actionableSkillRecords.filter((r) => r.skill_name === skillName);
125
-
126
- // 3. Apply session ID windowing only if telemetry is present and overlaps
127
- const hasSessionOverlap =
128
- windowedSessionIds.size > 0 &&
129
- (skillNameFiltered.some((r) => windowedSessionIds.has(r.session_id)) ||
130
- actionableQueryRecords.some((r) => windowedSessionIds.has(r.session_id)));
131
-
132
- const filteredSkillRecords = hasSessionOverlap
133
- ? skillNameFiltered.filter((r) => windowedSessionIds.has(r.session_id))
134
- : skillNameFiltered;
135
- const filteredQueryRecords = hasSessionOverlap
136
- ? actionableQueryRecords.filter((r) => windowedSessionIds.has(r.session_id))
137
- : actionableQueryRecords;
335
+ const { skillRecords: filteredSkillRecords, queryRecords: filteredQueryRecords } =
336
+ selectMonitoringWindow(skillName, telemetry, skillRecords, queryRecords, windowSessions);
138
337
 
139
338
  // 4. Compute pass rate from explicit skill checks, not from all queries.
140
339
  const triggeredCount = filteredSkillRecords.filter((r) => r.triggered).length;
@@ -202,6 +401,8 @@ export async function watch(options: WatchOptions): Promise<WatchResult> {
202
401
  regressionThreshold = DEFAULT_REGRESSION_THRESHOLD,
203
402
  gradeRegressionThreshold = DEFAULT_GRADE_REGRESSION_THRESHOLD,
204
403
  enableGradeWatch = true,
404
+ efficiencyRegressionThreshold = DEFAULT_EFFICIENCY_REGRESSION_THRESHOLD,
405
+ enableEfficiencyWatch = true,
205
406
  autoRollback = false,
206
407
  _telemetryLogPath = TELEMETRY_LOG,
207
408
  _skillLogPath = SKILL_LOG,
@@ -246,6 +447,13 @@ export async function watch(options: WatchOptions): Promise<WatchResult> {
246
447
  baselinePassRate,
247
448
  regressionThreshold,
248
449
  );
450
+ const monitoringWindow = selectMonitoringWindow(
451
+ skillName,
452
+ telemetry,
453
+ skillRecords,
454
+ queryRecords,
455
+ windowSessions,
456
+ );
249
457
 
250
458
  // 4. Build trigger alert. Grade alerts are added below before rollback
251
459
  // decisions so either signal can drive automated rollback.
@@ -296,7 +504,22 @@ export async function watch(options: WatchOptions): Promise<WatchResult> {
296
504
  }
297
505
  }
298
506
 
299
- const alerts = [triggerAlert, gradeAlert].filter((value): value is string => Boolean(value));
507
+ let efficiencyAlert: string | null = null;
508
+ let efficiencyRegression: CreatePackageEvaluationWatchEfficiencyRegressionSummary | null = null;
509
+ if (enableEfficiencyWatch) {
510
+ const efficiencyResult = buildEfficiencyRegression(
511
+ skillName,
512
+ monitoringWindow.telemetry,
513
+ monitoringWindow.skillRecords,
514
+ efficiencyRegressionThreshold,
515
+ );
516
+ efficiencyAlert = efficiencyResult.efficiencyAlert;
517
+ efficiencyRegression = efficiencyResult.efficiencyRegression;
518
+ }
519
+
520
+ const alerts = [triggerAlert, gradeAlert, efficiencyAlert].filter((value): value is string =>
521
+ Boolean(value),
522
+ );
300
523
  const alert = alerts.length > 0 ? alerts.join("\n") : null;
301
524
 
302
525
  if (alert && autoRollback) {
@@ -311,10 +534,14 @@ export async function watch(options: WatchOptions): Promise<WatchResult> {
311
534
  }
312
535
 
313
536
  let recommendation: string;
537
+ let recommendedCommand: string | null = null;
314
538
  if (alert) {
539
+ recommendedCommand = rolledBack
540
+ ? null
541
+ : `selftune rollback --skill ${skillName} --skill-path ${skillPath}`;
315
542
  recommendation = rolledBack
316
543
  ? `Rolled back "${skillName}" to previous version. Monitor to confirm recovery.`
317
- : `Consider running: selftune rollback --skill "${skillName}" --skill-path "${skillPath}"`;
544
+ : `Consider running: ${recommendedCommand}`;
318
545
  } else if (snapshot.skill_checks < MIN_MONITORING_SKILL_CHECKS) {
319
546
  recommendation =
320
547
  `Skill "${skillName}" has only ${snapshot.skill_checks} actionable check(s) in the current window. ` +
@@ -342,8 +569,15 @@ export async function watch(options: WatchOptions): Promise<WatchResult> {
342
569
  alert,
343
570
  rolledBack,
344
571
  recommendation,
572
+ recommended_command: recommendedCommand,
345
573
  gradeAlert,
346
574
  gradeRegression,
575
+ ...(efficiencyAlert || efficiencyRegression
576
+ ? {
577
+ efficiencyAlert,
578
+ efficiencyRegression,
579
+ }
580
+ : {}),
347
581
  ...(syncResult ? { sync_result: syncResult } : {}),
348
582
  };
349
583
  }
@@ -10,7 +10,7 @@ import type { EvolveOptions, evolve as evolveSkill } from "../evolution/evolve.j
10
10
  import type { ReplayValidationOptions } from "../evolution/engines/replay-engine.js";
11
11
  import { buildRuntimeReplayValidationOptions } from "../evolution/validate-host-replay.js";
12
12
  import { findRecentlyDeployedSkills } from "./plan.js";
13
- import type { OrchestrateOptions, SkillAction } from "../orchestrate.js";
13
+ import type { OrchestrateOptions, PackageSearchResult, SkillAction } from "../orchestrate.js";
14
14
  import type { EvolutionAuditEntry, SessionTelemetryRecord, SkillUsageRecord } from "../types.js";
15
15
  import { readExcerpt } from "../utils/transcript.js";
16
16
 
@@ -293,3 +293,302 @@ export async function watchRecentDeploys(
293
293
 
294
294
  return { freshAuditEntries, freshlyWatchedSkills };
295
295
  }
296
+
297
+ // ---------------------------------------------------------------------------
298
+ // Package Search Phase
299
+ // ---------------------------------------------------------------------------
300
+
301
+ export interface RunPackageSearchPhaseInput {
302
+ packageSearchCandidates: SkillAction[];
303
+ dryRun: boolean;
304
+ agent: string | null;
305
+ resolveSkillPath: (skillName: string) => string | undefined;
306
+ deps?: RunPackageSearchPhaseDeps;
307
+ }
308
+
309
+ export interface RunPackageSearchPhaseDeps {
310
+ generateReflectiveRoutingMutations?: typeof import("../evolution/bounded-mutations.js").generateReflectiveRoutingMutations;
311
+ generateReflectiveBodyMutations?: typeof import("../evolution/bounded-mutations.js").generateReflectiveBodyMutations;
312
+ generateRoutingMutations?: typeof import("../evolution/bounded-mutations.js").generateRoutingMutations;
313
+ generateBodyMutations?: typeof import("../evolution/bounded-mutations.js").generateBodyMutations;
314
+ generateTargetedRoutingMutations?: typeof import("../evolution/bounded-mutations.js").generateTargetedRoutingMutations;
315
+ generateTargetedBodyMutations?: typeof import("../evolution/bounded-mutations.js").generateTargetedBodyMutations;
316
+ extractMutationWeaknesses?: typeof import("../evolution/bounded-mutations.js").extractMutationWeaknesses;
317
+ cleanupVariants?: typeof import("../evolution/bounded-mutations.js").cleanupVariants;
318
+ computeCreatePackageFingerprint?: typeof import("../create/package-fingerprint.js").computeCreatePackageFingerprint;
319
+ runPackageSearch?: typeof import("../create/package-search.js").runPackageSearch;
320
+ applySearchRunWinner?: typeof import("../search-run.js").applySearchRunWinner;
321
+ getDb?: typeof import("../localdb/db.js").getDb;
322
+ }
323
+
324
+ /**
325
+ * Runs bounded package search for candidates tagged with action "package-search".
326
+ *
327
+ * For each candidate:
328
+ * 1. Resolves skill path
329
+ * 2. Generates routing + body mutations (bounded variants)
330
+ * 3. Fingerprints each variant
331
+ * 4. Runs package search evaluation across variants
332
+ * 5. Applies the winning candidate if found
333
+ * 6. Cleans up temporary variant files
334
+ *
335
+ * Returns candidates where a winner was found and applied.
336
+ */
337
+ export async function runPackageSearchPhase(
338
+ input: RunPackageSearchPhaseInput,
339
+ ): Promise<SkillAction[]> {
340
+ const { packageSearchCandidates, dryRun, agent, resolveSkillPath, deps = {} } = input;
341
+
342
+ if (packageSearchCandidates.length === 0) return [];
343
+
344
+ console.error(
345
+ `[orchestrate] Package search: ${packageSearchCandidates.length} candidate(s)${dryRun ? " (dry-run)" : ""}`,
346
+ );
347
+
348
+ // Pre-resolve skill paths and handle dry-run before loading optional modules
349
+ const resolved: Array<{ candidate: SkillAction; skillPath: string }> = [];
350
+ for (const candidate of packageSearchCandidates) {
351
+ const skillPath = resolveSkillPath(candidate.skill);
352
+ if (!skillPath) {
353
+ candidate.action = "skip";
354
+ candidate.reason = `SKILL.md not found for "${candidate.skill}"`;
355
+ console.error(` [pkg-search] ${candidate.skill}: ${candidate.reason}`);
356
+ continue;
357
+ }
358
+
359
+ if (dryRun) {
360
+ candidate.packageSearchResult = {
361
+ searched: false,
362
+ winnerApplied: false,
363
+ candidateCount: 0,
364
+ };
365
+ console.error(` [pkg-search] ${candidate.skill}: skipped (dry-run)`);
366
+ continue;
367
+ }
368
+
369
+ resolved.push({ candidate, skillPath });
370
+ }
371
+
372
+ // Nothing left to process after path resolution and dry-run filtering
373
+ if (resolved.length === 0) return [];
374
+
375
+ // Lazy-load package search dependencies. These modules are optional and may
376
+ // not exist yet if the package-search feature is still being built.
377
+ let generateRoutingMutations: typeof import("../evolution/bounded-mutations.js").generateRoutingMutations;
378
+ let generateBodyMutations: typeof import("../evolution/bounded-mutations.js").generateBodyMutations;
379
+ let generateReflectiveRoutingMutations: typeof import("../evolution/bounded-mutations.js").generateReflectiveRoutingMutations;
380
+ let generateReflectiveBodyMutations: typeof import("../evolution/bounded-mutations.js").generateReflectiveBodyMutations;
381
+ let generateTargetedRoutingMutations: typeof import("../evolution/bounded-mutations.js").generateTargetedRoutingMutations;
382
+ let generateTargetedBodyMutations: typeof import("../evolution/bounded-mutations.js").generateTargetedBodyMutations;
383
+ let extractMutationWeaknesses: typeof import("../evolution/bounded-mutations.js").extractMutationWeaknesses;
384
+ let cleanupVariants: typeof import("../evolution/bounded-mutations.js").cleanupVariants;
385
+ let computeCreatePackageFingerprint: typeof import("../create/package-fingerprint.js").computeCreatePackageFingerprint;
386
+ let runPackageSearch: typeof import("../create/package-search.js").runPackageSearch;
387
+ let applySearchRunWinner: typeof import("../search-run.js").applySearchRunWinner;
388
+ let getDb: typeof import("../localdb/db.js").getDb;
389
+
390
+ try {
391
+ if (
392
+ deps.generateReflectiveRoutingMutations &&
393
+ deps.generateReflectiveBodyMutations &&
394
+ deps.generateRoutingMutations &&
395
+ deps.generateBodyMutations &&
396
+ deps.generateTargetedRoutingMutations &&
397
+ deps.generateTargetedBodyMutations &&
398
+ deps.extractMutationWeaknesses &&
399
+ deps.cleanupVariants &&
400
+ deps.computeCreatePackageFingerprint &&
401
+ deps.runPackageSearch &&
402
+ deps.applySearchRunWinner &&
403
+ deps.getDb
404
+ ) {
405
+ generateReflectiveRoutingMutations = deps.generateReflectiveRoutingMutations;
406
+ generateReflectiveBodyMutations = deps.generateReflectiveBodyMutations;
407
+ generateRoutingMutations = deps.generateRoutingMutations;
408
+ generateBodyMutations = deps.generateBodyMutations;
409
+ generateTargetedRoutingMutations = deps.generateTargetedRoutingMutations;
410
+ generateTargetedBodyMutations = deps.generateTargetedBodyMutations;
411
+ extractMutationWeaknesses = deps.extractMutationWeaknesses;
412
+ cleanupVariants = deps.cleanupVariants;
413
+ computeCreatePackageFingerprint = deps.computeCreatePackageFingerprint;
414
+ runPackageSearch = deps.runPackageSearch;
415
+ applySearchRunWinner = deps.applySearchRunWinner;
416
+ getDb = deps.getDb;
417
+ } else {
418
+ const boundedMutations = await import("../evolution/bounded-mutations.js");
419
+ generateReflectiveRoutingMutations = boundedMutations.generateReflectiveRoutingMutations;
420
+ generateReflectiveBodyMutations = boundedMutations.generateReflectiveBodyMutations;
421
+ generateRoutingMutations = boundedMutations.generateRoutingMutations;
422
+ generateBodyMutations = boundedMutations.generateBodyMutations;
423
+ generateTargetedRoutingMutations = boundedMutations.generateTargetedRoutingMutations;
424
+ generateTargetedBodyMutations = boundedMutations.generateTargetedBodyMutations;
425
+ extractMutationWeaknesses = boundedMutations.extractMutationWeaknesses;
426
+ cleanupVariants = boundedMutations.cleanupVariants;
427
+
428
+ const fingerprint = await import("../create/package-fingerprint.js");
429
+ computeCreatePackageFingerprint = fingerprint.computeCreatePackageFingerprint;
430
+
431
+ const packageSearch = await import("../create/package-search.js");
432
+ runPackageSearch = packageSearch.runPackageSearch;
433
+
434
+ const searchRun = await import("../search-run.js");
435
+ applySearchRunWinner = searchRun.applySearchRunWinner;
436
+
437
+ const localdb = await import("../localdb/db.js");
438
+ getDb = localdb.getDb;
439
+ }
440
+ } catch (err) {
441
+ const msg = err instanceof Error ? err.message : String(err);
442
+ console.error(`[orchestrate] Package search modules not available — skipping. ${msg}`);
443
+ for (const { candidate } of resolved) {
444
+ candidate.action = "skip";
445
+ candidate.reason = `package-search modules unavailable: ${msg}`;
446
+ }
447
+ return [];
448
+ }
449
+
450
+ const improved: SkillAction[] = [];
451
+
452
+ for (const { candidate, skillPath } of resolved) {
453
+ let allMutations: Array<{
454
+ variantSkillPath: string;
455
+ mutationSurface: "routing" | "body";
456
+ mutationDescription: string;
457
+ parentFingerprint: string;
458
+ }> = [];
459
+ try {
460
+ console.error(` [pkg-search] ${candidate.skill}: generating bounded mutations...`);
461
+ const db = getDb();
462
+ const weaknesses = extractMutationWeaknesses(candidate.skill, db);
463
+
464
+ // Generate reflective, targeted, and deterministic mutations in
465
+ // priority order. Reflective variants consume measured evaluator evidence
466
+ // first, then bounded heuristics fill the remaining space.
467
+ const [
468
+ routingMutations,
469
+ bodyMutations,
470
+ reflectiveRoutingMutations,
471
+ reflectiveBodyMutations,
472
+ targetedRoutingMutations,
473
+ targetedBodyMutations,
474
+ ] = await Promise.all([
475
+ generateRoutingMutations(skillPath),
476
+ generateBodyMutations(skillPath),
477
+ agent
478
+ ? Promise.resolve(
479
+ generateReflectiveRoutingMutations(skillPath, weaknesses, {
480
+ maxVariants: 1,
481
+ skillName: candidate.skill,
482
+ agent,
483
+ }).catch(() => []),
484
+ )
485
+ : Promise.resolve([]),
486
+ agent
487
+ ? Promise.resolve(
488
+ generateReflectiveBodyMutations(skillPath, weaknesses, {
489
+ maxVariants: 1,
490
+ skillName: candidate.skill,
491
+ agent,
492
+ }).catch(() => []),
493
+ )
494
+ : Promise.resolve([]),
495
+ Promise.resolve(generateTargetedRoutingMutations(skillPath, weaknesses)),
496
+ Promise.resolve(generateTargetedBodyMutations(skillPath, weaknesses)),
497
+ ]);
498
+
499
+ allMutations = [
500
+ ...reflectiveRoutingMutations,
501
+ ...reflectiveBodyMutations,
502
+ ...targetedRoutingMutations,
503
+ ...targetedBodyMutations,
504
+ ...routingMutations,
505
+ ...bodyMutations,
506
+ ];
507
+ if (allMutations.length === 0) {
508
+ candidate.packageSearchResult = {
509
+ searched: false,
510
+ winnerApplied: false,
511
+ candidateCount: 0,
512
+ };
513
+ candidate.reason = "no mutations generated";
514
+ console.error(` [pkg-search] ${candidate.skill}: no mutations generated`);
515
+ continue;
516
+ }
517
+
518
+ // Fingerprint and deduplicate each variant.
519
+ const candidatePaths: Array<{ skill_path: string; fingerprint: string }> = [];
520
+ const seenFingerprints = new Set<string>();
521
+ for (const mutation of allMutations) {
522
+ const fp = computeCreatePackageFingerprint(mutation.variantSkillPath);
523
+ if (fp && !seenFingerprints.has(fp)) {
524
+ seenFingerprints.add(fp);
525
+ candidatePaths.push({ skill_path: mutation.variantSkillPath, fingerprint: fp });
526
+ }
527
+ }
528
+
529
+ if (candidatePaths.length === 0) {
530
+ cleanupVariants(allMutations);
531
+ candidate.packageSearchResult = {
532
+ searched: false,
533
+ winnerApplied: false,
534
+ candidateCount: 0,
535
+ };
536
+ candidate.reason = "no fingerprints computed";
537
+ console.error(` [pkg-search] ${candidate.skill}: no fingerprints computed`);
538
+ continue;
539
+ }
540
+
541
+ console.error(
542
+ ` [pkg-search] ${candidate.skill}: searching ${candidatePaths.length} variant(s)...`,
543
+ );
544
+
545
+ // Run the package search
546
+ const searchResult = await runPackageSearch({
547
+ skill_name: candidate.skill,
548
+ candidate_paths: candidatePaths,
549
+ agent: agent ?? undefined,
550
+ db: getDb(),
551
+ });
552
+
553
+ const searchedResult: PackageSearchResult = {
554
+ searched: true,
555
+ winnerApplied: false,
556
+ candidateCount: candidatePaths.length,
557
+ };
558
+
559
+ // Apply winner if found
560
+ if (searchResult.winner_candidate_id) {
561
+ console.error(` [pkg-search] ${candidate.skill}: winner found, applying...`);
562
+ const applyResult = applySearchRunWinner(
563
+ candidate.skill,
564
+ skillPath,
565
+ searchResult.winner_candidate_id,
566
+ );
567
+ searchedResult.winnerApplied = applyResult.applied_winner;
568
+ searchedResult.winnerCandidateId = searchResult.winner_candidate_id;
569
+
570
+ if (applyResult.applied_winner) {
571
+ console.error(` [pkg-search] ${candidate.skill}: winner applied successfully`);
572
+ improved.push(candidate);
573
+ } else {
574
+ console.error(` [pkg-search] ${candidate.skill}: winner could not be applied`);
575
+ }
576
+ } else {
577
+ console.error(` [pkg-search] ${candidate.skill}: no winner found`);
578
+ }
579
+
580
+ candidate.packageSearchResult = searchedResult;
581
+ } catch (err) {
582
+ const msg = err instanceof Error ? err.message : String(err);
583
+ candidate.action = "skip";
584
+ candidate.reason = `package-search error: ${msg}`;
585
+ console.error(` [pkg-search] ${candidate.skill}: error — ${msg}`);
586
+ } finally {
587
+ if (allMutations.length > 0) {
588
+ cleanupVariants(allMutations);
589
+ }
590
+ }
591
+ }
592
+
593
+ return improved;
594
+ }