selftune 0.2.16 → 0.2.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. package/README.md +32 -22
  2. package/apps/local-dashboard/dist/assets/index-DnhnXQm6.js +60 -0
  3. package/apps/local-dashboard/dist/assets/index-_EcLywDg.css +1 -0
  4. package/apps/local-dashboard/dist/assets/vendor-table-BIiI3YhS.js +1 -0
  5. package/apps/local-dashboard/dist/assets/vendor-ui-CGEmUayx.js +12 -0
  6. package/apps/local-dashboard/dist/index.html +5 -5
  7. package/cli/selftune/alpha-upload/build-payloads.ts +14 -1
  8. package/cli/selftune/alpha-upload/client.ts +51 -1
  9. package/cli/selftune/alpha-upload/flush.ts +46 -5
  10. package/cli/selftune/alpha-upload/stage-canonical.ts +32 -10
  11. package/cli/selftune/alpha-upload-contract.ts +9 -0
  12. package/cli/selftune/constants.ts +92 -5
  13. package/cli/selftune/contribute/contribute.ts +30 -2
  14. package/cli/selftune/contribute/sanitize.ts +52 -5
  15. package/cli/selftune/contribution-config.ts +249 -0
  16. package/cli/selftune/contribution-relay.ts +177 -0
  17. package/cli/selftune/contribution-signals.ts +219 -0
  18. package/cli/selftune/contribution-staging.ts +147 -0
  19. package/cli/selftune/contributions.ts +532 -0
  20. package/cli/selftune/creator-contributions.ts +333 -0
  21. package/cli/selftune/dashboard-contract.ts +305 -1
  22. package/cli/selftune/dashboard-server.ts +47 -13
  23. package/cli/selftune/eval/family-overlap.ts +395 -0
  24. package/cli/selftune/eval/hooks-to-evals.ts +182 -28
  25. package/cli/selftune/eval/synthetic-evals.ts +298 -11
  26. package/cli/selftune/evolution/description-quality.ts +12 -11
  27. package/cli/selftune/evolution/evolve.ts +214 -51
  28. package/cli/selftune/evolution/validate-proposal.ts +9 -6
  29. package/cli/selftune/export.ts +2 -2
  30. package/cli/selftune/grading/grade-session.ts +20 -0
  31. package/cli/selftune/hooks/commit-track.ts +188 -0
  32. package/cli/selftune/hooks/prompt-log.ts +10 -1
  33. package/cli/selftune/hooks/session-stop.ts +2 -2
  34. package/cli/selftune/hooks/skill-eval.ts +15 -1
  35. package/cli/selftune/hooks/stdin-preview.ts +32 -0
  36. package/cli/selftune/index.ts +41 -5
  37. package/cli/selftune/ingestors/codex-rollout.ts +31 -35
  38. package/cli/selftune/ingestors/codex-wrapper.ts +32 -24
  39. package/cli/selftune/localdb/db.ts +2 -2
  40. package/cli/selftune/localdb/direct-write.ts +69 -6
  41. package/cli/selftune/localdb/queries.ts +1253 -37
  42. package/cli/selftune/localdb/schema.ts +66 -0
  43. package/cli/selftune/orchestrate.ts +32 -4
  44. package/cli/selftune/recover.ts +153 -0
  45. package/cli/selftune/repair/skill-usage.ts +363 -4
  46. package/cli/selftune/routes/actions.ts +35 -1
  47. package/cli/selftune/routes/analytics.ts +14 -0
  48. package/cli/selftune/routes/index.ts +1 -0
  49. package/cli/selftune/routes/overview.ts +150 -4
  50. package/cli/selftune/routes/skill-report.ts +648 -18
  51. package/cli/selftune/status.ts +81 -2
  52. package/cli/selftune/sync.ts +56 -2
  53. package/cli/selftune/trust-model.ts +66 -0
  54. package/cli/selftune/types.ts +80 -0
  55. package/cli/selftune/utils/skill-detection.ts +43 -0
  56. package/cli/selftune/utils/transcript.ts +210 -1
  57. package/cli/selftune/watchlist.ts +65 -0
  58. package/node_modules/@selftune/telemetry-contract/src/types.ts +11 -0
  59. package/package.json +1 -1
  60. package/packages/telemetry-contract/src/types.ts +11 -0
  61. package/packages/ui/src/components/ActivityTimeline.tsx +165 -150
  62. package/packages/ui/src/components/EvidenceViewer.tsx +335 -144
  63. package/packages/ui/src/components/EvolutionTimeline.tsx +58 -28
  64. package/packages/ui/src/components/OrchestrateRunsPanel.tsx +33 -16
  65. package/packages/ui/src/components/RecentActivityFeed.tsx +72 -41
  66. package/packages/ui/src/components/section-cards.tsx +12 -9
  67. package/packages/ui/src/primitives/card.tsx +1 -1
  68. package/skill/SKILL.md +40 -2
  69. package/skill/Workflows/AlphaUpload.md +4 -0
  70. package/skill/Workflows/Composability.md +64 -0
  71. package/skill/Workflows/Contribute.md +6 -3
  72. package/skill/Workflows/Contributions.md +97 -0
  73. package/skill/Workflows/CreatorContributions.md +74 -0
  74. package/skill/Workflows/Dashboard.md +31 -0
  75. package/skill/Workflows/Evals.md +57 -8
  76. package/skill/Workflows/Evolve.md +31 -13
  77. package/skill/Workflows/ExportCanonical.md +121 -0
  78. package/skill/Workflows/Hook.md +131 -0
  79. package/skill/Workflows/Ingest.md +7 -0
  80. package/skill/Workflows/Initialize.md +29 -9
  81. package/skill/Workflows/Orchestrate.md +27 -5
  82. package/skill/Workflows/Quickstart.md +94 -0
  83. package/skill/Workflows/Recover.md +84 -0
  84. package/skill/Workflows/RepairSkillUsage.md +95 -0
  85. package/skill/Workflows/Sync.md +18 -12
  86. package/skill/Workflows/Uninstall.md +82 -0
  87. package/skill/settings_snippet.json +11 -0
  88. package/apps/local-dashboard/dist/assets/index-BMIS6uUh.css +0 -2
  89. package/apps/local-dashboard/dist/assets/index-DOu3iLD9.js +0 -16
  90. package/apps/local-dashboard/dist/assets/vendor-table-pHbDxq36.js +0 -8
  91. package/apps/local-dashboard/dist/assets/vendor-ui-DIwlrGlb.js +0 -12
@@ -8,10 +8,21 @@
8
8
 
9
9
  import type { Database } from "bun:sqlite";
10
10
 
11
+ import { parseCursorParam } from "../dashboard-contract.js";
11
12
  import { scoreDescription } from "../evolution/description-quality.js";
12
- import { getPendingProposals, getSkillReportPayload, safeParseJson } from "../localdb/queries.js";
13
+ import {
14
+ getExecutionMetrics,
15
+ getPendingProposals,
16
+ getSkillCommitSummary,
17
+ getSkillReportPayload,
18
+ safeParseJson,
19
+ } from "../localdb/queries.js";
13
20
 
14
- export function handleSkillReport(db: Database, skillName: string): Response {
21
+ export function handleSkillReport(
22
+ db: Database,
23
+ skillName: string,
24
+ searchParams?: URLSearchParams,
25
+ ): Response {
15
26
  const report = getSkillReportPayload(db, skillName);
16
27
 
17
28
  // 1. Evolution audit with eval_snapshot
@@ -87,21 +98,17 @@ export function handleSkillReport(db: Database, skillName: string): Response {
87
98
  run_count: selftuneRunCount,
88
99
  };
89
100
 
90
- // 4. Skill invocations — single source of truth
101
+ // 4. Skill invocations — single source of truth (with optional cursor pagination)
91
102
  // JOIN prompts to recover query text when si.query is null (canonical records
92
103
  // don't carry query; it's only populated via the direct-write hook path).
93
- const invocationsWithConfidence = db
94
- .query(
95
- `SELECT si.occurred_at as timestamp, si.session_id, si.skill_name,
96
- si.invocation_mode, si.triggered, si.confidence, si.tool_name,
97
- si.agent_type, COALESCE(si.query, p.prompt_text) as query, si.source
98
- FROM skill_invocations si
99
- LEFT JOIN prompts p ON si.matched_prompt_id = p.prompt_id
100
- WHERE si.skill_name = ?
101
- ORDER BY si.occurred_at DESC
102
- LIMIT 100`,
103
- )
104
- .all(skillName) as Array<{
104
+ const invCursor = parseCursorParam(searchParams?.get("invocations_cursor") ?? null);
105
+ const invLimitParam = searchParams?.get("invocations_limit");
106
+ const invLimit = invLimitParam
107
+ ? Math.max(1, Math.min(Number.parseInt(invLimitParam, 10) || 100, 10000))
108
+ : 100;
109
+ const invFetchLimit = invLimit + 1;
110
+
111
+ let invocationsWithConfidence: Array<{
105
112
  timestamp: string;
106
113
  session_id: string;
107
114
  skill_name: string;
@@ -112,8 +119,53 @@ export function handleSkillReport(db: Database, skillName: string): Response {
112
119
  agent_type: string | null;
113
120
  query: string | null;
114
121
  source: string | null;
122
+ skill_invocation_id: string;
123
+ capture_mode: string | null;
124
+ raw_source_ref: string | null;
115
125
  }>;
116
126
 
127
+ if (invCursor) {
128
+ invocationsWithConfidence = db
129
+ .query(
130
+ `SELECT si.occurred_at as timestamp, si.session_id, si.skill_name,
131
+ si.invocation_mode, si.triggered, si.confidence, si.tool_name,
132
+ si.agent_type, COALESCE(si.query, p.prompt_text) as query, si.source,
133
+ si.skill_invocation_id, si.capture_mode, si.raw_source_ref
134
+ FROM skill_invocations si
135
+ LEFT JOIN prompts p ON si.matched_prompt_id = p.prompt_id
136
+ WHERE si.skill_name = ?
137
+ AND (si.occurred_at < ? OR (si.occurred_at = ? AND si.skill_invocation_id < ?))
138
+ ORDER BY si.occurred_at DESC, si.skill_invocation_id DESC
139
+ LIMIT ?`,
140
+ )
141
+ .all(
142
+ skillName,
143
+ invCursor.timestamp,
144
+ invCursor.timestamp,
145
+ String(invCursor.id),
146
+ invFetchLimit,
147
+ ) as typeof invocationsWithConfidence;
148
+ } else {
149
+ invocationsWithConfidence = db
150
+ .query(
151
+ `SELECT si.occurred_at as timestamp, si.session_id, si.skill_name,
152
+ si.invocation_mode, si.triggered, si.confidence, si.tool_name,
153
+ si.agent_type, COALESCE(si.query, p.prompt_text) as query, si.source,
154
+ si.skill_invocation_id, si.capture_mode, si.raw_source_ref
155
+ FROM skill_invocations si
156
+ LEFT JOIN prompts p ON si.matched_prompt_id = p.prompt_id
157
+ WHERE si.skill_name = ?
158
+ ORDER BY si.occurred_at DESC, si.skill_invocation_id DESC
159
+ LIMIT ?`,
160
+ )
161
+ .all(skillName, invFetchLimit) as typeof invocationsWithConfidence;
162
+ }
163
+
164
+ const invHasMore = invocationsWithConfidence.length > invLimit;
165
+ const invPageRows = invHasMore
166
+ ? invocationsWithConfidence.slice(0, invLimit)
167
+ : invocationsWithConfidence;
168
+
117
169
  // Not-found check — after all enrichment queries so evidence-only skills aren't 404'd
118
170
  const hasData =
119
171
  report.usage.total_checks > 0 ||
@@ -121,7 +173,7 @@ export function handleSkillReport(db: Database, skillName: string): Response {
121
173
  report.evidence.length > 0 ||
122
174
  evolution.length > 0 ||
123
175
  pending_proposals.length > 0 ||
124
- invocationsWithConfidence.length > 0;
176
+ invPageRows.length > 0;
125
177
  if (!hasData) {
126
178
  return Response.json({ error: "Skill not found" }, { status: 404 });
127
179
  }
@@ -156,6 +208,18 @@ export function handleSkillReport(db: Database, skillName: string): Response {
156
208
  )
157
209
  .get(skillName) as { missed_triggers: number } | null;
158
210
 
211
+ // 5b. Execution metrics (enrichment columns from execution_facts)
212
+ const skillSessionIds = db
213
+ .query(`SELECT DISTINCT session_id FROM skill_invocations WHERE skill_name = ?`)
214
+ .all(skillName) as Array<{ session_id: string }>;
215
+ const executionMetrics = getExecutionMetrics(
216
+ db,
217
+ skillSessionIds.map((r) => r.session_id),
218
+ );
219
+
220
+ // 5c. Commit summary (from commit_tracking via session join)
221
+ const commitSummary = getSkillCommitSummary(db, skillName);
222
+
159
223
  // 6. Prompt texts — prefer matched prompts (the prompt that invoked the skill),
160
224
  // fall back to all prompts from sessions that used the skill.
161
225
  const promptSamples = db
@@ -219,6 +283,541 @@ export function handleSkillReport(db: Database, skillName: string): Response {
219
283
  ? scoreDescription(currentDescriptionText, skillName)
220
284
  : null;
221
285
 
286
+ // ── Trust field computation ──────────────────────────────────────────────
287
+
288
+ const SYSTEM_LIKE_PREFIXES = ["<system_instruction>", "<system-instruction>", "<command-name>"];
289
+ const INTERNAL_EVAL_MARKERS = [
290
+ "you are an evaluation assistant",
291
+ "you are a skill description optimizer",
292
+ "would each query trigger this skill",
293
+ "propose an improved description",
294
+ "failure patterns:",
295
+ "output only valid json",
296
+ ];
297
+ const isSystemLike = (text: string | null | undefined): boolean => {
298
+ if (!text) return false;
299
+ const trimmed = text.trimStart();
300
+ return SYSTEM_LIKE_PREFIXES.some((p) => trimmed.startsWith(p));
301
+ };
302
+ const isInternalSelftunePrompt = (
303
+ text: string | null | undefined,
304
+ promptKind: string | null | undefined,
305
+ ): boolean => {
306
+ if (!text) return false;
307
+ const lowered = text.toLowerCase();
308
+ return (
309
+ promptKind === "meta" && INTERNAL_EVAL_MARKERS.some((marker) => lowered.includes(marker))
310
+ );
311
+ };
312
+ const isPollutingPrompt = (
313
+ text: string | null | undefined,
314
+ promptKind: string | null | undefined,
315
+ ): boolean => isSystemLike(text) || isInternalSelftunePrompt(text, promptKind);
316
+ const classifyObservationKind = (
317
+ skillInvocationId: string,
318
+ captureMode: string | null,
319
+ triggered: number,
320
+ rawSourceRefJson: string | null,
321
+ ): "canonical" | "repaired_trigger" | "repaired_contextual_miss" | "legacy_materialized" => {
322
+ if (skillInvocationId.includes(":su:")) return "legacy_materialized";
323
+ if (captureMode === "repair") {
324
+ const rawSourceRef = safeParseJson(rawSourceRefJson) as {
325
+ metadata?: { miss_type?: string };
326
+ } | null;
327
+ if (triggered === 0 && rawSourceRef?.metadata?.miss_type === "contextual_read") {
328
+ return "repaired_contextual_miss";
329
+ }
330
+ return "repaired_trigger";
331
+ }
332
+ return "canonical";
333
+ };
334
+
335
+ // Fetch all invocations for this skill with joined prompt + session data
336
+ const allInvocations = db
337
+ .query(
338
+ `SELECT si.occurred_at AS timestamp, si.session_id, si.skill_name,
339
+ si.invocation_mode, si.triggered, si.confidence, si.tool_name,
340
+ si.agent_type, si.query AS inline_query, si.source,
341
+ si.matched_prompt_id, si.skill_scope, si.skill_path,
342
+ si.skill_invocation_id, si.capture_mode, si.raw_source_ref,
343
+ p.prompt_text, p.prompt_kind,
344
+ s.platform, s.workspace_path
345
+ FROM skill_invocations si
346
+ LEFT JOIN prompts p ON si.matched_prompt_id = p.prompt_id
347
+ LEFT JOIN sessions s ON si.session_id = s.session_id
348
+ WHERE si.skill_name = ?
349
+ ORDER BY si.occurred_at DESC`,
350
+ )
351
+ .all(skillName) as Array<{
352
+ timestamp: string | null;
353
+ session_id: string;
354
+ skill_name: string;
355
+ invocation_mode: string | null;
356
+ triggered: number;
357
+ confidence: number | null;
358
+ tool_name: string | null;
359
+ agent_type: string | null;
360
+ inline_query: string | null;
361
+ source: string | null;
362
+ matched_prompt_id: string | null;
363
+ skill_scope: string | null;
364
+ skill_path: string | null;
365
+ skill_invocation_id: string;
366
+ capture_mode: string | null;
367
+ raw_source_ref: string | null;
368
+ prompt_text: string | null;
369
+ prompt_kind: string | null;
370
+ platform: string | null;
371
+ workspace_path: string | null;
372
+ }>;
373
+
374
+ const totalInv = allInvocations.length;
375
+ const safeDiv = (num: number, den: number): number => (den > 0 ? num / den : 0);
376
+
377
+ // Coverage
378
+ const distinctSessions = new Set(allInvocations.map((r) => r.session_id));
379
+ const distinctWorkspaces = new Set(allInvocations.map((r) => r.workspace_path).filter(Boolean));
380
+ const allTimestamps = allInvocations
381
+ .map((r) => r.timestamp)
382
+ .filter((t): t is string => t != null);
383
+ const coverage = {
384
+ checks: report.usage.total_checks,
385
+ sessions: distinctSessions.size,
386
+ workspaces: distinctWorkspaces.size,
387
+ first_seen: allTimestamps.length > 0 ? allTimestamps[allTimestamps.length - 1] : null,
388
+ last_seen: allTimestamps.length > 0 ? allTimestamps[0] : null,
389
+ };
390
+
391
+ // Evidence quality
392
+ let promptLinked = 0;
393
+ let inlineQueryCount = 0;
394
+ let userPromptCount = 0;
395
+ let metaPromptCount = 0;
396
+ let internalPromptCount = 0;
397
+ let noPromptCount = 0;
398
+ let systemLikeCount = 0;
399
+ let invModeCount = 0;
400
+ let confCount = 0;
401
+ let sourceCount = 0;
402
+ let scopeCount = 0;
403
+
404
+ for (const inv of allInvocations) {
405
+ const queryText = inv.inline_query || inv.prompt_text || "";
406
+ if (inv.matched_prompt_id != null) promptLinked++;
407
+ if (inv.inline_query != null && inv.inline_query !== "") inlineQueryCount++;
408
+ if (inv.prompt_kind === "user") userPromptCount++;
409
+ if (inv.prompt_kind === "meta") metaPromptCount++;
410
+ if (isInternalSelftunePrompt(queryText, inv.prompt_kind)) internalPromptCount++;
411
+ if (inv.matched_prompt_id == null && (inv.inline_query == null || inv.inline_query === ""))
412
+ noPromptCount++;
413
+ if (isPollutingPrompt(queryText, inv.prompt_kind)) systemLikeCount++;
414
+ if (inv.invocation_mode != null && inv.invocation_mode !== "") invModeCount++;
415
+ if (inv.confidence != null) confCount++;
416
+ if (inv.source != null && inv.source !== "") sourceCount++;
417
+ if (inv.skill_scope != null && inv.skill_scope !== "") scopeCount++;
418
+ }
419
+
420
+ const evidence_quality = {
421
+ prompt_link_rate: safeDiv(promptLinked, totalInv),
422
+ inline_query_rate: safeDiv(inlineQueryCount, totalInv),
423
+ user_prompt_rate: safeDiv(userPromptCount, totalInv),
424
+ meta_prompt_rate: safeDiv(metaPromptCount, totalInv),
425
+ internal_prompt_rate: safeDiv(internalPromptCount, totalInv),
426
+ no_prompt_rate: safeDiv(noPromptCount, totalInv),
427
+ system_like_rate: safeDiv(systemLikeCount, totalInv),
428
+ invocation_mode_coverage: safeDiv(invModeCount, totalInv),
429
+ confidence_coverage: safeDiv(confCount, totalInv),
430
+ source_coverage: safeDiv(sourceCount, totalInv),
431
+ scope_coverage: safeDiv(scopeCount, totalInv),
432
+ };
433
+
434
+ // Routing quality
435
+ const missedTriggers = allInvocations.filter((r) => r.triggered === 0).length;
436
+ const withConfidence = allInvocations.filter((r) => r.confidence != null);
437
+ const avgConfidence =
438
+ withConfidence.length > 0
439
+ ? withConfidence.reduce((s, r) => s + (r.confidence ?? 0), 0) / withConfidence.length
440
+ : null;
441
+ const lowConfCount = withConfidence.filter((r) => (r.confidence ?? 0) < 0.5).length;
442
+
443
+ const routing_quality = {
444
+ missed_triggers: missedTriggers,
445
+ miss_rate: safeDiv(missedTriggers, totalInv),
446
+ avg_confidence: avgConfidence,
447
+ confidence_coverage: safeDiv(confCount, totalInv),
448
+ low_confidence_rate:
449
+ withConfidence.length > 0 ? safeDiv(lowConfCount, withConfidence.length) : null,
450
+ };
451
+
452
+ // Evolution state
453
+ const evidenceCountRow = db
454
+ .query(`SELECT COUNT(*) AS cnt FROM evolution_evidence WHERE skill_name = ?`)
455
+ .get(skillName) as { cnt: number } | null;
456
+ const evolutionCountRow = db
457
+ .query(
458
+ `SELECT COUNT(*) AS cnt FROM evolution_audit
459
+ WHERE skill_name = ? OR (skill_name IS NULL AND proposal_id LIKE 'evo-' || ? || '-%')`,
460
+ )
461
+ .get(skillName, skillName) as { cnt: number } | null;
462
+ const latestAuditRow = db
463
+ .query(
464
+ `SELECT action, timestamp FROM evolution_audit
465
+ WHERE (skill_name = ? OR (skill_name IS NULL AND proposal_id LIKE 'evo-' || ? || '-%'))
466
+ AND action IN ('deployed', 'rolled_back', 'validated', 'proposed', 'approved')
467
+ ORDER BY timestamp DESC LIMIT 1`,
468
+ )
469
+ .get(skillName, skillName) as { action: string; timestamp: string } | null;
470
+
471
+ const evolution_state = {
472
+ has_evidence: (evidenceCountRow?.cnt ?? 0) > 0,
473
+ has_pending_proposals: pending_proposals.length > 0,
474
+ latest_action: latestAuditRow?.action ?? null,
475
+ latest_timestamp: latestAuditRow?.timestamp ?? null,
476
+ evidence_rows: evidenceCountRow?.cnt ?? 0,
477
+ evolution_rows: evolutionCountRow?.cnt ?? 0,
478
+ };
479
+
480
+ // Data hygiene
481
+ const namingVariants = db
482
+ .query(`SELECT DISTINCT skill_name FROM skill_invocations WHERE lower(skill_name) = lower(?)`)
483
+ .all(skillName) as Array<{ skill_name: string }>;
484
+
485
+ const sourceBreakdown = db
486
+ .query(
487
+ `SELECT COALESCE(source, '(null)') AS source, COUNT(*) AS count
488
+ FROM skill_invocations WHERE skill_name = ?
489
+ GROUP BY source ORDER BY count DESC`,
490
+ )
491
+ .all(skillName) as Array<{ source: string; count: number }>;
492
+
493
+ const promptKindBreakdown = db
494
+ .query(
495
+ `SELECT COALESCE(p.prompt_kind, '(null)') AS kind, COUNT(*) AS count
496
+ FROM skill_invocations si
497
+ LEFT JOIN prompts p ON si.matched_prompt_id = p.prompt_id
498
+ WHERE si.skill_name = ?
499
+ GROUP BY p.prompt_kind ORDER BY count DESC`,
500
+ )
501
+ .all(skillName) as Array<{ kind: string; count: number }>;
502
+
503
+ const observationBreakdownMap = new Map<
504
+ "canonical" | "repaired_trigger" | "repaired_contextual_miss" | "legacy_materialized",
505
+ number
506
+ >();
507
+ const enrichedInvocations = allInvocations.map((inv) => {
508
+ const queryText = inv.inline_query || inv.prompt_text || "";
509
+ const isPolluting = isPollutingPrompt(queryText, inv.prompt_kind);
510
+ const observation_kind = classifyObservationKind(
511
+ inv.skill_invocation_id,
512
+ inv.capture_mode,
513
+ inv.triggered,
514
+ inv.raw_source_ref,
515
+ );
516
+ return {
517
+ ...inv,
518
+ queryText,
519
+ isPolluting,
520
+ observation_kind,
521
+ };
522
+ });
523
+
524
+ for (const inv of enrichedInvocations) {
525
+ observationBreakdownMap.set(
526
+ inv.observation_kind,
527
+ (observationBreakdownMap.get(inv.observation_kind) ?? 0) + 1,
528
+ );
529
+ }
530
+
531
+ const trustInvocationsRaw = enrichedInvocations.filter(
532
+ (inv) => inv.observation_kind !== "legacy_materialized",
533
+ );
534
+
535
+ const normalizeQueryForGrouping = (query: string) =>
536
+ query.replace(/\s+/g, " ").trim().toLowerCase();
537
+
538
+ const dedupeTrustInvocations = <T extends (typeof trustInvocationsRaw)[number]>(rows: T[]) => {
539
+ const grouped = new Map<string, T[]>();
540
+ for (const row of rows) {
541
+ const normalizedQuery = normalizeQueryForGrouping(row.queryText);
542
+ const key =
543
+ normalizedQuery.length > 0
544
+ ? `${row.session_id}::${normalizedQuery}`
545
+ : `${row.skill_invocation_id}`;
546
+ const arr = grouped.get(key);
547
+ if (arr) arr.push(row);
548
+ else grouped.set(key, [row]);
549
+ }
550
+
551
+ return [...grouped.values()]
552
+ .map((group) => {
553
+ const sorted = [...group].sort((a, b) => {
554
+ const aScore =
555
+ (a.triggered === 1 ? 100 : 0) +
556
+ (a.observation_kind === "canonical" ? 20 : 0) +
557
+ (a.observation_kind === "repaired_trigger" ? 15 : 0) +
558
+ (a.confidence != null ? 5 : 0);
559
+ const bScore =
560
+ (b.triggered === 1 ? 100 : 0) +
561
+ (b.observation_kind === "canonical" ? 20 : 0) +
562
+ (b.observation_kind === "repaired_trigger" ? 15 : 0) +
563
+ (b.confidence != null ? 5 : 0);
564
+ if (aScore !== bScore) return bScore - aScore;
565
+ return (b.timestamp ?? "").localeCompare(a.timestamp ?? "");
566
+ });
567
+ const primary = sorted[0]!;
568
+ return {
569
+ ...primary,
570
+ historical_context:
571
+ primary.triggered === 1 && group.some((row) => row.triggered === 0)
572
+ ? ("previously_missed" as const)
573
+ : null,
574
+ };
575
+ })
576
+ .sort((a, b) => (b.timestamp ?? "").localeCompare(a.timestamp ?? ""));
577
+ };
578
+
579
+ const trustInvocations = dedupeTrustInvocations(trustInvocationsRaw);
580
+
581
+ const trustTotalInv = trustInvocations.length;
582
+ const trustDistinctSessions = new Set(trustInvocations.map((r) => r.session_id));
583
+ const trustDistinctWorkspaces = new Set(
584
+ trustInvocations.map((r) => r.workspace_path).filter(Boolean),
585
+ );
586
+ const trustTimestamps = trustInvocations
587
+ .map((r) => r.timestamp)
588
+ .filter((t): t is string => t != null);
589
+
590
+ const legacyRows = enrichedInvocations.filter(
591
+ (inv) => inv.observation_kind === "legacy_materialized",
592
+ ).length;
593
+ const repairedRows = enrichedInvocations.filter(
594
+ (inv) =>
595
+ inv.observation_kind === "repaired_trigger" ||
596
+ inv.observation_kind === "repaired_contextual_miss",
597
+ ).length;
598
+
599
+ const data_hygiene = {
600
+ naming_variants: namingVariants.map((r) => r.skill_name),
601
+ source_breakdown: sourceBreakdown,
602
+ prompt_kind_breakdown: promptKindBreakdown,
603
+ observation_breakdown: [...observationBreakdownMap.entries()].map(([kind, count]) => ({
604
+ kind,
605
+ count,
606
+ })),
607
+ raw_checks: totalInv,
608
+ operational_checks: trustTotalInv,
609
+ internal_prompt_rows: internalPromptCount,
610
+ internal_prompt_rate: safeDiv(internalPromptCount, totalInv),
611
+ legacy_rows: legacyRows,
612
+ legacy_rate: safeDiv(legacyRows, totalInv),
613
+ repaired_rows: repairedRows,
614
+ repaired_rate: safeDiv(repairedRows, totalInv),
615
+ };
616
+
617
+ // Recompute trust-facing metrics from operational non-legacy observations.
618
+ const trustPromptLinked = trustInvocations.filter((inv) => inv.matched_prompt_id != null).length;
619
+ const trustInlineQueryCount = trustInvocations.filter(
620
+ (inv) => inv.inline_query != null && inv.inline_query !== "",
621
+ ).length;
622
+ const trustUserPromptCount = trustInvocations.filter((inv) => inv.prompt_kind === "user").length;
623
+ const trustMetaPromptCount = trustInvocations.filter((inv) => inv.prompt_kind === "meta").length;
624
+ const trustNoPromptCount = trustInvocations.filter(
625
+ (inv) => inv.matched_prompt_id == null && (inv.inline_query == null || inv.inline_query === ""),
626
+ ).length;
627
+ const trustSystemLikeCount = trustInvocations.filter((inv) => inv.isPolluting).length;
628
+ const trustInvModeCount = trustInvocations.filter(
629
+ (inv) => inv.invocation_mode != null && inv.invocation_mode !== "",
630
+ ).length;
631
+ const trustConfCount = trustInvocations.filter((inv) => inv.confidence != null).length;
632
+ const trustSourceCount = trustInvocations.filter(
633
+ (inv) => inv.source != null && inv.source !== "",
634
+ ).length;
635
+ const trustScopeCount = trustInvocations.filter(
636
+ (inv) => inv.skill_scope != null && inv.skill_scope !== "",
637
+ ).length;
638
+
639
+ coverage.checks = trustTotalInv;
640
+ coverage.sessions = trustDistinctSessions.size;
641
+ coverage.workspaces = trustDistinctWorkspaces.size;
642
+ coverage.first_seen =
643
+ trustTimestamps.length > 0 ? trustTimestamps[trustTimestamps.length - 1] : null;
644
+ coverage.last_seen = trustTimestamps.length > 0 ? trustTimestamps[0] : null;
645
+
646
+ evidence_quality.prompt_link_rate = safeDiv(trustPromptLinked, trustTotalInv);
647
+ evidence_quality.inline_query_rate = safeDiv(trustInlineQueryCount, trustTotalInv);
648
+ evidence_quality.user_prompt_rate = safeDiv(trustUserPromptCount, trustTotalInv);
649
+ evidence_quality.meta_prompt_rate = safeDiv(trustMetaPromptCount, trustTotalInv);
650
+ evidence_quality.no_prompt_rate = safeDiv(trustNoPromptCount, trustTotalInv);
651
+ evidence_quality.system_like_rate = safeDiv(trustSystemLikeCount, trustTotalInv);
652
+ evidence_quality.invocation_mode_coverage = safeDiv(trustInvModeCount, trustTotalInv);
653
+ evidence_quality.confidence_coverage = safeDiv(trustConfCount, trustTotalInv);
654
+ evidence_quality.source_coverage = safeDiv(trustSourceCount, trustTotalInv);
655
+ evidence_quality.scope_coverage = safeDiv(trustScopeCount, trustTotalInv);
656
+
657
+ const trustMissedTriggers = trustInvocations.filter((r) => r.triggered === 0).length;
658
+ const trustWithConfidence = trustInvocations.filter((r) => r.confidence != null);
659
+ const trustAvgConfidence =
660
+ trustWithConfidence.length > 0
661
+ ? trustWithConfidence.reduce((s, r) => s + (r.confidence ?? 0), 0) /
662
+ trustWithConfidence.length
663
+ : null;
664
+ const trustLowConfCount = trustWithConfidence.filter((r) => (r.confidence ?? 0) < 0.5).length;
665
+
666
+ routing_quality.missed_triggers = trustMissedTriggers;
667
+ routing_quality.miss_rate = safeDiv(trustMissedTriggers, trustTotalInv);
668
+ routing_quality.avg_confidence = trustAvgConfidence;
669
+ routing_quality.confidence_coverage = safeDiv(trustConfCount, trustTotalInv);
670
+ routing_quality.low_confidence_rate =
671
+ trustWithConfidence.length > 0 ? safeDiv(trustLowConfCount, trustWithConfidence.length) : null;
672
+
673
+ // Examples (limit 10 per category)
674
+ type ExampleRowInternal = {
675
+ timestamp: string | null;
676
+ session_id: string;
677
+ query_text: string;
678
+ triggered: boolean;
679
+ confidence: number | null;
680
+ invocation_mode: string | null;
681
+ prompt_kind: string | null;
682
+ source: string | null;
683
+ platform: string | null;
684
+ workspace_path: string | null;
685
+ query_origin: "inline_query" | "matched_prompt" | "missing";
686
+ is_system_like: boolean;
687
+ observation_kind:
688
+ | "canonical"
689
+ | "repaired_trigger"
690
+ | "repaired_contextual_miss"
691
+ | "legacy_materialized";
692
+ historical_context: "previously_missed" | null;
693
+ };
694
+
695
+ const goodExamples: ExampleRowInternal[] = [];
696
+ const missedExamples: ExampleRowInternal[] = [];
697
+ const noisyExamples: ExampleRowInternal[] = [];
698
+
699
+ for (const inv of dedupeTrustInvocations(trustInvocationsRaw)) {
700
+ const queryText = inv.queryText;
701
+ const sysLike = inv.isPolluting;
702
+ const queryOrigin: "inline_query" | "matched_prompt" | "missing" =
703
+ inv.inline_query != null && inv.inline_query !== ""
704
+ ? "inline_query"
705
+ : inv.matched_prompt_id != null
706
+ ? "matched_prompt"
707
+ : "missing";
708
+ const row: ExampleRowInternal = {
709
+ timestamp: inv.timestamp,
710
+ session_id: inv.session_id,
711
+ query_text: queryText,
712
+ triggered: inv.triggered === 1,
713
+ confidence: inv.confidence,
714
+ invocation_mode: inv.invocation_mode,
715
+ prompt_kind: inv.prompt_kind,
716
+ source: inv.source,
717
+ platform: inv.platform,
718
+ workspace_path: inv.workspace_path,
719
+ query_origin: queryOrigin,
720
+ is_system_like: sysLike,
721
+ observation_kind: inv.observation_kind,
722
+ historical_context: inv.historical_context,
723
+ };
724
+
725
+ if (inv.triggered === 0 && missedExamples.length < 10) {
726
+ missedExamples.push(row);
727
+ } else if (
728
+ inv.triggered === 1 &&
729
+ queryText !== "" &&
730
+ (queryOrigin === "inline_query" || inv.prompt_kind === "user" || inv.prompt_kind == null) &&
731
+ goodExamples.length < 10
732
+ ) {
733
+ goodExamples.push(row);
734
+ }
735
+ }
736
+
737
+ for (const inv of enrichedInvocations) {
738
+ if (!inv.isPolluting || noisyExamples.length >= 10) continue;
739
+ const queryOrigin: "inline_query" | "matched_prompt" | "missing" =
740
+ inv.inline_query != null && inv.inline_query !== ""
741
+ ? "inline_query"
742
+ : inv.matched_prompt_id != null
743
+ ? "matched_prompt"
744
+ : "missing";
745
+ noisyExamples.push({
746
+ timestamp: inv.timestamp,
747
+ session_id: inv.session_id,
748
+ query_text: inv.queryText,
749
+ triggered: inv.triggered === 1,
750
+ confidence: inv.confidence,
751
+ invocation_mode: inv.invocation_mode,
752
+ prompt_kind: inv.prompt_kind,
753
+ source: inv.source,
754
+ platform: inv.platform,
755
+ workspace_path: inv.workspace_path,
756
+ query_origin: queryOrigin,
757
+ is_system_like: true,
758
+ observation_kind: inv.observation_kind,
759
+ historical_context: null,
760
+ });
761
+ }
762
+
763
+ const examples = {
764
+ good: goodExamples,
765
+ missed: missedExamples,
766
+ noisy: noisyExamples,
767
+ };
768
+
769
+ // Trust state determination
770
+ type TrustStateType =
771
+ | "low_sample"
772
+ | "observed"
773
+ | "watch"
774
+ | "validated"
775
+ | "deployed"
776
+ | "rolled_back";
777
+ let trustState: TrustStateType;
778
+ let trustSummary: string;
779
+
780
+ if (coverage.checks < 5) {
781
+ trustState = "low_sample";
782
+ trustSummary = `Too few operational observations to assess trust — only ${coverage.checks} checks recorded.`;
783
+ } else if (latestAuditRow?.action === "rolled_back") {
784
+ trustState = "rolled_back";
785
+ trustSummary = "Recent evolution was rolled back — review evidence before re-deploying.";
786
+ } else if (latestAuditRow?.action === "deployed") {
787
+ trustState = "deployed";
788
+ trustSummary = `Deployed evolution; ${evolution_state.evidence_rows} evidence rows support current state.`;
789
+ } else if (latestAuditRow?.action === "validated" || latestAuditRow?.action === "approved") {
790
+ trustState = "validated";
791
+ trustSummary = "Validated with evidence but not yet deployed.";
792
+ } else if (
793
+ routing_quality.missed_triggers > 0 ||
794
+ evidence_quality.system_like_rate > 0.1 ||
795
+ evidence_quality.prompt_link_rate < 0.3
796
+ ) {
797
+ trustState = "watch";
798
+ const reasons: string[] = [];
799
+ if (routing_quality.missed_triggers > 0)
800
+ reasons.push(`${routing_quality.missed_triggers} missed triggers`);
801
+ if (evidence_quality.system_like_rate > 0.1)
802
+ reasons.push(`${(evidence_quality.system_like_rate * 100).toFixed(0)}% system-like queries`);
803
+ if (evidence_quality.prompt_link_rate < 0.3)
804
+ reasons.push(
805
+ `low prompt link rate (${(evidence_quality.prompt_link_rate * 100).toFixed(0)}%)`,
806
+ );
807
+ trustSummary = `Needs attention — ${reasons.join(", ")}.`;
808
+ } else {
809
+ trustState = "observed";
810
+ const qualityDesc =
811
+ evidence_quality.prompt_link_rate > 0.7
812
+ ? "strong"
813
+ : evidence_quality.prompt_link_rate > 0.4
814
+ ? "moderate"
815
+ : "sparse";
816
+ trustSummary = `Observed in ${coverage.sessions} sessions across ${coverage.workspaces} workspaces; evidence is ${qualityDesc}.`;
817
+ }
818
+
819
+ const trust = { state: trustState, summary: trustSummary };
820
+
222
821
  return Response.json({
223
822
  ...report,
224
823
  evolution: evolutionWithSnapshot,
@@ -227,16 +826,40 @@ export function handleSkillReport(db: Database, skillName: string): Response {
227
826
  total_input_tokens: executionRow?.total_input_tokens ?? 0,
228
827
  total_output_tokens: executionRow?.total_output_tokens ?? 0,
229
828
  },
230
- canonical_invocations: invocationsWithConfidence.map((i) => ({
231
- ...i,
829
+ canonical_invocations: trustInvocations.slice(0, invLimit).map((i) => ({
830
+ timestamp: i.timestamp,
831
+ session_id: i.session_id,
832
+ skill_name: i.skill_name,
833
+ invocation_mode: i.invocation_mode,
232
834
  triggered: i.triggered === 1,
835
+ confidence: i.confidence,
836
+ tool_name: i.tool_name,
837
+ agent_type: i.agent_type,
838
+ query: i.queryText,
839
+ source: i.source,
840
+ skill_path: i.skill_path,
841
+ skill_scope: i.skill_scope,
842
+ observation_kind: i.observation_kind,
843
+ historical_context: i.historical_context,
233
844
  })),
845
+ invocations_pagination:
846
+ trustInvocations.length > invLimit
847
+ ? {
848
+ next_cursor: {
849
+ timestamp: trustInvocations[invLimit - 1]!.timestamp!,
850
+ id: trustInvocations[invLimit - 1]!.skill_invocation_id,
851
+ },
852
+ has_more: true,
853
+ }
854
+ : undefined,
234
855
  duration_stats: {
235
856
  avg_duration_ms: executionRow?.avg_duration_ms ?? 0,
236
857
  total_duration_ms: executionRow?.total_duration_ms ?? 0,
237
858
  execution_count: executionRow?.execution_count ?? 0,
238
859
  missed_triggers: missedRow?.missed_triggers ?? 0,
239
860
  },
861
+ execution_metrics: executionMetrics,
862
+ commit_summary: commitSummary.total_commits > 0 ? commitSummary : null,
240
863
  selftune_stats: selftuneStats,
241
864
  prompt_samples: promptSamples.map((p) => ({
242
865
  ...p,
@@ -244,5 +867,12 @@ export function handleSkillReport(db: Database, skillName: string): Response {
244
867
  })),
245
868
  session_metadata: sessionMeta,
246
869
  description_quality: descriptionQuality,
870
+ trust,
871
+ coverage,
872
+ evidence_quality,
873
+ routing_quality,
874
+ evolution_state,
875
+ data_hygiene,
876
+ examples,
247
877
  });
248
878
  }