selftune 0.2.18 → 0.2.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/README.md +9 -4
  2. package/apps/local-dashboard/dist/assets/index-D8O-RG1I.js +60 -0
  3. package/apps/local-dashboard/dist/assets/index-_EcLywDg.css +1 -0
  4. package/apps/local-dashboard/dist/assets/vendor-table-BIiI3YhS.js +1 -0
  5. package/apps/local-dashboard/dist/assets/vendor-ui-CGEmUayx.js +12 -0
  6. package/apps/local-dashboard/dist/index.html +5 -5
  7. package/cli/selftune/alpha-upload/stage-canonical.ts +7 -6
  8. package/cli/selftune/constants.ts +10 -0
  9. package/cli/selftune/contribute/contribute.ts +30 -2
  10. package/cli/selftune/contribution-config.ts +249 -0
  11. package/cli/selftune/contribution-relay.ts +177 -0
  12. package/cli/selftune/contribution-signals.ts +219 -0
  13. package/cli/selftune/contribution-staging.ts +147 -0
  14. package/cli/selftune/contributions.ts +532 -0
  15. package/cli/selftune/creator-contributions.ts +333 -0
  16. package/cli/selftune/dashboard-contract.ts +209 -1
  17. package/cli/selftune/dashboard-server.ts +45 -11
  18. package/cli/selftune/eval/family-overlap.ts +714 -0
  19. package/cli/selftune/eval/hooks-to-evals.ts +182 -28
  20. package/cli/selftune/eval/synthetic-evals.ts +298 -11
  21. package/cli/selftune/evolution/evidence.ts +5 -0
  22. package/cli/selftune/evolution/evolve-body.ts +62 -2
  23. package/cli/selftune/evolution/evolve.ts +58 -1
  24. package/cli/selftune/evolution/validate-body.ts +10 -0
  25. package/cli/selftune/evolution/validate-host-replay.ts +236 -0
  26. package/cli/selftune/evolution/validate-proposal.ts +10 -0
  27. package/cli/selftune/evolution/validate-routing.ts +112 -5
  28. package/cli/selftune/export.ts +2 -2
  29. package/cli/selftune/index.ts +41 -5
  30. package/cli/selftune/ingestors/codex-rollout.ts +31 -35
  31. package/cli/selftune/ingestors/codex-wrapper.ts +32 -24
  32. package/cli/selftune/localdb/db.ts +2 -2
  33. package/cli/selftune/localdb/direct-write.ts +8 -3
  34. package/cli/selftune/localdb/materialize.ts +7 -2
  35. package/cli/selftune/localdb/queries.ts +712 -31
  36. package/cli/selftune/localdb/schema.ts +30 -1
  37. package/cli/selftune/recover.ts +153 -0
  38. package/cli/selftune/repair/skill-usage.ts +363 -4
  39. package/cli/selftune/routes/actions.ts +35 -1
  40. package/cli/selftune/routes/analytics.ts +14 -0
  41. package/cli/selftune/routes/index.ts +1 -0
  42. package/cli/selftune/routes/overview.ts +112 -4
  43. package/cli/selftune/routes/skill-report.ts +575 -11
  44. package/cli/selftune/status.ts +81 -2
  45. package/cli/selftune/sync.ts +56 -2
  46. package/cli/selftune/trust-model.ts +66 -0
  47. package/cli/selftune/types.ts +103 -0
  48. package/cli/selftune/utils/skill-detection.ts +43 -0
  49. package/cli/selftune/utils/text-similarity.ts +73 -0
  50. package/cli/selftune/watchlist.ts +65 -0
  51. package/package.json +1 -1
  52. package/packages/ui/src/components/ActivityTimeline.tsx +165 -150
  53. package/packages/ui/src/components/EvidenceViewer.tsx +419 -145
  54. package/packages/ui/src/components/EvolutionTimeline.tsx +81 -29
  55. package/packages/ui/src/components/OrchestrateRunsPanel.tsx +33 -16
  56. package/packages/ui/src/components/RecentActivityFeed.tsx +72 -41
  57. package/packages/ui/src/components/section-cards.tsx +12 -9
  58. package/packages/ui/src/primitives/card.tsx +1 -1
  59. package/packages/ui/src/types.ts +4 -0
  60. package/skill/SKILL.md +11 -1
  61. package/skill/Workflows/AlphaUpload.md +4 -0
  62. package/skill/Workflows/Composability.md +78 -0
  63. package/skill/Workflows/Contribute.md +6 -3
  64. package/skill/Workflows/Contributions.md +97 -0
  65. package/skill/Workflows/CreatorContributions.md +74 -0
  66. package/skill/Workflows/Dashboard.md +31 -0
  67. package/skill/Workflows/Evals.md +57 -8
  68. package/skill/Workflows/Evolve.md +23 -0
  69. package/skill/Workflows/Ingest.md +7 -0
  70. package/skill/Workflows/Initialize.md +20 -1
  71. package/skill/Workflows/Recover.md +84 -0
  72. package/skill/Workflows/RepairSkillUsage.md +12 -4
  73. package/skill/Workflows/Sync.md +18 -12
  74. package/apps/local-dashboard/dist/assets/index-BMIS6uUh.css +0 -2
  75. package/apps/local-dashboard/dist/assets/index-DOu3iLD9.js +0 -16
  76. package/apps/local-dashboard/dist/assets/vendor-table-pHbDxq36.js +0 -8
  77. package/apps/local-dashboard/dist/assets/vendor-ui-DIwlrGlb.js +0 -12
@@ -28,7 +28,8 @@ export function handleSkillReport(
28
28
  // 1. Evolution audit with eval_snapshot
29
29
  const evolution = db
30
30
  .query(
31
- `SELECT timestamp, proposal_id, skill_name, action, details, eval_snapshot_json
31
+ `SELECT timestamp, proposal_id, skill_name, action, details, eval_snapshot_json,
32
+ validation_mode, validation_agent, validation_fixture_id, validation_evidence_ref
32
33
  FROM evolution_audit
33
34
  WHERE skill_name = ? OR (skill_name IS NULL AND proposal_id LIKE 'evo-' || ? || '-%')
34
35
  ORDER BY timestamp DESC
@@ -41,6 +42,10 @@ export function handleSkillReport(
41
42
  action: string;
42
43
  details: string;
43
44
  eval_snapshot_json: string | null;
45
+ validation_mode: string | null;
46
+ validation_agent: string | null;
47
+ validation_fixture_id: string | null;
48
+ validation_evidence_ref: string | null;
44
49
  }>;
45
50
  const evolutionWithSnapshot = evolution.map((e) => ({
46
51
  ...e,
@@ -120,6 +125,8 @@ export function handleSkillReport(
120
125
  query: string | null;
121
126
  source: string | null;
122
127
  skill_invocation_id: string;
128
+ capture_mode: string | null;
129
+ raw_source_ref: string | null;
123
130
  }>;
124
131
 
125
132
  if (invCursor) {
@@ -128,7 +135,7 @@ export function handleSkillReport(
128
135
  `SELECT si.occurred_at as timestamp, si.session_id, si.skill_name,
129
136
  si.invocation_mode, si.triggered, si.confidence, si.tool_name,
130
137
  si.agent_type, COALESCE(si.query, p.prompt_text) as query, si.source,
131
- si.skill_invocation_id
138
+ si.skill_invocation_id, si.capture_mode, si.raw_source_ref
132
139
  FROM skill_invocations si
133
140
  LEFT JOIN prompts p ON si.matched_prompt_id = p.prompt_id
134
141
  WHERE si.skill_name = ?
@@ -149,7 +156,7 @@ export function handleSkillReport(
149
156
  `SELECT si.occurred_at as timestamp, si.session_id, si.skill_name,
150
157
  si.invocation_mode, si.triggered, si.confidence, si.tool_name,
151
158
  si.agent_type, COALESCE(si.query, p.prompt_text) as query, si.source,
152
- si.skill_invocation_id
159
+ si.skill_invocation_id, si.capture_mode, si.raw_source_ref
153
160
  FROM skill_invocations si
154
161
  LEFT JOIN prompts p ON si.matched_prompt_id = p.prompt_id
155
162
  WHERE si.skill_name = ?
@@ -163,11 +170,6 @@ export function handleSkillReport(
163
170
  const invPageRows = invHasMore
164
171
  ? invocationsWithConfidence.slice(0, invLimit)
165
172
  : invocationsWithConfidence;
166
- const invLastRow = invPageRows[invPageRows.length - 1];
167
- const invNextCursor =
168
- invHasMore && invLastRow
169
- ? { timestamp: invLastRow.timestamp, id: invLastRow.skill_invocation_id }
170
- : null;
171
173
 
172
174
  // Not-found check — after all enrichment queries so evidence-only skills aren't 404'd
173
175
  const hasData =
@@ -286,6 +288,541 @@ export function handleSkillReport(
286
288
  ? scoreDescription(currentDescriptionText, skillName)
287
289
  : null;
288
290
 
291
+ // ── Trust field computation ──────────────────────────────────────────────
292
+
293
+ const SYSTEM_LIKE_PREFIXES = ["<system_instruction>", "<system-instruction>", "<command-name>"];
294
+ const INTERNAL_EVAL_MARKERS = [
295
+ "you are an evaluation assistant",
296
+ "you are a skill description optimizer",
297
+ "would each query trigger this skill",
298
+ "propose an improved description",
299
+ "failure patterns:",
300
+ "output only valid json",
301
+ ];
302
+ const isSystemLike = (text: string | null | undefined): boolean => {
303
+ if (!text) return false;
304
+ const trimmed = text.trimStart();
305
+ return SYSTEM_LIKE_PREFIXES.some((p) => trimmed.startsWith(p));
306
+ };
307
+ const isInternalSelftunePrompt = (
308
+ text: string | null | undefined,
309
+ promptKind: string | null | undefined,
310
+ ): boolean => {
311
+ if (!text) return false;
312
+ const lowered = text.toLowerCase();
313
+ return (
314
+ promptKind === "meta" && INTERNAL_EVAL_MARKERS.some((marker) => lowered.includes(marker))
315
+ );
316
+ };
317
+ const isPollutingPrompt = (
318
+ text: string | null | undefined,
319
+ promptKind: string | null | undefined,
320
+ ): boolean => isSystemLike(text) || isInternalSelftunePrompt(text, promptKind);
321
+ const classifyObservationKind = (
322
+ skillInvocationId: string,
323
+ captureMode: string | null,
324
+ triggered: number,
325
+ rawSourceRefJson: string | null,
326
+ ): "canonical" | "repaired_trigger" | "repaired_contextual_miss" | "legacy_materialized" => {
327
+ if (skillInvocationId.includes(":su:")) return "legacy_materialized";
328
+ if (captureMode === "repair") {
329
+ const rawSourceRef = safeParseJson(rawSourceRefJson) as {
330
+ metadata?: { miss_type?: string };
331
+ } | null;
332
+ if (triggered === 0 && rawSourceRef?.metadata?.miss_type === "contextual_read") {
333
+ return "repaired_contextual_miss";
334
+ }
335
+ return "repaired_trigger";
336
+ }
337
+ return "canonical";
338
+ };
339
+
340
+ // Fetch all invocations for this skill with joined prompt + session data
341
+ const allInvocations = db
342
+ .query(
343
+ `SELECT si.occurred_at AS timestamp, si.session_id, si.skill_name,
344
+ si.invocation_mode, si.triggered, si.confidence, si.tool_name,
345
+ si.agent_type, si.query AS inline_query, si.source,
346
+ si.matched_prompt_id, si.skill_scope, si.skill_path,
347
+ si.skill_invocation_id, si.capture_mode, si.raw_source_ref,
348
+ p.prompt_text, p.prompt_kind,
349
+ s.platform, s.workspace_path
350
+ FROM skill_invocations si
351
+ LEFT JOIN prompts p ON si.matched_prompt_id = p.prompt_id
352
+ LEFT JOIN sessions s ON si.session_id = s.session_id
353
+ WHERE si.skill_name = ?
354
+ ORDER BY si.occurred_at DESC`,
355
+ )
356
+ .all(skillName) as Array<{
357
+ timestamp: string | null;
358
+ session_id: string;
359
+ skill_name: string;
360
+ invocation_mode: string | null;
361
+ triggered: number;
362
+ confidence: number | null;
363
+ tool_name: string | null;
364
+ agent_type: string | null;
365
+ inline_query: string | null;
366
+ source: string | null;
367
+ matched_prompt_id: string | null;
368
+ skill_scope: string | null;
369
+ skill_path: string | null;
370
+ skill_invocation_id: string;
371
+ capture_mode: string | null;
372
+ raw_source_ref: string | null;
373
+ prompt_text: string | null;
374
+ prompt_kind: string | null;
375
+ platform: string | null;
376
+ workspace_path: string | null;
377
+ }>;
378
+
379
+ const totalInv = allInvocations.length;
380
+ const safeDiv = (num: number, den: number): number => (den > 0 ? num / den : 0);
381
+
382
+ // Coverage
383
+ const distinctSessions = new Set(allInvocations.map((r) => r.session_id));
384
+ const distinctWorkspaces = new Set(allInvocations.map((r) => r.workspace_path).filter(Boolean));
385
+ const allTimestamps = allInvocations
386
+ .map((r) => r.timestamp)
387
+ .filter((t): t is string => t != null);
388
+ const coverage = {
389
+ checks: report.usage.total_checks,
390
+ sessions: distinctSessions.size,
391
+ workspaces: distinctWorkspaces.size,
392
+ first_seen: allTimestamps.length > 0 ? allTimestamps[allTimestamps.length - 1] : null,
393
+ last_seen: allTimestamps.length > 0 ? allTimestamps[0] : null,
394
+ };
395
+
396
+ // Evidence quality
397
+ let promptLinked = 0;
398
+ let inlineQueryCount = 0;
399
+ let userPromptCount = 0;
400
+ let metaPromptCount = 0;
401
+ let internalPromptCount = 0;
402
+ let noPromptCount = 0;
403
+ let systemLikeCount = 0;
404
+ let invModeCount = 0;
405
+ let confCount = 0;
406
+ let sourceCount = 0;
407
+ let scopeCount = 0;
408
+
409
+ for (const inv of allInvocations) {
410
+ const queryText = inv.inline_query || inv.prompt_text || "";
411
+ if (inv.matched_prompt_id != null) promptLinked++;
412
+ if (inv.inline_query != null && inv.inline_query !== "") inlineQueryCount++;
413
+ if (inv.prompt_kind === "user") userPromptCount++;
414
+ if (inv.prompt_kind === "meta") metaPromptCount++;
415
+ if (isInternalSelftunePrompt(queryText, inv.prompt_kind)) internalPromptCount++;
416
+ if (inv.matched_prompt_id == null && (inv.inline_query == null || inv.inline_query === ""))
417
+ noPromptCount++;
418
+ if (isPollutingPrompt(queryText, inv.prompt_kind)) systemLikeCount++;
419
+ if (inv.invocation_mode != null && inv.invocation_mode !== "") invModeCount++;
420
+ if (inv.confidence != null) confCount++;
421
+ if (inv.source != null && inv.source !== "") sourceCount++;
422
+ if (inv.skill_scope != null && inv.skill_scope !== "") scopeCount++;
423
+ }
424
+
425
+ const evidence_quality = {
426
+ prompt_link_rate: safeDiv(promptLinked, totalInv),
427
+ inline_query_rate: safeDiv(inlineQueryCount, totalInv),
428
+ user_prompt_rate: safeDiv(userPromptCount, totalInv),
429
+ meta_prompt_rate: safeDiv(metaPromptCount, totalInv),
430
+ internal_prompt_rate: safeDiv(internalPromptCount, totalInv),
431
+ no_prompt_rate: safeDiv(noPromptCount, totalInv),
432
+ system_like_rate: safeDiv(systemLikeCount, totalInv),
433
+ invocation_mode_coverage: safeDiv(invModeCount, totalInv),
434
+ confidence_coverage: safeDiv(confCount, totalInv),
435
+ source_coverage: safeDiv(sourceCount, totalInv),
436
+ scope_coverage: safeDiv(scopeCount, totalInv),
437
+ };
438
+
439
+ // Routing quality
440
+ const missedTriggers = allInvocations.filter((r) => r.triggered === 0).length;
441
+ const withConfidence = allInvocations.filter((r) => r.confidence != null);
442
+ const avgConfidence =
443
+ withConfidence.length > 0
444
+ ? withConfidence.reduce((s, r) => s + (r.confidence ?? 0), 0) / withConfidence.length
445
+ : null;
446
+ const lowConfCount = withConfidence.filter((r) => (r.confidence ?? 0) < 0.5).length;
447
+
448
+ const routing_quality = {
449
+ missed_triggers: missedTriggers,
450
+ miss_rate: safeDiv(missedTriggers, totalInv),
451
+ avg_confidence: avgConfidence,
452
+ confidence_coverage: safeDiv(confCount, totalInv),
453
+ low_confidence_rate:
454
+ withConfidence.length > 0 ? safeDiv(lowConfCount, withConfidence.length) : null,
455
+ };
456
+
457
+ // Evolution state
458
+ const evidenceCountRow = db
459
+ .query(`SELECT COUNT(*) AS cnt FROM evolution_evidence WHERE skill_name = ?`)
460
+ .get(skillName) as { cnt: number } | null;
461
+ const evolutionCountRow = db
462
+ .query(
463
+ `SELECT COUNT(*) AS cnt FROM evolution_audit
464
+ WHERE skill_name = ? OR (skill_name IS NULL AND proposal_id LIKE 'evo-' || ? || '-%')`,
465
+ )
466
+ .get(skillName, skillName) as { cnt: number } | null;
467
+ const latestAuditRow = db
468
+ .query(
469
+ `SELECT action, timestamp FROM evolution_audit
470
+ WHERE (skill_name = ? OR (skill_name IS NULL AND proposal_id LIKE 'evo-' || ? || '-%'))
471
+ AND action IN ('deployed', 'rolled_back', 'validated', 'proposed', 'approved')
472
+ ORDER BY timestamp DESC LIMIT 1`,
473
+ )
474
+ .get(skillName, skillName) as { action: string; timestamp: string } | null;
475
+
476
+ const evolution_state = {
477
+ has_evidence: (evidenceCountRow?.cnt ?? 0) > 0,
478
+ has_pending_proposals: pending_proposals.length > 0,
479
+ latest_action: latestAuditRow?.action ?? null,
480
+ latest_timestamp: latestAuditRow?.timestamp ?? null,
481
+ evidence_rows: evidenceCountRow?.cnt ?? 0,
482
+ evolution_rows: evolutionCountRow?.cnt ?? 0,
483
+ };
484
+
485
+ // Data hygiene
486
+ const namingVariants = db
487
+ .query(`SELECT DISTINCT skill_name FROM skill_invocations WHERE lower(skill_name) = lower(?)`)
488
+ .all(skillName) as Array<{ skill_name: string }>;
489
+
490
+ const sourceBreakdown = db
491
+ .query(
492
+ `SELECT COALESCE(source, '(null)') AS source, COUNT(*) AS count
493
+ FROM skill_invocations WHERE skill_name = ?
494
+ GROUP BY source ORDER BY count DESC`,
495
+ )
496
+ .all(skillName) as Array<{ source: string; count: number }>;
497
+
498
+ const promptKindBreakdown = db
499
+ .query(
500
+ `SELECT COALESCE(p.prompt_kind, '(null)') AS kind, COUNT(*) AS count
501
+ FROM skill_invocations si
502
+ LEFT JOIN prompts p ON si.matched_prompt_id = p.prompt_id
503
+ WHERE si.skill_name = ?
504
+ GROUP BY p.prompt_kind ORDER BY count DESC`,
505
+ )
506
+ .all(skillName) as Array<{ kind: string; count: number }>;
507
+
508
+ const observationBreakdownMap = new Map<
509
+ "canonical" | "repaired_trigger" | "repaired_contextual_miss" | "legacy_materialized",
510
+ number
511
+ >();
512
+ const enrichedInvocations = allInvocations.map((inv) => {
513
+ const queryText = inv.inline_query || inv.prompt_text || "";
514
+ const isPolluting = isPollutingPrompt(queryText, inv.prompt_kind);
515
+ const observation_kind = classifyObservationKind(
516
+ inv.skill_invocation_id,
517
+ inv.capture_mode,
518
+ inv.triggered,
519
+ inv.raw_source_ref,
520
+ );
521
+ return {
522
+ ...inv,
523
+ queryText,
524
+ isPolluting,
525
+ observation_kind,
526
+ };
527
+ });
528
+
529
+ for (const inv of enrichedInvocations) {
530
+ observationBreakdownMap.set(
531
+ inv.observation_kind,
532
+ (observationBreakdownMap.get(inv.observation_kind) ?? 0) + 1,
533
+ );
534
+ }
535
+
536
+ const trustInvocationsRaw = enrichedInvocations.filter(
537
+ (inv) => inv.observation_kind !== "legacy_materialized",
538
+ );
539
+
540
+ const normalizeQueryForGrouping = (query: string) =>
541
+ query.replace(/\s+/g, " ").trim().toLowerCase();
542
+
543
+ const dedupeTrustInvocations = <T extends (typeof trustInvocationsRaw)[number]>(rows: T[]) => {
544
+ const grouped = new Map<string, T[]>();
545
+ for (const row of rows) {
546
+ const normalizedQuery = normalizeQueryForGrouping(row.queryText);
547
+ const key =
548
+ normalizedQuery.length > 0
549
+ ? `${row.session_id}::${normalizedQuery}`
550
+ : `${row.skill_invocation_id}`;
551
+ const arr = grouped.get(key);
552
+ if (arr) arr.push(row);
553
+ else grouped.set(key, [row]);
554
+ }
555
+
556
+ return [...grouped.values()]
557
+ .map((group) => {
558
+ const sorted = [...group].sort((a, b) => {
559
+ const aScore =
560
+ (a.triggered === 1 ? 100 : 0) +
561
+ (a.observation_kind === "canonical" ? 20 : 0) +
562
+ (a.observation_kind === "repaired_trigger" ? 15 : 0) +
563
+ (a.confidence != null ? 5 : 0);
564
+ const bScore =
565
+ (b.triggered === 1 ? 100 : 0) +
566
+ (b.observation_kind === "canonical" ? 20 : 0) +
567
+ (b.observation_kind === "repaired_trigger" ? 15 : 0) +
568
+ (b.confidence != null ? 5 : 0);
569
+ if (aScore !== bScore) return bScore - aScore;
570
+ return (b.timestamp ?? "").localeCompare(a.timestamp ?? "");
571
+ });
572
+ const primary = sorted[0]!;
573
+ return {
574
+ ...primary,
575
+ historical_context:
576
+ primary.triggered === 1 && group.some((row) => row.triggered === 0)
577
+ ? ("previously_missed" as const)
578
+ : null,
579
+ };
580
+ })
581
+ .sort((a, b) => (b.timestamp ?? "").localeCompare(a.timestamp ?? ""));
582
+ };
583
+
584
+ const trustInvocations = dedupeTrustInvocations(trustInvocationsRaw);
585
+
586
+ const trustTotalInv = trustInvocations.length;
587
+ const trustDistinctSessions = new Set(trustInvocations.map((r) => r.session_id));
588
+ const trustDistinctWorkspaces = new Set(
589
+ trustInvocations.map((r) => r.workspace_path).filter(Boolean),
590
+ );
591
+ const trustTimestamps = trustInvocations
592
+ .map((r) => r.timestamp)
593
+ .filter((t): t is string => t != null);
594
+
595
+ const legacyRows = enrichedInvocations.filter(
596
+ (inv) => inv.observation_kind === "legacy_materialized",
597
+ ).length;
598
+ const repairedRows = enrichedInvocations.filter(
599
+ (inv) =>
600
+ inv.observation_kind === "repaired_trigger" ||
601
+ inv.observation_kind === "repaired_contextual_miss",
602
+ ).length;
603
+
604
+ const data_hygiene = {
605
+ naming_variants: namingVariants.map((r) => r.skill_name),
606
+ source_breakdown: sourceBreakdown,
607
+ prompt_kind_breakdown: promptKindBreakdown,
608
+ observation_breakdown: [...observationBreakdownMap.entries()].map(([kind, count]) => ({
609
+ kind,
610
+ count,
611
+ })),
612
+ raw_checks: totalInv,
613
+ operational_checks: trustTotalInv,
614
+ internal_prompt_rows: internalPromptCount,
615
+ internal_prompt_rate: safeDiv(internalPromptCount, totalInv),
616
+ legacy_rows: legacyRows,
617
+ legacy_rate: safeDiv(legacyRows, totalInv),
618
+ repaired_rows: repairedRows,
619
+ repaired_rate: safeDiv(repairedRows, totalInv),
620
+ };
621
+
622
+ // Recompute trust-facing metrics from operational non-legacy observations.
623
+ const trustPromptLinked = trustInvocations.filter((inv) => inv.matched_prompt_id != null).length;
624
+ const trustInlineQueryCount = trustInvocations.filter(
625
+ (inv) => inv.inline_query != null && inv.inline_query !== "",
626
+ ).length;
627
+ const trustUserPromptCount = trustInvocations.filter((inv) => inv.prompt_kind === "user").length;
628
+ const trustMetaPromptCount = trustInvocations.filter((inv) => inv.prompt_kind === "meta").length;
629
+ const trustNoPromptCount = trustInvocations.filter(
630
+ (inv) => inv.matched_prompt_id == null && (inv.inline_query == null || inv.inline_query === ""),
631
+ ).length;
632
+ const trustSystemLikeCount = trustInvocations.filter((inv) => inv.isPolluting).length;
633
+ const trustInvModeCount = trustInvocations.filter(
634
+ (inv) => inv.invocation_mode != null && inv.invocation_mode !== "",
635
+ ).length;
636
+ const trustConfCount = trustInvocations.filter((inv) => inv.confidence != null).length;
637
+ const trustSourceCount = trustInvocations.filter(
638
+ (inv) => inv.source != null && inv.source !== "",
639
+ ).length;
640
+ const trustScopeCount = trustInvocations.filter(
641
+ (inv) => inv.skill_scope != null && inv.skill_scope !== "",
642
+ ).length;
643
+
644
+ coverage.checks = trustTotalInv;
645
+ coverage.sessions = trustDistinctSessions.size;
646
+ coverage.workspaces = trustDistinctWorkspaces.size;
647
+ coverage.first_seen =
648
+ trustTimestamps.length > 0 ? trustTimestamps[trustTimestamps.length - 1] : null;
649
+ coverage.last_seen = trustTimestamps.length > 0 ? trustTimestamps[0] : null;
650
+
651
+ evidence_quality.prompt_link_rate = safeDiv(trustPromptLinked, trustTotalInv);
652
+ evidence_quality.inline_query_rate = safeDiv(trustInlineQueryCount, trustTotalInv);
653
+ evidence_quality.user_prompt_rate = safeDiv(trustUserPromptCount, trustTotalInv);
654
+ evidence_quality.meta_prompt_rate = safeDiv(trustMetaPromptCount, trustTotalInv);
655
+ evidence_quality.no_prompt_rate = safeDiv(trustNoPromptCount, trustTotalInv);
656
+ evidence_quality.system_like_rate = safeDiv(trustSystemLikeCount, trustTotalInv);
657
+ evidence_quality.invocation_mode_coverage = safeDiv(trustInvModeCount, trustTotalInv);
658
+ evidence_quality.confidence_coverage = safeDiv(trustConfCount, trustTotalInv);
659
+ evidence_quality.source_coverage = safeDiv(trustSourceCount, trustTotalInv);
660
+ evidence_quality.scope_coverage = safeDiv(trustScopeCount, trustTotalInv);
661
+
662
+ const trustMissedTriggers = trustInvocations.filter((r) => r.triggered === 0).length;
663
+ const trustWithConfidence = trustInvocations.filter((r) => r.confidence != null);
664
+ const trustAvgConfidence =
665
+ trustWithConfidence.length > 0
666
+ ? trustWithConfidence.reduce((s, r) => s + (r.confidence ?? 0), 0) /
667
+ trustWithConfidence.length
668
+ : null;
669
+ const trustLowConfCount = trustWithConfidence.filter((r) => (r.confidence ?? 0) < 0.5).length;
670
+
671
+ routing_quality.missed_triggers = trustMissedTriggers;
672
+ routing_quality.miss_rate = safeDiv(trustMissedTriggers, trustTotalInv);
673
+ routing_quality.avg_confidence = trustAvgConfidence;
674
+ routing_quality.confidence_coverage = safeDiv(trustConfCount, trustTotalInv);
675
+ routing_quality.low_confidence_rate =
676
+ trustWithConfidence.length > 0 ? safeDiv(trustLowConfCount, trustWithConfidence.length) : null;
677
+
678
+ // Examples (limit 10 per category)
679
+ type ExampleRowInternal = {
680
+ timestamp: string | null;
681
+ session_id: string;
682
+ query_text: string;
683
+ triggered: boolean;
684
+ confidence: number | null;
685
+ invocation_mode: string | null;
686
+ prompt_kind: string | null;
687
+ source: string | null;
688
+ platform: string | null;
689
+ workspace_path: string | null;
690
+ query_origin: "inline_query" | "matched_prompt" | "missing";
691
+ is_system_like: boolean;
692
+ observation_kind:
693
+ | "canonical"
694
+ | "repaired_trigger"
695
+ | "repaired_contextual_miss"
696
+ | "legacy_materialized";
697
+ historical_context: "previously_missed" | null;
698
+ };
699
+
700
+ const goodExamples: ExampleRowInternal[] = [];
701
+ const missedExamples: ExampleRowInternal[] = [];
702
+ const noisyExamples: ExampleRowInternal[] = [];
703
+
704
+ for (const inv of dedupeTrustInvocations(trustInvocationsRaw)) {
705
+ const queryText = inv.queryText;
706
+ const sysLike = inv.isPolluting;
707
+ const queryOrigin: "inline_query" | "matched_prompt" | "missing" =
708
+ inv.inline_query != null && inv.inline_query !== ""
709
+ ? "inline_query"
710
+ : inv.matched_prompt_id != null
711
+ ? "matched_prompt"
712
+ : "missing";
713
+ const row: ExampleRowInternal = {
714
+ timestamp: inv.timestamp,
715
+ session_id: inv.session_id,
716
+ query_text: queryText,
717
+ triggered: inv.triggered === 1,
718
+ confidence: inv.confidence,
719
+ invocation_mode: inv.invocation_mode,
720
+ prompt_kind: inv.prompt_kind,
721
+ source: inv.source,
722
+ platform: inv.platform,
723
+ workspace_path: inv.workspace_path,
724
+ query_origin: queryOrigin,
725
+ is_system_like: sysLike,
726
+ observation_kind: inv.observation_kind,
727
+ historical_context: inv.historical_context,
728
+ };
729
+
730
+ if (inv.triggered === 0 && missedExamples.length < 10) {
731
+ missedExamples.push(row);
732
+ } else if (
733
+ inv.triggered === 1 &&
734
+ queryText !== "" &&
735
+ (queryOrigin === "inline_query" || inv.prompt_kind === "user" || inv.prompt_kind == null) &&
736
+ goodExamples.length < 10
737
+ ) {
738
+ goodExamples.push(row);
739
+ }
740
+ }
741
+
742
+ for (const inv of enrichedInvocations) {
743
+ if (!inv.isPolluting || noisyExamples.length >= 10) continue;
744
+ const queryOrigin: "inline_query" | "matched_prompt" | "missing" =
745
+ inv.inline_query != null && inv.inline_query !== ""
746
+ ? "inline_query"
747
+ : inv.matched_prompt_id != null
748
+ ? "matched_prompt"
749
+ : "missing";
750
+ noisyExamples.push({
751
+ timestamp: inv.timestamp,
752
+ session_id: inv.session_id,
753
+ query_text: inv.queryText,
754
+ triggered: inv.triggered === 1,
755
+ confidence: inv.confidence,
756
+ invocation_mode: inv.invocation_mode,
757
+ prompt_kind: inv.prompt_kind,
758
+ source: inv.source,
759
+ platform: inv.platform,
760
+ workspace_path: inv.workspace_path,
761
+ query_origin: queryOrigin,
762
+ is_system_like: true,
763
+ observation_kind: inv.observation_kind,
764
+ historical_context: null,
765
+ });
766
+ }
767
+
768
+ const examples = {
769
+ good: goodExamples,
770
+ missed: missedExamples,
771
+ noisy: noisyExamples,
772
+ };
773
+
774
+ // Trust state determination
775
+ type TrustStateType =
776
+ | "low_sample"
777
+ | "observed"
778
+ | "watch"
779
+ | "validated"
780
+ | "deployed"
781
+ | "rolled_back";
782
+ let trustState: TrustStateType;
783
+ let trustSummary: string;
784
+
785
+ if (coverage.checks < 5) {
786
+ trustState = "low_sample";
787
+ trustSummary = `Too few operational observations to assess trust — only ${coverage.checks} checks recorded.`;
788
+ } else if (latestAuditRow?.action === "rolled_back") {
789
+ trustState = "rolled_back";
790
+ trustSummary = "Recent evolution was rolled back — review evidence before re-deploying.";
791
+ } else if (latestAuditRow?.action === "deployed") {
792
+ trustState = "deployed";
793
+ trustSummary = `Deployed evolution; ${evolution_state.evidence_rows} evidence rows support current state.`;
794
+ } else if (latestAuditRow?.action === "validated" || latestAuditRow?.action === "approved") {
795
+ trustState = "validated";
796
+ trustSummary = "Validated with evidence but not yet deployed.";
797
+ } else if (
798
+ routing_quality.missed_triggers > 0 ||
799
+ evidence_quality.system_like_rate > 0.1 ||
800
+ evidence_quality.prompt_link_rate < 0.3
801
+ ) {
802
+ trustState = "watch";
803
+ const reasons: string[] = [];
804
+ if (routing_quality.missed_triggers > 0)
805
+ reasons.push(`${routing_quality.missed_triggers} missed triggers`);
806
+ if (evidence_quality.system_like_rate > 0.1)
807
+ reasons.push(`${(evidence_quality.system_like_rate * 100).toFixed(0)}% system-like queries`);
808
+ if (evidence_quality.prompt_link_rate < 0.3)
809
+ reasons.push(
810
+ `low prompt link rate (${(evidence_quality.prompt_link_rate * 100).toFixed(0)}%)`,
811
+ );
812
+ trustSummary = `Needs attention — ${reasons.join(", ")}.`;
813
+ } else {
814
+ trustState = "observed";
815
+ const qualityDesc =
816
+ evidence_quality.prompt_link_rate > 0.7
817
+ ? "strong"
818
+ : evidence_quality.prompt_link_rate > 0.4
819
+ ? "moderate"
820
+ : "sparse";
821
+ trustSummary = `Observed in ${coverage.sessions} sessions across ${coverage.workspaces} workspaces; evidence is ${qualityDesc}.`;
822
+ }
823
+
824
+ const trust = { state: trustState, summary: trustSummary };
825
+
289
826
  return Response.json({
290
827
  ...report,
291
828
  evolution: evolutionWithSnapshot,
@@ -294,12 +831,32 @@ export function handleSkillReport(
294
831
  total_input_tokens: executionRow?.total_input_tokens ?? 0,
295
832
  total_output_tokens: executionRow?.total_output_tokens ?? 0,
296
833
  },
297
- canonical_invocations: invPageRows.map((i) => ({
298
- ...i,
834
+ canonical_invocations: trustInvocations.slice(0, invLimit).map((i) => ({
835
+ timestamp: i.timestamp,
836
+ session_id: i.session_id,
837
+ skill_name: i.skill_name,
838
+ invocation_mode: i.invocation_mode,
299
839
  triggered: i.triggered === 1,
840
+ confidence: i.confidence,
841
+ tool_name: i.tool_name,
842
+ agent_type: i.agent_type,
843
+ query: i.queryText,
844
+ source: i.source,
845
+ skill_path: i.skill_path,
846
+ skill_scope: i.skill_scope,
847
+ observation_kind: i.observation_kind,
848
+ historical_context: i.historical_context,
300
849
  })),
301
850
  invocations_pagination:
302
- invNextCursor || invCursor ? { next_cursor: invNextCursor, has_more: invHasMore } : undefined,
851
+ trustInvocations.length > invLimit
852
+ ? {
853
+ next_cursor: {
854
+ timestamp: trustInvocations[invLimit - 1]!.timestamp!,
855
+ id: trustInvocations[invLimit - 1]!.skill_invocation_id,
856
+ },
857
+ has_more: true,
858
+ }
859
+ : undefined,
303
860
  duration_stats: {
304
861
  avg_duration_ms: executionRow?.avg_duration_ms ?? 0,
305
862
  total_duration_ms: executionRow?.total_duration_ms ?? 0,
@@ -315,5 +872,12 @@ export function handleSkillReport(
315
872
  })),
316
873
  session_metadata: sessionMeta,
317
874
  description_quality: descriptionQuality,
875
+ trust,
876
+ coverage,
877
+ evidence_quality,
878
+ routing_quality,
879
+ evolution_state,
880
+ data_hygiene,
881
+ examples,
318
882
  });
319
883
  }