selftune 0.2.18 → 0.2.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -4
- package/apps/local-dashboard/dist/assets/index-DnhnXQm6.js +60 -0
- package/apps/local-dashboard/dist/assets/index-_EcLywDg.css +1 -0
- package/apps/local-dashboard/dist/assets/vendor-table-BIiI3YhS.js +1 -0
- package/apps/local-dashboard/dist/assets/vendor-ui-CGEmUayx.js +12 -0
- package/apps/local-dashboard/dist/index.html +5 -5
- package/cli/selftune/alpha-upload/stage-canonical.ts +7 -6
- package/cli/selftune/constants.ts +10 -0
- package/cli/selftune/contribute/contribute.ts +30 -2
- package/cli/selftune/contribution-config.ts +249 -0
- package/cli/selftune/contribution-relay.ts +177 -0
- package/cli/selftune/contribution-signals.ts +219 -0
- package/cli/selftune/contribution-staging.ts +147 -0
- package/cli/selftune/contributions.ts +532 -0
- package/cli/selftune/creator-contributions.ts +333 -0
- package/cli/selftune/dashboard-contract.ts +205 -1
- package/cli/selftune/dashboard-server.ts +45 -11
- package/cli/selftune/eval/family-overlap.ts +395 -0
- package/cli/selftune/eval/hooks-to-evals.ts +182 -28
- package/cli/selftune/eval/synthetic-evals.ts +298 -11
- package/cli/selftune/export.ts +2 -2
- package/cli/selftune/index.ts +41 -5
- package/cli/selftune/ingestors/codex-rollout.ts +31 -35
- package/cli/selftune/ingestors/codex-wrapper.ts +32 -24
- package/cli/selftune/localdb/db.ts +2 -2
- package/cli/selftune/localdb/queries.ts +701 -30
- package/cli/selftune/localdb/schema.ts +20 -0
- package/cli/selftune/recover.ts +153 -0
- package/cli/selftune/repair/skill-usage.ts +363 -4
- package/cli/selftune/routes/actions.ts +35 -1
- package/cli/selftune/routes/analytics.ts +14 -0
- package/cli/selftune/routes/index.ts +1 -0
- package/cli/selftune/routes/overview.ts +112 -4
- package/cli/selftune/routes/skill-report.ts +569 -10
- package/cli/selftune/status.ts +81 -2
- package/cli/selftune/sync.ts +56 -2
- package/cli/selftune/trust-model.ts +66 -0
- package/cli/selftune/types.ts +49 -0
- package/cli/selftune/utils/skill-detection.ts +43 -0
- package/cli/selftune/watchlist.ts +65 -0
- package/package.json +1 -1
- package/packages/ui/src/components/ActivityTimeline.tsx +165 -150
- package/packages/ui/src/components/EvidenceViewer.tsx +335 -144
- package/packages/ui/src/components/EvolutionTimeline.tsx +58 -28
- package/packages/ui/src/components/OrchestrateRunsPanel.tsx +33 -16
- package/packages/ui/src/components/RecentActivityFeed.tsx +72 -41
- package/packages/ui/src/components/section-cards.tsx +12 -9
- package/packages/ui/src/primitives/card.tsx +1 -1
- package/skill/SKILL.md +11 -1
- package/skill/Workflows/AlphaUpload.md +4 -0
- package/skill/Workflows/Composability.md +64 -0
- package/skill/Workflows/Contribute.md +6 -3
- package/skill/Workflows/Contributions.md +97 -0
- package/skill/Workflows/CreatorContributions.md +74 -0
- package/skill/Workflows/Dashboard.md +31 -0
- package/skill/Workflows/Evals.md +57 -8
- package/skill/Workflows/Ingest.md +7 -0
- package/skill/Workflows/Initialize.md +20 -1
- package/skill/Workflows/Recover.md +84 -0
- package/skill/Workflows/RepairSkillUsage.md +12 -4
- package/skill/Workflows/Sync.md +18 -12
- package/apps/local-dashboard/dist/assets/index-BMIS6uUh.css +0 -2
- package/apps/local-dashboard/dist/assets/index-DOu3iLD9.js +0 -16
- package/apps/local-dashboard/dist/assets/vendor-table-pHbDxq36.js +0 -8
- package/apps/local-dashboard/dist/assets/vendor-ui-DIwlrGlb.js +0 -12
|
@@ -120,6 +120,8 @@ export function handleSkillReport(
|
|
|
120
120
|
query: string | null;
|
|
121
121
|
source: string | null;
|
|
122
122
|
skill_invocation_id: string;
|
|
123
|
+
capture_mode: string | null;
|
|
124
|
+
raw_source_ref: string | null;
|
|
123
125
|
}>;
|
|
124
126
|
|
|
125
127
|
if (invCursor) {
|
|
@@ -128,7 +130,7 @@ export function handleSkillReport(
|
|
|
128
130
|
`SELECT si.occurred_at as timestamp, si.session_id, si.skill_name,
|
|
129
131
|
si.invocation_mode, si.triggered, si.confidence, si.tool_name,
|
|
130
132
|
si.agent_type, COALESCE(si.query, p.prompt_text) as query, si.source,
|
|
131
|
-
si.skill_invocation_id
|
|
133
|
+
si.skill_invocation_id, si.capture_mode, si.raw_source_ref
|
|
132
134
|
FROM skill_invocations si
|
|
133
135
|
LEFT JOIN prompts p ON si.matched_prompt_id = p.prompt_id
|
|
134
136
|
WHERE si.skill_name = ?
|
|
@@ -149,7 +151,7 @@ export function handleSkillReport(
|
|
|
149
151
|
`SELECT si.occurred_at as timestamp, si.session_id, si.skill_name,
|
|
150
152
|
si.invocation_mode, si.triggered, si.confidence, si.tool_name,
|
|
151
153
|
si.agent_type, COALESCE(si.query, p.prompt_text) as query, si.source,
|
|
152
|
-
si.skill_invocation_id
|
|
154
|
+
si.skill_invocation_id, si.capture_mode, si.raw_source_ref
|
|
153
155
|
FROM skill_invocations si
|
|
154
156
|
LEFT JOIN prompts p ON si.matched_prompt_id = p.prompt_id
|
|
155
157
|
WHERE si.skill_name = ?
|
|
@@ -163,11 +165,6 @@ export function handleSkillReport(
|
|
|
163
165
|
const invPageRows = invHasMore
|
|
164
166
|
? invocationsWithConfidence.slice(0, invLimit)
|
|
165
167
|
: invocationsWithConfidence;
|
|
166
|
-
const invLastRow = invPageRows[invPageRows.length - 1];
|
|
167
|
-
const invNextCursor =
|
|
168
|
-
invHasMore && invLastRow
|
|
169
|
-
? { timestamp: invLastRow.timestamp, id: invLastRow.skill_invocation_id }
|
|
170
|
-
: null;
|
|
171
168
|
|
|
172
169
|
// Not-found check — after all enrichment queries so evidence-only skills aren't 404'd
|
|
173
170
|
const hasData =
|
|
@@ -286,6 +283,541 @@ export function handleSkillReport(
|
|
|
286
283
|
? scoreDescription(currentDescriptionText, skillName)
|
|
287
284
|
: null;
|
|
288
285
|
|
|
286
|
+
// ── Trust field computation ──────────────────────────────────────────────
|
|
287
|
+
|
|
288
|
+
const SYSTEM_LIKE_PREFIXES = ["<system_instruction>", "<system-instruction>", "<command-name>"];
|
|
289
|
+
const INTERNAL_EVAL_MARKERS = [
|
|
290
|
+
"you are an evaluation assistant",
|
|
291
|
+
"you are a skill description optimizer",
|
|
292
|
+
"would each query trigger this skill",
|
|
293
|
+
"propose an improved description",
|
|
294
|
+
"failure patterns:",
|
|
295
|
+
"output only valid json",
|
|
296
|
+
];
|
|
297
|
+
const isSystemLike = (text: string | null | undefined): boolean => {
|
|
298
|
+
if (!text) return false;
|
|
299
|
+
const trimmed = text.trimStart();
|
|
300
|
+
return SYSTEM_LIKE_PREFIXES.some((p) => trimmed.startsWith(p));
|
|
301
|
+
};
|
|
302
|
+
const isInternalSelftunePrompt = (
|
|
303
|
+
text: string | null | undefined,
|
|
304
|
+
promptKind: string | null | undefined,
|
|
305
|
+
): boolean => {
|
|
306
|
+
if (!text) return false;
|
|
307
|
+
const lowered = text.toLowerCase();
|
|
308
|
+
return (
|
|
309
|
+
promptKind === "meta" && INTERNAL_EVAL_MARKERS.some((marker) => lowered.includes(marker))
|
|
310
|
+
);
|
|
311
|
+
};
|
|
312
|
+
const isPollutingPrompt = (
|
|
313
|
+
text: string | null | undefined,
|
|
314
|
+
promptKind: string | null | undefined,
|
|
315
|
+
): boolean => isSystemLike(text) || isInternalSelftunePrompt(text, promptKind);
|
|
316
|
+
const classifyObservationKind = (
|
|
317
|
+
skillInvocationId: string,
|
|
318
|
+
captureMode: string | null,
|
|
319
|
+
triggered: number,
|
|
320
|
+
rawSourceRefJson: string | null,
|
|
321
|
+
): "canonical" | "repaired_trigger" | "repaired_contextual_miss" | "legacy_materialized" => {
|
|
322
|
+
if (skillInvocationId.includes(":su:")) return "legacy_materialized";
|
|
323
|
+
if (captureMode === "repair") {
|
|
324
|
+
const rawSourceRef = safeParseJson(rawSourceRefJson) as {
|
|
325
|
+
metadata?: { miss_type?: string };
|
|
326
|
+
} | null;
|
|
327
|
+
if (triggered === 0 && rawSourceRef?.metadata?.miss_type === "contextual_read") {
|
|
328
|
+
return "repaired_contextual_miss";
|
|
329
|
+
}
|
|
330
|
+
return "repaired_trigger";
|
|
331
|
+
}
|
|
332
|
+
return "canonical";
|
|
333
|
+
};
|
|
334
|
+
|
|
335
|
+
// Fetch all invocations for this skill with joined prompt + session data
|
|
336
|
+
const allInvocations = db
|
|
337
|
+
.query(
|
|
338
|
+
`SELECT si.occurred_at AS timestamp, si.session_id, si.skill_name,
|
|
339
|
+
si.invocation_mode, si.triggered, si.confidence, si.tool_name,
|
|
340
|
+
si.agent_type, si.query AS inline_query, si.source,
|
|
341
|
+
si.matched_prompt_id, si.skill_scope, si.skill_path,
|
|
342
|
+
si.skill_invocation_id, si.capture_mode, si.raw_source_ref,
|
|
343
|
+
p.prompt_text, p.prompt_kind,
|
|
344
|
+
s.platform, s.workspace_path
|
|
345
|
+
FROM skill_invocations si
|
|
346
|
+
LEFT JOIN prompts p ON si.matched_prompt_id = p.prompt_id
|
|
347
|
+
LEFT JOIN sessions s ON si.session_id = s.session_id
|
|
348
|
+
WHERE si.skill_name = ?
|
|
349
|
+
ORDER BY si.occurred_at DESC`,
|
|
350
|
+
)
|
|
351
|
+
.all(skillName) as Array<{
|
|
352
|
+
timestamp: string | null;
|
|
353
|
+
session_id: string;
|
|
354
|
+
skill_name: string;
|
|
355
|
+
invocation_mode: string | null;
|
|
356
|
+
triggered: number;
|
|
357
|
+
confidence: number | null;
|
|
358
|
+
tool_name: string | null;
|
|
359
|
+
agent_type: string | null;
|
|
360
|
+
inline_query: string | null;
|
|
361
|
+
source: string | null;
|
|
362
|
+
matched_prompt_id: string | null;
|
|
363
|
+
skill_scope: string | null;
|
|
364
|
+
skill_path: string | null;
|
|
365
|
+
skill_invocation_id: string;
|
|
366
|
+
capture_mode: string | null;
|
|
367
|
+
raw_source_ref: string | null;
|
|
368
|
+
prompt_text: string | null;
|
|
369
|
+
prompt_kind: string | null;
|
|
370
|
+
platform: string | null;
|
|
371
|
+
workspace_path: string | null;
|
|
372
|
+
}>;
|
|
373
|
+
|
|
374
|
+
const totalInv = allInvocations.length;
|
|
375
|
+
const safeDiv = (num: number, den: number): number => (den > 0 ? num / den : 0);
|
|
376
|
+
|
|
377
|
+
// Coverage
|
|
378
|
+
const distinctSessions = new Set(allInvocations.map((r) => r.session_id));
|
|
379
|
+
const distinctWorkspaces = new Set(allInvocations.map((r) => r.workspace_path).filter(Boolean));
|
|
380
|
+
const allTimestamps = allInvocations
|
|
381
|
+
.map((r) => r.timestamp)
|
|
382
|
+
.filter((t): t is string => t != null);
|
|
383
|
+
const coverage = {
|
|
384
|
+
checks: report.usage.total_checks,
|
|
385
|
+
sessions: distinctSessions.size,
|
|
386
|
+
workspaces: distinctWorkspaces.size,
|
|
387
|
+
first_seen: allTimestamps.length > 0 ? allTimestamps[allTimestamps.length - 1] : null,
|
|
388
|
+
last_seen: allTimestamps.length > 0 ? allTimestamps[0] : null,
|
|
389
|
+
};
|
|
390
|
+
|
|
391
|
+
// Evidence quality
|
|
392
|
+
let promptLinked = 0;
|
|
393
|
+
let inlineQueryCount = 0;
|
|
394
|
+
let userPromptCount = 0;
|
|
395
|
+
let metaPromptCount = 0;
|
|
396
|
+
let internalPromptCount = 0;
|
|
397
|
+
let noPromptCount = 0;
|
|
398
|
+
let systemLikeCount = 0;
|
|
399
|
+
let invModeCount = 0;
|
|
400
|
+
let confCount = 0;
|
|
401
|
+
let sourceCount = 0;
|
|
402
|
+
let scopeCount = 0;
|
|
403
|
+
|
|
404
|
+
for (const inv of allInvocations) {
|
|
405
|
+
const queryText = inv.inline_query || inv.prompt_text || "";
|
|
406
|
+
if (inv.matched_prompt_id != null) promptLinked++;
|
|
407
|
+
if (inv.inline_query != null && inv.inline_query !== "") inlineQueryCount++;
|
|
408
|
+
if (inv.prompt_kind === "user") userPromptCount++;
|
|
409
|
+
if (inv.prompt_kind === "meta") metaPromptCount++;
|
|
410
|
+
if (isInternalSelftunePrompt(queryText, inv.prompt_kind)) internalPromptCount++;
|
|
411
|
+
if (inv.matched_prompt_id == null && (inv.inline_query == null || inv.inline_query === ""))
|
|
412
|
+
noPromptCount++;
|
|
413
|
+
if (isPollutingPrompt(queryText, inv.prompt_kind)) systemLikeCount++;
|
|
414
|
+
if (inv.invocation_mode != null && inv.invocation_mode !== "") invModeCount++;
|
|
415
|
+
if (inv.confidence != null) confCount++;
|
|
416
|
+
if (inv.source != null && inv.source !== "") sourceCount++;
|
|
417
|
+
if (inv.skill_scope != null && inv.skill_scope !== "") scopeCount++;
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
const evidence_quality = {
|
|
421
|
+
prompt_link_rate: safeDiv(promptLinked, totalInv),
|
|
422
|
+
inline_query_rate: safeDiv(inlineQueryCount, totalInv),
|
|
423
|
+
user_prompt_rate: safeDiv(userPromptCount, totalInv),
|
|
424
|
+
meta_prompt_rate: safeDiv(metaPromptCount, totalInv),
|
|
425
|
+
internal_prompt_rate: safeDiv(internalPromptCount, totalInv),
|
|
426
|
+
no_prompt_rate: safeDiv(noPromptCount, totalInv),
|
|
427
|
+
system_like_rate: safeDiv(systemLikeCount, totalInv),
|
|
428
|
+
invocation_mode_coverage: safeDiv(invModeCount, totalInv),
|
|
429
|
+
confidence_coverage: safeDiv(confCount, totalInv),
|
|
430
|
+
source_coverage: safeDiv(sourceCount, totalInv),
|
|
431
|
+
scope_coverage: safeDiv(scopeCount, totalInv),
|
|
432
|
+
};
|
|
433
|
+
|
|
434
|
+
// Routing quality
|
|
435
|
+
const missedTriggers = allInvocations.filter((r) => r.triggered === 0).length;
|
|
436
|
+
const withConfidence = allInvocations.filter((r) => r.confidence != null);
|
|
437
|
+
const avgConfidence =
|
|
438
|
+
withConfidence.length > 0
|
|
439
|
+
? withConfidence.reduce((s, r) => s + (r.confidence ?? 0), 0) / withConfidence.length
|
|
440
|
+
: null;
|
|
441
|
+
const lowConfCount = withConfidence.filter((r) => (r.confidence ?? 0) < 0.5).length;
|
|
442
|
+
|
|
443
|
+
const routing_quality = {
|
|
444
|
+
missed_triggers: missedTriggers,
|
|
445
|
+
miss_rate: safeDiv(missedTriggers, totalInv),
|
|
446
|
+
avg_confidence: avgConfidence,
|
|
447
|
+
confidence_coverage: safeDiv(confCount, totalInv),
|
|
448
|
+
low_confidence_rate:
|
|
449
|
+
withConfidence.length > 0 ? safeDiv(lowConfCount, withConfidence.length) : null,
|
|
450
|
+
};
|
|
451
|
+
|
|
452
|
+
// Evolution state
|
|
453
|
+
const evidenceCountRow = db
|
|
454
|
+
.query(`SELECT COUNT(*) AS cnt FROM evolution_evidence WHERE skill_name = ?`)
|
|
455
|
+
.get(skillName) as { cnt: number } | null;
|
|
456
|
+
const evolutionCountRow = db
|
|
457
|
+
.query(
|
|
458
|
+
`SELECT COUNT(*) AS cnt FROM evolution_audit
|
|
459
|
+
WHERE skill_name = ? OR (skill_name IS NULL AND proposal_id LIKE 'evo-' || ? || '-%')`,
|
|
460
|
+
)
|
|
461
|
+
.get(skillName, skillName) as { cnt: number } | null;
|
|
462
|
+
const latestAuditRow = db
|
|
463
|
+
.query(
|
|
464
|
+
`SELECT action, timestamp FROM evolution_audit
|
|
465
|
+
WHERE (skill_name = ? OR (skill_name IS NULL AND proposal_id LIKE 'evo-' || ? || '-%'))
|
|
466
|
+
AND action IN ('deployed', 'rolled_back', 'validated', 'proposed', 'approved')
|
|
467
|
+
ORDER BY timestamp DESC LIMIT 1`,
|
|
468
|
+
)
|
|
469
|
+
.get(skillName, skillName) as { action: string; timestamp: string } | null;
|
|
470
|
+
|
|
471
|
+
const evolution_state = {
|
|
472
|
+
has_evidence: (evidenceCountRow?.cnt ?? 0) > 0,
|
|
473
|
+
has_pending_proposals: pending_proposals.length > 0,
|
|
474
|
+
latest_action: latestAuditRow?.action ?? null,
|
|
475
|
+
latest_timestamp: latestAuditRow?.timestamp ?? null,
|
|
476
|
+
evidence_rows: evidenceCountRow?.cnt ?? 0,
|
|
477
|
+
evolution_rows: evolutionCountRow?.cnt ?? 0,
|
|
478
|
+
};
|
|
479
|
+
|
|
480
|
+
// Data hygiene
|
|
481
|
+
const namingVariants = db
|
|
482
|
+
.query(`SELECT DISTINCT skill_name FROM skill_invocations WHERE lower(skill_name) = lower(?)`)
|
|
483
|
+
.all(skillName) as Array<{ skill_name: string }>;
|
|
484
|
+
|
|
485
|
+
const sourceBreakdown = db
|
|
486
|
+
.query(
|
|
487
|
+
`SELECT COALESCE(source, '(null)') AS source, COUNT(*) AS count
|
|
488
|
+
FROM skill_invocations WHERE skill_name = ?
|
|
489
|
+
GROUP BY source ORDER BY count DESC`,
|
|
490
|
+
)
|
|
491
|
+
.all(skillName) as Array<{ source: string; count: number }>;
|
|
492
|
+
|
|
493
|
+
const promptKindBreakdown = db
|
|
494
|
+
.query(
|
|
495
|
+
`SELECT COALESCE(p.prompt_kind, '(null)') AS kind, COUNT(*) AS count
|
|
496
|
+
FROM skill_invocations si
|
|
497
|
+
LEFT JOIN prompts p ON si.matched_prompt_id = p.prompt_id
|
|
498
|
+
WHERE si.skill_name = ?
|
|
499
|
+
GROUP BY p.prompt_kind ORDER BY count DESC`,
|
|
500
|
+
)
|
|
501
|
+
.all(skillName) as Array<{ kind: string; count: number }>;
|
|
502
|
+
|
|
503
|
+
const observationBreakdownMap = new Map<
|
|
504
|
+
"canonical" | "repaired_trigger" | "repaired_contextual_miss" | "legacy_materialized",
|
|
505
|
+
number
|
|
506
|
+
>();
|
|
507
|
+
const enrichedInvocations = allInvocations.map((inv) => {
|
|
508
|
+
const queryText = inv.inline_query || inv.prompt_text || "";
|
|
509
|
+
const isPolluting = isPollutingPrompt(queryText, inv.prompt_kind);
|
|
510
|
+
const observation_kind = classifyObservationKind(
|
|
511
|
+
inv.skill_invocation_id,
|
|
512
|
+
inv.capture_mode,
|
|
513
|
+
inv.triggered,
|
|
514
|
+
inv.raw_source_ref,
|
|
515
|
+
);
|
|
516
|
+
return {
|
|
517
|
+
...inv,
|
|
518
|
+
queryText,
|
|
519
|
+
isPolluting,
|
|
520
|
+
observation_kind,
|
|
521
|
+
};
|
|
522
|
+
});
|
|
523
|
+
|
|
524
|
+
for (const inv of enrichedInvocations) {
|
|
525
|
+
observationBreakdownMap.set(
|
|
526
|
+
inv.observation_kind,
|
|
527
|
+
(observationBreakdownMap.get(inv.observation_kind) ?? 0) + 1,
|
|
528
|
+
);
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
const trustInvocationsRaw = enrichedInvocations.filter(
|
|
532
|
+
(inv) => inv.observation_kind !== "legacy_materialized",
|
|
533
|
+
);
|
|
534
|
+
|
|
535
|
+
const normalizeQueryForGrouping = (query: string) =>
|
|
536
|
+
query.replace(/\s+/g, " ").trim().toLowerCase();
|
|
537
|
+
|
|
538
|
+
const dedupeTrustInvocations = <T extends (typeof trustInvocationsRaw)[number]>(rows: T[]) => {
|
|
539
|
+
const grouped = new Map<string, T[]>();
|
|
540
|
+
for (const row of rows) {
|
|
541
|
+
const normalizedQuery = normalizeQueryForGrouping(row.queryText);
|
|
542
|
+
const key =
|
|
543
|
+
normalizedQuery.length > 0
|
|
544
|
+
? `${row.session_id}::${normalizedQuery}`
|
|
545
|
+
: `${row.skill_invocation_id}`;
|
|
546
|
+
const arr = grouped.get(key);
|
|
547
|
+
if (arr) arr.push(row);
|
|
548
|
+
else grouped.set(key, [row]);
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
return [...grouped.values()]
|
|
552
|
+
.map((group) => {
|
|
553
|
+
const sorted = [...group].sort((a, b) => {
|
|
554
|
+
const aScore =
|
|
555
|
+
(a.triggered === 1 ? 100 : 0) +
|
|
556
|
+
(a.observation_kind === "canonical" ? 20 : 0) +
|
|
557
|
+
(a.observation_kind === "repaired_trigger" ? 15 : 0) +
|
|
558
|
+
(a.confidence != null ? 5 : 0);
|
|
559
|
+
const bScore =
|
|
560
|
+
(b.triggered === 1 ? 100 : 0) +
|
|
561
|
+
(b.observation_kind === "canonical" ? 20 : 0) +
|
|
562
|
+
(b.observation_kind === "repaired_trigger" ? 15 : 0) +
|
|
563
|
+
(b.confidence != null ? 5 : 0);
|
|
564
|
+
if (aScore !== bScore) return bScore - aScore;
|
|
565
|
+
return (b.timestamp ?? "").localeCompare(a.timestamp ?? "");
|
|
566
|
+
});
|
|
567
|
+
const primary = sorted[0]!;
|
|
568
|
+
return {
|
|
569
|
+
...primary,
|
|
570
|
+
historical_context:
|
|
571
|
+
primary.triggered === 1 && group.some((row) => row.triggered === 0)
|
|
572
|
+
? ("previously_missed" as const)
|
|
573
|
+
: null,
|
|
574
|
+
};
|
|
575
|
+
})
|
|
576
|
+
.sort((a, b) => (b.timestamp ?? "").localeCompare(a.timestamp ?? ""));
|
|
577
|
+
};
|
|
578
|
+
|
|
579
|
+
const trustInvocations = dedupeTrustInvocations(trustInvocationsRaw);
|
|
580
|
+
|
|
581
|
+
const trustTotalInv = trustInvocations.length;
|
|
582
|
+
const trustDistinctSessions = new Set(trustInvocations.map((r) => r.session_id));
|
|
583
|
+
const trustDistinctWorkspaces = new Set(
|
|
584
|
+
trustInvocations.map((r) => r.workspace_path).filter(Boolean),
|
|
585
|
+
);
|
|
586
|
+
const trustTimestamps = trustInvocations
|
|
587
|
+
.map((r) => r.timestamp)
|
|
588
|
+
.filter((t): t is string => t != null);
|
|
589
|
+
|
|
590
|
+
const legacyRows = enrichedInvocations.filter(
|
|
591
|
+
(inv) => inv.observation_kind === "legacy_materialized",
|
|
592
|
+
).length;
|
|
593
|
+
const repairedRows = enrichedInvocations.filter(
|
|
594
|
+
(inv) =>
|
|
595
|
+
inv.observation_kind === "repaired_trigger" ||
|
|
596
|
+
inv.observation_kind === "repaired_contextual_miss",
|
|
597
|
+
).length;
|
|
598
|
+
|
|
599
|
+
const data_hygiene = {
|
|
600
|
+
naming_variants: namingVariants.map((r) => r.skill_name),
|
|
601
|
+
source_breakdown: sourceBreakdown,
|
|
602
|
+
prompt_kind_breakdown: promptKindBreakdown,
|
|
603
|
+
observation_breakdown: [...observationBreakdownMap.entries()].map(([kind, count]) => ({
|
|
604
|
+
kind,
|
|
605
|
+
count,
|
|
606
|
+
})),
|
|
607
|
+
raw_checks: totalInv,
|
|
608
|
+
operational_checks: trustTotalInv,
|
|
609
|
+
internal_prompt_rows: internalPromptCount,
|
|
610
|
+
internal_prompt_rate: safeDiv(internalPromptCount, totalInv),
|
|
611
|
+
legacy_rows: legacyRows,
|
|
612
|
+
legacy_rate: safeDiv(legacyRows, totalInv),
|
|
613
|
+
repaired_rows: repairedRows,
|
|
614
|
+
repaired_rate: safeDiv(repairedRows, totalInv),
|
|
615
|
+
};
|
|
616
|
+
|
|
617
|
+
// Recompute trust-facing metrics from operational non-legacy observations.
|
|
618
|
+
const trustPromptLinked = trustInvocations.filter((inv) => inv.matched_prompt_id != null).length;
|
|
619
|
+
const trustInlineQueryCount = trustInvocations.filter(
|
|
620
|
+
(inv) => inv.inline_query != null && inv.inline_query !== "",
|
|
621
|
+
).length;
|
|
622
|
+
const trustUserPromptCount = trustInvocations.filter((inv) => inv.prompt_kind === "user").length;
|
|
623
|
+
const trustMetaPromptCount = trustInvocations.filter((inv) => inv.prompt_kind === "meta").length;
|
|
624
|
+
const trustNoPromptCount = trustInvocations.filter(
|
|
625
|
+
(inv) => inv.matched_prompt_id == null && (inv.inline_query == null || inv.inline_query === ""),
|
|
626
|
+
).length;
|
|
627
|
+
const trustSystemLikeCount = trustInvocations.filter((inv) => inv.isPolluting).length;
|
|
628
|
+
const trustInvModeCount = trustInvocations.filter(
|
|
629
|
+
(inv) => inv.invocation_mode != null && inv.invocation_mode !== "",
|
|
630
|
+
).length;
|
|
631
|
+
const trustConfCount = trustInvocations.filter((inv) => inv.confidence != null).length;
|
|
632
|
+
const trustSourceCount = trustInvocations.filter(
|
|
633
|
+
(inv) => inv.source != null && inv.source !== "",
|
|
634
|
+
).length;
|
|
635
|
+
const trustScopeCount = trustInvocations.filter(
|
|
636
|
+
(inv) => inv.skill_scope != null && inv.skill_scope !== "",
|
|
637
|
+
).length;
|
|
638
|
+
|
|
639
|
+
coverage.checks = trustTotalInv;
|
|
640
|
+
coverage.sessions = trustDistinctSessions.size;
|
|
641
|
+
coverage.workspaces = trustDistinctWorkspaces.size;
|
|
642
|
+
coverage.first_seen =
|
|
643
|
+
trustTimestamps.length > 0 ? trustTimestamps[trustTimestamps.length - 1] : null;
|
|
644
|
+
coverage.last_seen = trustTimestamps.length > 0 ? trustTimestamps[0] : null;
|
|
645
|
+
|
|
646
|
+
evidence_quality.prompt_link_rate = safeDiv(trustPromptLinked, trustTotalInv);
|
|
647
|
+
evidence_quality.inline_query_rate = safeDiv(trustInlineQueryCount, trustTotalInv);
|
|
648
|
+
evidence_quality.user_prompt_rate = safeDiv(trustUserPromptCount, trustTotalInv);
|
|
649
|
+
evidence_quality.meta_prompt_rate = safeDiv(trustMetaPromptCount, trustTotalInv);
|
|
650
|
+
evidence_quality.no_prompt_rate = safeDiv(trustNoPromptCount, trustTotalInv);
|
|
651
|
+
evidence_quality.system_like_rate = safeDiv(trustSystemLikeCount, trustTotalInv);
|
|
652
|
+
evidence_quality.invocation_mode_coverage = safeDiv(trustInvModeCount, trustTotalInv);
|
|
653
|
+
evidence_quality.confidence_coverage = safeDiv(trustConfCount, trustTotalInv);
|
|
654
|
+
evidence_quality.source_coverage = safeDiv(trustSourceCount, trustTotalInv);
|
|
655
|
+
evidence_quality.scope_coverage = safeDiv(trustScopeCount, trustTotalInv);
|
|
656
|
+
|
|
657
|
+
const trustMissedTriggers = trustInvocations.filter((r) => r.triggered === 0).length;
|
|
658
|
+
const trustWithConfidence = trustInvocations.filter((r) => r.confidence != null);
|
|
659
|
+
const trustAvgConfidence =
|
|
660
|
+
trustWithConfidence.length > 0
|
|
661
|
+
? trustWithConfidence.reduce((s, r) => s + (r.confidence ?? 0), 0) /
|
|
662
|
+
trustWithConfidence.length
|
|
663
|
+
: null;
|
|
664
|
+
const trustLowConfCount = trustWithConfidence.filter((r) => (r.confidence ?? 0) < 0.5).length;
|
|
665
|
+
|
|
666
|
+
routing_quality.missed_triggers = trustMissedTriggers;
|
|
667
|
+
routing_quality.miss_rate = safeDiv(trustMissedTriggers, trustTotalInv);
|
|
668
|
+
routing_quality.avg_confidence = trustAvgConfidence;
|
|
669
|
+
routing_quality.confidence_coverage = safeDiv(trustConfCount, trustTotalInv);
|
|
670
|
+
routing_quality.low_confidence_rate =
|
|
671
|
+
trustWithConfidence.length > 0 ? safeDiv(trustLowConfCount, trustWithConfidence.length) : null;
|
|
672
|
+
|
|
673
|
+
// Examples (limit 10 per category)
|
|
674
|
+
type ExampleRowInternal = {
|
|
675
|
+
timestamp: string | null;
|
|
676
|
+
session_id: string;
|
|
677
|
+
query_text: string;
|
|
678
|
+
triggered: boolean;
|
|
679
|
+
confidence: number | null;
|
|
680
|
+
invocation_mode: string | null;
|
|
681
|
+
prompt_kind: string | null;
|
|
682
|
+
source: string | null;
|
|
683
|
+
platform: string | null;
|
|
684
|
+
workspace_path: string | null;
|
|
685
|
+
query_origin: "inline_query" | "matched_prompt" | "missing";
|
|
686
|
+
is_system_like: boolean;
|
|
687
|
+
observation_kind:
|
|
688
|
+
| "canonical"
|
|
689
|
+
| "repaired_trigger"
|
|
690
|
+
| "repaired_contextual_miss"
|
|
691
|
+
| "legacy_materialized";
|
|
692
|
+
historical_context: "previously_missed" | null;
|
|
693
|
+
};
|
|
694
|
+
|
|
695
|
+
const goodExamples: ExampleRowInternal[] = [];
|
|
696
|
+
const missedExamples: ExampleRowInternal[] = [];
|
|
697
|
+
const noisyExamples: ExampleRowInternal[] = [];
|
|
698
|
+
|
|
699
|
+
for (const inv of dedupeTrustInvocations(trustInvocationsRaw)) {
|
|
700
|
+
const queryText = inv.queryText;
|
|
701
|
+
const sysLike = inv.isPolluting;
|
|
702
|
+
const queryOrigin: "inline_query" | "matched_prompt" | "missing" =
|
|
703
|
+
inv.inline_query != null && inv.inline_query !== ""
|
|
704
|
+
? "inline_query"
|
|
705
|
+
: inv.matched_prompt_id != null
|
|
706
|
+
? "matched_prompt"
|
|
707
|
+
: "missing";
|
|
708
|
+
const row: ExampleRowInternal = {
|
|
709
|
+
timestamp: inv.timestamp,
|
|
710
|
+
session_id: inv.session_id,
|
|
711
|
+
query_text: queryText,
|
|
712
|
+
triggered: inv.triggered === 1,
|
|
713
|
+
confidence: inv.confidence,
|
|
714
|
+
invocation_mode: inv.invocation_mode,
|
|
715
|
+
prompt_kind: inv.prompt_kind,
|
|
716
|
+
source: inv.source,
|
|
717
|
+
platform: inv.platform,
|
|
718
|
+
workspace_path: inv.workspace_path,
|
|
719
|
+
query_origin: queryOrigin,
|
|
720
|
+
is_system_like: sysLike,
|
|
721
|
+
observation_kind: inv.observation_kind,
|
|
722
|
+
historical_context: inv.historical_context,
|
|
723
|
+
};
|
|
724
|
+
|
|
725
|
+
if (inv.triggered === 0 && missedExamples.length < 10) {
|
|
726
|
+
missedExamples.push(row);
|
|
727
|
+
} else if (
|
|
728
|
+
inv.triggered === 1 &&
|
|
729
|
+
queryText !== "" &&
|
|
730
|
+
(queryOrigin === "inline_query" || inv.prompt_kind === "user" || inv.prompt_kind == null) &&
|
|
731
|
+
goodExamples.length < 10
|
|
732
|
+
) {
|
|
733
|
+
goodExamples.push(row);
|
|
734
|
+
}
|
|
735
|
+
}
|
|
736
|
+
|
|
737
|
+
for (const inv of enrichedInvocations) {
|
|
738
|
+
if (!inv.isPolluting || noisyExamples.length >= 10) continue;
|
|
739
|
+
const queryOrigin: "inline_query" | "matched_prompt" | "missing" =
|
|
740
|
+
inv.inline_query != null && inv.inline_query !== ""
|
|
741
|
+
? "inline_query"
|
|
742
|
+
: inv.matched_prompt_id != null
|
|
743
|
+
? "matched_prompt"
|
|
744
|
+
: "missing";
|
|
745
|
+
noisyExamples.push({
|
|
746
|
+
timestamp: inv.timestamp,
|
|
747
|
+
session_id: inv.session_id,
|
|
748
|
+
query_text: inv.queryText,
|
|
749
|
+
triggered: inv.triggered === 1,
|
|
750
|
+
confidence: inv.confidence,
|
|
751
|
+
invocation_mode: inv.invocation_mode,
|
|
752
|
+
prompt_kind: inv.prompt_kind,
|
|
753
|
+
source: inv.source,
|
|
754
|
+
platform: inv.platform,
|
|
755
|
+
workspace_path: inv.workspace_path,
|
|
756
|
+
query_origin: queryOrigin,
|
|
757
|
+
is_system_like: true,
|
|
758
|
+
observation_kind: inv.observation_kind,
|
|
759
|
+
historical_context: null,
|
|
760
|
+
});
|
|
761
|
+
}
|
|
762
|
+
|
|
763
|
+
const examples = {
|
|
764
|
+
good: goodExamples,
|
|
765
|
+
missed: missedExamples,
|
|
766
|
+
noisy: noisyExamples,
|
|
767
|
+
};
|
|
768
|
+
|
|
769
|
+
// Trust state determination
|
|
770
|
+
type TrustStateType =
|
|
771
|
+
| "low_sample"
|
|
772
|
+
| "observed"
|
|
773
|
+
| "watch"
|
|
774
|
+
| "validated"
|
|
775
|
+
| "deployed"
|
|
776
|
+
| "rolled_back";
|
|
777
|
+
let trustState: TrustStateType;
|
|
778
|
+
let trustSummary: string;
|
|
779
|
+
|
|
780
|
+
if (coverage.checks < 5) {
|
|
781
|
+
trustState = "low_sample";
|
|
782
|
+
trustSummary = `Too few operational observations to assess trust — only ${coverage.checks} checks recorded.`;
|
|
783
|
+
} else if (latestAuditRow?.action === "rolled_back") {
|
|
784
|
+
trustState = "rolled_back";
|
|
785
|
+
trustSummary = "Recent evolution was rolled back — review evidence before re-deploying.";
|
|
786
|
+
} else if (latestAuditRow?.action === "deployed") {
|
|
787
|
+
trustState = "deployed";
|
|
788
|
+
trustSummary = `Deployed evolution; ${evolution_state.evidence_rows} evidence rows support current state.`;
|
|
789
|
+
} else if (latestAuditRow?.action === "validated" || latestAuditRow?.action === "approved") {
|
|
790
|
+
trustState = "validated";
|
|
791
|
+
trustSummary = "Validated with evidence but not yet deployed.";
|
|
792
|
+
} else if (
|
|
793
|
+
routing_quality.missed_triggers > 0 ||
|
|
794
|
+
evidence_quality.system_like_rate > 0.1 ||
|
|
795
|
+
evidence_quality.prompt_link_rate < 0.3
|
|
796
|
+
) {
|
|
797
|
+
trustState = "watch";
|
|
798
|
+
const reasons: string[] = [];
|
|
799
|
+
if (routing_quality.missed_triggers > 0)
|
|
800
|
+
reasons.push(`${routing_quality.missed_triggers} missed triggers`);
|
|
801
|
+
if (evidence_quality.system_like_rate > 0.1)
|
|
802
|
+
reasons.push(`${(evidence_quality.system_like_rate * 100).toFixed(0)}% system-like queries`);
|
|
803
|
+
if (evidence_quality.prompt_link_rate < 0.3)
|
|
804
|
+
reasons.push(
|
|
805
|
+
`low prompt link rate (${(evidence_quality.prompt_link_rate * 100).toFixed(0)}%)`,
|
|
806
|
+
);
|
|
807
|
+
trustSummary = `Needs attention — ${reasons.join(", ")}.`;
|
|
808
|
+
} else {
|
|
809
|
+
trustState = "observed";
|
|
810
|
+
const qualityDesc =
|
|
811
|
+
evidence_quality.prompt_link_rate > 0.7
|
|
812
|
+
? "strong"
|
|
813
|
+
: evidence_quality.prompt_link_rate > 0.4
|
|
814
|
+
? "moderate"
|
|
815
|
+
: "sparse";
|
|
816
|
+
trustSummary = `Observed in ${coverage.sessions} sessions across ${coverage.workspaces} workspaces; evidence is ${qualityDesc}.`;
|
|
817
|
+
}
|
|
818
|
+
|
|
819
|
+
const trust = { state: trustState, summary: trustSummary };
|
|
820
|
+
|
|
289
821
|
return Response.json({
|
|
290
822
|
...report,
|
|
291
823
|
evolution: evolutionWithSnapshot,
|
|
@@ -294,12 +826,32 @@ export function handleSkillReport(
|
|
|
294
826
|
total_input_tokens: executionRow?.total_input_tokens ?? 0,
|
|
295
827
|
total_output_tokens: executionRow?.total_output_tokens ?? 0,
|
|
296
828
|
},
|
|
297
|
-
canonical_invocations:
|
|
298
|
-
|
|
829
|
+
canonical_invocations: trustInvocations.slice(0, invLimit).map((i) => ({
|
|
830
|
+
timestamp: i.timestamp,
|
|
831
|
+
session_id: i.session_id,
|
|
832
|
+
skill_name: i.skill_name,
|
|
833
|
+
invocation_mode: i.invocation_mode,
|
|
299
834
|
triggered: i.triggered === 1,
|
|
835
|
+
confidence: i.confidence,
|
|
836
|
+
tool_name: i.tool_name,
|
|
837
|
+
agent_type: i.agent_type,
|
|
838
|
+
query: i.queryText,
|
|
839
|
+
source: i.source,
|
|
840
|
+
skill_path: i.skill_path,
|
|
841
|
+
skill_scope: i.skill_scope,
|
|
842
|
+
observation_kind: i.observation_kind,
|
|
843
|
+
historical_context: i.historical_context,
|
|
300
844
|
})),
|
|
301
845
|
invocations_pagination:
|
|
302
|
-
|
|
846
|
+
trustInvocations.length > invLimit
|
|
847
|
+
? {
|
|
848
|
+
next_cursor: {
|
|
849
|
+
timestamp: trustInvocations[invLimit - 1]!.timestamp!,
|
|
850
|
+
id: trustInvocations[invLimit - 1]!.skill_invocation_id,
|
|
851
|
+
},
|
|
852
|
+
has_more: true,
|
|
853
|
+
}
|
|
854
|
+
: undefined,
|
|
303
855
|
duration_stats: {
|
|
304
856
|
avg_duration_ms: executionRow?.avg_duration_ms ?? 0,
|
|
305
857
|
total_duration_ms: executionRow?.total_duration_ms ?? 0,
|
|
@@ -315,5 +867,12 @@ export function handleSkillReport(
|
|
|
315
867
|
})),
|
|
316
868
|
session_metadata: sessionMeta,
|
|
317
869
|
description_quality: descriptionQuality,
|
|
870
|
+
trust,
|
|
871
|
+
coverage,
|
|
872
|
+
evidence_quality,
|
|
873
|
+
routing_quality,
|
|
874
|
+
evolution_state,
|
|
875
|
+
data_hygiene,
|
|
876
|
+
examples,
|
|
318
877
|
});
|
|
319
878
|
}
|