selftune 0.1.4 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. package/.claude/agents/diagnosis-analyst.md +156 -0
  2. package/.claude/agents/evolution-reviewer.md +180 -0
  3. package/.claude/agents/integration-guide.md +212 -0
  4. package/.claude/agents/pattern-analyst.md +160 -0
  5. package/CHANGELOG.md +46 -1
  6. package/README.md +105 -257
  7. package/apps/local-dashboard/dist/assets/geist-cyrillic-wght-normal-CHSlOQsW.woff2 +0 -0
  8. package/apps/local-dashboard/dist/assets/geist-latin-ext-wght-normal-DMtmJ5ZE.woff2 +0 -0
  9. package/apps/local-dashboard/dist/assets/geist-latin-wght-normal-Dm3htQBi.woff2 +0 -0
  10. package/apps/local-dashboard/dist/assets/index-C4EOTFZ2.js +15 -0
  11. package/apps/local-dashboard/dist/assets/index-bl-Webyd.css +1 -0
  12. package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +60 -0
  13. package/apps/local-dashboard/dist/assets/vendor-table-B7VF2Ipl.js +26 -0
  14. package/apps/local-dashboard/dist/assets/vendor-ui-D7_zX_qy.js +346 -0
  15. package/apps/local-dashboard/dist/favicon.png +0 -0
  16. package/apps/local-dashboard/dist/index.html +17 -0
  17. package/apps/local-dashboard/dist/logo.png +0 -0
  18. package/apps/local-dashboard/dist/logo.svg +9 -0
  19. package/assets/BeforeAfter.gif +0 -0
  20. package/assets/FeedbackLoop.gif +0 -0
  21. package/assets/logo.svg +9 -0
  22. package/assets/skill-health-badge.svg +20 -0
  23. package/cli/selftune/activation-rules.ts +171 -0
  24. package/cli/selftune/badge/badge-data.ts +108 -0
  25. package/cli/selftune/badge/badge-svg.ts +212 -0
  26. package/cli/selftune/badge/badge.ts +99 -0
  27. package/cli/selftune/canonical-export.ts +183 -0
  28. package/cli/selftune/constants.ts +103 -1
  29. package/cli/selftune/contribute/bundle.ts +314 -0
  30. package/cli/selftune/contribute/contribute.ts +214 -0
  31. package/cli/selftune/contribute/sanitize.ts +162 -0
  32. package/cli/selftune/cron/setup.ts +266 -0
  33. package/cli/selftune/dashboard-contract.ts +202 -0
  34. package/cli/selftune/dashboard-server.ts +1049 -0
  35. package/cli/selftune/dashboard.ts +43 -156
  36. package/cli/selftune/eval/baseline.ts +248 -0
  37. package/cli/selftune/eval/composability-v2.ts +273 -0
  38. package/cli/selftune/eval/composability.ts +117 -0
  39. package/cli/selftune/eval/generate-unit-tests.ts +143 -0
  40. package/cli/selftune/eval/hooks-to-evals.ts +101 -16
  41. package/cli/selftune/eval/import-skillsbench.ts +221 -0
  42. package/cli/selftune/eval/synthetic-evals.ts +172 -0
  43. package/cli/selftune/eval/unit-test-cli.ts +152 -0
  44. package/cli/selftune/eval/unit-test.ts +196 -0
  45. package/cli/selftune/evolution/deploy-proposal.ts +142 -1
  46. package/cli/selftune/evolution/evidence.ts +26 -0
  47. package/cli/selftune/evolution/evolve-body.ts +586 -0
  48. package/cli/selftune/evolution/evolve.ts +825 -116
  49. package/cli/selftune/evolution/extract-patterns.ts +105 -16
  50. package/cli/selftune/evolution/pareto.ts +314 -0
  51. package/cli/selftune/evolution/propose-body.ts +171 -0
  52. package/cli/selftune/evolution/propose-description.ts +100 -2
  53. package/cli/selftune/evolution/propose-routing.ts +166 -0
  54. package/cli/selftune/evolution/refine-body.ts +141 -0
  55. package/cli/selftune/evolution/rollback.ts +21 -4
  56. package/cli/selftune/evolution/validate-body.ts +254 -0
  57. package/cli/selftune/evolution/validate-proposal.ts +257 -35
  58. package/cli/selftune/evolution/validate-routing.ts +177 -0
  59. package/cli/selftune/grading/auto-grade.ts +200 -0
  60. package/cli/selftune/grading/grade-session.ts +513 -42
  61. package/cli/selftune/grading/pre-gates.ts +104 -0
  62. package/cli/selftune/grading/results.ts +42 -0
  63. package/cli/selftune/hooks/auto-activate.ts +185 -0
  64. package/cli/selftune/hooks/evolution-guard.ts +165 -0
  65. package/cli/selftune/hooks/prompt-log.ts +172 -2
  66. package/cli/selftune/hooks/session-stop.ts +123 -3
  67. package/cli/selftune/hooks/skill-change-guard.ts +112 -0
  68. package/cli/selftune/hooks/skill-eval.ts +119 -3
  69. package/cli/selftune/index.ts +415 -48
  70. package/cli/selftune/ingestors/claude-replay.ts +377 -0
  71. package/cli/selftune/ingestors/codex-rollout.ts +345 -46
  72. package/cli/selftune/ingestors/codex-wrapper.ts +207 -39
  73. package/cli/selftune/ingestors/openclaw-ingest.ts +573 -0
  74. package/cli/selftune/ingestors/opencode-ingest.ts +193 -17
  75. package/cli/selftune/init.ts +376 -16
  76. package/cli/selftune/last.ts +14 -5
  77. package/cli/selftune/localdb/db.ts +63 -0
  78. package/cli/selftune/localdb/materialize.ts +428 -0
  79. package/cli/selftune/localdb/queries.ts +376 -0
  80. package/cli/selftune/localdb/schema.ts +204 -0
  81. package/cli/selftune/memory/writer.ts +447 -0
  82. package/cli/selftune/monitoring/watch.ts +90 -16
  83. package/cli/selftune/normalization.ts +682 -0
  84. package/cli/selftune/observability.ts +19 -44
  85. package/cli/selftune/orchestrate.ts +1073 -0
  86. package/cli/selftune/quickstart.ts +203 -0
  87. package/cli/selftune/repair/skill-usage.ts +576 -0
  88. package/cli/selftune/schedule.ts +561 -0
  89. package/cli/selftune/status.ts +59 -33
  90. package/cli/selftune/sync.ts +627 -0
  91. package/cli/selftune/types.ts +525 -5
  92. package/cli/selftune/utils/canonical-log.ts +45 -0
  93. package/cli/selftune/utils/frontmatter.ts +217 -0
  94. package/cli/selftune/utils/hooks.ts +41 -0
  95. package/cli/selftune/utils/html.ts +27 -0
  96. package/cli/selftune/utils/llm-call.ts +103 -19
  97. package/cli/selftune/utils/math.ts +10 -0
  98. package/cli/selftune/utils/query-filter.ts +139 -0
  99. package/cli/selftune/utils/skill-discovery.ts +340 -0
  100. package/cli/selftune/utils/skill-log.ts +68 -0
  101. package/cli/selftune/utils/skill-usage-confidence.ts +18 -0
  102. package/cli/selftune/utils/transcript.ts +307 -26
  103. package/cli/selftune/utils/trigger-check.ts +89 -0
  104. package/cli/selftune/utils/tui.ts +156 -0
  105. package/cli/selftune/workflows/discover.ts +254 -0
  106. package/cli/selftune/workflows/skill-md-writer.ts +288 -0
  107. package/cli/selftune/workflows/workflows.ts +188 -0
  108. package/package.json +28 -11
  109. package/packages/telemetry-contract/README.md +11 -0
  110. package/packages/telemetry-contract/fixtures/golden.json +87 -0
  111. package/packages/telemetry-contract/fixtures/golden.test.ts +42 -0
  112. package/packages/telemetry-contract/index.ts +1 -0
  113. package/packages/telemetry-contract/package.json +19 -0
  114. package/packages/telemetry-contract/src/index.ts +2 -0
  115. package/packages/telemetry-contract/src/types.ts +163 -0
  116. package/packages/telemetry-contract/src/validators.ts +109 -0
  117. package/skill/SKILL.md +180 -33
  118. package/skill/Workflows/AutoActivation.md +145 -0
  119. package/skill/Workflows/Badge.md +124 -0
  120. package/skill/Workflows/Baseline.md +144 -0
  121. package/skill/Workflows/Composability.md +107 -0
  122. package/skill/Workflows/Contribute.md +94 -0
  123. package/skill/Workflows/Cron.md +132 -0
  124. package/skill/Workflows/Dashboard.md +214 -0
  125. package/skill/Workflows/Doctor.md +63 -14
  126. package/skill/Workflows/Evals.md +110 -18
  127. package/skill/Workflows/EvolutionMemory.md +154 -0
  128. package/skill/Workflows/Evolve.md +181 -21
  129. package/skill/Workflows/EvolveBody.md +159 -0
  130. package/skill/Workflows/Grade.md +36 -31
  131. package/skill/Workflows/ImportSkillsBench.md +117 -0
  132. package/skill/Workflows/Ingest.md +142 -21
  133. package/skill/Workflows/Initialize.md +91 -23
  134. package/skill/Workflows/Orchestrate.md +139 -0
  135. package/skill/Workflows/Replay.md +91 -0
  136. package/skill/Workflows/Rollback.md +23 -4
  137. package/skill/Workflows/Schedule.md +61 -0
  138. package/skill/Workflows/Sync.md +88 -0
  139. package/skill/Workflows/UnitTest.md +150 -0
  140. package/skill/Workflows/Watch.md +33 -1
  141. package/skill/Workflows/Workflows.md +129 -0
  142. package/skill/assets/activation-rules-default.json +26 -0
  143. package/skill/assets/multi-skill-settings.json +63 -0
  144. package/skill/assets/single-skill-settings.json +57 -0
  145. package/skill/references/invocation-taxonomy.md +2 -2
  146. package/skill/references/logs.md +164 -2
  147. package/skill/references/setup-patterns.md +65 -0
  148. package/skill/references/version-history.md +40 -0
  149. package/skill/settings_snippet.json +23 -0
  150. package/templates/activation-rules-default.json +27 -0
  151. package/templates/multi-skill-settings.json +64 -0
  152. package/templates/single-skill-settings.json +58 -0
  153. package/dashboard/index.html +0 -1119
@@ -0,0 +1,273 @@
1
+ /**
2
+ * composability-v2.ts
3
+ *
4
+ * Extended composability analysis with synergy detection and sequence extraction.
5
+ * Builds on v1 patterns but adds:
6
+ * - Synergy scores (positive = better together, negative = conflict)
7
+ * - Ordered skill sequence detection from timestamps
8
+ * - Workflow candidate flagging
9
+ *
10
+ * Pure function -- no I/O. CLI wrapper handles reading JSONL.
11
+ */
12
+
13
+ import type {
14
+ ComposabilityReportV2,
15
+ CoOccurrencePairV2,
16
+ SessionTelemetryRecord,
17
+ SkillSequence,
18
+ SkillUsageRecord,
19
+ } from "../types.js";
20
+ import { clamp } from "../utils/math.js";
21
+
22
+ /**
23
+ * Extended composability analysis with synergy detection and sequence extraction.
24
+ *
25
+ * @param skillName - The skill to analyze
26
+ * @param telemetry - Session telemetry records
27
+ * @param usage - Skill usage records (for timestamp-based ordering)
28
+ * @param options - Analysis options
29
+ * @returns ComposabilityReportV2 with synergy pairs, sequences, and workflow candidates
30
+ */
31
+ export function analyzeComposabilityV2(
32
+ skillName: string,
33
+ telemetry: SessionTelemetryRecord[],
34
+ usage: SkillUsageRecord[],
35
+ options?: {
36
+ window?: number;
37
+ minOccurrences?: number;
38
+ },
39
+ ): ComposabilityReportV2 {
40
+ const minOccurrences = options?.minOccurrences ?? 3;
41
+
42
+ // Apply window: sort by timestamp descending, take last N
43
+ let sessions = telemetry.filter((r) => r && Array.isArray(r.skills_triggered));
44
+
45
+ if (options?.window && options.window > 0) {
46
+ sessions = sessions
47
+ .sort((a, b) => (b.timestamp ?? "").localeCompare(a.timestamp ?? ""))
48
+ .slice(0, options.window);
49
+ }
50
+
51
+ // Build a set of session IDs in scope (after windowing)
52
+ const sessionIdSet = new Set(sessions.map((s) => s.session_id));
53
+
54
+ // Sessions where the target skill was triggered
55
+ const skillSessions = sessions.filter((r) => r.skills_triggered.includes(skillName));
56
+
57
+ // Sessions where the target skill was triggered ALONE (no other skills)
58
+ const aloneSessions = skillSessions.filter((r) => r.skills_triggered.length === 1);
59
+
60
+ // Average errors when skill is used alone
61
+ const errorsAlone =
62
+ aloneSessions.length > 0
63
+ ? aloneSessions.reduce((sum, r) => sum + (r.errors_encountered ?? 0), 0) /
64
+ aloneSessions.length
65
+ : 0;
66
+
67
+ // Find all co-occurring skills
68
+ const coSkills = new Set<string>();
69
+ for (const r of skillSessions) {
70
+ for (const s of r.skills_triggered) {
71
+ if (s !== skillName) coSkills.add(s);
72
+ }
73
+ }
74
+
75
+ // -----------------------------------------------------------------------
76
+ // Synergy computation for each co-occurring skill
77
+ // -----------------------------------------------------------------------
78
+
79
+ const pairs: CoOccurrencePairV2[] = [];
80
+ for (const coSkill of coSkills) {
81
+ // Sessions where BOTH skills are triggered together
82
+ const togetherSessions = skillSessions.filter((r) => r.skills_triggered.includes(coSkill));
83
+ const coOccurrenceCount = togetherSessions.length;
84
+
85
+ // Average errors when both skills are used together
86
+ const avgErrorsTogether =
87
+ togetherSessions.length > 0
88
+ ? togetherSessions.reduce((sum, r) => sum + (r.errors_encountered ?? 0), 0) /
89
+ togetherSessions.length
90
+ : 0;
91
+
92
+ // Baseline: consider BOTH skills' solo error rates, take the max
93
+ const coSkillAloneSessions = sessions.filter(
94
+ (r) => r.skills_triggered.length === 1 && r.skills_triggered.includes(coSkill),
95
+ );
96
+ const errorsCoSkillAlone =
97
+ coSkillAloneSessions.length > 0
98
+ ? coSkillAloneSessions.reduce((sum, r) => sum + (r.errors_encountered ?? 0), 0) /
99
+ coSkillAloneSessions.length
100
+ : errorsAlone;
101
+ const avgErrorsAlone = Math.max(errorsAlone, errorsCoSkillAlone);
102
+
103
+ // synergy_score = clamp((avg_errors_alone - avg_errors_together) / (avg_errors_alone + 1), -1, 1)
104
+ const synergyScore = clamp((avgErrorsAlone - avgErrorsTogether) / (avgErrorsAlone + 1), -1, 1);
105
+
106
+ const conflictDetected = synergyScore < -0.3;
107
+ const workflowCandidate = synergyScore > 0.3 && coOccurrenceCount >= minOccurrences;
108
+
109
+ const pair: CoOccurrencePairV2 = {
110
+ skill_a: skillName,
111
+ skill_b: coSkill,
112
+ co_occurrence_count: coOccurrenceCount,
113
+ conflict_detected: conflictDetected,
114
+ synergy_score: synergyScore,
115
+ avg_errors_together: avgErrorsTogether,
116
+ avg_errors_alone: avgErrorsAlone,
117
+ workflow_candidate: workflowCandidate,
118
+ };
119
+
120
+ if (conflictDetected) {
121
+ pair.conflict_reason = `synergy_score=${synergyScore.toFixed(3)} (avg errors together=${avgErrorsTogether.toFixed(1)} vs alone=${avgErrorsAlone.toFixed(1)})`;
122
+ }
123
+
124
+ pairs.push(pair);
125
+ }
126
+
127
+ // Sort by co-occurrence count descending
128
+ pairs.sort((a, b) => b.co_occurrence_count - a.co_occurrence_count);
129
+
130
+ // -----------------------------------------------------------------------
131
+ // Sequence extraction from usage records
132
+ // -----------------------------------------------------------------------
133
+
134
+ // Filter usage records for sessions in scope that contain the target skill
135
+ const usageInScope = usage.filter((u) => sessionIdSet.has(u.session_id));
136
+
137
+ // Group by session_id
138
+ const usageBySession = new Map<string, SkillUsageRecord[]>();
139
+ for (const u of usageInScope) {
140
+ const group = usageBySession.get(u.session_id);
141
+ if (group) {
142
+ group.push(u);
143
+ } else {
144
+ usageBySession.set(u.session_id, [u]);
145
+ }
146
+ }
147
+
148
+ // Build ordered sequences per session (only sessions containing target skill)
149
+ const sessionSequences: Array<{ skills: string[]; sessionId: string; firstQuery: string }> = [];
150
+
151
+ for (const [sessionId, records] of usageBySession) {
152
+ // Only sessions containing the target skill
153
+ if (!records.some((r) => r.skill_name === skillName)) continue;
154
+
155
+ // Sort by timestamp ascending
156
+ const sorted = [...records].sort((a, b) =>
157
+ (a.timestamp ?? "").localeCompare(b.timestamp ?? ""),
158
+ );
159
+
160
+ // Extract skill names, deduplicate consecutive same-skill entries
161
+ const skills: string[] = [];
162
+ for (const r of sorted) {
163
+ if (skills.length === 0 || skills[skills.length - 1] !== r.skill_name) {
164
+ skills.push(r.skill_name);
165
+ }
166
+ }
167
+
168
+ // Only record sequences with 2+ skills
169
+ if (skills.length >= 2) {
170
+ sessionSequences.push({
171
+ skills,
172
+ sessionId,
173
+ firstQuery: sorted[0]?.query ?? "",
174
+ });
175
+ }
176
+ }
177
+
178
+ // Count frequency of each unique sequence (by JSON key)
179
+ const sequenceCounts = new Map<
180
+ string,
181
+ { count: number; queryCounts: Map<string, number>; skills: string[] }
182
+ >();
183
+ for (const seq of sessionSequences) {
184
+ const key = JSON.stringify(seq.skills);
185
+ const existing = sequenceCounts.get(key);
186
+ if (existing) {
187
+ existing.count++;
188
+ existing.queryCounts.set(seq.firstQuery, (existing.queryCounts.get(seq.firstQuery) ?? 0) + 1);
189
+ } else {
190
+ sequenceCounts.set(key, {
191
+ count: 1,
192
+ queryCounts: new Map([[seq.firstQuery, 1]]),
193
+ skills: seq.skills,
194
+ });
195
+ }
196
+ }
197
+
198
+ // Also count all orderings of each skill set (for consistency computation)
199
+ // Key: sorted skill set -> total count of all orderings
200
+ const skillSetCounts = new Map<string, number>();
201
+ for (const seq of sessionSequences) {
202
+ const setKey = JSON.stringify([...seq.skills].sort());
203
+ skillSetCounts.set(setKey, (skillSetCounts.get(setKey) ?? 0) + 1);
204
+ }
205
+
206
+ // Build telemetry lookup by session_id for synergy scoring
207
+ const telemetryBySession = new Map<string, SessionTelemetryRecord>();
208
+ for (const s of sessions) {
209
+ telemetryBySession.set(s.session_id, s);
210
+ }
211
+
212
+ // Build sequences, filtered by minOccurrences
213
+ const sequences: SkillSequence[] = [];
214
+ for (const [key, data] of sequenceCounts) {
215
+ if (data.count < minOccurrences) continue;
216
+
217
+ // Compute synergy_score for this sequence's sessions
218
+ const matchingSessions = sessionSequences
219
+ .filter((s) => JSON.stringify(s.skills) === key)
220
+ .map((s) => telemetryBySession.get(s.sessionId))
221
+ .filter((s): s is SessionTelemetryRecord => s !== undefined);
222
+
223
+ const seqErrorsTogether =
224
+ matchingSessions.length > 0
225
+ ? matchingSessions.reduce((sum, r) => sum + (r.errors_encountered ?? 0), 0) /
226
+ matchingSessions.length
227
+ : 0;
228
+
229
+ const seqSynergyScore = clamp((errorsAlone - seqErrorsTogether) / (errorsAlone + 1), -1, 1);
230
+
231
+ // Consistency: count of this exact order / count of all orderings of same skill set
232
+ const setKey = JSON.stringify([...data.skills].sort());
233
+ const totalOrderings = skillSetCounts.get(setKey) ?? data.count;
234
+ const sequenceConsistency = totalOrderings > 0 ? data.count / totalOrderings : 1;
235
+
236
+ let representativeQuery = "";
237
+ let highestFrequency = -1;
238
+ for (const [query, frequency] of data.queryCounts) {
239
+ if (frequency > highestFrequency) {
240
+ representativeQuery = query;
241
+ highestFrequency = frequency;
242
+ }
243
+ }
244
+
245
+ sequences.push({
246
+ skills: data.skills,
247
+ occurrence_count: data.count,
248
+ synergy_score: seqSynergyScore,
249
+ representative_query: representativeQuery,
250
+ sequence_consistency: sequenceConsistency,
251
+ });
252
+ }
253
+
254
+ // Sort sequences by occurrence_count descending
255
+ sequences.sort((a, b) => b.occurrence_count - a.occurrence_count);
256
+
257
+ // -----------------------------------------------------------------------
258
+ // Assemble report
259
+ // -----------------------------------------------------------------------
260
+
261
+ const workflowCandidates = pairs.filter((p) => p.workflow_candidate);
262
+ const synergyCount = pairs.filter((p) => p.synergy_score > 0.3).length;
263
+
264
+ return {
265
+ pairs,
266
+ sequences,
267
+ workflow_candidates: workflowCandidates,
268
+ synergy_count: synergyCount,
269
+ total_sessions_analyzed: skillSessions.length,
270
+ conflict_count: pairs.filter((p) => p.conflict_detected).length,
271
+ generated_at: new Date().toISOString(),
272
+ };
273
+ }
@@ -0,0 +1,117 @@
1
+ /**
2
+ * composability.ts
3
+ *
4
+ * Analyzes co-occurrence patterns between skills in session telemetry
5
+ * to detect composability conflicts. A conflict is flagged when two
6
+ * skills used together produce more errors than either skill used alone.
7
+ *
8
+ * Pure function -- no I/O. CLI wrapper handles reading JSONL.
9
+ */
10
+
11
+ import type { ComposabilityReport, CoOccurrencePair, SessionTelemetryRecord } from "../types.js";
12
+
13
+ /**
14
+ * Clamp a number between min and max.
15
+ */
16
+ function clamp(value: number, min: number, max: number): number {
17
+ return Math.max(min, Math.min(max, value));
18
+ }
19
+
20
+ /**
21
+ * Analyze composability of a target skill against all co-occurring skills.
22
+ *
23
+ * @param skillName - The skill to analyze
24
+ * @param telemetry - All session telemetry records
25
+ * @param window - Optional: only consider the last N sessions (by timestamp)
26
+ * @returns ComposabilityReport with co-occurrence pairs and conflict detection
27
+ */
28
+ export function analyzeComposability(
29
+ skillName: string,
30
+ telemetry: SessionTelemetryRecord[],
31
+ window?: number,
32
+ ): ComposabilityReport {
33
+ // Apply window: sort by timestamp descending, take last N
34
+ let sessions = telemetry.filter((r) => r && Array.isArray(r.skills_triggered));
35
+
36
+ if (window && window > 0) {
37
+ sessions = sessions
38
+ .sort((a, b) => (b.timestamp ?? "").localeCompare(a.timestamp ?? ""))
39
+ .slice(0, window);
40
+ }
41
+
42
+ // Sessions where the target skill was triggered
43
+ const skillSessions = sessions.filter((r) => r.skills_triggered.includes(skillName));
44
+
45
+ // Sessions where the target skill was triggered ALONE (no other skills)
46
+ const aloneSessions = skillSessions.filter((r) => r.skills_triggered.length === 1);
47
+
48
+ // Average errors when skill is used alone
49
+ const errorsAlone =
50
+ aloneSessions.length > 0
51
+ ? aloneSessions.reduce((sum, r) => sum + (r.errors_encountered ?? 0), 0) /
52
+ aloneSessions.length
53
+ : 0;
54
+
55
+ // Find all co-occurring skills
56
+ const coSkills = new Set<string>();
57
+ for (const r of skillSessions) {
58
+ for (const s of r.skills_triggered) {
59
+ if (s !== skillName) coSkills.add(s);
60
+ }
61
+ }
62
+
63
+ // For each co-occurring skill, compute conflict score
64
+ const pairs: CoOccurrencePair[] = [];
65
+ for (const coSkill of coSkills) {
66
+ // Sessions where BOTH skills are triggered together
67
+ const togetherSessions = skillSessions.filter((r) => r.skills_triggered.includes(coSkill));
68
+
69
+ const coOccurrenceCount = togetherSessions.length;
70
+
71
+ // Average errors when both skills are used together
72
+ const errorsTogether =
73
+ togetherSessions.length > 0
74
+ ? togetherSessions.reduce((sum, r) => sum + (r.errors_encountered ?? 0), 0) /
75
+ togetherSessions.length
76
+ : 0;
77
+
78
+ // Baseline should consider BOTH skills alone to avoid false positives
79
+ const coSkillAloneSessions = sessions.filter(
80
+ (r) => r.skills_triggered.includes(coSkill) && !r.skills_triggered.includes(skillName),
81
+ );
82
+ const errorsCoSkillAlone =
83
+ coSkillAloneSessions.length > 0
84
+ ? coSkillAloneSessions.reduce((sum, r) => sum + (r.errors_encountered ?? 0), 0) /
85
+ coSkillAloneSessions.length
86
+ : errorsAlone;
87
+ const baselineAlone = Math.max(errorsAlone, errorsCoSkillAlone);
88
+
89
+ // conflict_score = clamp((errors_together - baseline) / (baseline + 1), 0, 1)
90
+ const conflictScore = clamp((errorsTogether - baselineAlone) / (baselineAlone + 1), 0, 1);
91
+
92
+ const conflictDetected = conflictScore > 0.3;
93
+
94
+ const pair: CoOccurrencePair = {
95
+ skill_a: skillName,
96
+ skill_b: coSkill,
97
+ co_occurrence_count: coOccurrenceCount,
98
+ conflict_detected: conflictDetected,
99
+ };
100
+
101
+ if (conflictDetected) {
102
+ pair.conflict_reason = `conflict_score=${conflictScore.toFixed(3)} (avg errors together=${errorsTogether.toFixed(1)} vs alone=${errorsAlone.toFixed(1)})`;
103
+ }
104
+
105
+ pairs.push(pair);
106
+ }
107
+
108
+ // Sort by co-occurrence count descending for readability
109
+ pairs.sort((a, b) => b.co_occurrence_count - a.co_occurrence_count);
110
+
111
+ return {
112
+ pairs,
113
+ total_sessions_analyzed: skillSessions.length,
114
+ conflict_count: pairs.filter((p) => p.conflict_detected).length,
115
+ generated_at: new Date().toISOString(),
116
+ };
117
+ }
@@ -0,0 +1,143 @@
1
+ /**
2
+ * Skill unit test generator.
3
+ *
4
+ * Uses an LLM to generate unit test cases from skill content and eval failures.
5
+ * Tests are output as SkillUnitTest[] JSON arrays.
6
+ */
7
+
8
+ import type { EvalEntry, SkillUnitTest } from "../types.js";
9
+
10
+ // Note: we don't use stripMarkdownFences from llm-call.ts because it
11
+ // assumes JSON objects (looks for `{`), but we return JSON arrays.
12
+
13
+ /** Strip markdown fences and find JSON array content. */
14
+ function stripArrayFences(raw: string): string {
15
+ let text = raw.trim();
16
+
17
+ // Strip markdown code fences
18
+ const fenceMatch = text.match(/^```\w*\n([\s\S]*?)\n```$/);
19
+ if (fenceMatch) {
20
+ text = fenceMatch[1].trim();
21
+ }
22
+
23
+ // Find first [ in case there's preamble text
24
+ const bracketIdx = text.indexOf("[");
25
+ if (bracketIdx >= 0) {
26
+ text = text.slice(bracketIdx);
27
+ }
28
+
29
+ return text;
30
+ }
31
+
32
+ // ---------------------------------------------------------------------------
33
+ // LLM caller type (injectable for testing)
34
+ // ---------------------------------------------------------------------------
35
+
36
+ export type LlmCaller = (systemPrompt: string, userPrompt: string) => Promise<string>;
37
+
38
+ // ---------------------------------------------------------------------------
39
+ // Prompt building
40
+ // ---------------------------------------------------------------------------
41
+
42
+ const SYSTEM_PROMPT = `You are a test engineer generating skill unit tests.
43
+ Given a skill name, its content/description, and optionally some eval failures,
44
+ generate unit test cases as a JSON array of objects.
45
+
46
+ Each test object must have:
47
+ - id: unique string (e.g. "gen-1", "gen-2")
48
+ - skill_name: the skill name provided
49
+ - query: a user query that would test this skill
50
+ - assertions: array of assertion objects, each with:
51
+ - type: one of "contains", "not_contains", "regex", "tool_called", "tool_not_called", "json_path"
52
+ - value: the value to check for
53
+ - description: (optional) human-readable description of what this checks
54
+ - tags: (optional) array of tag strings like ["generated", "smoke"]
55
+
56
+ Focus on:
57
+ 1. Covering different invocation patterns (explicit, implicit, contextual)
58
+ 2. Testing edge cases from eval failures if provided
59
+ 3. Verifying expected tools are called
60
+ 4. Checking output contains expected content
61
+
62
+ Respond with ONLY a JSON array. No explanation.`;
63
+
64
+ /** Build the user prompt for test generation. */
65
+ export function buildGenerationPrompt(
66
+ skillName: string,
67
+ skillContent: string,
68
+ evalFailures: EvalEntry[],
69
+ ): string {
70
+ const parts: string[] = [`Skill name: ${skillName}`, "", "Skill content:", skillContent, ""];
71
+
72
+ if (evalFailures.length > 0) {
73
+ parts.push("Eval failures (queries that failed trigger checks):");
74
+ for (const f of evalFailures) {
75
+ parts.push(
76
+ ` - query: "${f.query}" (should_trigger=${f.should_trigger}, type=${f.invocation_type ?? "unknown"})`,
77
+ );
78
+ }
79
+ parts.push("");
80
+ }
81
+
82
+ parts.push("Example test case format:");
83
+ parts.push(
84
+ JSON.stringify(
85
+ [
86
+ {
87
+ id: "example-1",
88
+ skill_name: skillName,
89
+ query: "example query for this skill",
90
+ assertions: [
91
+ {
92
+ type: "contains",
93
+ value: "expected output",
94
+ description: "checks for expected content",
95
+ },
96
+ { type: "tool_called", value: "Write", description: "verifies Write tool was used" },
97
+ ],
98
+ tags: ["generated"],
99
+ },
100
+ ],
101
+ null,
102
+ 2,
103
+ ),
104
+ );
105
+
106
+ parts.push("");
107
+ parts.push("Generate 5-10 diverse test cases covering the skill's functionality.");
108
+
109
+ return parts.join("\n");
110
+ }
111
+
112
+ // ---------------------------------------------------------------------------
113
+ // Generate unit tests
114
+ // ---------------------------------------------------------------------------
115
+
116
+ /** Generate unit tests for a skill using an LLM. Returns empty array on error. */
117
+ export async function generateUnitTests(
118
+ skillName: string,
119
+ skillContent: string,
120
+ evalFailures: EvalEntry[],
121
+ llmCaller: LlmCaller,
122
+ ): Promise<SkillUnitTest[]> {
123
+ try {
124
+ const userPrompt = buildGenerationPrompt(skillName, skillContent, evalFailures);
125
+ const raw = await llmCaller(SYSTEM_PROMPT, userPrompt);
126
+ const cleaned = stripArrayFences(raw);
127
+
128
+ const parsed = JSON.parse(cleaned);
129
+ if (!Array.isArray(parsed)) {
130
+ console.warn("[WARN] LLM did not return a JSON array for unit test generation");
131
+ return [];
132
+ }
133
+
134
+ // Ensure skill_name is set correctly on each test
135
+ return parsed.map((t: SkillUnitTest) => ({
136
+ ...t,
137
+ skill_name: t.skill_name || skillName,
138
+ }));
139
+ } catch (err) {
140
+ console.warn("[WARN] Failed to generate unit tests:", err);
141
+ return [];
142
+ }
143
+ }