triflux 10.9.21 → 10.9.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +34 -0
- package/.claude-plugin/plugin.json +22 -0
- package/config/mcp-registry.json +29 -0
- package/hub/account-broker.mjs +6 -4
- package/hub/cli-adapter-base.mjs +14 -14
- package/hub/lib/env-detect.mjs +47 -20
- package/hub/server.mjs +17 -15
- package/hub/team/headless.mjs +10 -0
- package/hub/team/swarm-hypervisor.mjs +2 -2
- package/hub/workers/delegator-mcp.mjs +129 -1
- package/hud/constants.mjs +24 -13
- package/hud/renderers.mjs +2 -1
- package/package.json +62 -21
- package/scripts/__tests__/keyword-detector.test.mjs +4 -4
- package/scripts/__tests__/release-governance.test.mjs +148 -0
- package/scripts/doctor-diagnose.mjs +6 -7
- package/scripts/lib/cross-review-utils.mjs +2 -2
- package/scripts/lib/mcp-filter.mjs +12 -24
- package/scripts/release/bump-version.mjs +77 -0
- package/scripts/release/check-sync.mjs +51 -0
- package/scripts/release/lib.mjs +303 -0
- package/scripts/release/prepare.mjs +85 -0
- package/scripts/release/publish.mjs +87 -0
- package/scripts/release/verify.mjs +81 -0
- package/scripts/release/version-manifest.json +26 -0
- package/scripts/remote-spawn.mjs +3 -3
- package/scripts/setup.mjs +18 -15
- package/scripts/tfx-route.sh +64 -8
- package/tui/codex-profile.mjs +457 -0
- package/tui/core.mjs +266 -0
- package/tui/doctor.mjs +375 -0
- package/tui/gemini-profile.mjs +299 -0
- package/tui/monitor-data.mjs +152 -0
- package/tui/monitor.mjs +339 -0
- package/tui/setup.mjs +598 -0
- package/CLAUDE.md +0 -212
- package/references/hosts.json +0 -46
- package/skills/tfx-workspace/async-tests/run-tests.sh +0 -203
- package/skills/tfx-workspace/evals/evals.json +0 -79
- package/skills/tfx-workspace/iteration-1/benchmark.json +0 -524
- package/skills/tfx-workspace/iteration-1/codex-gemini-remap/eval_metadata.json +0 -11
- package/skills/tfx-workspace/iteration-1/codex-gemini-remap/old_skill/grading.json +0 -25
- package/skills/tfx-workspace/iteration-1/codex-gemini-remap/old_skill/outputs/analysis.md +0 -154
- package/skills/tfx-workspace/iteration-1/codex-gemini-remap/old_skill/timing.json +0 -5
- package/skills/tfx-workspace/iteration-1/codex-gemini-remap/with_skill/grading.json +0 -25
- package/skills/tfx-workspace/iteration-1/codex-gemini-remap/with_skill/outputs/analysis.md +0 -126
- package/skills/tfx-workspace/iteration-1/codex-gemini-remap/with_skill/timing.json +0 -5
- package/skills/tfx-workspace/iteration-1/doctor-diagnosis/eval_metadata.json +0 -11
- package/skills/tfx-workspace/iteration-1/doctor-diagnosis/old_skill/grading.json +0 -25
- package/skills/tfx-workspace/iteration-1/doctor-diagnosis/old_skill/outputs/analysis.md +0 -119
- package/skills/tfx-workspace/iteration-1/doctor-diagnosis/old_skill/timing.json +0 -5
- package/skills/tfx-workspace/iteration-1/doctor-diagnosis/with_skill/grading.json +0 -25
- package/skills/tfx-workspace/iteration-1/doctor-diagnosis/with_skill/outputs/analysis.md +0 -115
- package/skills/tfx-workspace/iteration-1/doctor-diagnosis/with_skill/timing.json +0 -5
- package/skills/tfx-workspace/iteration-1/hub-start-sequence/eval_metadata.json +0 -10
- package/skills/tfx-workspace/iteration-1/hub-start-sequence/old_skill/grading.json +0 -20
- package/skills/tfx-workspace/iteration-1/hub-start-sequence/old_skill/outputs/analysis.md +0 -86
- package/skills/tfx-workspace/iteration-1/hub-start-sequence/old_skill/timing.json +0 -5
- package/skills/tfx-workspace/iteration-1/hub-start-sequence/with_skill/grading.json +0 -20
- package/skills/tfx-workspace/iteration-1/hub-start-sequence/with_skill/outputs/analysis.md +0 -81
- package/skills/tfx-workspace/iteration-1/hub-start-sequence/with_skill/timing.json +0 -5
- package/skills/tfx-workspace/iteration-1/multi-team-creation/eval_metadata.json +0 -12
- package/skills/tfx-workspace/iteration-1/multi-team-creation/old_skill/grading.json +0 -30
- package/skills/tfx-workspace/iteration-1/multi-team-creation/old_skill/outputs/analysis.md +0 -316
- package/skills/tfx-workspace/iteration-1/multi-team-creation/old_skill/timing.json +0 -5
- package/skills/tfx-workspace/iteration-1/multi-team-creation/with_skill/grading.json +0 -30
- package/skills/tfx-workspace/iteration-1/multi-team-creation/with_skill/outputs/analysis.md +0 -352
- package/skills/tfx-workspace/iteration-1/multi-team-creation/with_skill/timing.json +0 -5
- package/skills/tfx-workspace/iteration-1/review.html +0 -1325
- package/skills/tfx-workspace/iteration-1/routing-implement-shortcut/eval_metadata.json +0 -12
- package/skills/tfx-workspace/iteration-1/routing-implement-shortcut/old_skill/grading.json +0 -30
- package/skills/tfx-workspace/iteration-1/routing-implement-shortcut/old_skill/outputs/analysis.md +0 -97
- package/skills/tfx-workspace/iteration-1/routing-implement-shortcut/old_skill/timing.json +0 -5
- package/skills/tfx-workspace/iteration-1/routing-implement-shortcut/with_skill/grading.json +0 -30
- package/skills/tfx-workspace/iteration-1/routing-implement-shortcut/with_skill/outputs/analysis.md +0 -94
- package/skills/tfx-workspace/iteration-1/routing-implement-shortcut/with_skill/timing.json +0 -5
- package/skills/tfx-workspace/iteration-1/routing-multi-task-triage/eval_metadata.json +0 -12
- package/skills/tfx-workspace/iteration-1/routing-multi-task-triage/old_skill/grading.json +0 -30
- package/skills/tfx-workspace/iteration-1/routing-multi-task-triage/old_skill/outputs/analysis.md +0 -209
- package/skills/tfx-workspace/iteration-1/routing-multi-task-triage/old_skill/timing.json +0 -5
- package/skills/tfx-workspace/iteration-1/routing-multi-task-triage/with_skill/grading.json +0 -30
- package/skills/tfx-workspace/iteration-1/routing-multi-task-triage/with_skill/outputs/analysis.md +0 -193
- package/skills/tfx-workspace/iteration-1/routing-multi-task-triage/with_skill/timing.json +0 -5
- package/skills/tfx-workspace/iteration-2/benchmark.json +0 -144
- package/skills/tfx-workspace/iteration-2/multi-team-creation-refactored/eval_metadata.json +0 -13
- package/skills/tfx-workspace/iteration-2/multi-team-creation-refactored/old_skill/grading.json +0 -35
- package/skills/tfx-workspace/iteration-2/multi-team-creation-refactored/old_skill/outputs/analysis.md +0 -382
- package/skills/tfx-workspace/iteration-2/multi-team-creation-refactored/old_skill/timing.json +0 -5
- package/skills/tfx-workspace/iteration-2/multi-team-creation-refactored/with_skill/grading.json +0 -35
- package/skills/tfx-workspace/iteration-2/multi-team-creation-refactored/with_skill/outputs/analysis.md +0 -333
- package/skills/tfx-workspace/iteration-2/multi-team-creation-refactored/with_skill/timing.json +0 -5
- package/skills/tfx-workspace/iteration-2/review.html +0 -1325
- package/skills/tfx-workspace/skill-snapshot/tfx-auto/SKILL.md +0 -217
- package/skills/tfx-workspace/skill-snapshot/tfx-auto-codex/SKILL.md +0 -77
- package/skills/tfx-workspace/skill-snapshot/tfx-codex/SKILL.md +0 -65
- package/skills/tfx-workspace/skill-snapshot/tfx-doctor/SKILL.md +0 -94
- package/skills/tfx-workspace/skill-snapshot/tfx-gemini/SKILL.md +0 -82
- package/skills/tfx-workspace/skill-snapshot/tfx-hub/SKILL.md +0 -133
- package/skills/tfx-workspace/skill-snapshot/tfx-multi/SKILL.md +0 -426
- package/skills/tfx-workspace/skill-snapshot/tfx-setup/SKILL.md +0 -101
|
@@ -1,524 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"metadata": {
|
|
3
|
-
"skill_name": "tfx-skills-suite",
|
|
4
|
-
"skill_path": "C:/Users/SSAFY/Desktop/Projects/cli/triflux/skills",
|
|
5
|
-
"executor_model": "claude-sonnet-4-6",
|
|
6
|
-
"analyzer_model": "claude-opus-4-6",
|
|
7
|
-
"timestamp": "2026-03-19T10:00:00Z",
|
|
8
|
-
"evals_run": [1, 2, 3, 4, 5, 6],
|
|
9
|
-
"runs_per_configuration": 1
|
|
10
|
-
},
|
|
11
|
-
"runs": [
|
|
12
|
-
{
|
|
13
|
-
"eval_id": 1,
|
|
14
|
-
"eval_name": "routing-implement-shortcut",
|
|
15
|
-
"configuration": "with_skill",
|
|
16
|
-
"run_number": 1,
|
|
17
|
-
"result": {
|
|
18
|
-
"pass_rate": 1.0,
|
|
19
|
-
"passed": 5,
|
|
20
|
-
"failed": 0,
|
|
21
|
-
"total": 5,
|
|
22
|
-
"time_seconds": 43.6,
|
|
23
|
-
"tokens": 16303,
|
|
24
|
-
"tool_calls": 4,
|
|
25
|
-
"errors": 0
|
|
26
|
-
},
|
|
27
|
-
"expectations": [
|
|
28
|
-
{
|
|
29
|
-
"text": "Routes to executor agent",
|
|
30
|
-
"passed": true,
|
|
31
|
-
"evidence": "Correctly mapped from implement shortcut table"
|
|
32
|
-
},
|
|
33
|
-
{
|
|
34
|
-
"text": "Uses implement MCP profile",
|
|
35
|
-
"passed": true,
|
|
36
|
-
"evidence": "Mapped from shortcut table"
|
|
37
|
-
},
|
|
38
|
-
{
|
|
39
|
-
"text": "Generates correct tfx-route.sh command",
|
|
40
|
-
"passed": true,
|
|
41
|
-
"evidence": "bash ~/.claude/scripts/tfx-route.sh executor '...' implement"
|
|
42
|
-
},
|
|
43
|
-
{
|
|
44
|
-
"text": "Does NOT trigger triage",
|
|
45
|
-
"passed": true,
|
|
46
|
-
"evidence": "Command shortcut skips triage"
|
|
47
|
-
},
|
|
48
|
-
{
|
|
49
|
-
"text": "Does NOT delegate to tfx-multi",
|
|
50
|
-
"passed": true,
|
|
51
|
-
"evidence": "No subtask decomposition occurred"
|
|
52
|
-
}
|
|
53
|
-
]
|
|
54
|
-
},
|
|
55
|
-
{
|
|
56
|
-
"eval_id": 1,
|
|
57
|
-
"eval_name": "routing-implement-shortcut",
|
|
58
|
-
"configuration": "without_skill",
|
|
59
|
-
"run_number": 1,
|
|
60
|
-
"result": {
|
|
61
|
-
"pass_rate": 1.0,
|
|
62
|
-
"passed": 5,
|
|
63
|
-
"failed": 0,
|
|
64
|
-
"total": 5,
|
|
65
|
-
"time_seconds": 48.1,
|
|
66
|
-
"tokens": 16436,
|
|
67
|
-
"tool_calls": 4,
|
|
68
|
-
"errors": 0
|
|
69
|
-
},
|
|
70
|
-
"expectations": [
|
|
71
|
-
{
|
|
72
|
-
"text": "Routes to executor agent",
|
|
73
|
-
"passed": true,
|
|
74
|
-
"evidence": "Correctly mapped"
|
|
75
|
-
},
|
|
76
|
-
{
|
|
77
|
-
"text": "Uses implement MCP profile",
|
|
78
|
-
"passed": true,
|
|
79
|
-
"evidence": "Assigned by shortcut table"
|
|
80
|
-
},
|
|
81
|
-
{
|
|
82
|
-
"text": "Generates correct tfx-route.sh command",
|
|
83
|
-
"passed": true,
|
|
84
|
-
"evidence": "Correct syntax generated"
|
|
85
|
-
},
|
|
86
|
-
{
|
|
87
|
-
"text": "Does NOT trigger triage",
|
|
88
|
-
"passed": true,
|
|
89
|
-
"evidence": "Shortcut mode skips triage"
|
|
90
|
-
},
|
|
91
|
-
{
|
|
92
|
-
"text": "Does NOT delegate to tfx-multi",
|
|
93
|
-
"passed": true,
|
|
94
|
-
"evidence": "No delegation"
|
|
95
|
-
}
|
|
96
|
-
]
|
|
97
|
-
},
|
|
98
|
-
{
|
|
99
|
-
"eval_id": 2,
|
|
100
|
-
"eval_name": "routing-multi-task-triage",
|
|
101
|
-
"configuration": "with_skill",
|
|
102
|
-
"run_number": 1,
|
|
103
|
-
"result": {
|
|
104
|
-
"pass_rate": 1.0,
|
|
105
|
-
"passed": 5,
|
|
106
|
-
"failed": 0,
|
|
107
|
-
"total": 5,
|
|
108
|
-
"time_seconds": 58.2,
|
|
109
|
-
"tokens": 17584,
|
|
110
|
-
"tool_calls": 3,
|
|
111
|
-
"errors": 0
|
|
112
|
-
},
|
|
113
|
-
"expectations": [
|
|
114
|
-
{
|
|
115
|
-
"text": "Identifies as auto mode",
|
|
116
|
-
"passed": true,
|
|
117
|
-
"evidence": "No shortcut match, auto mode selected"
|
|
118
|
-
},
|
|
119
|
-
{
|
|
120
|
-
"text": "Triggers Codex classification",
|
|
121
|
-
"passed": true,
|
|
122
|
-
"evidence": "Codex --full-auto classification triggered"
|
|
123
|
-
},
|
|
124
|
-
{
|
|
125
|
-
"text": "Decomposes into 2+ subtasks",
|
|
126
|
-
"passed": true,
|
|
127
|
-
"evidence": "2 subtasks: executor + security-reviewer"
|
|
128
|
-
},
|
|
129
|
-
{
|
|
130
|
-
"text": "Notes tfx-multi delegation",
|
|
131
|
-
"passed": true,
|
|
132
|
-
"evidence": "subtasks.length >= 2 triggers tfx-multi Phase 3"
|
|
133
|
-
},
|
|
134
|
-
{
|
|
135
|
-
"text": "Does NOT execute directly",
|
|
136
|
-
"passed": true,
|
|
137
|
-
"evidence": "Delegates to tfx-multi"
|
|
138
|
-
}
|
|
139
|
-
]
|
|
140
|
-
},
|
|
141
|
-
{
|
|
142
|
-
"eval_id": 2,
|
|
143
|
-
"eval_name": "routing-multi-task-triage",
|
|
144
|
-
"configuration": "without_skill",
|
|
145
|
-
"run_number": 1,
|
|
146
|
-
"result": {
|
|
147
|
-
"pass_rate": 1.0,
|
|
148
|
-
"passed": 5,
|
|
149
|
-
"failed": 0,
|
|
150
|
-
"total": 5,
|
|
151
|
-
"time_seconds": 77.2,
|
|
152
|
-
"tokens": 18626,
|
|
153
|
-
"tool_calls": 4,
|
|
154
|
-
"errors": 0
|
|
155
|
-
},
|
|
156
|
-
"expectations": [
|
|
157
|
-
{
|
|
158
|
-
"text": "Identifies as auto mode",
|
|
159
|
-
"passed": true,
|
|
160
|
-
"evidence": "Auto mode selected"
|
|
161
|
-
},
|
|
162
|
-
{
|
|
163
|
-
"text": "Triggers Codex classification",
|
|
164
|
-
"passed": true,
|
|
165
|
-
"evidence": "Codex --full-auto triggered"
|
|
166
|
-
},
|
|
167
|
-
{
|
|
168
|
-
"text": "Decomposes into 2+ subtasks",
|
|
169
|
-
"passed": true,
|
|
170
|
-
"evidence": "2 subtasks decomposed"
|
|
171
|
-
},
|
|
172
|
-
{
|
|
173
|
-
"text": "Notes tfx-multi delegation",
|
|
174
|
-
"passed": true,
|
|
175
|
-
"evidence": "Hands off to tfx-multi Phase 3"
|
|
176
|
-
},
|
|
177
|
-
{
|
|
178
|
-
"text": "Does NOT execute directly",
|
|
179
|
-
"passed": true,
|
|
180
|
-
"evidence": "Delegates correctly"
|
|
181
|
-
}
|
|
182
|
-
]
|
|
183
|
-
},
|
|
184
|
-
{
|
|
185
|
-
"eval_id": 3,
|
|
186
|
-
"eval_name": "multi-team-creation",
|
|
187
|
-
"configuration": "with_skill",
|
|
188
|
-
"run_number": 1,
|
|
189
|
-
"result": {
|
|
190
|
-
"pass_rate": 1.0,
|
|
191
|
-
"passed": 5,
|
|
192
|
-
"failed": 0,
|
|
193
|
-
"total": 5,
|
|
194
|
-
"time_seconds": 115.3,
|
|
195
|
-
"tokens": 27197,
|
|
196
|
-
"tool_calls": 3,
|
|
197
|
-
"errors": 0
|
|
198
|
-
},
|
|
199
|
-
"expectations": [
|
|
200
|
-
{
|
|
201
|
-
"text": "Creates TeamCreate with tfx- prefix",
|
|
202
|
-
"passed": true,
|
|
203
|
-
"evidence": "TeamCreate({ team_name: 'tfx-<base36>' })"
|
|
204
|
-
},
|
|
205
|
-
{
|
|
206
|
-
"text": "Creates 3 TaskCreate calls",
|
|
207
|
-
"passed": true,
|
|
208
|
-
"evidence": "3x TaskCreate with metadata"
|
|
209
|
-
},
|
|
210
|
-
{
|
|
211
|
-
"text": "Spawns 3 Agent wrappers with bypassPermissions",
|
|
212
|
-
"passed": true,
|
|
213
|
-
"evidence": "3x Agent({ mode: bypassPermissions })"
|
|
214
|
-
},
|
|
215
|
-
{
|
|
216
|
-
"text": "Uses tfx-route.sh inside wrappers",
|
|
217
|
-
"passed": true,
|
|
218
|
-
"evidence": "Direct codex/gemini calls prohibited"
|
|
219
|
-
},
|
|
220
|
-
{
|
|
221
|
-
"text": "Includes Phase 5 TeamDelete",
|
|
222
|
-
"passed": true,
|
|
223
|
-
"evidence": "TeamDelete always runs, max 30s wait"
|
|
224
|
-
}
|
|
225
|
-
]
|
|
226
|
-
},
|
|
227
|
-
{
|
|
228
|
-
"eval_id": 3,
|
|
229
|
-
"eval_name": "multi-team-creation",
|
|
230
|
-
"configuration": "without_skill",
|
|
231
|
-
"run_number": 1,
|
|
232
|
-
"result": {
|
|
233
|
-
"pass_rate": 1.0,
|
|
234
|
-
"passed": 5,
|
|
235
|
-
"failed": 0,
|
|
236
|
-
"total": 5,
|
|
237
|
-
"time_seconds": 100.6,
|
|
238
|
-
"tokens": 26140,
|
|
239
|
-
"tool_calls": 3,
|
|
240
|
-
"errors": 0
|
|
241
|
-
},
|
|
242
|
-
"expectations": [
|
|
243
|
-
{
|
|
244
|
-
"text": "Creates TeamCreate with tfx- prefix",
|
|
245
|
-
"passed": true,
|
|
246
|
-
"evidence": "TeamCreate with tfx-<id>"
|
|
247
|
-
},
|
|
248
|
-
{
|
|
249
|
-
"text": "Creates 3 TaskCreate calls",
|
|
250
|
-
"passed": true,
|
|
251
|
-
"evidence": "Three TaskCreate calls"
|
|
252
|
-
},
|
|
253
|
-
{
|
|
254
|
-
"text": "Spawns 3 Agent wrappers with bypassPermissions",
|
|
255
|
-
"passed": true,
|
|
256
|
-
"evidence": "mode: bypassPermissions in all 3"
|
|
257
|
-
},
|
|
258
|
-
{
|
|
259
|
-
"text": "Uses tfx-route.sh inside wrappers",
|
|
260
|
-
"passed": true,
|
|
261
|
-
"evidence": "Never direct codex/gemini calls"
|
|
262
|
-
},
|
|
263
|
-
{
|
|
264
|
-
"text": "Includes Phase 5 TeamDelete",
|
|
265
|
-
"passed": true,
|
|
266
|
-
"evidence": "TeamDelete unconditionally"
|
|
267
|
-
}
|
|
268
|
-
]
|
|
269
|
-
},
|
|
270
|
-
{
|
|
271
|
-
"eval_id": 4,
|
|
272
|
-
"eval_name": "doctor-diagnosis",
|
|
273
|
-
"configuration": "with_skill",
|
|
274
|
-
"run_number": 1,
|
|
275
|
-
"result": {
|
|
276
|
-
"pass_rate": 1.0,
|
|
277
|
-
"passed": 4,
|
|
278
|
-
"failed": 0,
|
|
279
|
-
"total": 4,
|
|
280
|
-
"time_seconds": 53.8,
|
|
281
|
-
"tokens": 14499,
|
|
282
|
-
"tool_calls": 4,
|
|
283
|
-
"errors": 0
|
|
284
|
-
},
|
|
285
|
-
"expectations": [
|
|
286
|
-
{
|
|
287
|
-
"text": "Runs triflux doctor first",
|
|
288
|
-
"passed": true,
|
|
289
|
-
"evidence": "Bash(\"triflux doctor\")"
|
|
290
|
-
},
|
|
291
|
-
{
|
|
292
|
-
"text": "Suggests --fix mode",
|
|
293
|
-
"passed": true,
|
|
294
|
-
"evidence": "Suggests after diagnosis report"
|
|
295
|
-
},
|
|
296
|
-
{
|
|
297
|
-
"text": "Mentions HUD and CLI checks",
|
|
298
|
-
"passed": true,
|
|
299
|
-
"evidence": "HUD and CLI paths checked"
|
|
300
|
-
},
|
|
301
|
-
{
|
|
302
|
-
"text": "Does NOT jump to --reset",
|
|
303
|
-
"passed": true,
|
|
304
|
-
"evidence": "--reset reserved for explicit request"
|
|
305
|
-
}
|
|
306
|
-
]
|
|
307
|
-
},
|
|
308
|
-
{
|
|
309
|
-
"eval_id": 4,
|
|
310
|
-
"eval_name": "doctor-diagnosis",
|
|
311
|
-
"configuration": "without_skill",
|
|
312
|
-
"run_number": 1,
|
|
313
|
-
"result": {
|
|
314
|
-
"pass_rate": 1.0,
|
|
315
|
-
"passed": 4,
|
|
316
|
-
"failed": 0,
|
|
317
|
-
"total": 4,
|
|
318
|
-
"time_seconds": 48.3,
|
|
319
|
-
"tokens": 14482,
|
|
320
|
-
"tool_calls": 3,
|
|
321
|
-
"errors": 0
|
|
322
|
-
},
|
|
323
|
-
"expectations": [
|
|
324
|
-
{
|
|
325
|
-
"text": "Runs triflux doctor first",
|
|
326
|
-
"passed": true,
|
|
327
|
-
"evidence": "Bash(\"triflux doctor\")"
|
|
328
|
-
},
|
|
329
|
-
{
|
|
330
|
-
"text": "Suggests --fix mode",
|
|
331
|
-
"passed": true,
|
|
332
|
-
"evidence": "Offers --fix after diagnosis"
|
|
333
|
-
},
|
|
334
|
-
{
|
|
335
|
-
"text": "Mentions HUD and CLI checks",
|
|
336
|
-
"passed": true,
|
|
337
|
-
"evidence": "All 8 diagnostics listed"
|
|
338
|
-
},
|
|
339
|
-
{
|
|
340
|
-
"text": "Does NOT jump to --reset",
|
|
341
|
-
"passed": true,
|
|
342
|
-
"evidence": "--reset reserved for explicit request"
|
|
343
|
-
}
|
|
344
|
-
]
|
|
345
|
-
},
|
|
346
|
-
{
|
|
347
|
-
"eval_id": 5,
|
|
348
|
-
"eval_name": "hub-start-sequence",
|
|
349
|
-
"configuration": "with_skill",
|
|
350
|
-
"run_number": 1,
|
|
351
|
-
"result": {
|
|
352
|
-
"pass_rate": 1.0,
|
|
353
|
-
"passed": 3,
|
|
354
|
-
"failed": 0,
|
|
355
|
-
"total": 3,
|
|
356
|
-
"time_seconds": 47.2,
|
|
357
|
-
"tokens": 14821,
|
|
358
|
-
"tool_calls": 4,
|
|
359
|
-
"errors": 0
|
|
360
|
-
},
|
|
361
|
-
"expectations": [
|
|
362
|
-
{
|
|
363
|
-
"text": "Runs node hub/server.mjs in background",
|
|
364
|
-
"passed": true,
|
|
365
|
-
"evidence": "Bash(\"node hub/server.mjs\", run_in_background=true)"
|
|
366
|
-
},
|
|
367
|
-
{
|
|
368
|
-
"text": "Mentions port 27888 and /mcp",
|
|
369
|
-
"passed": true,
|
|
370
|
-
"evidence": "Port 27888, http://127.0.0.1:27888/mcp"
|
|
371
|
-
},
|
|
372
|
-
{
|
|
373
|
-
"text": "No triage or routing attempted",
|
|
374
|
-
"passed": true,
|
|
375
|
-
"evidence": "Command match, not fallthrough"
|
|
376
|
-
}
|
|
377
|
-
]
|
|
378
|
-
},
|
|
379
|
-
{
|
|
380
|
-
"eval_id": 5,
|
|
381
|
-
"eval_name": "hub-start-sequence",
|
|
382
|
-
"configuration": "without_skill",
|
|
383
|
-
"run_number": 1,
|
|
384
|
-
"result": {
|
|
385
|
-
"pass_rate": 1.0,
|
|
386
|
-
"passed": 3,
|
|
387
|
-
"failed": 0,
|
|
388
|
-
"total": 3,
|
|
389
|
-
"time_seconds": 51.8,
|
|
390
|
-
"tokens": 14904,
|
|
391
|
-
"tool_calls": 4,
|
|
392
|
-
"errors": 0
|
|
393
|
-
},
|
|
394
|
-
"expectations": [
|
|
395
|
-
{
|
|
396
|
-
"text": "Runs node hub/server.mjs in background",
|
|
397
|
-
"passed": true,
|
|
398
|
-
"evidence": "Bash(\"node hub/server.mjs\", run_in_background=true)"
|
|
399
|
-
},
|
|
400
|
-
{
|
|
401
|
-
"text": "Mentions port 27888 and /mcp",
|
|
402
|
-
"passed": true,
|
|
403
|
-
"evidence": "Port 27888, endpoint /mcp"
|
|
404
|
-
},
|
|
405
|
-
{
|
|
406
|
-
"text": "No triage or routing attempted",
|
|
407
|
-
"passed": true,
|
|
408
|
-
"evidence": "Command match, not fallthrough"
|
|
409
|
-
}
|
|
410
|
-
]
|
|
411
|
-
},
|
|
412
|
-
{
|
|
413
|
-
"eval_id": 6,
|
|
414
|
-
"eval_name": "codex-gemini-remap",
|
|
415
|
-
"configuration": "with_skill",
|
|
416
|
-
"run_number": 1,
|
|
417
|
-
"result": {
|
|
418
|
-
"pass_rate": 1.0,
|
|
419
|
-
"passed": 4,
|
|
420
|
-
"failed": 0,
|
|
421
|
-
"total": 4,
|
|
422
|
-
"time_seconds": 69.7,
|
|
423
|
-
"tokens": 14889,
|
|
424
|
-
"tool_calls": 5,
|
|
425
|
-
"errors": 0
|
|
426
|
-
},
|
|
427
|
-
"expectations": [
|
|
428
|
-
{
|
|
429
|
-
"text": "designer remapped to Codex (effort: high)",
|
|
430
|
-
"passed": true,
|
|
431
|
-
"evidence": "designer → Codex (effort: high)"
|
|
432
|
-
},
|
|
433
|
-
{
|
|
434
|
-
"text": "writer remapped to Codex Spark (spark_fast)",
|
|
435
|
-
"passed": true,
|
|
436
|
-
"evidence": "writer → Codex Spark (effort: spark_fast)"
|
|
437
|
-
},
|
|
438
|
-
{
|
|
439
|
-
"text": "TFX_CLI_MODE=codex set",
|
|
440
|
-
"passed": true,
|
|
441
|
-
"evidence": "Set for every Phase 3 call"
|
|
442
|
-
},
|
|
443
|
-
{
|
|
444
|
-
"text": "MCP profiles changed",
|
|
445
|
-
"passed": true,
|
|
446
|
-
"evidence": "designer→implement, writer→analyze"
|
|
447
|
-
}
|
|
448
|
-
]
|
|
449
|
-
},
|
|
450
|
-
{
|
|
451
|
-
"eval_id": 6,
|
|
452
|
-
"eval_name": "codex-gemini-remap",
|
|
453
|
-
"configuration": "without_skill",
|
|
454
|
-
"run_number": 1,
|
|
455
|
-
"result": {
|
|
456
|
-
"pass_rate": 1.0,
|
|
457
|
-
"passed": 4,
|
|
458
|
-
"failed": 0,
|
|
459
|
-
"total": 4,
|
|
460
|
-
"time_seconds": 85.2,
|
|
461
|
-
"tokens": 19802,
|
|
462
|
-
"tool_calls": 7,
|
|
463
|
-
"errors": 0
|
|
464
|
-
},
|
|
465
|
-
"expectations": [
|
|
466
|
-
{
|
|
467
|
-
"text": "designer remapped to Codex (effort: high)",
|
|
468
|
-
"passed": true,
|
|
469
|
-
"evidence": "designer → Codex (effort: high)"
|
|
470
|
-
},
|
|
471
|
-
{
|
|
472
|
-
"text": "writer remapped to Codex Spark (spark_fast)",
|
|
473
|
-
"passed": true,
|
|
474
|
-
"evidence": "writer → Codex Spark (effort: spark_fast)"
|
|
475
|
-
},
|
|
476
|
-
{
|
|
477
|
-
"text": "TFX_CLI_MODE=codex set",
|
|
478
|
-
"passed": true,
|
|
479
|
-
"evidence": "TFX_CLI_MODE set to codex"
|
|
480
|
-
},
|
|
481
|
-
{
|
|
482
|
-
"text": "MCP profiles changed",
|
|
483
|
-
"passed": true,
|
|
484
|
-
"evidence": "writer→analyze, designer→implement"
|
|
485
|
-
}
|
|
486
|
-
]
|
|
487
|
-
}
|
|
488
|
-
],
|
|
489
|
-
"run_summary": {
|
|
490
|
-
"with_skill": {
|
|
491
|
-
"pass_rate": { "mean": 1.0, "stddev": 0.0, "min": 1.0, "max": 1.0 },
|
|
492
|
-
"time_seconds": {
|
|
493
|
-
"mean": 64.6,
|
|
494
|
-
"stddev": 26.4,
|
|
495
|
-
"min": 43.6,
|
|
496
|
-
"max": 115.3
|
|
497
|
-
},
|
|
498
|
-
"tokens": { "mean": 17549, "stddev": 4857, "min": 14499, "max": 27197 }
|
|
499
|
-
},
|
|
500
|
-
"without_skill": {
|
|
501
|
-
"pass_rate": { "mean": 1.0, "stddev": 0.0, "min": 1.0, "max": 1.0 },
|
|
502
|
-
"time_seconds": {
|
|
503
|
-
"mean": 68.5,
|
|
504
|
-
"stddev": 20.4,
|
|
505
|
-
"min": 48.1,
|
|
506
|
-
"max": 100.6
|
|
507
|
-
},
|
|
508
|
-
"tokens": { "mean": 18398, "stddev": 4227, "min": 14482, "max": 26140 }
|
|
509
|
-
},
|
|
510
|
-
"delta": {
|
|
511
|
-
"pass_rate": "+0.00",
|
|
512
|
-
"time_seconds": "-3.9",
|
|
513
|
-
"tokens": "-849"
|
|
514
|
-
}
|
|
515
|
-
},
|
|
516
|
-
"notes": [
|
|
517
|
-
"All 26 assertions pass at 100% for both configurations — the skills are functionally correct",
|
|
518
|
-
"The fixes applied (dead reference removal, Phase numbering consistency, hub description) don't change routing logic, so pass rates are identical",
|
|
519
|
-
"NEW version is marginally faster (-3.9s avg) and uses fewer tokens (-849 avg), likely due to cleaner references reducing model confusion",
|
|
520
|
-
"tfx-multi is the most complex skill (115s / 27K tokens with_skill) — consider extracting reference docs to reduce context load",
|
|
521
|
-
"tfx-codex OLD references 'Phase(1~6)' which doesn't exist in tfx-auto — the NEW version correctly references the actual workflow names",
|
|
522
|
-
"All assertions pass regardless of configuration — these test the core routing logic which is unchanged. Consider adding assertions that specifically test the fixed issues (dead refs, phase naming) for differentiation"
|
|
523
|
-
]
|
|
524
|
-
}
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"eval_id": 6,
|
|
3
|
-
"eval_name": "codex-gemini-remap",
|
|
4
|
-
"prompt": "/tfx-codex API 문서를 작성하고 디자인 가이드도 만들어줘",
|
|
5
|
-
"assertions": [
|
|
6
|
-
"designer remapped to Codex with effort: high",
|
|
7
|
-
"writer remapped to Codex Spark with effort: spark_fast",
|
|
8
|
-
"Sets TFX_CLI_MODE=codex environment variable",
|
|
9
|
-
"Changes MCP profile: designer->implement, writer->analyze"
|
|
10
|
-
]
|
|
11
|
-
}
|
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"expectations": [
|
|
3
|
-
{
|
|
4
|
-
"text": "designer remapped to Codex with effort: high",
|
|
5
|
-
"passed": true,
|
|
6
|
-
"evidence": "Agent output: designer → Codex (effort: high)"
|
|
7
|
-
},
|
|
8
|
-
{
|
|
9
|
-
"text": "writer remapped to Codex Spark with effort: spark_fast",
|
|
10
|
-
"passed": true,
|
|
11
|
-
"evidence": "Agent output: writer → Codex Spark (effort: spark_fast)"
|
|
12
|
-
},
|
|
13
|
-
{
|
|
14
|
-
"text": "Sets TFX_CLI_MODE=codex environment variable",
|
|
15
|
-
"passed": true,
|
|
16
|
-
"evidence": "Agent output: 'TFX_CLI_MODE: Set to codex'"
|
|
17
|
-
},
|
|
18
|
-
{
|
|
19
|
-
"text": "Changes MCP profile: designer->implement, writer->analyze",
|
|
20
|
-
"passed": true,
|
|
21
|
-
"evidence": "Agent output: writer→analyze, designer→implement"
|
|
22
|
-
}
|
|
23
|
-
],
|
|
24
|
-
"summary": { "passed": 4, "failed": 0, "total": 4, "pass_rate": 1.0 }
|
|
25
|
-
}
|