workflow-ai 1.0.65 → 1.0.67

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/README.md +377 -371
  2. package/configs/agent-health-rules.yaml +12 -1
  3. package/configs/pipeline.yaml +6 -6
  4. package/package.json +1 -1
  5. package/src/lib/agent-spawner.mjs +47 -6
  6. package/src/lib/error-classifier.mjs +311 -274
  7. package/src/runner.mjs +241 -58
  8. package/src/skills/coach/tests/cases/TC-COACH-001/current/meta.json +93 -93
  9. package/src/skills/coach/tests/cases/TC-COACH-002/current/meta.json +93 -93
  10. package/src/skills/create-plan/SKILL.md +1 -0
  11. package/src/skills/create-plan/knowledge/test-hygiene.md +47 -0
  12. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-005/current/meta.json +113 -113
  13. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/meta.json +87 -87
  14. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/meta.json +87 -87
  15. package/src/skills/review-result/SKILL.md +1 -0
  16. package/src/skills/review-result/knowledge/test-hygiene.md +44 -0
  17. package/src/skills/review-result/scripts/verify-artifacts.js +115 -2
  18. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-003/current/claude-sonnet/trial-1.md +7 -0
  19. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-003/current/claude-sonnet/trial-2.md +7 -0
  20. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-003/current/claude-sonnet/trial-3.md +7 -0
  21. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-003/current/judge.json +163 -0
  22. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-003/current/kilo-deepseek/trial-1.md +5 -0
  23. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-003/current/kilo-deepseek/trial-2.md +5 -0
  24. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-003/current/kilo-deepseek/trial-3.md +11 -0
  25. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-003/current/kilo-glm/trial-1.md +16 -0
  26. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-003/current/kilo-glm/trial-2.md +18 -0
  27. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-003/current/kilo-glm/trial-3.md +17 -0
  28. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-003/current/kilo-minimax/trial-1.md +17 -0
  29. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-003/current/kilo-minimax/trial-2.md +31 -0
  30. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-003/current/kilo-minimax/trial-3.md +5 -0
  31. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-003/current/meta.json +115 -0
  32. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-003-test-isolation.yaml +50 -0
  33. package/src/skills/review-result/tests/fixtures/QA-904-test-isolation-violation/QA-904.md +51 -0
  34. package/src/skills/review-result/tests/fixtures/QA-904-test-isolation-violation/example-test.mjs +36 -0
  35. package/src/skills/review-result/tests/index.yaml +5 -0
  36. package/src/skills/review-result/tests/rubrics/test-isolation.md +20 -0
package/src/runner.mjs CHANGED
@@ -6,6 +6,9 @@ import { spawn, execSync } from 'child_process';
6
6
  import crypto from 'crypto';
7
7
  import yaml from './lib/js-yaml.mjs';
8
8
  import { findProjectRoot } from './lib/find-root.mjs';
9
+ import { loadRules, scanStderrForFatalRule, classify } from './lib/error-classifier.mjs';
10
+ import { snapshot, diff, isEmpty } from './lib/artifact-snapshot.mjs';
11
+ import { markUnhealthy, isHealthy } from './lib/agent-health-registry.mjs';
9
12
 
10
13
  // ============================================================================
11
14
  // Logger — система логирования с уровнями DEBUG/INFO/WARN/ERROR
@@ -846,6 +849,26 @@ class StageExecutor {
846
849
 
847
850
  // Текущий дочерний процесс агента (для kill при shutdown)
848
851
  this.currentChild = null;
852
+
853
+ // Правила health-классификатора (инициализируются один раз в конструкторе)
854
+ this.rules = loadRules(projectRoot);
855
+
856
+ // Лениво загружаемые правила health-классификатора для онлайн-сканирования stderr
857
+ this._healthRules = null;
858
+ }
859
+
860
+ /** Возвращает правила health-классификатора, загружая их при первом обращении. */
861
+ _getHealthRules() {
862
+ if (this._healthRules !== null) return this._healthRules;
863
+ try {
864
+ this._healthRules = loadRules(this.projectRoot);
865
+ } catch (e) {
866
+ if (this.logger) {
867
+ this.logger.warn(`Failed to load agent-health-rules: ${e.message}`, 'CLI');
868
+ }
869
+ this._healthRules = { common: [], agents: new Map() };
870
+ }
871
+ return this._healthRules;
849
872
  }
850
873
 
851
874
  /**
@@ -875,8 +898,13 @@ class StageExecutor {
875
898
  * 4. Скрипт-агенты (stage.agent: script-*) обрабатываются в отдельной ветке
876
899
  * execute() — сюда не попадают.
877
900
  */
878
- resolveAgent(stage, stageId) {
879
- const attempt = (stage.counter && this.counters[stage.counter]) || 1;
901
+ resolveAgent(stage, stageId, options = {}) {
902
+ const excludeAgents = options.excludeAgents || [];
903
+ // Семантика: counter = число УЖЕ ИСЧЕРПАННЫХ попыток (0 на старте, инкрементируется
904
+ // стадией `increment-*-attempts` ПОСЛЕ каждой неудачи). attempt — номер текущей
905
+ // (1-based). Читаем counter через ?? 0, чтобы отличать «ещё не запускались»
906
+ // от «была 1 попытка» — иначе оффсет-by-one и ротация застревает на первом агенте.
907
+ const attempt = (stage.counter ? (this.counters[stage.counter] ?? 0) : 0) + 1;
880
908
 
881
909
  // Task type: явно из context либо из префикса ticket_id
882
910
  const taskType = this.context.task_type
@@ -921,9 +949,28 @@ class StageExecutor {
921
949
  const caps = Array.isArray(agent.capabilities) ? agent.capabilities : [];
922
950
  return required.every(r => caps.includes(r));
923
951
  };
924
- const compatible = agentIds.filter(covers);
925
-
926
- if (compatible.length === 0) {
952
+ const afterCapabilities = agentIds.filter(covers);
953
+
954
+ // Фильтр по health-реестру: unhealthy-агенты с неистёкшим TTL пропускаются.
955
+ // Реестр персистентный между attempt'ами (план rev.3, решение 6.5).
956
+ const afterHealth = afterCapabilities.filter(id => isHealthy(this.projectRoot, id));
957
+
958
+ // Фильтр по excludeAgents (для in-stage fallback в рамках одной attempt)
959
+ const afterExclude = excludeAgents.length > 0
960
+ ? afterHealth.filter(id => !excludeAgents.includes(id))
961
+ : afterHealth;
962
+
963
+ if (afterExclude.length === 0) {
964
+ // Все capability-совместимые агенты либо unhealthy в реестре, либо уже пробованы в этой attempt.
965
+ if (afterCapabilities.length > 0) {
966
+ return {
967
+ blocked: 'all_unhealthy',
968
+ reason: excludeAgents.length > 0
969
+ ? `All agents tried in fallback`
970
+ : `All capable agents are unhealthy in registry`,
971
+ attempt
972
+ };
973
+ }
927
974
  return {
928
975
  blocked: 'no_capable_agent',
929
976
  reason: `No agent in [${agentIds.join(', ')}] covers required_capabilities [${required.join(', ')}]`,
@@ -932,12 +979,111 @@ class StageExecutor {
932
979
  }
933
980
 
934
981
  // Курсор = (attempt - 1) % length — ротация по кругу
935
- const cursor = (attempt - 1) % compatible.length;
982
+ const cursor = (attempt - 1) % afterExclude.length;
936
983
 
937
- const agentId = compatible[cursor];
984
+ const agentId = afterExclude[cursor];
938
985
  // Клонируем stage с подменой instructions (для agents_by_type override)
939
986
  const effectiveStage = { ...stage, instructions };
940
- return { agentId, effectiveStage, attempt, compatible };
987
+ return { agentId, effectiveStage, attempt, compatible: afterExclude };
988
+ }
989
+
990
+ /**
991
+ * Выполняет stage с fallback-логикой: при пустом artifact diff делает retry с другим агентом.
992
+ * @param {string} stageId - ID stage из конфигурации
993
+ * @param {object} [stageOverride] - явный stage (для тестов и промежуточных вызовов); по умолчанию берётся из pipeline.stages
994
+ * @returns {Promise<{status: string, output: string, result?: object}>}
995
+ */
996
+ async executeWithFallback(stageId, stageOverride) {
997
+ const stage = stageOverride ?? this.pipeline.stages[stageId];
998
+ if (!stage) {
999
+ throw new Error(`Stage not found: ${stageId}`);
1000
+ }
1001
+
1002
+ const triedInThisAttempt = [];
1003
+ let lastErr = null;
1004
+
1005
+ const snapshotEnabled = this.pipeline.execution?.artifact_snapshot_enabled !== false;
1006
+ const snapshotOpts = {
1007
+ includePaths: this.pipeline.execution?.snapshot_paths ?? ['src', 'configs'],
1008
+ snapshotMaxFileSize: this.pipeline.execution?.snapshot_max_file_size ?? 524288,
1009
+ };
1010
+
1011
+ while (true) {
1012
+ const resolved = this.resolveAgent(stage, stageId, { excludeAgents: triedInThisAttempt });
1013
+
1014
+ if (resolved.blocked) {
1015
+ // all_unhealthy после исчерпания списка в текущей attempt (lastErr есть) —
1016
+ // re-throw, чтобы стадия ушла в goto.error и inc-counter. Без lastErr —
1017
+ // первая итерация while, агентов сразу нет (persistence из прошлой attempt)
1018
+ // → возвращаем blocked, чтобы конфиг мог развести goto.blocked vs goto.error.
1019
+ if (resolved.blocked === 'all_unhealthy' && lastErr) {
1020
+ throw lastErr;
1021
+ }
1022
+ return { status: 'blocked', blocked_reason: resolved.blocked, reason: resolved.reason };
1023
+ }
1024
+
1025
+ const { agentId, effectiveStage } = resolved;
1026
+ const agent = this.pipeline.agents[agentId];
1027
+ const prompt = this.promptBuilder.build(effectiveStage, stageId);
1028
+
1029
+ const before = snapshotEnabled ? await snapshot(this.projectRoot, snapshotOpts) : null;
1030
+
1031
+ try {
1032
+ if (this.logger) {
1033
+ this.logger.info(
1034
+ `Agent selected: ${agentId} (attempt ${resolved.attempt}, compatible=[${resolved.compatible.join(', ')}])`,
1035
+ stageId
1036
+ );
1037
+ this.logger.stageStart(stageId, agentId, effectiveStage.skill);
1038
+ }
1039
+
1040
+ const result = await this.callAgent(agent, prompt, stageId, effectiveStage.skill, agentId);
1041
+
1042
+ if (this.logger) this.logger.stageComplete(stageId, result.status, result.exitCode);
1043
+ return result;
1044
+ } catch (err) {
1045
+ if (!err.exitCode && !err.code) throw err;
1046
+
1047
+ const exitCode = err.exitCode ?? err.code;
1048
+ const stderr = err.stderr || '';
1049
+
1050
+ const after = snapshotEnabled ? await snapshot(this.projectRoot, snapshotOpts) : null;
1051
+ const diffResult = snapshotEnabled ? diff(before, after) : null;
1052
+ const diffEmpty = snapshotEnabled && isEmpty(diffResult);
1053
+
1054
+ const classification = await classify(this.rules, agentId, { exitCode, stderr });
1055
+ if (classification) {
1056
+ markUnhealthy(this.projectRoot, agentId, classification);
1057
+ if (this.logger) {
1058
+ this.logger.info(
1059
+ `agent ${agentId} marked unhealthy: class=${classification.class}, excluded (fallback triggered)`,
1060
+ stageId
1061
+ );
1062
+ }
1063
+ }
1064
+
1065
+ if (!diffEmpty) {
1066
+ const changedPaths = diffResult ? Object.keys(diffResult).join(', ') : 'unknown';
1067
+ if (this.logger) {
1068
+ this.logger.warn(
1069
+ `agent ${agentId} exited ${exitCode}, artifacts modified [${changedPaths}] — fallback blocked`,
1070
+ stageId
1071
+ );
1072
+ }
1073
+ throw err;
1074
+ }
1075
+
1076
+ if (this.logger) {
1077
+ this.logger.info(
1078
+ `agent ${agentId} exited ${exitCode}, artifact diff empty — falling back in-stage (class=${classification?.class ?? 'unmatched'})`,
1079
+ stageId
1080
+ );
1081
+ }
1082
+
1083
+ triedInThisAttempt.push(agentId);
1084
+ lastErr = err;
1085
+ }
1086
+ }
941
1087
  }
942
1088
 
943
1089
  /**
@@ -961,7 +1107,7 @@ class StageExecutor {
961
1107
  const skipGuard = this.fileGuard && this.fileGuard.isTrusted(stage.agent, stageId);
962
1108
  if (this.fileGuard && !skipGuard) this.fileGuard.takeSnapshot();
963
1109
 
964
- const result = await this.callAgent(agent, prompt, stageId, stage.skill);
1110
+ const result = await this.callAgent(agent, prompt, stageId, stage.skill, stage.agent);
965
1111
 
966
1112
  if (this.logger) this.logger.stageComplete(stageId, result.status, result.exitCode);
967
1113
  if (this.fileGuard && !skipGuard) {
@@ -971,56 +1117,20 @@ class StageExecutor {
971
1117
  return result;
972
1118
  }
973
1119
 
974
- // Новая ветка: список кандидатов с фильтром по capabilities
975
- const resolved = this.resolveAgent(stage, stageId);
976
- if (resolved.blocked) {
977
- if (this.logger) {
978
- this.logger.error(
979
- `Stage "${stageId}" blocked: ${resolved.blocked} — ${resolved.reason}`,
980
- stageId
981
- );
982
- }
983
- return {
984
- status: 'blocked',
985
- blocked_reason: resolved.blocked,
986
- output: resolved.reason,
987
- result: { blocked: resolved.blocked, reason: resolved.reason },
988
- exitCode: 0,
989
- parsed: false
990
- };
991
- }
992
-
993
- const { agentId, effectiveStage } = resolved;
994
- const agent = this.pipeline.agents[agentId];
995
- const prompt = this.promptBuilder.build(effectiveStage, stageId);
996
-
997
- if (this.logger) {
998
- this.logger.info(
999
- `Agent selected: ${agentId} (attempt ${resolved.attempt}, compatible=[${resolved.compatible.join(', ')}])`,
1000
- stageId
1001
- );
1002
- this.logger.stageStart(stageId, agentId, effectiveStage.skill);
1003
- }
1004
-
1005
- const skipGuard = this.fileGuard && this.fileGuard.isTrusted(agentId, stageId);
1006
- if (this.fileGuard && !skipGuard) this.fileGuard.takeSnapshot();
1007
-
1008
- const result = await this.callAgent(agent, prompt, stageId, effectiveStage.skill);
1009
-
1010
- if (this.logger) this.logger.stageComplete(stageId, result.status, result.exitCode);
1011
- if (this.fileGuard && !skipGuard) {
1012
- const violations = this.fileGuard.checkAndRollback();
1013
- if (violations.length > 0) result.violations = violations;
1014
- }
1015
- return result;
1120
+ // Новая ветка: список кандидатов с фильтром по capabilities → executeWithFallback
1121
+ return this.executeWithFallback(stageId);
1016
1122
  }
1017
1123
 
1018
1124
  /**
1019
1125
  * Вызывает CLI-агента через child_process
1020
1126
  */
1021
- callAgent(agent, prompt, stageId, skillId) {
1127
+ callAgent(agent, prompt, stageId, skillId, agentId = null) {
1022
1128
  return new Promise((resolve, reject) => {
1023
1129
  const timeout = this.pipeline.execution?.timeout_per_stage || 300;
1130
+ const healthRules = agentId ? this._getHealthRules() : null;
1131
+ const hasAgentRules = Boolean(
1132
+ healthRules && agentId && healthRules.agents.get(agentId)?.length
1133
+ );
1024
1134
  const args = [...agent.args];
1025
1135
  const finalPrompt = prompt;
1026
1136
 
@@ -1065,16 +1175,23 @@ class StageExecutor {
1065
1175
  let stdout = '';
1066
1176
  let stderr = '';
1067
1177
  let timedOut = false;
1178
+ let earlyKilled = false;
1179
+ let earlyKillRule = null;
1180
+ let lastScanSize = 0;
1068
1181
 
1069
- // Таймаут
1070
- const timeoutId = setTimeout(() => {
1071
- timedOut = true;
1072
- // На Windows SIGTERM игнорируется — используем taskkill /T /F для убийства дерева
1182
+ const killChild = () => {
1073
1183
  if (process.platform === 'win32' && child.pid) {
1074
1184
  try { execSync(`taskkill /pid ${child.pid} /T /F`, { stdio: 'pipe' }); } catch {}
1075
1185
  } else {
1076
- child.kill('SIGTERM');
1186
+ try { child.kill('SIGTERM'); } catch {}
1077
1187
  }
1188
+ };
1189
+
1190
+ // Таймаут
1191
+ const timeoutId = setTimeout(() => {
1192
+ timedOut = true;
1193
+ // На Windows SIGTERM игнорируется — используем taskkill /T /F для убийства дерева
1194
+ killChild();
1078
1195
  if (this.logger) {
1079
1196
  this.logger.timeout(stageId, timeout);
1080
1197
  }
@@ -1120,6 +1237,36 @@ class StageExecutor {
1120
1237
  child.stderr.on('data', (data) => {
1121
1238
  stderr += data.toString();
1122
1239
  process.stderr.write(data);
1240
+
1241
+ // Online-детекция фатальных паттернов (quota/429/usage-limit и т.п.).
1242
+ // Нужна чтобы не ждать timeout_per_stage (1800s), когда агентский CLI
1243
+ // уходит в молчаливый retry-цикл после HTTP 429.
1244
+ if (!hasAgentRules || earlyKilled || timedOut) return;
1245
+ // Throttle: первый скан всегда, последующие — только после 200+ новых байт.
1246
+ if (lastScanSize > 0 && stderr.length - lastScanSize < 200) return;
1247
+ lastScanSize = stderr.length;
1248
+ const match = scanStderrForFatalRule(healthRules, agentId, stderr);
1249
+ if (!match) return;
1250
+
1251
+ earlyKilled = true;
1252
+ earlyKillRule = match;
1253
+ clearTimeout(timeoutId);
1254
+ if (this.logger) {
1255
+ this.logger.error(
1256
+ `Fatal stderr pattern matched for ${agentId} (rule=${match.rule_id}, class=${match.class}). Killing process.`,
1257
+ stageId
1258
+ );
1259
+ }
1260
+ killChild();
1261
+ const err = new Error(
1262
+ `Agent "${agentId}" killed early: ${match.rule_id} (class=${match.class})`
1263
+ );
1264
+ err.code = 'EARLY_KILL';
1265
+ err.exitCode = -1;
1266
+ err.stderr = stderr;
1267
+ err.earlyKill = true;
1268
+ err.rule = match;
1269
+ reject(err);
1123
1270
  });
1124
1271
 
1125
1272
  child.on('close', (code) => {
@@ -1139,6 +1286,16 @@ class StageExecutor {
1139
1286
  process.stdout.write('\n');
1140
1287
 
1141
1288
  if (timedOut) return;
1289
+ if (earlyKilled) {
1290
+ if (this.logger && stderr.trim()) {
1291
+ this.logger.warn(`STDERR ↓`, stageId);
1292
+ for (const line of stderr.trim().split('\n')) {
1293
+ this.logger.warn(` ${line}`, stageId);
1294
+ }
1295
+ this.logger.warn(`STDERR ↑`, stageId);
1296
+ }
1297
+ return;
1298
+ }
1142
1299
 
1143
1300
  // Логгируем CLI вызов
1144
1301
  if (this.logger) {
@@ -1193,6 +1350,32 @@ class StageExecutor {
1193
1350
  return;
1194
1351
  }
1195
1352
 
1353
+ // Детекция silent-failure: CLI-агент (kilo и т.п.) auto-rejected permission-
1354
+ // запросы, exit=0, структурированного RESULT нет. Без этой проверки pipeline
1355
+ // получает status=default и идёт дальше, а стейдж фактически не выполнен
1356
+ // (см. incident 2026-04-22: create-report/analyze-report в PulseProxy).
1357
+ // Маппим в ошибку, чтобы executeWithFallback переключился на следующего агента.
1358
+ if (code === 0 && !result.parsed && stderr) {
1359
+ const rejectMatches = stderr.match(/(?:auto-rejecting|rejected permission|permission denied)/gi) || [];
1360
+ if (rejectMatches.length > 0) {
1361
+ const err = new Error(
1362
+ `Agent "${agentId}" exited 0 but auto-rejected ${rejectMatches.length} permission request(s) and produced no RESULT`
1363
+ );
1364
+ err.code = 'PERMISSION_REJECTED';
1365
+ err.exitCode = -1;
1366
+ err.stderr = stderr;
1367
+ err.rejectCount = rejectMatches.length;
1368
+ if (this.logger) {
1369
+ this.logger.error(
1370
+ `Agent "${agentId}" silent-failure: ${rejectMatches.length} auto-rejected permission(s), no RESULT — mapping to status=error`,
1371
+ stageId
1372
+ );
1373
+ }
1374
+ reject(err);
1375
+ return;
1376
+ }
1377
+ }
1378
+
1196
1379
  resolve({
1197
1380
  status: result.status || 'default',
1198
1381
  output: stdout,
@@ -1205,7 +1388,7 @@ class StageExecutor {
1205
1388
 
1206
1389
  child.on('error', (err) => {
1207
1390
  clearTimeout(timeoutId);
1208
- if (!timedOut) {
1391
+ if (!timedOut && !earlyKilled) {
1209
1392
  if (this.logger) {
1210
1393
  this.logger.error(`CLI error: ${err.message}`, stageId);
1211
1394
  }
@@ -1,94 +1,94 @@
1
- {
2
- "date": "2026-04-21T08:57:32.459Z",
3
- "skill_sha": "6df42d0",
4
- "status": "passed",
5
- "duration_ms": 1,
6
- "per_model": {
7
- "claude-sonnet": {
8
- "passed": true,
9
- "pass_count": 3,
10
- "total": 3,
11
- "threshold": 2
12
- },
13
- "kilo-deepseek": {
14
- "passed": true,
15
- "pass_count": 2,
16
- "total": 3,
17
- "threshold": 2
18
- },
19
- "kilo-minimax": {
20
- "passed": true,
21
- "pass_count": 2,
22
- "total": 3,
23
- "threshold": 2
24
- },
25
- "kilo-glm": {
26
- "passed": true,
27
- "pass_count": 3,
28
- "total": 3,
29
- "threshold": 2
30
- }
31
- },
32
- "rubric_scores": [
33
- {
34
- "agentId": "claude-sonnet",
35
- "trial": 1,
36
- "score": 5
37
- },
38
- {
39
- "agentId": "claude-sonnet",
40
- "trial": 2,
41
- "score": 5
42
- },
43
- {
44
- "agentId": "claude-sonnet",
45
- "trial": 3,
46
- "score": 5
47
- },
48
- {
49
- "agentId": "kilo-deepseek",
50
- "trial": 1,
51
- "score": 5
52
- },
53
- {
54
- "agentId": "kilo-deepseek",
55
- "trial": 2,
56
- "score": 1
57
- },
58
- {
59
- "agentId": "kilo-deepseek",
60
- "trial": 3,
61
- "score": 5
62
- },
63
- {
64
- "agentId": "kilo-glm",
65
- "trial": 1,
66
- "score": 5
67
- },
68
- {
69
- "agentId": "kilo-glm",
70
- "trial": 2,
71
- "score": 5
72
- },
73
- {
74
- "agentId": "kilo-glm",
75
- "trial": 3,
76
- "score": 5
77
- },
78
- {
79
- "agentId": "kilo-minimax",
80
- "trial": 1,
81
- "score": 3
82
- },
83
- {
84
- "agentId": "kilo-minimax",
85
- "trial": 2,
86
- "score": 5
87
- },
88
- {
89
- "agentId": "kilo-minimax",
90
- "trial": 3,
91
- "score": 5
92
- }
93
- ]
1
+ {
2
+ "date": "2026-04-23T08:08:11.031Z",
3
+ "skill_sha": "6df42d0",
4
+ "status": "passed",
5
+ "duration_ms": 2,
6
+ "per_model": {
7
+ "claude-sonnet": {
8
+ "passed": true,
9
+ "pass_count": 3,
10
+ "total": 3,
11
+ "threshold": 2
12
+ },
13
+ "kilo-deepseek": {
14
+ "passed": true,
15
+ "pass_count": 2,
16
+ "total": 3,
17
+ "threshold": 2
18
+ },
19
+ "kilo-minimax": {
20
+ "passed": true,
21
+ "pass_count": 2,
22
+ "total": 3,
23
+ "threshold": 2
24
+ },
25
+ "kilo-glm": {
26
+ "passed": true,
27
+ "pass_count": 3,
28
+ "total": 3,
29
+ "threshold": 2
30
+ }
31
+ },
32
+ "rubric_scores": [
33
+ {
34
+ "agentId": "claude-sonnet",
35
+ "trial": 1,
36
+ "score": 5
37
+ },
38
+ {
39
+ "agentId": "claude-sonnet",
40
+ "trial": 2,
41
+ "score": 5
42
+ },
43
+ {
44
+ "agentId": "claude-sonnet",
45
+ "trial": 3,
46
+ "score": 5
47
+ },
48
+ {
49
+ "agentId": "kilo-deepseek",
50
+ "trial": 1,
51
+ "score": 5
52
+ },
53
+ {
54
+ "agentId": "kilo-deepseek",
55
+ "trial": 2,
56
+ "score": 1
57
+ },
58
+ {
59
+ "agentId": "kilo-deepseek",
60
+ "trial": 3,
61
+ "score": 5
62
+ },
63
+ {
64
+ "agentId": "kilo-glm",
65
+ "trial": 1,
66
+ "score": 5
67
+ },
68
+ {
69
+ "agentId": "kilo-glm",
70
+ "trial": 2,
71
+ "score": 5
72
+ },
73
+ {
74
+ "agentId": "kilo-glm",
75
+ "trial": 3,
76
+ "score": 5
77
+ },
78
+ {
79
+ "agentId": "kilo-minimax",
80
+ "trial": 1,
81
+ "score": 3
82
+ },
83
+ {
84
+ "agentId": "kilo-minimax",
85
+ "trial": 2,
86
+ "score": 5
87
+ },
88
+ {
89
+ "agentId": "kilo-minimax",
90
+ "trial": 3,
91
+ "score": 5
92
+ }
93
+ ]
94
94
  }