workflow-ai 1.0.65 → 1.0.66
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +377 -371
- package/configs/agent-health-rules.yaml +12 -1
- package/configs/pipeline.yaml +6 -6
- package/package.json +1 -1
- package/src/lib/agent-spawner.mjs +47 -6
- package/src/lib/error-classifier.mjs +311 -274
- package/src/runner.mjs +215 -58
- package/src/skills/coach/tests/cases/TC-COACH-001/current/meta.json +93 -93
- package/src/skills/coach/tests/cases/TC-COACH-002/current/meta.json +93 -93
- package/src/skills/create-plan/SKILL.md +1 -0
- package/src/skills/create-plan/knowledge/test-hygiene.md +47 -0
- package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-005/current/meta.json +113 -113
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/meta.json +87 -87
- package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/meta.json +87 -87
- package/src/skills/review-result/SKILL.md +1 -0
- package/src/skills/review-result/knowledge/test-hygiene.md +44 -0
- package/src/skills/review-result/scripts/verify-artifacts.js +115 -2
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-003/current/claude-sonnet/trial-1.md +7 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-003/current/claude-sonnet/trial-2.md +7 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-003/current/claude-sonnet/trial-3.md +7 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-003/current/judge.json +163 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-003/current/kilo-deepseek/trial-1.md +5 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-003/current/kilo-deepseek/trial-2.md +5 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-003/current/kilo-deepseek/trial-3.md +11 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-003/current/kilo-glm/trial-1.md +16 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-003/current/kilo-glm/trial-2.md +18 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-003/current/kilo-glm/trial-3.md +17 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-003/current/kilo-minimax/trial-1.md +17 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-003/current/kilo-minimax/trial-2.md +31 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-003/current/kilo-minimax/trial-3.md +5 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-003/current/meta.json +115 -0
- package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-003-test-isolation.yaml +50 -0
- package/src/skills/review-result/tests/fixtures/QA-904-test-isolation-violation/QA-904.md +51 -0
- package/src/skills/review-result/tests/fixtures/QA-904-test-isolation-violation/example-test.mjs +36 -0
- package/src/skills/review-result/tests/index.yaml +5 -0
- package/src/skills/review-result/tests/rubrics/test-isolation.md +20 -0
package/src/runner.mjs
CHANGED
|
@@ -6,6 +6,9 @@ import { spawn, execSync } from 'child_process';
|
|
|
6
6
|
import crypto from 'crypto';
|
|
7
7
|
import yaml from './lib/js-yaml.mjs';
|
|
8
8
|
import { findProjectRoot } from './lib/find-root.mjs';
|
|
9
|
+
import { loadRules, scanStderrForFatalRule, classify } from './lib/error-classifier.mjs';
|
|
10
|
+
import { snapshot, diff, isEmpty } from './lib/artifact-snapshot.mjs';
|
|
11
|
+
import { markUnhealthy, isHealthy } from './lib/agent-health-registry.mjs';
|
|
9
12
|
|
|
10
13
|
// ============================================================================
|
|
11
14
|
// Logger — система логирования с уровнями DEBUG/INFO/WARN/ERROR
|
|
@@ -846,6 +849,26 @@ class StageExecutor {
|
|
|
846
849
|
|
|
847
850
|
// Текущий дочерний процесс агента (для kill при shutdown)
|
|
848
851
|
this.currentChild = null;
|
|
852
|
+
|
|
853
|
+
// Правила health-классификатора (инициализируются один раз в конструкторе)
|
|
854
|
+
this.rules = loadRules(projectRoot);
|
|
855
|
+
|
|
856
|
+
// Лениво загружаемые правила health-классификатора для онлайн-сканирования stderr
|
|
857
|
+
this._healthRules = null;
|
|
858
|
+
}
|
|
859
|
+
|
|
860
|
+
/** Возвращает правила health-классификатора, загружая их при первом обращении. */
|
|
861
|
+
_getHealthRules() {
|
|
862
|
+
if (this._healthRules !== null) return this._healthRules;
|
|
863
|
+
try {
|
|
864
|
+
this._healthRules = loadRules(this.projectRoot);
|
|
865
|
+
} catch (e) {
|
|
866
|
+
if (this.logger) {
|
|
867
|
+
this.logger.warn(`Failed to load agent-health-rules: ${e.message}`, 'CLI');
|
|
868
|
+
}
|
|
869
|
+
this._healthRules = { common: [], agents: new Map() };
|
|
870
|
+
}
|
|
871
|
+
return this._healthRules;
|
|
849
872
|
}
|
|
850
873
|
|
|
851
874
|
/**
|
|
@@ -875,8 +898,13 @@ class StageExecutor {
|
|
|
875
898
|
* 4. Скрипт-агенты (stage.agent: script-*) обрабатываются в отдельной ветке
|
|
876
899
|
* execute() — сюда не попадают.
|
|
877
900
|
*/
|
|
878
|
-
resolveAgent(stage, stageId) {
|
|
879
|
-
const
|
|
901
|
+
resolveAgent(stage, stageId, options = {}) {
|
|
902
|
+
const excludeAgents = options.excludeAgents || [];
|
|
903
|
+
// Семантика: counter = число УЖЕ ИСЧЕРПАННЫХ попыток (0 на старте, инкрементируется
|
|
904
|
+
// стадией `increment-*-attempts` ПОСЛЕ каждой неудачи). attempt — номер текущей
|
|
905
|
+
// (1-based). Читаем counter через ?? 0, чтобы отличать «ещё не запускались»
|
|
906
|
+
// от «была 1 попытка» — иначе оффсет-by-one и ротация застревает на первом агенте.
|
|
907
|
+
const attempt = (stage.counter ? (this.counters[stage.counter] ?? 0) : 0) + 1;
|
|
880
908
|
|
|
881
909
|
// Task type: явно из context либо из префикса ticket_id
|
|
882
910
|
const taskType = this.context.task_type
|
|
@@ -921,9 +949,28 @@ class StageExecutor {
|
|
|
921
949
|
const caps = Array.isArray(agent.capabilities) ? agent.capabilities : [];
|
|
922
950
|
return required.every(r => caps.includes(r));
|
|
923
951
|
};
|
|
924
|
-
const
|
|
925
|
-
|
|
926
|
-
|
|
952
|
+
const afterCapabilities = agentIds.filter(covers);
|
|
953
|
+
|
|
954
|
+
// Фильтр по health-реестру: unhealthy-агенты с неистёкшим TTL пропускаются.
|
|
955
|
+
// Реестр персистентный между attempt'ами (план rev.3, решение 6.5).
|
|
956
|
+
const afterHealth = afterCapabilities.filter(id => isHealthy(this.projectRoot, id));
|
|
957
|
+
|
|
958
|
+
// Фильтр по excludeAgents (для in-stage fallback в рамках одной attempt)
|
|
959
|
+
const afterExclude = excludeAgents.length > 0
|
|
960
|
+
? afterHealth.filter(id => !excludeAgents.includes(id))
|
|
961
|
+
: afterHealth;
|
|
962
|
+
|
|
963
|
+
if (afterExclude.length === 0) {
|
|
964
|
+
// Все capability-совместимые агенты либо unhealthy в реестре, либо уже пробованы в этой attempt.
|
|
965
|
+
if (afterCapabilities.length > 0) {
|
|
966
|
+
return {
|
|
967
|
+
blocked: 'all_unhealthy',
|
|
968
|
+
reason: excludeAgents.length > 0
|
|
969
|
+
? `All agents tried in fallback`
|
|
970
|
+
: `All capable agents are unhealthy in registry`,
|
|
971
|
+
attempt
|
|
972
|
+
};
|
|
973
|
+
}
|
|
927
974
|
return {
|
|
928
975
|
blocked: 'no_capable_agent',
|
|
929
976
|
reason: `No agent in [${agentIds.join(', ')}] covers required_capabilities [${required.join(', ')}]`,
|
|
@@ -932,12 +979,111 @@ class StageExecutor {
|
|
|
932
979
|
}
|
|
933
980
|
|
|
934
981
|
// Курсор = (attempt - 1) % length — ротация по кругу
|
|
935
|
-
const cursor = (attempt - 1) %
|
|
982
|
+
const cursor = (attempt - 1) % afterExclude.length;
|
|
936
983
|
|
|
937
|
-
const agentId =
|
|
984
|
+
const agentId = afterExclude[cursor];
|
|
938
985
|
// Клонируем stage с подменой instructions (для agents_by_type override)
|
|
939
986
|
const effectiveStage = { ...stage, instructions };
|
|
940
|
-
return { agentId, effectiveStage, attempt, compatible };
|
|
987
|
+
return { agentId, effectiveStage, attempt, compatible: afterExclude };
|
|
988
|
+
}
|
|
989
|
+
|
|
990
|
+
/**
|
|
991
|
+
* Выполняет stage с fallback-логикой: при пустом artifact diff делает retry с другим агентом.
|
|
992
|
+
* @param {string} stageId - ID stage из конфигурации
|
|
993
|
+
* @param {object} [stageOverride] - явный stage (для тестов и промежуточных вызовов); по умолчанию берётся из pipeline.stages
|
|
994
|
+
* @returns {Promise<{status: string, output: string, result?: object}>}
|
|
995
|
+
*/
|
|
996
|
+
async executeWithFallback(stageId, stageOverride) {
|
|
997
|
+
const stage = stageOverride ?? this.pipeline.stages[stageId];
|
|
998
|
+
if (!stage) {
|
|
999
|
+
throw new Error(`Stage not found: ${stageId}`);
|
|
1000
|
+
}
|
|
1001
|
+
|
|
1002
|
+
const triedInThisAttempt = [];
|
|
1003
|
+
let lastErr = null;
|
|
1004
|
+
|
|
1005
|
+
const snapshotEnabled = this.pipeline.execution?.artifact_snapshot_enabled !== false;
|
|
1006
|
+
const snapshotOpts = {
|
|
1007
|
+
includePaths: this.pipeline.execution?.snapshot_paths ?? ['src', 'configs'],
|
|
1008
|
+
snapshotMaxFileSize: this.pipeline.execution?.snapshot_max_file_size ?? 524288,
|
|
1009
|
+
};
|
|
1010
|
+
|
|
1011
|
+
while (true) {
|
|
1012
|
+
const resolved = this.resolveAgent(stage, stageId, { excludeAgents: triedInThisAttempt });
|
|
1013
|
+
|
|
1014
|
+
if (resolved.blocked) {
|
|
1015
|
+
// all_unhealthy после исчерпания списка в текущей attempt (lastErr есть) —
|
|
1016
|
+
// re-throw, чтобы стадия ушла в goto.error и inc-counter. Без lastErr —
|
|
1017
|
+
// первая итерация while, агентов сразу нет (persistence из прошлой attempt)
|
|
1018
|
+
// → возвращаем blocked, чтобы конфиг мог развести goto.blocked vs goto.error.
|
|
1019
|
+
if (resolved.blocked === 'all_unhealthy' && lastErr) {
|
|
1020
|
+
throw lastErr;
|
|
1021
|
+
}
|
|
1022
|
+
return { status: 'blocked', blocked_reason: resolved.blocked, reason: resolved.reason };
|
|
1023
|
+
}
|
|
1024
|
+
|
|
1025
|
+
const { agentId, effectiveStage } = resolved;
|
|
1026
|
+
const agent = this.pipeline.agents[agentId];
|
|
1027
|
+
const prompt = this.promptBuilder.build(effectiveStage, stageId);
|
|
1028
|
+
|
|
1029
|
+
const before = snapshotEnabled ? await snapshot(this.projectRoot, snapshotOpts) : null;
|
|
1030
|
+
|
|
1031
|
+
try {
|
|
1032
|
+
if (this.logger) {
|
|
1033
|
+
this.logger.info(
|
|
1034
|
+
`Agent selected: ${agentId} (attempt ${resolved.attempt}, compatible=[${resolved.compatible.join(', ')}])`,
|
|
1035
|
+
stageId
|
|
1036
|
+
);
|
|
1037
|
+
this.logger.stageStart(stageId, agentId, effectiveStage.skill);
|
|
1038
|
+
}
|
|
1039
|
+
|
|
1040
|
+
const result = await this.callAgent(agent, prompt, stageId, effectiveStage.skill, agentId);
|
|
1041
|
+
|
|
1042
|
+
if (this.logger) this.logger.stageComplete(stageId, result.status, result.exitCode);
|
|
1043
|
+
return result;
|
|
1044
|
+
} catch (err) {
|
|
1045
|
+
if (!err.exitCode && !err.code) throw err;
|
|
1046
|
+
|
|
1047
|
+
const exitCode = err.exitCode ?? err.code;
|
|
1048
|
+
const stderr = err.stderr || '';
|
|
1049
|
+
|
|
1050
|
+
const after = snapshotEnabled ? await snapshot(this.projectRoot, snapshotOpts) : null;
|
|
1051
|
+
const diffResult = snapshotEnabled ? diff(before, after) : null;
|
|
1052
|
+
const diffEmpty = snapshotEnabled && isEmpty(diffResult);
|
|
1053
|
+
|
|
1054
|
+
const classification = await classify(this.rules, agentId, { exitCode, stderr });
|
|
1055
|
+
if (classification) {
|
|
1056
|
+
markUnhealthy(this.projectRoot, agentId, classification);
|
|
1057
|
+
if (this.logger) {
|
|
1058
|
+
this.logger.info(
|
|
1059
|
+
`agent ${agentId} marked unhealthy: class=${classification.class}, excluded (fallback triggered)`,
|
|
1060
|
+
stageId
|
|
1061
|
+
);
|
|
1062
|
+
}
|
|
1063
|
+
}
|
|
1064
|
+
|
|
1065
|
+
if (!diffEmpty) {
|
|
1066
|
+
const changedPaths = diffResult ? Object.keys(diffResult).join(', ') : 'unknown';
|
|
1067
|
+
if (this.logger) {
|
|
1068
|
+
this.logger.warn(
|
|
1069
|
+
`agent ${agentId} exited ${exitCode}, artifacts modified [${changedPaths}] — fallback blocked`,
|
|
1070
|
+
stageId
|
|
1071
|
+
);
|
|
1072
|
+
}
|
|
1073
|
+
throw err;
|
|
1074
|
+
}
|
|
1075
|
+
|
|
1076
|
+
if (this.logger) {
|
|
1077
|
+
this.logger.info(
|
|
1078
|
+
`agent ${agentId} exited ${exitCode}, artifact diff empty — falling back in-stage (class=${classification?.class ?? 'unmatched'})`,
|
|
1079
|
+
stageId
|
|
1080
|
+
);
|
|
1081
|
+
}
|
|
1082
|
+
|
|
1083
|
+
triedInThisAttempt.push(agentId);
|
|
1084
|
+
lastErr = err;
|
|
1085
|
+
}
|
|
1086
|
+
}
|
|
941
1087
|
}
|
|
942
1088
|
|
|
943
1089
|
/**
|
|
@@ -961,7 +1107,7 @@ class StageExecutor {
|
|
|
961
1107
|
const skipGuard = this.fileGuard && this.fileGuard.isTrusted(stage.agent, stageId);
|
|
962
1108
|
if (this.fileGuard && !skipGuard) this.fileGuard.takeSnapshot();
|
|
963
1109
|
|
|
964
|
-
const result = await this.callAgent(agent, prompt, stageId, stage.skill);
|
|
1110
|
+
const result = await this.callAgent(agent, prompt, stageId, stage.skill, stage.agent);
|
|
965
1111
|
|
|
966
1112
|
if (this.logger) this.logger.stageComplete(stageId, result.status, result.exitCode);
|
|
967
1113
|
if (this.fileGuard && !skipGuard) {
|
|
@@ -971,56 +1117,20 @@ class StageExecutor {
|
|
|
971
1117
|
return result;
|
|
972
1118
|
}
|
|
973
1119
|
|
|
974
|
-
// Новая ветка: список кандидатов с фильтром по capabilities
|
|
975
|
-
|
|
976
|
-
if (resolved.blocked) {
|
|
977
|
-
if (this.logger) {
|
|
978
|
-
this.logger.error(
|
|
979
|
-
`Stage "${stageId}" blocked: ${resolved.blocked} — ${resolved.reason}`,
|
|
980
|
-
stageId
|
|
981
|
-
);
|
|
982
|
-
}
|
|
983
|
-
return {
|
|
984
|
-
status: 'blocked',
|
|
985
|
-
blocked_reason: resolved.blocked,
|
|
986
|
-
output: resolved.reason,
|
|
987
|
-
result: { blocked: resolved.blocked, reason: resolved.reason },
|
|
988
|
-
exitCode: 0,
|
|
989
|
-
parsed: false
|
|
990
|
-
};
|
|
991
|
-
}
|
|
992
|
-
|
|
993
|
-
const { agentId, effectiveStage } = resolved;
|
|
994
|
-
const agent = this.pipeline.agents[agentId];
|
|
995
|
-
const prompt = this.promptBuilder.build(effectiveStage, stageId);
|
|
996
|
-
|
|
997
|
-
if (this.logger) {
|
|
998
|
-
this.logger.info(
|
|
999
|
-
`Agent selected: ${agentId} (attempt ${resolved.attempt}, compatible=[${resolved.compatible.join(', ')}])`,
|
|
1000
|
-
stageId
|
|
1001
|
-
);
|
|
1002
|
-
this.logger.stageStart(stageId, agentId, effectiveStage.skill);
|
|
1003
|
-
}
|
|
1004
|
-
|
|
1005
|
-
const skipGuard = this.fileGuard && this.fileGuard.isTrusted(agentId, stageId);
|
|
1006
|
-
if (this.fileGuard && !skipGuard) this.fileGuard.takeSnapshot();
|
|
1007
|
-
|
|
1008
|
-
const result = await this.callAgent(agent, prompt, stageId, effectiveStage.skill);
|
|
1009
|
-
|
|
1010
|
-
if (this.logger) this.logger.stageComplete(stageId, result.status, result.exitCode);
|
|
1011
|
-
if (this.fileGuard && !skipGuard) {
|
|
1012
|
-
const violations = this.fileGuard.checkAndRollback();
|
|
1013
|
-
if (violations.length > 0) result.violations = violations;
|
|
1014
|
-
}
|
|
1015
|
-
return result;
|
|
1120
|
+
// Новая ветка: список кандидатов с фильтром по capabilities → executeWithFallback
|
|
1121
|
+
return this.executeWithFallback(stageId);
|
|
1016
1122
|
}
|
|
1017
1123
|
|
|
1018
1124
|
/**
|
|
1019
1125
|
* Вызывает CLI-агента через child_process
|
|
1020
1126
|
*/
|
|
1021
|
-
callAgent(agent, prompt, stageId, skillId) {
|
|
1127
|
+
callAgent(agent, prompt, stageId, skillId, agentId = null) {
|
|
1022
1128
|
return new Promise((resolve, reject) => {
|
|
1023
1129
|
const timeout = this.pipeline.execution?.timeout_per_stage || 300;
|
|
1130
|
+
const healthRules = agentId ? this._getHealthRules() : null;
|
|
1131
|
+
const hasAgentRules = Boolean(
|
|
1132
|
+
healthRules && agentId && healthRules.agents.get(agentId)?.length
|
|
1133
|
+
);
|
|
1024
1134
|
const args = [...agent.args];
|
|
1025
1135
|
const finalPrompt = prompt;
|
|
1026
1136
|
|
|
@@ -1065,16 +1175,23 @@ class StageExecutor {
|
|
|
1065
1175
|
let stdout = '';
|
|
1066
1176
|
let stderr = '';
|
|
1067
1177
|
let timedOut = false;
|
|
1178
|
+
let earlyKilled = false;
|
|
1179
|
+
let earlyKillRule = null;
|
|
1180
|
+
let lastScanSize = 0;
|
|
1068
1181
|
|
|
1069
|
-
|
|
1070
|
-
const timeoutId = setTimeout(() => {
|
|
1071
|
-
timedOut = true;
|
|
1072
|
-
// На Windows SIGTERM игнорируется — используем taskkill /T /F для убийства дерева
|
|
1182
|
+
const killChild = () => {
|
|
1073
1183
|
if (process.platform === 'win32' && child.pid) {
|
|
1074
1184
|
try { execSync(`taskkill /pid ${child.pid} /T /F`, { stdio: 'pipe' }); } catch {}
|
|
1075
1185
|
} else {
|
|
1076
|
-
child.kill('SIGTERM');
|
|
1186
|
+
try { child.kill('SIGTERM'); } catch {}
|
|
1077
1187
|
}
|
|
1188
|
+
};
|
|
1189
|
+
|
|
1190
|
+
// Таймаут
|
|
1191
|
+
const timeoutId = setTimeout(() => {
|
|
1192
|
+
timedOut = true;
|
|
1193
|
+
// На Windows SIGTERM игнорируется — используем taskkill /T /F для убийства дерева
|
|
1194
|
+
killChild();
|
|
1078
1195
|
if (this.logger) {
|
|
1079
1196
|
this.logger.timeout(stageId, timeout);
|
|
1080
1197
|
}
|
|
@@ -1120,6 +1237,36 @@ class StageExecutor {
|
|
|
1120
1237
|
child.stderr.on('data', (data) => {
|
|
1121
1238
|
stderr += data.toString();
|
|
1122
1239
|
process.stderr.write(data);
|
|
1240
|
+
|
|
1241
|
+
// Online-детекция фатальных паттернов (quota/429/usage-limit и т.п.).
|
|
1242
|
+
// Нужна чтобы не ждать timeout_per_stage (1800s), когда агентский CLI
|
|
1243
|
+
// уходит в молчаливый retry-цикл после HTTP 429.
|
|
1244
|
+
if (!hasAgentRules || earlyKilled || timedOut) return;
|
|
1245
|
+
// Throttle: первый скан всегда, последующие — только после 200+ новых байт.
|
|
1246
|
+
if (lastScanSize > 0 && stderr.length - lastScanSize < 200) return;
|
|
1247
|
+
lastScanSize = stderr.length;
|
|
1248
|
+
const match = scanStderrForFatalRule(healthRules, agentId, stderr);
|
|
1249
|
+
if (!match) return;
|
|
1250
|
+
|
|
1251
|
+
earlyKilled = true;
|
|
1252
|
+
earlyKillRule = match;
|
|
1253
|
+
clearTimeout(timeoutId);
|
|
1254
|
+
if (this.logger) {
|
|
1255
|
+
this.logger.error(
|
|
1256
|
+
`Fatal stderr pattern matched for ${agentId} (rule=${match.rule_id}, class=${match.class}). Killing process.`,
|
|
1257
|
+
stageId
|
|
1258
|
+
);
|
|
1259
|
+
}
|
|
1260
|
+
killChild();
|
|
1261
|
+
const err = new Error(
|
|
1262
|
+
`Agent "${agentId}" killed early: ${match.rule_id} (class=${match.class})`
|
|
1263
|
+
);
|
|
1264
|
+
err.code = 'EARLY_KILL';
|
|
1265
|
+
err.exitCode = -1;
|
|
1266
|
+
err.stderr = stderr;
|
|
1267
|
+
err.earlyKill = true;
|
|
1268
|
+
err.rule = match;
|
|
1269
|
+
reject(err);
|
|
1123
1270
|
});
|
|
1124
1271
|
|
|
1125
1272
|
child.on('close', (code) => {
|
|
@@ -1139,6 +1286,16 @@ class StageExecutor {
|
|
|
1139
1286
|
process.stdout.write('\n');
|
|
1140
1287
|
|
|
1141
1288
|
if (timedOut) return;
|
|
1289
|
+
if (earlyKilled) {
|
|
1290
|
+
if (this.logger && stderr.trim()) {
|
|
1291
|
+
this.logger.warn(`STDERR ↓`, stageId);
|
|
1292
|
+
for (const line of stderr.trim().split('\n')) {
|
|
1293
|
+
this.logger.warn(` ${line}`, stageId);
|
|
1294
|
+
}
|
|
1295
|
+
this.logger.warn(`STDERR ↑`, stageId);
|
|
1296
|
+
}
|
|
1297
|
+
return;
|
|
1298
|
+
}
|
|
1142
1299
|
|
|
1143
1300
|
// Логгируем CLI вызов
|
|
1144
1301
|
if (this.logger) {
|
|
@@ -1205,7 +1362,7 @@ class StageExecutor {
|
|
|
1205
1362
|
|
|
1206
1363
|
child.on('error', (err) => {
|
|
1207
1364
|
clearTimeout(timeoutId);
|
|
1208
|
-
if (!timedOut) {
|
|
1365
|
+
if (!timedOut && !earlyKilled) {
|
|
1209
1366
|
if (this.logger) {
|
|
1210
1367
|
this.logger.error(`CLI error: ${err.message}`, stageId);
|
|
1211
1368
|
}
|
|
@@ -1,94 +1,94 @@
|
|
|
1
|
-
{
|
|
2
|
-
"date": "2026-04-
|
|
3
|
-
"skill_sha": "6df42d0",
|
|
4
|
-
"status": "passed",
|
|
5
|
-
"duration_ms":
|
|
6
|
-
"per_model": {
|
|
7
|
-
"claude-sonnet": {
|
|
8
|
-
"passed": true,
|
|
9
|
-
"pass_count": 3,
|
|
10
|
-
"total": 3,
|
|
11
|
-
"threshold": 2
|
|
12
|
-
},
|
|
13
|
-
"kilo-deepseek": {
|
|
14
|
-
"passed": true,
|
|
15
|
-
"pass_count": 2,
|
|
16
|
-
"total": 3,
|
|
17
|
-
"threshold": 2
|
|
18
|
-
},
|
|
19
|
-
"kilo-minimax": {
|
|
20
|
-
"passed": true,
|
|
21
|
-
"pass_count": 2,
|
|
22
|
-
"total": 3,
|
|
23
|
-
"threshold": 2
|
|
24
|
-
},
|
|
25
|
-
"kilo-glm": {
|
|
26
|
-
"passed": true,
|
|
27
|
-
"pass_count": 3,
|
|
28
|
-
"total": 3,
|
|
29
|
-
"threshold": 2
|
|
30
|
-
}
|
|
31
|
-
},
|
|
32
|
-
"rubric_scores": [
|
|
33
|
-
{
|
|
34
|
-
"agentId": "claude-sonnet",
|
|
35
|
-
"trial": 1,
|
|
36
|
-
"score": 5
|
|
37
|
-
},
|
|
38
|
-
{
|
|
39
|
-
"agentId": "claude-sonnet",
|
|
40
|
-
"trial": 2,
|
|
41
|
-
"score": 5
|
|
42
|
-
},
|
|
43
|
-
{
|
|
44
|
-
"agentId": "claude-sonnet",
|
|
45
|
-
"trial": 3,
|
|
46
|
-
"score": 5
|
|
47
|
-
},
|
|
48
|
-
{
|
|
49
|
-
"agentId": "kilo-deepseek",
|
|
50
|
-
"trial": 1,
|
|
51
|
-
"score": 5
|
|
52
|
-
},
|
|
53
|
-
{
|
|
54
|
-
"agentId": "kilo-deepseek",
|
|
55
|
-
"trial": 2,
|
|
56
|
-
"score": 1
|
|
57
|
-
},
|
|
58
|
-
{
|
|
59
|
-
"agentId": "kilo-deepseek",
|
|
60
|
-
"trial": 3,
|
|
61
|
-
"score": 5
|
|
62
|
-
},
|
|
63
|
-
{
|
|
64
|
-
"agentId": "kilo-glm",
|
|
65
|
-
"trial": 1,
|
|
66
|
-
"score": 5
|
|
67
|
-
},
|
|
68
|
-
{
|
|
69
|
-
"agentId": "kilo-glm",
|
|
70
|
-
"trial": 2,
|
|
71
|
-
"score": 5
|
|
72
|
-
},
|
|
73
|
-
{
|
|
74
|
-
"agentId": "kilo-glm",
|
|
75
|
-
"trial": 3,
|
|
76
|
-
"score": 5
|
|
77
|
-
},
|
|
78
|
-
{
|
|
79
|
-
"agentId": "kilo-minimax",
|
|
80
|
-
"trial": 1,
|
|
81
|
-
"score": 3
|
|
82
|
-
},
|
|
83
|
-
{
|
|
84
|
-
"agentId": "kilo-minimax",
|
|
85
|
-
"trial": 2,
|
|
86
|
-
"score": 5
|
|
87
|
-
},
|
|
88
|
-
{
|
|
89
|
-
"agentId": "kilo-minimax",
|
|
90
|
-
"trial": 3,
|
|
91
|
-
"score": 5
|
|
92
|
-
}
|
|
93
|
-
]
|
|
1
|
+
{
|
|
2
|
+
"date": "2026-04-21T16:43:17.710Z",
|
|
3
|
+
"skill_sha": "6df42d0",
|
|
4
|
+
"status": "passed",
|
|
5
|
+
"duration_ms": 2,
|
|
6
|
+
"per_model": {
|
|
7
|
+
"claude-sonnet": {
|
|
8
|
+
"passed": true,
|
|
9
|
+
"pass_count": 3,
|
|
10
|
+
"total": 3,
|
|
11
|
+
"threshold": 2
|
|
12
|
+
},
|
|
13
|
+
"kilo-deepseek": {
|
|
14
|
+
"passed": true,
|
|
15
|
+
"pass_count": 2,
|
|
16
|
+
"total": 3,
|
|
17
|
+
"threshold": 2
|
|
18
|
+
},
|
|
19
|
+
"kilo-minimax": {
|
|
20
|
+
"passed": true,
|
|
21
|
+
"pass_count": 2,
|
|
22
|
+
"total": 3,
|
|
23
|
+
"threshold": 2
|
|
24
|
+
},
|
|
25
|
+
"kilo-glm": {
|
|
26
|
+
"passed": true,
|
|
27
|
+
"pass_count": 3,
|
|
28
|
+
"total": 3,
|
|
29
|
+
"threshold": 2
|
|
30
|
+
}
|
|
31
|
+
},
|
|
32
|
+
"rubric_scores": [
|
|
33
|
+
{
|
|
34
|
+
"agentId": "claude-sonnet",
|
|
35
|
+
"trial": 1,
|
|
36
|
+
"score": 5
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
"agentId": "claude-sonnet",
|
|
40
|
+
"trial": 2,
|
|
41
|
+
"score": 5
|
|
42
|
+
},
|
|
43
|
+
{
|
|
44
|
+
"agentId": "claude-sonnet",
|
|
45
|
+
"trial": 3,
|
|
46
|
+
"score": 5
|
|
47
|
+
},
|
|
48
|
+
{
|
|
49
|
+
"agentId": "kilo-deepseek",
|
|
50
|
+
"trial": 1,
|
|
51
|
+
"score": 5
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
"agentId": "kilo-deepseek",
|
|
55
|
+
"trial": 2,
|
|
56
|
+
"score": 1
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
"agentId": "kilo-deepseek",
|
|
60
|
+
"trial": 3,
|
|
61
|
+
"score": 5
|
|
62
|
+
},
|
|
63
|
+
{
|
|
64
|
+
"agentId": "kilo-glm",
|
|
65
|
+
"trial": 1,
|
|
66
|
+
"score": 5
|
|
67
|
+
},
|
|
68
|
+
{
|
|
69
|
+
"agentId": "kilo-glm",
|
|
70
|
+
"trial": 2,
|
|
71
|
+
"score": 5
|
|
72
|
+
},
|
|
73
|
+
{
|
|
74
|
+
"agentId": "kilo-glm",
|
|
75
|
+
"trial": 3,
|
|
76
|
+
"score": 5
|
|
77
|
+
},
|
|
78
|
+
{
|
|
79
|
+
"agentId": "kilo-minimax",
|
|
80
|
+
"trial": 1,
|
|
81
|
+
"score": 3
|
|
82
|
+
},
|
|
83
|
+
{
|
|
84
|
+
"agentId": "kilo-minimax",
|
|
85
|
+
"trial": 2,
|
|
86
|
+
"score": 5
|
|
87
|
+
},
|
|
88
|
+
{
|
|
89
|
+
"agentId": "kilo-minimax",
|
|
90
|
+
"trial": 3,
|
|
91
|
+
"score": 5
|
|
92
|
+
}
|
|
93
|
+
]
|
|
94
94
|
}
|