@forwardimpact/libeval 0.1.62 → 0.1.63
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/agent-runner.js +44 -1
- package/src/commands/run.js +3 -1
package/package.json
CHANGED
package/src/agent-runner.js
CHANGED
|
@@ -10,6 +10,32 @@ import { AGENT_MODEL } from "@forwardimpact/libutil/models";
|
|
|
10
10
|
|
|
11
11
|
const DEFAULT_ALLOWED_TOOLS = ["Bash", "Read", "Glob", "Grep", "Write", "Edit"];
|
|
12
12
|
|
|
13
|
+
/**
|
|
14
|
+
* Did the session actually invoke the model? A genuine run always bills
|
|
15
|
+
* tokens (the system prompt alone is thousands of input tokens) and costs
|
|
16
|
+
* more than zero. A `result` message with `subtype: "success"` but zero
|
|
17
|
+
* token usage and zero cost means the model was never reached — the
|
|
18
|
+
* canonical signature of a Claude Code init/auth failure (e.g. an invalid
|
|
19
|
+
* `ANTHROPIC_API_KEY`), which the SDK otherwise reports as a clean success.
|
|
20
|
+
*
|
|
21
|
+
* If the SDK gave us neither a `usage` object nor `total_cost_usd`, don't
|
|
22
|
+
* second-guess the subtype — trust the reported success.
|
|
23
|
+
* @param {object|null} result - The SDK `result` message, or null.
|
|
24
|
+
* @returns {boolean}
|
|
25
|
+
*/
|
|
26
|
+
function modelDidWork(result) {
|
|
27
|
+
if (!result) return false;
|
|
28
|
+
const { usage, total_cost_usd: cost } = result;
|
|
29
|
+
if (usage == null && cost == null) return true;
|
|
30
|
+
const tokens = usage
|
|
31
|
+
? (usage.input_tokens ?? 0) +
|
|
32
|
+
(usage.output_tokens ?? 0) +
|
|
33
|
+
(usage.cache_creation_input_tokens ?? 0) +
|
|
34
|
+
(usage.cache_read_input_tokens ?? 0)
|
|
35
|
+
: 0;
|
|
36
|
+
return tokens > 0 || (cost ?? 0) > 0;
|
|
37
|
+
}
|
|
38
|
+
|
|
13
39
|
// fit-eval and kata-action run headless in CI/CD with no human to answer
|
|
14
40
|
// permission prompts. The SDK is always launched in bypass mode — not
|
|
15
41
|
// overridable — so a future caller can't accidentally reduce permissions.
|
|
@@ -148,6 +174,7 @@ export class AgentRunner {
|
|
|
148
174
|
async #consumeQuery(iterator) {
|
|
149
175
|
let text = "";
|
|
150
176
|
let stopReason = null;
|
|
177
|
+
let resultMessage = null;
|
|
151
178
|
let error = null;
|
|
152
179
|
let aborted = false;
|
|
153
180
|
|
|
@@ -157,6 +184,7 @@ export class AgentRunner {
|
|
|
157
184
|
if (message.type === "result") {
|
|
158
185
|
text = message.result ?? "";
|
|
159
186
|
stopReason = message.subtype;
|
|
187
|
+
resultMessage = message;
|
|
160
188
|
}
|
|
161
189
|
}
|
|
162
190
|
} catch (err) {
|
|
@@ -167,8 +195,23 @@ export class AgentRunner {
|
|
|
167
195
|
}
|
|
168
196
|
}
|
|
169
197
|
|
|
198
|
+
// A "success" subtype is necessary but not sufficient: the SDK reports a
|
|
199
|
+
// failed init (e.g. an invalid API key) as success with zero model work.
|
|
200
|
+
// Require evidence the model actually ran, and surface a clear error when
|
|
201
|
+
// it didn't, so the masked failure can't be reported as a green run.
|
|
202
|
+
const reportedSuccess = stopReason === "success";
|
|
203
|
+
const success =
|
|
204
|
+
reportedSuccess &&
|
|
205
|
+
resultMessage?.is_error !== true &&
|
|
206
|
+
modelDidWork(resultMessage);
|
|
207
|
+
if (reportedSuccess && !success && !error) {
|
|
208
|
+
error = new Error(
|
|
209
|
+
"agent reported success but performed no model work (zero token usage) — likely a Claude Code init or authentication failure",
|
|
210
|
+
);
|
|
211
|
+
}
|
|
212
|
+
|
|
170
213
|
return {
|
|
171
|
-
success
|
|
214
|
+
success,
|
|
172
215
|
text,
|
|
173
216
|
sessionId: this.sessionId,
|
|
174
217
|
error,
|
package/src/commands/run.js
CHANGED
|
@@ -140,5 +140,7 @@ export async function runRunCommand(ctx) {
|
|
|
140
140
|
await new Promise((r) => fileStream.end(r));
|
|
141
141
|
}
|
|
142
142
|
|
|
143
|
-
return result.success
|
|
143
|
+
return result.success
|
|
144
|
+
? { ok: true }
|
|
145
|
+
: { ok: false, code: 1, error: result.error?.message ?? "" };
|
|
144
146
|
}
|