judgeval 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +95 -68
- package/dist/cjs/common/tracer.js +235 -143
- package/dist/cjs/common/tracer.js.map +1 -1
- package/dist/cjs/constants.js +8 -5
- package/dist/cjs/constants.js.map +1 -1
- package/dist/cjs/data/datasets/eval-dataset-client.js +349 -0
- package/dist/cjs/data/datasets/eval-dataset-client.js.map +1 -0
- package/dist/cjs/data/datasets/eval-dataset.js +405 -0
- package/dist/cjs/data/datasets/eval-dataset.js.map +1 -0
- package/dist/cjs/data/example.js +22 -1
- package/dist/cjs/data/example.js.map +1 -1
- package/dist/cjs/e2etests/eval-operations.test.js +282 -0
- package/dist/cjs/e2etests/eval-operations.test.js.map +1 -0
- package/dist/cjs/e2etests/judgee-traces.test.js +278 -0
- package/dist/cjs/e2etests/judgee-traces.test.js.map +1 -0
- package/dist/cjs/index.js +1 -3
- package/dist/cjs/index.js.map +1 -1
- package/dist/cjs/judgment-client.js +326 -645
- package/dist/cjs/judgment-client.js.map +1 -1
- package/dist/cjs/scorers/api-scorer.js +56 -48
- package/dist/cjs/scorers/api-scorer.js.map +1 -1
- package/dist/cjs/scorers/base-scorer.js +66 -11
- package/dist/cjs/scorers/base-scorer.js.map +1 -1
- package/dist/esm/common/tracer.js +236 -144
- package/dist/esm/common/tracer.js.map +1 -1
- package/dist/esm/constants.js +7 -4
- package/dist/esm/constants.js.map +1 -1
- package/dist/esm/data/datasets/eval-dataset-client.js +342 -0
- package/dist/esm/data/datasets/eval-dataset-client.js.map +1 -0
- package/dist/esm/data/datasets/eval-dataset.js +375 -0
- package/dist/esm/data/datasets/eval-dataset.js.map +1 -0
- package/dist/esm/data/example.js +22 -1
- package/dist/esm/data/example.js.map +1 -1
- package/dist/esm/e2etests/eval-operations.test.js +254 -0
- package/dist/esm/e2etests/eval-operations.test.js.map +1 -0
- package/dist/esm/e2etests/judgee-traces.test.js +253 -0
- package/dist/esm/e2etests/judgee-traces.test.js.map +1 -0
- package/dist/esm/index.js +0 -1
- package/dist/esm/index.js.map +1 -1
- package/dist/esm/judgment-client.js +328 -647
- package/dist/esm/judgment-client.js.map +1 -1
- package/dist/esm/scorers/api-scorer.js +56 -48
- package/dist/esm/scorers/api-scorer.js.map +1 -1
- package/dist/esm/scorers/base-scorer.js +66 -11
- package/dist/esm/scorers/base-scorer.js.map +1 -1
- package/dist/types/common/tracer.d.ts +27 -14
- package/dist/types/constants.d.ts +4 -4
- package/dist/types/data/datasets/eval-dataset-client.d.ts +39 -0
- package/dist/types/data/datasets/eval-dataset.d.ts +45 -0
- package/dist/types/data/example.d.ts +24 -12
- package/dist/types/e2etests/eval-operations.test.d.ts +5 -0
- package/dist/types/e2etests/judgee-traces.test.d.ts +5 -0
- package/dist/types/index.d.ts +0 -1
- package/dist/types/judgment-client.d.ts +3 -47
- package/dist/types/scorers/api-scorer.d.ts +15 -15
- package/dist/types/scorers/base-scorer.d.ts +53 -10
- package/package.json +2 -1
- package/dist/cjs/scorers/exact-match-scorer.js +0 -84
- package/dist/cjs/scorers/exact-match-scorer.js.map +0 -1
- package/dist/esm/scorers/exact-match-scorer.js +0 -80
- package/dist/esm/scorers/exact-match-scorer.js.map +0 -1
- package/dist/types/scorers/exact-match-scorer.d.ts +0 -10
|
@@ -74,16 +74,13 @@ class TraceManagerClient {
|
|
|
74
74
|
try {
|
|
75
75
|
// Use isomorphic fetch (available globally in modern Node.js and browsers)
|
|
76
76
|
const response = yield fetch(url, Object.assign(Object.assign({}, options), { headers: headers }));
|
|
77
|
-
if
|
|
78
|
-
|
|
79
|
-
console.error(`API Error (${response.status}) for ${options.method || 'GET'} ${url}: ${errorBody}`);
|
|
80
|
-
throw new Error(`Judgment API request failed: ${response.status} ${response.statusText} - ${errorBody}`);
|
|
81
|
-
}
|
|
77
|
+
// We will return the response object even if !response.ok
|
|
78
|
+
// The caller (e.g., saveTrace) is responsible for checking response.ok or response.status
|
|
82
79
|
// Handle cases where the response might be empty (e.g., 204 No Content on DELETE)
|
|
83
80
|
if (response.status === 204) {
|
|
84
81
|
return null; // Indicate success with no content
|
|
85
82
|
}
|
|
86
|
-
return
|
|
83
|
+
return response;
|
|
87
84
|
}
|
|
88
85
|
catch (error) {
|
|
89
86
|
console.error(`Network or fetch error during ${options.method || 'GET'} ${url}:`, error);
|
|
@@ -100,21 +97,52 @@ class TraceManagerClient {
|
|
|
100
97
|
});
|
|
101
98
|
});
|
|
102
99
|
}
|
|
103
|
-
saveTrace(traceData
|
|
100
|
+
saveTrace(traceData) {
|
|
104
101
|
return __awaiter(this, void 0, void 0, function* () {
|
|
102
|
+
// _fetch now returns the raw response object or throws on network error
|
|
105
103
|
const response = yield this._fetch(constants_js_1.JUDGMENT_TRACES_SAVE_API_URL, {
|
|
106
104
|
method: 'POST',
|
|
107
|
-
body: JSON.stringify(traceData),
|
|
105
|
+
body: JSON.stringify(traceData), // Stringify directly here again
|
|
108
106
|
});
|
|
109
|
-
//
|
|
110
|
-
if (!
|
|
111
|
-
//
|
|
112
|
-
|
|
107
|
+
// Check if _fetch threw a network error (caught below) or returned an invalid object
|
|
108
|
+
if (!response) {
|
|
109
|
+
// This case should ideally be caught by _fetch's catch block, but double-check
|
|
110
|
+
throw new Error('Failed to save trace data: No response received from API.');
|
|
111
|
+
}
|
|
112
|
+
// Now, check the status code on the received response object
|
|
113
|
+
if (response.status === 400) {
|
|
114
|
+
// Attempt to get error body for more info
|
|
115
|
+
const errorBody = yield response.text();
|
|
116
|
+
throw new Error(`Failed to save trace data: Check your Trace name for conflicts, set overwrite=True to overwrite existing traces: ${response.status} ${response.statusText || ''} - ${errorBody}`);
|
|
117
|
+
}
|
|
118
|
+
else if (!response.ok) { // Handles other errors (5xx, 4xx except 400)
|
|
119
|
+
const errorBody = yield response.text();
|
|
120
|
+
throw new Error(`Failed to save trace data: Status ${response.status} ${response.statusText || '(No status text)'} - ${errorBody}`);
|
|
121
|
+
}
|
|
122
|
+
// --- Success Path ---
|
|
123
|
+
// Optionally log the UI URL (needs JSON parsing)
|
|
124
|
+
let responseData = null;
|
|
125
|
+
try {
|
|
126
|
+
// Handle 204 No Content specifically
|
|
127
|
+
if (response.status === 204) {
|
|
128
|
+
responseData = null; // Or maybe { success: true }?
|
|
129
|
+
}
|
|
130
|
+
else {
|
|
131
|
+
responseData = yield response.json(); // Parse JSON only on success
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
catch (parseError) {
|
|
135
|
+
logger_instance_js_1.default.warn("Failed to parse successful API response JSON.", { error: parseError });
|
|
136
|
+
// Depending on requirements, maybe throw, maybe return a default success object
|
|
137
|
+
throw new Error(`API request succeeded (${response.status}), but failed to parse JSON response.`);
|
|
138
|
+
}
|
|
139
|
+
if (responseData === null || responseData === void 0 ? void 0 : responseData.ui_results_url) {
|
|
113
140
|
console.info(`
|
|
114
|
-
🔍 View trace: ${
|
|
141
|
+
🔍 View trace: ${responseData.ui_results_url}
|
|
115
142
|
`);
|
|
116
143
|
}
|
|
117
|
-
|
|
144
|
+
// Return the parsed data (or null for 204)
|
|
145
|
+
return responseData;
|
|
118
146
|
});
|
|
119
147
|
}
|
|
120
148
|
deleteTrace(traceId) {
|
|
@@ -143,6 +171,50 @@ class TraceManagerClient {
|
|
|
143
171
|
});
|
|
144
172
|
});
|
|
145
173
|
}
|
|
174
|
+
/**
|
|
175
|
+
* Calculate token costs directly using the API endpoint.
|
|
176
|
+
* This is more accurate than client-side calculation as it uses the most up-to-date pricing.
|
|
177
|
+
*
|
|
178
|
+
* @param model The model name (e.g. 'gpt-4', 'claude-3-opus-20240229')
|
|
179
|
+
* @param promptTokens Number of tokens in the prompt/input
|
|
180
|
+
* @param completionTokens Number of tokens in the completion/output
|
|
181
|
+
* @returns Object containing token counts and calculated costs in USD
|
|
182
|
+
*/
|
|
183
|
+
calculateTokenCosts(model, promptTokens, completionTokens) {
|
|
184
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
185
|
+
try {
|
|
186
|
+
// Use the new calculation endpoint
|
|
187
|
+
const response = yield this._fetch(constants_js_1.JUDGMENT_CALCULATE_TOKEN_COSTS_API_URL, {
|
|
188
|
+
method: 'POST',
|
|
189
|
+
body: JSON.stringify({
|
|
190
|
+
model,
|
|
191
|
+
prompt_tokens: promptTokens,
|
|
192
|
+
completion_tokens: completionTokens
|
|
193
|
+
})
|
|
194
|
+
});
|
|
195
|
+
// Check if the response is okay and parse JSON
|
|
196
|
+
if (response && response.ok) {
|
|
197
|
+
const data = yield response.json();
|
|
198
|
+
return data;
|
|
199
|
+
}
|
|
200
|
+
else if (response) {
|
|
201
|
+
// Log error if response was not ok
|
|
202
|
+
const errorBody = yield response.text();
|
|
203
|
+
logger_instance_js_1.default.warn(`API error calculating token costs for model ${model}: ${response.status} ${response.statusText}`, { errorBody });
|
|
204
|
+
return null;
|
|
205
|
+
}
|
|
206
|
+
else {
|
|
207
|
+
// Handle cases where _fetch might return null or undefined (though it shouldn't with current implementation)
|
|
208
|
+
logger_instance_js_1.default.warn(`No response received when calculating token costs for model ${model}.`);
|
|
209
|
+
return null;
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
catch (error) {
|
|
213
|
+
logger_instance_js_1.default.warn(`Failed to calculate token costs for model ${model}.`, { error: error instanceof Error ? error.message : String(error) });
|
|
214
|
+
return null;
|
|
215
|
+
}
|
|
216
|
+
});
|
|
217
|
+
}
|
|
146
218
|
}
|
|
147
219
|
exports.TraceManagerClient = TraceManagerClient;
|
|
148
220
|
// --- Helper Functions ---
|
|
@@ -160,6 +232,7 @@ class TraceClient {
|
|
|
160
232
|
constructor(config) {
|
|
161
233
|
var _a, _b, _c, _d, _e;
|
|
162
234
|
this.traceManager = null; // Can be null if monitoring disabled
|
|
235
|
+
this._spanDepths = {}; // Track depth of active spans
|
|
163
236
|
this.traceId = config.traceId || (0, uuid_1.v4)();
|
|
164
237
|
this.originalName = config.name || 'default_trace'; // Store original
|
|
165
238
|
this.name = sanitizeName(this.originalName); // Use sanitized name internally
|
|
@@ -198,7 +271,7 @@ class TraceClient {
|
|
|
198
271
|
recordInput(inputs) {
|
|
199
272
|
const traceClientContext = getTraceClientContext();
|
|
200
273
|
const currentEntry = traceClientContext.entryStack.at(-1);
|
|
201
|
-
if (!currentEntry) {
|
|
274
|
+
if (!currentEntry || !currentEntry.span_id) {
|
|
202
275
|
console.warn(`No current entry to record input to\nStack trace: ${new Error().stack}`);
|
|
203
276
|
return;
|
|
204
277
|
}
|
|
@@ -207,14 +280,16 @@ class TraceClient {
|
|
|
207
280
|
span_id: currentEntry.span_id,
|
|
208
281
|
inputs,
|
|
209
282
|
function: currentEntry.function,
|
|
210
|
-
depth: currentEntry.
|
|
211
|
-
|
|
283
|
+
depth: this._spanDepths[currentEntry.span_id],
|
|
284
|
+
created_at: Date.now() / 1000,
|
|
285
|
+
span_type: currentEntry.span_type,
|
|
286
|
+
message: `Inputs to ${currentEntry.function}`
|
|
212
287
|
});
|
|
213
288
|
}
|
|
214
289
|
recordOutput(output) {
|
|
215
290
|
const traceClientContext = getTraceClientContext();
|
|
216
291
|
const currentEntry = traceClientContext.entryStack.at(-1);
|
|
217
|
-
if (!currentEntry) {
|
|
292
|
+
if (!currentEntry || !currentEntry.span_id) {
|
|
218
293
|
console.warn(`No current entry to record output to\nStack trace: ${new Error().stack}`);
|
|
219
294
|
return;
|
|
220
295
|
}
|
|
@@ -223,33 +298,28 @@ class TraceClient {
|
|
|
223
298
|
span_id: currentEntry.span_id,
|
|
224
299
|
output,
|
|
225
300
|
function: currentEntry.function,
|
|
226
|
-
depth: currentEntry.
|
|
227
|
-
|
|
301
|
+
depth: this._spanDepths[currentEntry.span_id],
|
|
302
|
+
created_at: Date.now() / 1000,
|
|
303
|
+
span_type: currentEntry.span_type,
|
|
304
|
+
message: `Output from ${currentEntry.function}`
|
|
228
305
|
});
|
|
229
306
|
}
|
|
230
307
|
recordError(error) {
|
|
231
|
-
var _a;
|
|
232
308
|
const traceClientContext = getTraceClientContext();
|
|
233
309
|
const currentEntry = traceClientContext.entryStack.at(-1);
|
|
234
|
-
if (!currentEntry) {
|
|
310
|
+
if (!currentEntry || !currentEntry.span_id) {
|
|
235
311
|
console.warn(`No current entry to record error to\nStack trace: ${new Error().stack}`);
|
|
236
312
|
return;
|
|
237
313
|
}
|
|
238
|
-
let output = error;
|
|
239
|
-
if (error instanceof Error) {
|
|
240
|
-
output = {
|
|
241
|
-
name: error.name,
|
|
242
|
-
message: error.message,
|
|
243
|
-
stack: (_a = error.stack) === null || _a === void 0 ? void 0 : _a.substring(0, 1000)
|
|
244
|
-
};
|
|
245
|
-
}
|
|
246
314
|
this.addEntry({
|
|
247
315
|
type: 'error',
|
|
248
316
|
span_id: currentEntry.span_id,
|
|
249
|
-
output,
|
|
317
|
+
output: error,
|
|
250
318
|
function: currentEntry.function,
|
|
251
|
-
depth: currentEntry.
|
|
252
|
-
|
|
319
|
+
depth: this._spanDepths[currentEntry.span_id],
|
|
320
|
+
created_at: Date.now() / 1000,
|
|
321
|
+
span_type: currentEntry.span_type,
|
|
322
|
+
message: `Error from ${currentEntry.function}`
|
|
253
323
|
});
|
|
254
324
|
}
|
|
255
325
|
startSpan(name, options = {}) {
|
|
@@ -260,18 +330,20 @@ class TraceClient {
|
|
|
260
330
|
const spanType = (_a = options.spanType) !== null && _a !== void 0 ? _a : 'span';
|
|
261
331
|
const startTime = Date.now() / 1000;
|
|
262
332
|
let depth = 0, parentSpanId = undefined;
|
|
263
|
-
if (parentEntry) {
|
|
264
|
-
depth = parentEntry.
|
|
333
|
+
if (parentEntry && parentEntry.span_id) {
|
|
334
|
+
depth = this._spanDepths[parentEntry.span_id] + 1;
|
|
265
335
|
parentSpanId = parentEntry.span_id;
|
|
266
336
|
}
|
|
337
|
+
this._spanDepths[spanId] = depth;
|
|
267
338
|
const entry = {
|
|
268
339
|
type: 'enter',
|
|
269
340
|
function: name,
|
|
270
341
|
span_id: spanId,
|
|
271
342
|
depth: depth,
|
|
272
|
-
|
|
343
|
+
created_at: startTime,
|
|
273
344
|
span_type: spanType,
|
|
274
|
-
parent_span_id: parentSpanId
|
|
345
|
+
parent_span_id: parentSpanId,
|
|
346
|
+
message: name
|
|
275
347
|
};
|
|
276
348
|
this.addEntry(entry);
|
|
277
349
|
traceClientContext.entryStack.push(entry);
|
|
@@ -279,21 +351,24 @@ class TraceClient {
|
|
|
279
351
|
endSpan() {
|
|
280
352
|
const traceClientContext = getTraceClientContext();
|
|
281
353
|
const enterEntry = traceClientContext.entryStack.pop();
|
|
282
|
-
if (!enterEntry) {
|
|
354
|
+
if (!enterEntry || !enterEntry.span_id) {
|
|
283
355
|
console.warn("No enter entry to end");
|
|
284
356
|
return;
|
|
285
357
|
}
|
|
286
358
|
const endTime = Date.now() / 1000;
|
|
287
|
-
const duration = endTime - enterEntry.
|
|
359
|
+
const duration = endTime - enterEntry.created_at;
|
|
288
360
|
this.addEntry({
|
|
289
361
|
type: 'exit',
|
|
290
362
|
function: enterEntry.function,
|
|
291
363
|
span_id: enterEntry.span_id,
|
|
292
|
-
depth: enterEntry.
|
|
293
|
-
|
|
364
|
+
depth: this._spanDepths[enterEntry.span_id],
|
|
365
|
+
created_at: endTime,
|
|
294
366
|
duration: duration,
|
|
295
|
-
span_type: enterEntry.span_type
|
|
367
|
+
span_type: enterEntry.span_type,
|
|
368
|
+
message: `← ${enterEntry.function}`
|
|
296
369
|
});
|
|
370
|
+
// Clean up depth tracking
|
|
371
|
+
delete this._spanDepths[enterEntry.span_id];
|
|
297
372
|
}
|
|
298
373
|
*span(name, options = {}) {
|
|
299
374
|
if (!this.enableMonitoring) {
|
|
@@ -311,6 +386,7 @@ class TraceClient {
|
|
|
311
386
|
condenseTrace(rawEntries) {
|
|
312
387
|
var _a, _b, _c, _d, _e;
|
|
313
388
|
const spansById = {};
|
|
389
|
+
const allEvaluationRuns = [];
|
|
314
390
|
for (const entry of rawEntries) {
|
|
315
391
|
const spanId = entry.span_id;
|
|
316
392
|
if (!spanId)
|
|
@@ -320,12 +396,12 @@ class TraceClient {
|
|
|
320
396
|
span_id: spanId,
|
|
321
397
|
function: entry.function || 'unknown',
|
|
322
398
|
depth: (_a = entry.depth) !== null && _a !== void 0 ? _a : 0,
|
|
323
|
-
|
|
399
|
+
created_at: new Date(((_b = entry.created_at) !== null && _b !== void 0 ? _b : 0) * 1000).toISOString(), // Convert number to ISO string
|
|
400
|
+
trace_id: this.traceId, // Add trace_id
|
|
324
401
|
parent_span_id: entry.parent_span_id,
|
|
325
402
|
span_type: entry.span_type || 'span',
|
|
326
403
|
inputs: null,
|
|
327
404
|
output: null,
|
|
328
|
-
evaluation_runs: [],
|
|
329
405
|
duration: null,
|
|
330
406
|
children: []
|
|
331
407
|
};
|
|
@@ -335,14 +411,14 @@ class TraceClient {
|
|
|
335
411
|
case 'enter':
|
|
336
412
|
currentSpanData.function = entry.function || currentSpanData.function;
|
|
337
413
|
currentSpanData.depth = (_c = entry.depth) !== null && _c !== void 0 ? _c : currentSpanData.depth;
|
|
338
|
-
currentSpanData.
|
|
414
|
+
currentSpanData.created_at = new Date(((_d = entry.created_at) !== null && _d !== void 0 ? _d : 0) * 1000).toISOString(); // Ensure created_at is string on update
|
|
339
415
|
currentSpanData.parent_span_id = entry.parent_span_id;
|
|
340
416
|
currentSpanData.span_type = entry.span_type || currentSpanData.span_type;
|
|
341
|
-
currentSpanData.start_time = entry.
|
|
417
|
+
currentSpanData.start_time = entry.created_at; // Keep original number for duration calc
|
|
342
418
|
break;
|
|
343
419
|
case 'exit':
|
|
344
420
|
currentSpanData.duration = (_e = entry.duration) !== null && _e !== void 0 ? _e : currentSpanData.duration;
|
|
345
|
-
currentSpanData.end_time = entry.
|
|
421
|
+
currentSpanData.end_time = entry.created_at; // Keep original number for duration calc
|
|
346
422
|
if (currentSpanData.duration === null && currentSpanData.start_time && currentSpanData.end_time) {
|
|
347
423
|
currentSpanData.duration = currentSpanData.end_time - currentSpanData.start_time;
|
|
348
424
|
}
|
|
@@ -358,10 +434,8 @@ class TraceClient {
|
|
|
358
434
|
case 'output':
|
|
359
435
|
case 'error':
|
|
360
436
|
currentSpanData.output = entry.output;
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
if (entry.evaluation_runs) {
|
|
364
|
-
currentSpanData.evaluation_runs.push(...entry.evaluation_runs);
|
|
437
|
+
if (entry.type === 'output' && entry.output && typeof entry.output === 'object' && 'eval_name' in entry.output && 'scorers' in entry.output && 'trace_span_id' in entry.output) {
|
|
438
|
+
allEvaluationRuns.push(entry.output);
|
|
365
439
|
}
|
|
366
440
|
break;
|
|
367
441
|
}
|
|
@@ -392,9 +466,11 @@ class TraceClient {
|
|
|
392
466
|
childrenMap[parentId].push(span);
|
|
393
467
|
}
|
|
394
468
|
}
|
|
395
|
-
|
|
469
|
+
// Sort using parsed dates
|
|
470
|
+
roots.sort((a, b) => Date.parse(a.created_at) - Date.parse(b.created_at));
|
|
396
471
|
for (const parentId in childrenMap) {
|
|
397
|
-
|
|
472
|
+
// Sort using parsed dates
|
|
473
|
+
childrenMap[parentId].sort((a, b) => Date.parse(a.created_at) - Date.parse(b.created_at));
|
|
398
474
|
}
|
|
399
475
|
function buildFlatListDfs(span) {
|
|
400
476
|
if (visited.has(span.span_id))
|
|
@@ -415,26 +491,36 @@ class TraceClient {
|
|
|
415
491
|
buildFlatListDfs(span);
|
|
416
492
|
}
|
|
417
493
|
}
|
|
418
|
-
return sortedCondensedList;
|
|
494
|
+
return [sortedCondensedList, allEvaluationRuns];
|
|
419
495
|
}
|
|
420
496
|
save() {
|
|
421
497
|
return __awaiter(this, arguments, void 0, function* (emptySave = false) {
|
|
498
|
+
var _a, _b, _c, _d, _e;
|
|
422
499
|
if (!this.enableMonitoring || !this.traceManager) {
|
|
423
500
|
return null;
|
|
424
501
|
}
|
|
425
502
|
const traceClientContext = getTraceClientContext();
|
|
426
503
|
const totalDuration = this.getDuration();
|
|
427
|
-
|
|
504
|
+
// Use the tuple returned by condenseTrace
|
|
505
|
+
const [condensedEntries, evaluationRuns] = this.condenseTrace(traceClientContext.entries);
|
|
428
506
|
const tokenCounts = {
|
|
429
|
-
prompt_tokens: 0,
|
|
430
|
-
|
|
507
|
+
prompt_tokens: 0,
|
|
508
|
+
completion_tokens: 0,
|
|
509
|
+
total_tokens: 0,
|
|
510
|
+
prompt_tokens_cost_usd: 0.0,
|
|
511
|
+
completion_tokens_cost_usd: 0.0,
|
|
512
|
+
total_cost_usd: 0.0
|
|
431
513
|
};
|
|
432
|
-
|
|
433
|
-
|
|
514
|
+
// First pass: collect all LLM calls with their token counts
|
|
515
|
+
const llmCalls = [];
|
|
516
|
+
let index = 0;
|
|
517
|
+
for (const entry of condensedEntries) {
|
|
434
518
|
if (entry.span_type === 'llm' && ((_a = entry.output) === null || _a === void 0 ? void 0 : _a.usage)) {
|
|
435
519
|
const usage = entry.output.usage;
|
|
520
|
+
const modelName = ((_b = entry.inputs) === null || _b === void 0 ? void 0 : _b.model) || "";
|
|
436
521
|
let promptTokens = 0;
|
|
437
522
|
let completionTokens = 0;
|
|
523
|
+
// Handle different token naming conventions
|
|
438
524
|
if (usage.prompt_tokens !== undefined || usage.completion_tokens !== undefined) {
|
|
439
525
|
promptTokens = usage.prompt_tokens || 0;
|
|
440
526
|
completionTokens = usage.completion_tokens || 0;
|
|
@@ -442,6 +528,7 @@ class TraceClient {
|
|
|
442
528
|
else if (usage.input_tokens !== undefined || usage.output_tokens !== undefined) {
|
|
443
529
|
promptTokens = usage.input_tokens || 0;
|
|
444
530
|
completionTokens = usage.output_tokens || 0;
|
|
531
|
+
// Standardize naming
|
|
445
532
|
usage.prompt_tokens = promptTokens;
|
|
446
533
|
usage.completion_tokens = completionTokens;
|
|
447
534
|
delete usage.input_tokens;
|
|
@@ -450,33 +537,82 @@ class TraceClient {
|
|
|
450
537
|
tokenCounts.prompt_tokens += promptTokens;
|
|
451
538
|
tokenCounts.completion_tokens += completionTokens;
|
|
452
539
|
tokenCounts.total_tokens += usage.total_tokens || (promptTokens + completionTokens);
|
|
453
|
-
|
|
540
|
+
// Add to list of calls for cost calculation
|
|
454
541
|
if (modelName) {
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
542
|
+
llmCalls.push({
|
|
543
|
+
modelName,
|
|
544
|
+
promptTokens,
|
|
545
|
+
completionTokens,
|
|
546
|
+
entryIndex: index
|
|
547
|
+
});
|
|
548
|
+
}
|
|
549
|
+
}
|
|
550
|
+
index++;
|
|
551
|
+
}
|
|
552
|
+
// Second pass: calculate costs for each LLM call using the API
|
|
553
|
+
if (this.traceManager && llmCalls.length > 0) {
|
|
554
|
+
// Process each LLM call
|
|
555
|
+
for (const call of llmCalls) {
|
|
556
|
+
try {
|
|
557
|
+
// Get costs from the API
|
|
558
|
+
const costs = yield this.traceManager.calculateTokenCosts(call.modelName, call.promptTokens, call.completionTokens);
|
|
559
|
+
if (costs) {
|
|
560
|
+
// Update the entry with the costs
|
|
561
|
+
const entry = condensedEntries[call.entryIndex];
|
|
562
|
+
// Ensure output and usage objects exist before assigning costs
|
|
563
|
+
if (entry.output && entry.output.usage) {
|
|
564
|
+
// --- This part assigns costs to the individual span ---
|
|
565
|
+
entry.output.usage.prompt_tokens_cost_usd = costs.prompt_tokens_cost_usd;
|
|
566
|
+
entry.output.usage.completion_tokens_cost_usd = costs.completion_tokens_cost_usd;
|
|
567
|
+
entry.output.usage.total_cost_usd = costs.total_cost_usd;
|
|
568
|
+
logger_instance_js_1.default.debug(`Assigned costs to span ${entry.span_id} (model: ${call.modelName})`, { costs }); // Added debug log
|
|
569
|
+
// -----------------------------------------------------
|
|
570
|
+
}
|
|
571
|
+
else {
|
|
572
|
+
logger_instance_js_1.default.warn(`Could not assign costs to span ${entry.span_id} (model: ${call.modelName}): Missing 'output' or 'output.usage' object.`, { output: entry.output }); // Log if structure is missing
|
|
573
|
+
}
|
|
574
|
+
// Add to the total costs for the trace
|
|
575
|
+
tokenCounts.prompt_tokens_cost_usd += (_c = costs.prompt_tokens_cost_usd) !== null && _c !== void 0 ? _c : 0.0;
|
|
576
|
+
tokenCounts.completion_tokens_cost_usd += (_d = costs.completion_tokens_cost_usd) !== null && _d !== void 0 ? _d : 0.0;
|
|
577
|
+
tokenCounts.total_cost_usd += (_e = costs.total_cost_usd) !== null && _e !== void 0 ? _e : 0.0;
|
|
465
578
|
}
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
usage
|
|
470
|
-
|
|
579
|
+
else {
|
|
580
|
+
// If calculation failed, set costs to null in the entry (matching Python behavior)
|
|
581
|
+
const entry = condensedEntries[call.entryIndex];
|
|
582
|
+
// Ensure output and usage objects exist before assigning null costs
|
|
583
|
+
if (entry.output && entry.output.usage) {
|
|
584
|
+
// --- Sets null costs on the individual span ---
|
|
585
|
+
entry.output.usage.prompt_tokens_cost_usd = null;
|
|
586
|
+
entry.output.usage.completion_tokens_cost_usd = null;
|
|
587
|
+
entry.output.usage.total_cost_usd = null;
|
|
588
|
+
// ------------------------------------------
|
|
589
|
+
}
|
|
590
|
+
else {
|
|
591
|
+
// Log if we can't even assign null because the structure is missing
|
|
592
|
+
logger_instance_js_1.default.warn(`Could not assign null costs to span ${entry.span_id} (model: ${call.modelName}): Missing 'output' or 'output.usage' object.`, { output: entry.output });
|
|
593
|
+
}
|
|
594
|
+
logger_instance_js_1.default.warn(`Token cost calculation failed for model '${call.modelName}'. Cost information will be null for this span.`); // More specific warning
|
|
471
595
|
}
|
|
472
596
|
}
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
597
|
+
catch (e) {
|
|
598
|
+
logger_instance_js_1.default.warn(`Error during cost calculation loop for model '${call.modelName}':`, e); // Adjusted logging
|
|
599
|
+
// Set costs to null in the entry if an error occurs during the loop iteration
|
|
600
|
+
const entry = condensedEntries[call.entryIndex];
|
|
601
|
+
// Ensure output and usage objects exist before assigning null costs on error
|
|
602
|
+
if (entry.output && entry.output.usage) {
|
|
603
|
+
// --- Sets null costs on the individual span on error ---
|
|
604
|
+
entry.output.usage.prompt_tokens_cost_usd = null;
|
|
605
|
+
entry.output.usage.completion_tokens_cost_usd = null;
|
|
606
|
+
entry.output.usage.total_cost_usd = null;
|
|
607
|
+
// ----------------------------------------------------
|
|
608
|
+
}
|
|
609
|
+
else {
|
|
610
|
+
// Log if we can't assign null on error because the structure is missing
|
|
611
|
+
logger_instance_js_1.default.warn(`Could not assign null costs to span ${entry.span_id} (model: ${call.modelName}) on error: Missing 'output' or 'output.usage' object.`, { output: entry.output });
|
|
612
|
+
}
|
|
477
613
|
}
|
|
478
614
|
}
|
|
479
|
-
}
|
|
615
|
+
}
|
|
480
616
|
// Convert rules array to a dictionary (Record<string, Rule>)
|
|
481
617
|
const rulesDict = {};
|
|
482
618
|
this.rules.forEach(rule => {
|
|
@@ -493,16 +629,15 @@ class TraceClient {
|
|
|
493
629
|
duration: totalDuration,
|
|
494
630
|
token_counts: tokenCounts,
|
|
495
631
|
entries: condensedEntries,
|
|
496
|
-
|
|
497
|
-
empty_save: emptySave,
|
|
632
|
+
evaluation_runs: evaluationRuns,
|
|
498
633
|
overwrite: this.overwrite,
|
|
499
634
|
parent_trace_id: this.parentTraceId,
|
|
500
635
|
parent_name: this.parentName
|
|
501
636
|
};
|
|
502
637
|
try {
|
|
503
|
-
yield this.traceManager.saveTrace(traceData
|
|
638
|
+
yield this.traceManager.saveTrace(traceData);
|
|
504
639
|
logger_instance_js_1.default.info(`Trace ${this.traceId} saved successfully.`);
|
|
505
|
-
if (
|
|
640
|
+
if (this.enableEvaluations) {
|
|
506
641
|
try {
|
|
507
642
|
yield this.traceManager.addTraceToEvalQueue(traceData);
|
|
508
643
|
logger_instance_js_1.default.info(`Trace ${this.traceId} added to evaluation queue.`);
|
|
@@ -536,7 +671,7 @@ class TraceClient {
|
|
|
536
671
|
traceClientContext.entries.forEach(entry => {
|
|
537
672
|
var _a;
|
|
538
673
|
const indent = " ".repeat((_a = entry.depth) !== null && _a !== void 0 ? _a : 0);
|
|
539
|
-
const timeStr = entry.
|
|
674
|
+
const timeStr = entry.created_at ? `@ ${new Date(entry.created_at * 1000).toISOString()}` : '';
|
|
540
675
|
const shortSpanId = entry.span_id ? `(id: ${entry.span_id.substring(0, 8)}...)` : '';
|
|
541
676
|
const shortParentId = entry.parent_span_id ? `(parent: ${entry.parent_span_id.substring(0, 8)}...)` : '';
|
|
542
677
|
try {
|
|
@@ -567,17 +702,6 @@ class TraceClient {
|
|
|
567
702
|
// Keep console.log
|
|
568
703
|
console.log(`${indent} ${prefix} (for ${shortSpanId}): ${outputStr || 'null'}`);
|
|
569
704
|
break;
|
|
570
|
-
case 'evaluation':
|
|
571
|
-
let evalStr = JSON.stringify(entry.evaluation_runs);
|
|
572
|
-
if (evalStr && evalStr.length > 200) {
|
|
573
|
-
evalStr = evalStr.substring(0, 197) + '...';
|
|
574
|
-
}
|
|
575
|
-
// Keep console.log
|
|
576
|
-
console.log(`${indent} Evaluation (for ${shortSpanId}): ${evalStr || '[]'}`);
|
|
577
|
-
break;
|
|
578
|
-
default:
|
|
579
|
-
// Keep console.log
|
|
580
|
-
console.log(`${indent}? Unknown entry type: ${JSON.stringify(entry)}`);
|
|
581
705
|
}
|
|
582
706
|
}
|
|
583
707
|
catch (stringifyError) {
|
|
@@ -617,9 +741,7 @@ class TraceClient {
|
|
|
617
741
|
* @returns Promise that resolves when the evaluation entry has been added to the trace
|
|
618
742
|
*/
|
|
619
743
|
asyncEvaluate(scorers_1) {
|
|
620
|
-
return __awaiter(this, arguments, void 0, function* (
|
|
621
|
-
// Accept general Scorer type, but filter/check for API scorers internally
|
|
622
|
-
scorers, options = {}) {
|
|
744
|
+
return __awaiter(this, arguments, void 0, function* (scorers, options = {}) {
|
|
623
745
|
if (!this.enableEvaluations) {
|
|
624
746
|
logger_instance_js_1.default.warn("Evaluations are disabled. Skipping async evaluation.");
|
|
625
747
|
return;
|
|
@@ -634,7 +756,12 @@ class TraceClient {
|
|
|
634
756
|
logger_instance_js_1.default.warn("No APIJudgmentScorers found in the provided scorers list. Skipping async evaluation as backend requires API scorers.");
|
|
635
757
|
return;
|
|
636
758
|
}
|
|
637
|
-
|
|
759
|
+
// Process rules (currently just using this.rules directly)
|
|
760
|
+
const loadedRules = this.rules; // TODO: Add ScorerWrapper-like processing if needed in TS
|
|
761
|
+
// Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
|
|
762
|
+
if (loadedRules && loadedRules.length > 0 && scorers.some(s => !(s instanceof base_scorer_js_1.APIJudgmentScorer))) {
|
|
763
|
+
throw new Error("Cannot use Judgeval scorers, you can only use API scorers when using rules. Please either remove rules or use only APIJudgmentScorer types.");
|
|
764
|
+
}
|
|
638
765
|
// Create example structure matching Python/backend expectations
|
|
639
766
|
const example = {
|
|
640
767
|
input: options.input || "",
|
|
@@ -654,6 +781,7 @@ class TraceClient {
|
|
|
654
781
|
logger_instance_js_1.default.warn(`No current entry to record evaluation to\nStack trace: ${new Error().stack}`);
|
|
655
782
|
return;
|
|
656
783
|
}
|
|
784
|
+
const currentSpanId = currentEntry.span_id; // Get the span ID
|
|
657
785
|
// --- Create evaluation run name (similar to Python) ---
|
|
658
786
|
// Capitalize scorer names
|
|
659
787
|
const scorerNames = apiScorers.map(scorer => {
|
|
@@ -663,11 +791,9 @@ class TraceClient {
|
|
|
663
791
|
return name.charAt(0).toUpperCase() + name.slice(1);
|
|
664
792
|
}).join(',');
|
|
665
793
|
// Use trace name and shortened span ID (or trace ID if no span)
|
|
666
|
-
const idPart =
|
|
794
|
+
const idPart = currentSpanId ? currentSpanId.substring(0, 8) : this.traceId.substring(0, 8);
|
|
667
795
|
const evalName = `${this.name.charAt(0).toUpperCase() + this.name.slice(1)}-${idPart}-[${scorerNames}]`;
|
|
668
796
|
// --- End eval name creation ---
|
|
669
|
-
// Process rules (currently just using this.rules directly)
|
|
670
|
-
const loadedRules = this.rules; // TODO: Add ScorerWrapper-like processing if needed in TS
|
|
671
797
|
// Construct the evaluation payload
|
|
672
798
|
const evalRunPayload = {
|
|
673
799
|
organization_id: this.organizationId,
|
|
@@ -680,47 +806,18 @@ class TraceClient {
|
|
|
680
806
|
metadata: {}, // Matches Python tracer
|
|
681
807
|
judgment_api_key: this.apiKey,
|
|
682
808
|
override: this.overwrite, // Use trace's overwrite setting
|
|
683
|
-
rules: loadedRules // Pass the processed rules
|
|
809
|
+
rules: loadedRules, // Pass the processed rules
|
|
810
|
+
trace_span_id: currentSpanId // <<< RENAMED: Assign the current span ID (matching backend)
|
|
684
811
|
};
|
|
685
|
-
// Add evaluation entry
|
|
686
|
-
this.
|
|
812
|
+
// Add evaluation entry to the trace
|
|
813
|
+
this.recordOutput(evalRunPayload);
|
|
687
814
|
}
|
|
688
815
|
catch (error) {
|
|
689
|
-
|
|
690
|
-
|
|
816
|
+
logger_instance_js_1.default.error(`Failed during asyncEvaluate execution: ${error instanceof Error ? error.message : String(error)}`);
|
|
817
|
+
throw error; // Re-throw after logging
|
|
691
818
|
}
|
|
692
819
|
});
|
|
693
820
|
}
|
|
694
|
-
/**
|
|
695
|
-
* Private helper to add an evaluation entry to the trace.
|
|
696
|
-
* This mirrors the structure of Python's add_eval_run.
|
|
697
|
-
*
|
|
698
|
-
* @param evalRunPayload The constructed payload for the evaluation.
|
|
699
|
-
* @param startTime The start time (in seconds) of the evaluation process.
|
|
700
|
-
*/
|
|
701
|
-
_addEvalRun(evalRunPayload, startTime) {
|
|
702
|
-
var _a, _b;
|
|
703
|
-
const traceClientContext = getTraceClientContext();
|
|
704
|
-
const currentEntry = traceClientContext.entryStack.at(-1);
|
|
705
|
-
if (!currentEntry) {
|
|
706
|
-
logger_instance_js_1.default.warn(`No current entry to record evaluation to\nStack trace: ${new Error().stack}`);
|
|
707
|
-
return;
|
|
708
|
-
}
|
|
709
|
-
const function_ = (_a = currentEntry.function) !== null && _a !== void 0 ? _a : "unknown_function";
|
|
710
|
-
const depth = (_b = currentEntry.depth) !== null && _b !== void 0 ? _b : 0;
|
|
711
|
-
const duration = Date.now() / 1000 - startTime;
|
|
712
|
-
// Add evaluation entry to the trace
|
|
713
|
-
this.addEntry({
|
|
714
|
-
type: "evaluation",
|
|
715
|
-
function: function_,
|
|
716
|
-
span_id: currentEntry.span_id, // May be undefined
|
|
717
|
-
depth: depth,
|
|
718
|
-
timestamp: Date.now() / 1000,
|
|
719
|
-
evaluation_runs: [evalRunPayload], // Embed the payload
|
|
720
|
-
duration: duration,
|
|
721
|
-
span_type: "evaluation"
|
|
722
|
-
});
|
|
723
|
-
}
|
|
724
821
|
// OPTIONAL: Add a method to get the original name if needed elsewhere
|
|
725
822
|
getOriginalName() {
|
|
726
823
|
return this.originalName;
|
|
@@ -802,11 +899,6 @@ class Tracer {
|
|
|
802
899
|
apiKey: this.apiKey,
|
|
803
900
|
organizationId: this.organizationId,
|
|
804
901
|
});
|
|
805
|
-
if (traceClient.enableMonitoring) {
|
|
806
|
-
traceClient.save(true).catch(err => {
|
|
807
|
-
logger_instance_js_1.default.error(`Failed to save empty trace (${traceClient.traceId}):`, err);
|
|
808
|
-
});
|
|
809
|
-
}
|
|
810
902
|
return traceClient;
|
|
811
903
|
}
|
|
812
904
|
*trace(name, options = {}) {
|