judgeval 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +95 -68
- package/dist/cjs/common/logger-instance.js +17 -19
- package/dist/cjs/common/logger-instance.js.map +1 -1
- package/dist/cjs/common/tracer.js +210 -126
- package/dist/cjs/common/tracer.js.map +1 -1
- package/dist/cjs/constants.js +3 -2
- package/dist/cjs/constants.js.map +1 -1
- package/dist/cjs/index.js +1 -3
- package/dist/cjs/index.js.map +1 -1
- package/dist/cjs/judgment-client.js +20 -114
- package/dist/cjs/judgment-client.js.map +1 -1
- package/dist/cjs/scorers/api-scorer.js +56 -48
- package/dist/cjs/scorers/api-scorer.js.map +1 -1
- package/dist/cjs/scorers/base-scorer.js +66 -11
- package/dist/cjs/scorers/base-scorer.js.map +1 -1
- package/dist/esm/common/logger-instance.js +17 -19
- package/dist/esm/common/logger-instance.js.map +1 -1
- package/dist/esm/common/tracer.js +211 -127
- package/dist/esm/common/tracer.js.map +1 -1
- package/dist/esm/constants.js +2 -1
- package/dist/esm/constants.js.map +1 -1
- package/dist/esm/index.js +0 -1
- package/dist/esm/index.js.map +1 -1
- package/dist/esm/judgment-client.js +20 -114
- package/dist/esm/judgment-client.js.map +1 -1
- package/dist/esm/scorers/api-scorer.js +56 -48
- package/dist/esm/scorers/api-scorer.js.map +1 -1
- package/dist/esm/scorers/base-scorer.js +66 -11
- package/dist/esm/scorers/base-scorer.js.map +1 -1
- package/dist/types/common/tracer.d.ts +27 -13
- package/dist/types/constants.d.ts +2 -1
- package/dist/types/index.d.ts +0 -1
- package/dist/types/judgment-client.d.ts +0 -22
- package/dist/types/scorers/api-scorer.d.ts +15 -15
- package/dist/types/scorers/base-scorer.d.ts +53 -10
- package/package.json +10 -3
- package/dist/cjs/scorers/exact-match-scorer.js +0 -84
- package/dist/cjs/scorers/exact-match-scorer.js.map +0 -1
- package/dist/esm/scorers/exact-match-scorer.js +0 -80
- package/dist/esm/scorers/exact-match-scorer.js.map +0 -1
- package/dist/types/scorers/exact-match-scorer.d.ts +0 -10
|
@@ -14,7 +14,7 @@ import { AsyncLocalStorage } from 'async_hooks';
|
|
|
14
14
|
import OpenAI from 'openai';
|
|
15
15
|
import Anthropic from '@anthropic-ai/sdk';
|
|
16
16
|
// Local Imports
|
|
17
|
-
import { JUDGMENT_TRACES_SAVE_API_URL, JUDGMENT_TRACES_FETCH_API_URL, JUDGMENT_TRACES_DELETE_API_URL, JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL,
|
|
17
|
+
import { JUDGMENT_TRACES_SAVE_API_URL, JUDGMENT_TRACES_FETCH_API_URL, JUDGMENT_TRACES_DELETE_API_URL, JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL, JUDGMENT_CALCULATE_TOKEN_COSTS_API_URL,
|
|
18
18
|
// Add other necessary constants if needed
|
|
19
19
|
} from '../constants.js';
|
|
20
20
|
import { APIJudgmentScorer } from '../scorers/base-scorer.js';
|
|
@@ -70,16 +70,13 @@ class TraceManagerClient {
|
|
|
70
70
|
try {
|
|
71
71
|
// Use isomorphic fetch (available globally in modern Node.js and browsers)
|
|
72
72
|
const response = yield fetch(url, Object.assign(Object.assign({}, options), { headers: headers }));
|
|
73
|
-
if
|
|
74
|
-
|
|
75
|
-
console.error(`API Error (${response.status}) for ${options.method || 'GET'} ${url}: ${errorBody}`);
|
|
76
|
-
throw new Error(`Judgment API request failed: ${response.status} ${response.statusText} - ${errorBody}`);
|
|
77
|
-
}
|
|
73
|
+
// We will return the response object even if !response.ok
|
|
74
|
+
// The caller (e.g., saveTrace) is responsible for checking response.ok or response.status
|
|
78
75
|
// Handle cases where the response might be empty (e.g., 204 No Content on DELETE)
|
|
79
76
|
if (response.status === 204) {
|
|
80
77
|
return null; // Indicate success with no content
|
|
81
78
|
}
|
|
82
|
-
return
|
|
79
|
+
return response;
|
|
83
80
|
}
|
|
84
81
|
catch (error) {
|
|
85
82
|
console.error(`Network or fetch error during ${options.method || 'GET'} ${url}:`, error);
|
|
@@ -96,21 +93,52 @@ class TraceManagerClient {
|
|
|
96
93
|
});
|
|
97
94
|
});
|
|
98
95
|
}
|
|
99
|
-
saveTrace(traceData
|
|
96
|
+
saveTrace(traceData) {
|
|
100
97
|
return __awaiter(this, void 0, void 0, function* () {
|
|
98
|
+
// _fetch now returns the raw response object or throws on network error
|
|
101
99
|
const response = yield this._fetch(JUDGMENT_TRACES_SAVE_API_URL, {
|
|
102
100
|
method: 'POST',
|
|
103
|
-
body: JSON.stringify(traceData),
|
|
101
|
+
body: JSON.stringify(traceData), // Stringify directly here again
|
|
104
102
|
});
|
|
105
|
-
//
|
|
106
|
-
if (!
|
|
107
|
-
//
|
|
108
|
-
|
|
103
|
+
// Check if _fetch threw a network error (caught below) or returned an invalid object
|
|
104
|
+
if (!response) {
|
|
105
|
+
// This case should ideally be caught by _fetch's catch block, but double-check
|
|
106
|
+
throw new Error('Failed to save trace data: No response received from API.');
|
|
107
|
+
}
|
|
108
|
+
// Now, check the status code on the received response object
|
|
109
|
+
if (response.status === 400) {
|
|
110
|
+
// Attempt to get error body for more info
|
|
111
|
+
const errorBody = yield response.text();
|
|
112
|
+
throw new Error(`Failed to save trace data: Check your Trace name for conflicts, set overwrite=True to overwrite existing traces: ${response.status} ${response.statusText || ''} - ${errorBody}`);
|
|
113
|
+
}
|
|
114
|
+
else if (!response.ok) { // Handles other errors (5xx, 4xx except 400)
|
|
115
|
+
const errorBody = yield response.text();
|
|
116
|
+
throw new Error(`Failed to save trace data: Status ${response.status} ${response.statusText || '(No status text)'} - ${errorBody}`);
|
|
117
|
+
}
|
|
118
|
+
// --- Success Path ---
|
|
119
|
+
// Optionally log the UI URL (needs JSON parsing)
|
|
120
|
+
let responseData = null;
|
|
121
|
+
try {
|
|
122
|
+
// Handle 204 No Content specifically
|
|
123
|
+
if (response.status === 204) {
|
|
124
|
+
responseData = null; // Or maybe { success: true }?
|
|
125
|
+
}
|
|
126
|
+
else {
|
|
127
|
+
responseData = yield response.json(); // Parse JSON only on success
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
catch (parseError) {
|
|
131
|
+
logger.warn("Failed to parse successful API response JSON.", { error: parseError });
|
|
132
|
+
// Depending on requirements, maybe throw, maybe return a default success object
|
|
133
|
+
throw new Error(`API request succeeded (${response.status}), but failed to parse JSON response.`);
|
|
134
|
+
}
|
|
135
|
+
if (responseData === null || responseData === void 0 ? void 0 : responseData.ui_results_url) {
|
|
109
136
|
console.info(`
|
|
110
|
-
🔍 View trace: ${
|
|
137
|
+
🔍 View trace: ${responseData.ui_results_url}
|
|
111
138
|
`);
|
|
112
139
|
}
|
|
113
|
-
|
|
140
|
+
// Return the parsed data (or null for 204)
|
|
141
|
+
return responseData;
|
|
114
142
|
});
|
|
115
143
|
}
|
|
116
144
|
deleteTrace(traceId) {
|
|
@@ -139,6 +167,35 @@ class TraceManagerClient {
|
|
|
139
167
|
});
|
|
140
168
|
});
|
|
141
169
|
}
|
|
170
|
+
/**
|
|
171
|
+
* Calculate token costs directly using the API endpoint.
|
|
172
|
+
* This is more accurate than client-side calculation as it uses the most up-to-date pricing.
|
|
173
|
+
*
|
|
174
|
+
* @param model The model name (e.g. 'gpt-4', 'claude-3-opus-20240229')
|
|
175
|
+
* @param promptTokens Number of tokens in the prompt/input
|
|
176
|
+
* @param completionTokens Number of tokens in the completion/output
|
|
177
|
+
* @returns Object containing token counts and calculated costs in USD
|
|
178
|
+
*/
|
|
179
|
+
calculateTokenCosts(model, promptTokens, completionTokens) {
|
|
180
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
181
|
+
try {
|
|
182
|
+
// Use the new calculation endpoint
|
|
183
|
+
const result = yield this._fetch(JUDGMENT_CALCULATE_TOKEN_COSTS_API_URL, {
|
|
184
|
+
method: 'POST',
|
|
185
|
+
body: JSON.stringify({
|
|
186
|
+
model,
|
|
187
|
+
prompt_tokens: promptTokens,
|
|
188
|
+
completion_tokens: completionTokens
|
|
189
|
+
})
|
|
190
|
+
});
|
|
191
|
+
return result;
|
|
192
|
+
}
|
|
193
|
+
catch (error) {
|
|
194
|
+
logger.warn(`Failed to calculate token costs for model ${model}.`, { error: error instanceof Error ? error.message : String(error) });
|
|
195
|
+
return null;
|
|
196
|
+
}
|
|
197
|
+
});
|
|
198
|
+
}
|
|
142
199
|
}
|
|
143
200
|
// --- Helper Functions ---
|
|
144
201
|
// Helper function to sanitize names (e.g., replace spaces with underscores)
|
|
@@ -155,6 +212,7 @@ class TraceClient {
|
|
|
155
212
|
constructor(config) {
|
|
156
213
|
var _a, _b, _c, _d, _e;
|
|
157
214
|
this.traceManager = null; // Can be null if monitoring disabled
|
|
215
|
+
this._spanDepths = {}; // Track depth of active spans
|
|
158
216
|
this.traceId = config.traceId || uuidv4();
|
|
159
217
|
this.originalName = config.name || 'default_trace'; // Store original
|
|
160
218
|
this.name = sanitizeName(this.originalName); // Use sanitized name internally
|
|
@@ -193,7 +251,7 @@ class TraceClient {
|
|
|
193
251
|
recordInput(inputs) {
|
|
194
252
|
const traceClientContext = getTraceClientContext();
|
|
195
253
|
const currentEntry = traceClientContext.entryStack.at(-1);
|
|
196
|
-
if (!currentEntry) {
|
|
254
|
+
if (!currentEntry || !currentEntry.span_id) {
|
|
197
255
|
console.warn(`No current entry to record input to\nStack trace: ${new Error().stack}`);
|
|
198
256
|
return;
|
|
199
257
|
}
|
|
@@ -202,14 +260,16 @@ class TraceClient {
|
|
|
202
260
|
span_id: currentEntry.span_id,
|
|
203
261
|
inputs,
|
|
204
262
|
function: currentEntry.function,
|
|
205
|
-
depth: currentEntry.
|
|
206
|
-
|
|
263
|
+
depth: this._spanDepths[currentEntry.span_id],
|
|
264
|
+
created_at: Date.now() / 1000,
|
|
265
|
+
span_type: currentEntry.span_type,
|
|
266
|
+
message: `Inputs to ${currentEntry.function}`
|
|
207
267
|
});
|
|
208
268
|
}
|
|
209
269
|
recordOutput(output) {
|
|
210
270
|
const traceClientContext = getTraceClientContext();
|
|
211
271
|
const currentEntry = traceClientContext.entryStack.at(-1);
|
|
212
|
-
if (!currentEntry) {
|
|
272
|
+
if (!currentEntry || !currentEntry.span_id) {
|
|
213
273
|
console.warn(`No current entry to record output to\nStack trace: ${new Error().stack}`);
|
|
214
274
|
return;
|
|
215
275
|
}
|
|
@@ -218,33 +278,28 @@ class TraceClient {
|
|
|
218
278
|
span_id: currentEntry.span_id,
|
|
219
279
|
output,
|
|
220
280
|
function: currentEntry.function,
|
|
221
|
-
depth: currentEntry.
|
|
222
|
-
|
|
281
|
+
depth: this._spanDepths[currentEntry.span_id],
|
|
282
|
+
created_at: Date.now() / 1000,
|
|
283
|
+
span_type: currentEntry.span_type,
|
|
284
|
+
message: `Output from ${currentEntry.function}`
|
|
223
285
|
});
|
|
224
286
|
}
|
|
225
287
|
recordError(error) {
|
|
226
|
-
var _a;
|
|
227
288
|
const traceClientContext = getTraceClientContext();
|
|
228
289
|
const currentEntry = traceClientContext.entryStack.at(-1);
|
|
229
|
-
if (!currentEntry) {
|
|
290
|
+
if (!currentEntry || !currentEntry.span_id) {
|
|
230
291
|
console.warn(`No current entry to record error to\nStack trace: ${new Error().stack}`);
|
|
231
292
|
return;
|
|
232
293
|
}
|
|
233
|
-
let output = error;
|
|
234
|
-
if (error instanceof Error) {
|
|
235
|
-
output = {
|
|
236
|
-
name: error.name,
|
|
237
|
-
message: error.message,
|
|
238
|
-
stack: (_a = error.stack) === null || _a === void 0 ? void 0 : _a.substring(0, 1000)
|
|
239
|
-
};
|
|
240
|
-
}
|
|
241
294
|
this.addEntry({
|
|
242
295
|
type: 'error',
|
|
243
296
|
span_id: currentEntry.span_id,
|
|
244
|
-
output,
|
|
297
|
+
output: error,
|
|
245
298
|
function: currentEntry.function,
|
|
246
|
-
depth: currentEntry.
|
|
247
|
-
|
|
299
|
+
depth: this._spanDepths[currentEntry.span_id],
|
|
300
|
+
created_at: Date.now() / 1000,
|
|
301
|
+
span_type: currentEntry.span_type,
|
|
302
|
+
message: `Error from ${currentEntry.function}`
|
|
248
303
|
});
|
|
249
304
|
}
|
|
250
305
|
startSpan(name, options = {}) {
|
|
@@ -255,18 +310,20 @@ class TraceClient {
|
|
|
255
310
|
const spanType = (_a = options.spanType) !== null && _a !== void 0 ? _a : 'span';
|
|
256
311
|
const startTime = Date.now() / 1000;
|
|
257
312
|
let depth = 0, parentSpanId = undefined;
|
|
258
|
-
if (parentEntry) {
|
|
259
|
-
depth = parentEntry.
|
|
313
|
+
if (parentEntry && parentEntry.span_id) {
|
|
314
|
+
depth = this._spanDepths[parentEntry.span_id] + 1;
|
|
260
315
|
parentSpanId = parentEntry.span_id;
|
|
261
316
|
}
|
|
317
|
+
this._spanDepths[spanId] = depth;
|
|
262
318
|
const entry = {
|
|
263
319
|
type: 'enter',
|
|
264
320
|
function: name,
|
|
265
321
|
span_id: spanId,
|
|
266
322
|
depth: depth,
|
|
267
|
-
|
|
323
|
+
created_at: startTime,
|
|
268
324
|
span_type: spanType,
|
|
269
|
-
parent_span_id: parentSpanId
|
|
325
|
+
parent_span_id: parentSpanId,
|
|
326
|
+
message: name
|
|
270
327
|
};
|
|
271
328
|
this.addEntry(entry);
|
|
272
329
|
traceClientContext.entryStack.push(entry);
|
|
@@ -274,21 +331,24 @@ class TraceClient {
|
|
|
274
331
|
endSpan() {
|
|
275
332
|
const traceClientContext = getTraceClientContext();
|
|
276
333
|
const enterEntry = traceClientContext.entryStack.pop();
|
|
277
|
-
if (!enterEntry) {
|
|
334
|
+
if (!enterEntry || !enterEntry.span_id) {
|
|
278
335
|
console.warn("No enter entry to end");
|
|
279
336
|
return;
|
|
280
337
|
}
|
|
281
338
|
const endTime = Date.now() / 1000;
|
|
282
|
-
const duration = endTime - enterEntry.
|
|
339
|
+
const duration = endTime - enterEntry.created_at;
|
|
283
340
|
this.addEntry({
|
|
284
341
|
type: 'exit',
|
|
285
342
|
function: enterEntry.function,
|
|
286
343
|
span_id: enterEntry.span_id,
|
|
287
|
-
depth: enterEntry.
|
|
288
|
-
|
|
344
|
+
depth: this._spanDepths[enterEntry.span_id],
|
|
345
|
+
created_at: endTime,
|
|
289
346
|
duration: duration,
|
|
290
|
-
span_type: enterEntry.span_type
|
|
347
|
+
span_type: enterEntry.span_type,
|
|
348
|
+
message: `← ${enterEntry.function}`
|
|
291
349
|
});
|
|
350
|
+
// Clean up depth tracking
|
|
351
|
+
delete this._spanDepths[enterEntry.span_id];
|
|
292
352
|
}
|
|
293
353
|
*span(name, options = {}) {
|
|
294
354
|
if (!this.enableMonitoring) {
|
|
@@ -306,6 +366,7 @@ class TraceClient {
|
|
|
306
366
|
condenseTrace(rawEntries) {
|
|
307
367
|
var _a, _b, _c, _d, _e;
|
|
308
368
|
const spansById = {};
|
|
369
|
+
const allEvaluationRuns = []; // To collect all eval runs
|
|
309
370
|
for (const entry of rawEntries) {
|
|
310
371
|
const spanId = entry.span_id;
|
|
311
372
|
if (!spanId)
|
|
@@ -315,7 +376,8 @@ class TraceClient {
|
|
|
315
376
|
span_id: spanId,
|
|
316
377
|
function: entry.function || 'unknown',
|
|
317
378
|
depth: (_a = entry.depth) !== null && _a !== void 0 ? _a : 0,
|
|
318
|
-
|
|
379
|
+
created_at: new Date(((_b = entry.created_at) !== null && _b !== void 0 ? _b : 0) * 1000).toISOString(), // Convert number to ISO string
|
|
380
|
+
trace_id: this.traceId, // Add trace_id
|
|
319
381
|
parent_span_id: entry.parent_span_id,
|
|
320
382
|
span_type: entry.span_type || 'span',
|
|
321
383
|
inputs: null,
|
|
@@ -330,14 +392,14 @@ class TraceClient {
|
|
|
330
392
|
case 'enter':
|
|
331
393
|
currentSpanData.function = entry.function || currentSpanData.function;
|
|
332
394
|
currentSpanData.depth = (_c = entry.depth) !== null && _c !== void 0 ? _c : currentSpanData.depth;
|
|
333
|
-
currentSpanData.
|
|
395
|
+
currentSpanData.created_at = new Date(((_d = entry.created_at) !== null && _d !== void 0 ? _d : 0) * 1000).toISOString(); // Ensure created_at is string on update
|
|
334
396
|
currentSpanData.parent_span_id = entry.parent_span_id;
|
|
335
397
|
currentSpanData.span_type = entry.span_type || currentSpanData.span_type;
|
|
336
|
-
currentSpanData.start_time = entry.
|
|
398
|
+
currentSpanData.start_time = entry.created_at; // Keep original number for duration calc
|
|
337
399
|
break;
|
|
338
400
|
case 'exit':
|
|
339
401
|
currentSpanData.duration = (_e = entry.duration) !== null && _e !== void 0 ? _e : currentSpanData.duration;
|
|
340
|
-
currentSpanData.end_time = entry.
|
|
402
|
+
currentSpanData.end_time = entry.created_at; // Keep original number for duration calc
|
|
341
403
|
if (currentSpanData.duration === null && currentSpanData.start_time && currentSpanData.end_time) {
|
|
342
404
|
currentSpanData.duration = currentSpanData.end_time - currentSpanData.start_time;
|
|
343
405
|
}
|
|
@@ -355,8 +417,11 @@ class TraceClient {
|
|
|
355
417
|
currentSpanData.output = entry.output;
|
|
356
418
|
break;
|
|
357
419
|
case 'evaluation':
|
|
358
|
-
if
|
|
359
|
-
|
|
420
|
+
// Check if evaluation_runs is an array and has at least one element
|
|
421
|
+
if (Array.isArray(entry.evaluation_runs) && entry.evaluation_runs.length > 0) {
|
|
422
|
+
const evalPayload = entry.evaluation_runs[0]; // Extract the payload object
|
|
423
|
+
currentSpanData.evaluation_runs.push(evalPayload); // Add the object to the span's list
|
|
424
|
+
allEvaluationRuns.push(evalPayload); // Add the object to the central list
|
|
360
425
|
}
|
|
361
426
|
break;
|
|
362
427
|
}
|
|
@@ -387,9 +452,11 @@ class TraceClient {
|
|
|
387
452
|
childrenMap[parentId].push(span);
|
|
388
453
|
}
|
|
389
454
|
}
|
|
390
|
-
|
|
455
|
+
// Sort using parsed dates
|
|
456
|
+
roots.sort((a, b) => Date.parse(a.created_at) - Date.parse(b.created_at));
|
|
391
457
|
for (const parentId in childrenMap) {
|
|
392
|
-
|
|
458
|
+
// Sort using parsed dates
|
|
459
|
+
childrenMap[parentId].sort((a, b) => Date.parse(a.created_at) - Date.parse(b.created_at));
|
|
393
460
|
}
|
|
394
461
|
function buildFlatListDfs(span) {
|
|
395
462
|
if (visited.has(span.span_id))
|
|
@@ -410,26 +477,36 @@ class TraceClient {
|
|
|
410
477
|
buildFlatListDfs(span);
|
|
411
478
|
}
|
|
412
479
|
}
|
|
413
|
-
return sortedCondensedList;
|
|
480
|
+
return [sortedCondensedList, allEvaluationRuns]; // Return both
|
|
414
481
|
}
|
|
415
482
|
save() {
|
|
416
483
|
return __awaiter(this, arguments, void 0, function* (emptySave = false) {
|
|
484
|
+
var _a, _b, _c, _d, _e, _f, _g, _h;
|
|
417
485
|
if (!this.enableMonitoring || !this.traceManager) {
|
|
418
486
|
return null;
|
|
419
487
|
}
|
|
420
488
|
const traceClientContext = getTraceClientContext();
|
|
421
489
|
const totalDuration = this.getDuration();
|
|
422
|
-
|
|
490
|
+
// Use the tuple returned by condenseTrace
|
|
491
|
+
const [condensedEntries, evaluationRuns] = this.condenseTrace(traceClientContext.entries);
|
|
423
492
|
const tokenCounts = {
|
|
424
|
-
prompt_tokens: 0,
|
|
425
|
-
|
|
493
|
+
prompt_tokens: 0,
|
|
494
|
+
completion_tokens: 0,
|
|
495
|
+
total_tokens: 0,
|
|
496
|
+
prompt_tokens_cost_usd: 0.0,
|
|
497
|
+
completion_tokens_cost_usd: 0.0,
|
|
498
|
+
total_cost_usd: 0.0
|
|
426
499
|
};
|
|
427
|
-
|
|
428
|
-
|
|
500
|
+
// First pass: collect all LLM calls with their token counts
|
|
501
|
+
const llmCalls = [];
|
|
502
|
+
let index = 0;
|
|
503
|
+
for (const entry of condensedEntries) {
|
|
429
504
|
if (entry.span_type === 'llm' && ((_a = entry.output) === null || _a === void 0 ? void 0 : _a.usage)) {
|
|
430
505
|
const usage = entry.output.usage;
|
|
506
|
+
const modelName = ((_b = entry.inputs) === null || _b === void 0 ? void 0 : _b.model) || "";
|
|
431
507
|
let promptTokens = 0;
|
|
432
508
|
let completionTokens = 0;
|
|
509
|
+
// Handle different token naming conventions
|
|
433
510
|
if (usage.prompt_tokens !== undefined || usage.completion_tokens !== undefined) {
|
|
434
511
|
promptTokens = usage.prompt_tokens || 0;
|
|
435
512
|
completionTokens = usage.completion_tokens || 0;
|
|
@@ -437,6 +514,7 @@ class TraceClient {
|
|
|
437
514
|
else if (usage.input_tokens !== undefined || usage.output_tokens !== undefined) {
|
|
438
515
|
promptTokens = usage.input_tokens || 0;
|
|
439
516
|
completionTokens = usage.output_tokens || 0;
|
|
517
|
+
// Standardize naming
|
|
440
518
|
usage.prompt_tokens = promptTokens;
|
|
441
519
|
usage.completion_tokens = completionTokens;
|
|
442
520
|
delete usage.input_tokens;
|
|
@@ -445,33 +523,63 @@ class TraceClient {
|
|
|
445
523
|
tokenCounts.prompt_tokens += promptTokens;
|
|
446
524
|
tokenCounts.completion_tokens += completionTokens;
|
|
447
525
|
tokenCounts.total_tokens += usage.total_tokens || (promptTokens + completionTokens);
|
|
448
|
-
|
|
526
|
+
// Add to list of calls for cost calculation
|
|
449
527
|
if (modelName) {
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
528
|
+
llmCalls.push({
|
|
529
|
+
modelName,
|
|
530
|
+
promptTokens,
|
|
531
|
+
completionTokens,
|
|
532
|
+
entryIndex: index
|
|
533
|
+
});
|
|
534
|
+
}
|
|
535
|
+
}
|
|
536
|
+
index++;
|
|
537
|
+
}
|
|
538
|
+
// Second pass: calculate costs for each LLM call using the API
|
|
539
|
+
if (this.traceManager && llmCalls.length > 0) {
|
|
540
|
+
// Process each LLM call
|
|
541
|
+
for (const call of llmCalls) {
|
|
542
|
+
try {
|
|
543
|
+
// Get costs from the API
|
|
544
|
+
const costs = yield this.traceManager.calculateTokenCosts(call.modelName, call.promptTokens, call.completionTokens);
|
|
545
|
+
if (costs) {
|
|
546
|
+
// Update the entry with the costs
|
|
547
|
+
const entry = condensedEntries[call.entryIndex];
|
|
548
|
+
if ((_c = entry.output) === null || _c === void 0 ? void 0 : _c.usage) {
|
|
549
|
+
entry.output.usage.prompt_tokens_cost_usd = costs.prompt_tokens_cost_usd;
|
|
550
|
+
entry.output.usage.completion_tokens_cost_usd = costs.completion_tokens_cost_usd;
|
|
551
|
+
entry.output.usage.total_cost_usd = costs.total_cost_usd;
|
|
552
|
+
}
|
|
553
|
+
// Add to the total costs, ensuring values are numbers (default to 0)
|
|
554
|
+
tokenCounts.prompt_tokens_cost_usd += (_d = costs.prompt_tokens_cost_usd) !== null && _d !== void 0 ? _d : 0.0;
|
|
555
|
+
tokenCounts.completion_tokens_cost_usd += (_e = costs.completion_tokens_cost_usd) !== null && _e !== void 0 ? _e : 0.0;
|
|
556
|
+
tokenCounts.total_cost_usd += (_f = costs.total_cost_usd) !== null && _f !== void 0 ? _f : 0.0;
|
|
460
557
|
}
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
558
|
+
else {
|
|
559
|
+
// If calculation failed, set costs to null in the entry (matching Python behavior)
|
|
560
|
+
const entry = condensedEntries[call.entryIndex];
|
|
561
|
+
if ((_g = entry.output) === null || _g === void 0 ? void 0 : _g.usage) {
|
|
562
|
+
entry.output.usage.prompt_tokens_cost_usd = null;
|
|
563
|
+
entry.output.usage.completion_tokens_cost_usd = null;
|
|
564
|
+
entry.output.usage.total_cost_usd = null;
|
|
565
|
+
}
|
|
566
|
+
// Log warning, but totals remain 0 for this call
|
|
567
|
+
logger.warn(`Token cost calculation failed for model '${call.modelName}'. Cost information will not be available.`);
|
|
466
568
|
}
|
|
467
569
|
}
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
570
|
+
catch (e) {
|
|
571
|
+
logger.warn(`Error calculating cost for model '${call.modelName}':`, e);
|
|
572
|
+
// Set costs to null in the entry
|
|
573
|
+
const entry = condensedEntries[call.entryIndex];
|
|
574
|
+
if ((_h = entry.output) === null || _h === void 0 ? void 0 : _h.usage) {
|
|
575
|
+
entry.output.usage.prompt_tokens_cost_usd = null;
|
|
576
|
+
entry.output.usage.completion_tokens_cost_usd = null;
|
|
577
|
+
entry.output.usage.total_cost_usd = null;
|
|
578
|
+
}
|
|
579
|
+
// Totals remain unchanged (effectively adding 0)
|
|
472
580
|
}
|
|
473
581
|
}
|
|
474
|
-
}
|
|
582
|
+
}
|
|
475
583
|
// Convert rules array to a dictionary (Record<string, Rule>)
|
|
476
584
|
const rulesDict = {};
|
|
477
585
|
this.rules.forEach(rule => {
|
|
@@ -488,16 +596,15 @@ class TraceClient {
|
|
|
488
596
|
duration: totalDuration,
|
|
489
597
|
token_counts: tokenCounts,
|
|
490
598
|
entries: condensedEntries,
|
|
491
|
-
|
|
492
|
-
empty_save: emptySave,
|
|
599
|
+
evaluation_runs: evaluationRuns,
|
|
493
600
|
overwrite: this.overwrite,
|
|
494
601
|
parent_trace_id: this.parentTraceId,
|
|
495
602
|
parent_name: this.parentName
|
|
496
603
|
};
|
|
497
604
|
try {
|
|
498
|
-
yield this.traceManager.saveTrace(traceData
|
|
605
|
+
yield this.traceManager.saveTrace(traceData);
|
|
499
606
|
logger.info(`Trace ${this.traceId} saved successfully.`);
|
|
500
|
-
if (
|
|
607
|
+
if (this.enableEvaluations) {
|
|
501
608
|
try {
|
|
502
609
|
yield this.traceManager.addTraceToEvalQueue(traceData);
|
|
503
610
|
logger.info(`Trace ${this.traceId} added to evaluation queue.`);
|
|
@@ -531,7 +638,7 @@ class TraceClient {
|
|
|
531
638
|
traceClientContext.entries.forEach(entry => {
|
|
532
639
|
var _a;
|
|
533
640
|
const indent = " ".repeat((_a = entry.depth) !== null && _a !== void 0 ? _a : 0);
|
|
534
|
-
const timeStr = entry.
|
|
641
|
+
const timeStr = entry.created_at ? `@ ${new Date(entry.created_at * 1000).toISOString()}` : '';
|
|
535
642
|
const shortSpanId = entry.span_id ? `(id: ${entry.span_id.substring(0, 8)}...)` : '';
|
|
536
643
|
const shortParentId = entry.parent_span_id ? `(parent: ${entry.parent_span_id.substring(0, 8)}...)` : '';
|
|
537
644
|
try {
|
|
@@ -612,9 +719,8 @@ class TraceClient {
|
|
|
612
719
|
* @returns Promise that resolves when the evaluation entry has been added to the trace
|
|
613
720
|
*/
|
|
614
721
|
asyncEvaluate(scorers_1) {
|
|
615
|
-
return __awaiter(this, arguments, void 0, function* (
|
|
616
|
-
|
|
617
|
-
scorers, options = {}) {
|
|
722
|
+
return __awaiter(this, arguments, void 0, function* (scorers, options = {}) {
|
|
723
|
+
var _a;
|
|
618
724
|
if (!this.enableEvaluations) {
|
|
619
725
|
logger.warn("Evaluations are disabled. Skipping async evaluation.");
|
|
620
726
|
return;
|
|
@@ -629,6 +735,12 @@ class TraceClient {
|
|
|
629
735
|
logger.warn("No APIJudgmentScorers found in the provided scorers list. Skipping async evaluation as backend requires API scorers.");
|
|
630
736
|
return;
|
|
631
737
|
}
|
|
738
|
+
// Process rules (currently just using this.rules directly)
|
|
739
|
+
const loadedRules = this.rules; // TODO: Add ScorerWrapper-like processing if needed in TS
|
|
740
|
+
// Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
|
|
741
|
+
if (loadedRules && loadedRules.length > 0 && scorers.some(s => !(s instanceof APIJudgmentScorer))) {
|
|
742
|
+
throw new Error("Cannot use Judgeval scorers, you can only use API scorers when using rules. Please either remove rules or use only APIJudgmentScorer types.");
|
|
743
|
+
}
|
|
632
744
|
const startTime = Date.now() / 1000; // Record start time in seconds
|
|
633
745
|
// Create example structure matching Python/backend expectations
|
|
634
746
|
const example = {
|
|
@@ -661,8 +773,6 @@ class TraceClient {
|
|
|
661
773
|
const idPart = currentEntry ? currentEntry.span_id.substring(0, 8) : this.traceId.substring(0, 8);
|
|
662
774
|
const evalName = `${this.name.charAt(0).toUpperCase() + this.name.slice(1)}-${idPart}-[${scorerNames}]`;
|
|
663
775
|
// --- End eval name creation ---
|
|
664
|
-
// Process rules (currently just using this.rules directly)
|
|
665
|
-
const loadedRules = this.rules; // TODO: Add ScorerWrapper-like processing if needed in TS
|
|
666
776
|
// Construct the evaluation payload
|
|
667
777
|
const evalRunPayload = {
|
|
668
778
|
organization_id: this.organizationId,
|
|
@@ -677,45 +787,24 @@ class TraceClient {
|
|
|
677
787
|
override: this.overwrite, // Use trace's overwrite setting
|
|
678
788
|
rules: loadedRules // Pass the processed rules
|
|
679
789
|
};
|
|
680
|
-
// Add evaluation entry
|
|
681
|
-
this.
|
|
790
|
+
// Add evaluation entry to the trace
|
|
791
|
+
this.addEntry({
|
|
792
|
+
type: "evaluation",
|
|
793
|
+
function: currentEntry.function,
|
|
794
|
+
span_id: currentEntry.span_id, // May be undefined
|
|
795
|
+
depth: (_a = currentEntry.depth) !== null && _a !== void 0 ? _a : 0,
|
|
796
|
+
created_at: Date.now() / 1000,
|
|
797
|
+
evaluation_runs: [evalRunPayload], // Store the object back in an array to match interface
|
|
798
|
+
duration: Date.now() / 1000 - startTime,
|
|
799
|
+
span_type: currentEntry.span_type
|
|
800
|
+
});
|
|
682
801
|
}
|
|
683
802
|
catch (error) {
|
|
684
|
-
|
|
685
|
-
|
|
803
|
+
logger.error(`Failed during asyncEvaluate execution: ${error instanceof Error ? error.message : String(error)}`);
|
|
804
|
+
throw error; // Re-throw after logging
|
|
686
805
|
}
|
|
687
806
|
});
|
|
688
807
|
}
|
|
689
|
-
/**
|
|
690
|
-
* Private helper to add an evaluation entry to the trace.
|
|
691
|
-
* This mirrors the structure of Python's add_eval_run.
|
|
692
|
-
*
|
|
693
|
-
* @param evalRunPayload The constructed payload for the evaluation.
|
|
694
|
-
* @param startTime The start time (in seconds) of the evaluation process.
|
|
695
|
-
*/
|
|
696
|
-
_addEvalRun(evalRunPayload, startTime) {
|
|
697
|
-
var _a, _b;
|
|
698
|
-
const traceClientContext = getTraceClientContext();
|
|
699
|
-
const currentEntry = traceClientContext.entryStack.at(-1);
|
|
700
|
-
if (!currentEntry) {
|
|
701
|
-
logger.warn(`No current entry to record evaluation to\nStack trace: ${new Error().stack}`);
|
|
702
|
-
return;
|
|
703
|
-
}
|
|
704
|
-
const function_ = (_a = currentEntry.function) !== null && _a !== void 0 ? _a : "unknown_function";
|
|
705
|
-
const depth = (_b = currentEntry.depth) !== null && _b !== void 0 ? _b : 0;
|
|
706
|
-
const duration = Date.now() / 1000 - startTime;
|
|
707
|
-
// Add evaluation entry to the trace
|
|
708
|
-
this.addEntry({
|
|
709
|
-
type: "evaluation",
|
|
710
|
-
function: function_,
|
|
711
|
-
span_id: currentEntry.span_id, // May be undefined
|
|
712
|
-
depth: depth,
|
|
713
|
-
timestamp: Date.now() / 1000,
|
|
714
|
-
evaluation_runs: [evalRunPayload], // Embed the payload
|
|
715
|
-
duration: duration,
|
|
716
|
-
span_type: "evaluation"
|
|
717
|
-
});
|
|
718
|
-
}
|
|
719
808
|
// OPTIONAL: Add a method to get the original name if needed elsewhere
|
|
720
809
|
getOriginalName() {
|
|
721
810
|
return this.originalName;
|
|
@@ -796,11 +885,6 @@ class Tracer {
|
|
|
796
885
|
apiKey: this.apiKey,
|
|
797
886
|
organizationId: this.organizationId,
|
|
798
887
|
});
|
|
799
|
-
if (traceClient.enableMonitoring) {
|
|
800
|
-
traceClient.save(true).catch(err => {
|
|
801
|
-
logger.error(`Failed to save empty trace (${traceClient.traceId}):`, err);
|
|
802
|
-
});
|
|
803
|
-
}
|
|
804
888
|
return traceClient;
|
|
805
889
|
}
|
|
806
890
|
*trace(name, options = {}) {
|