judgeval 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +95 -68
- package/dist/cjs/common/tracer.js +235 -143
- package/dist/cjs/common/tracer.js.map +1 -1
- package/dist/cjs/constants.js +8 -5
- package/dist/cjs/constants.js.map +1 -1
- package/dist/cjs/data/datasets/eval-dataset-client.js +349 -0
- package/dist/cjs/data/datasets/eval-dataset-client.js.map +1 -0
- package/dist/cjs/data/datasets/eval-dataset.js +405 -0
- package/dist/cjs/data/datasets/eval-dataset.js.map +1 -0
- package/dist/cjs/data/example.js +22 -1
- package/dist/cjs/data/example.js.map +1 -1
- package/dist/cjs/e2etests/eval-operations.test.js +282 -0
- package/dist/cjs/e2etests/eval-operations.test.js.map +1 -0
- package/dist/cjs/e2etests/judgee-traces.test.js +278 -0
- package/dist/cjs/e2etests/judgee-traces.test.js.map +1 -0
- package/dist/cjs/index.js +1 -3
- package/dist/cjs/index.js.map +1 -1
- package/dist/cjs/judgment-client.js +326 -645
- package/dist/cjs/judgment-client.js.map +1 -1
- package/dist/cjs/scorers/api-scorer.js +56 -48
- package/dist/cjs/scorers/api-scorer.js.map +1 -1
- package/dist/cjs/scorers/base-scorer.js +66 -11
- package/dist/cjs/scorers/base-scorer.js.map +1 -1
- package/dist/esm/common/tracer.js +236 -144
- package/dist/esm/common/tracer.js.map +1 -1
- package/dist/esm/constants.js +7 -4
- package/dist/esm/constants.js.map +1 -1
- package/dist/esm/data/datasets/eval-dataset-client.js +342 -0
- package/dist/esm/data/datasets/eval-dataset-client.js.map +1 -0
- package/dist/esm/data/datasets/eval-dataset.js +375 -0
- package/dist/esm/data/datasets/eval-dataset.js.map +1 -0
- package/dist/esm/data/example.js +22 -1
- package/dist/esm/data/example.js.map +1 -1
- package/dist/esm/e2etests/eval-operations.test.js +254 -0
- package/dist/esm/e2etests/eval-operations.test.js.map +1 -0
- package/dist/esm/e2etests/judgee-traces.test.js +253 -0
- package/dist/esm/e2etests/judgee-traces.test.js.map +1 -0
- package/dist/esm/index.js +0 -1
- package/dist/esm/index.js.map +1 -1
- package/dist/esm/judgment-client.js +328 -647
- package/dist/esm/judgment-client.js.map +1 -1
- package/dist/esm/scorers/api-scorer.js +56 -48
- package/dist/esm/scorers/api-scorer.js.map +1 -1
- package/dist/esm/scorers/base-scorer.js +66 -11
- package/dist/esm/scorers/base-scorer.js.map +1 -1
- package/dist/types/common/tracer.d.ts +27 -14
- package/dist/types/constants.d.ts +4 -4
- package/dist/types/data/datasets/eval-dataset-client.d.ts +39 -0
- package/dist/types/data/datasets/eval-dataset.d.ts +45 -0
- package/dist/types/data/example.d.ts +24 -12
- package/dist/types/e2etests/eval-operations.test.d.ts +5 -0
- package/dist/types/e2etests/judgee-traces.test.d.ts +5 -0
- package/dist/types/index.d.ts +0 -1
- package/dist/types/judgment-client.d.ts +3 -47
- package/dist/types/scorers/api-scorer.d.ts +15 -15
- package/dist/types/scorers/base-scorer.d.ts +53 -10
- package/package.json +2 -1
- package/dist/cjs/scorers/exact-match-scorer.js +0 -84
- package/dist/cjs/scorers/exact-match-scorer.js.map +0 -1
- package/dist/esm/scorers/exact-match-scorer.js +0 -80
- package/dist/esm/scorers/exact-match-scorer.js.map +0 -1
- package/dist/types/scorers/exact-match-scorer.d.ts +0 -10
|
@@ -14,7 +14,7 @@ import { AsyncLocalStorage } from 'async_hooks';
|
|
|
14
14
|
import OpenAI from 'openai';
|
|
15
15
|
import Anthropic from '@anthropic-ai/sdk';
|
|
16
16
|
// Local Imports
|
|
17
|
-
import { JUDGMENT_TRACES_SAVE_API_URL, JUDGMENT_TRACES_FETCH_API_URL, JUDGMENT_TRACES_DELETE_API_URL, JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL,
|
|
17
|
+
import { JUDGMENT_TRACES_SAVE_API_URL, JUDGMENT_TRACES_FETCH_API_URL, JUDGMENT_TRACES_DELETE_API_URL, JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL, JUDGMENT_CALCULATE_TOKEN_COSTS_API_URL,
|
|
18
18
|
// Add other necessary constants if needed
|
|
19
19
|
} from '../constants.js';
|
|
20
20
|
import { APIJudgmentScorer } from '../scorers/base-scorer.js';
|
|
@@ -70,16 +70,13 @@ class TraceManagerClient {
|
|
|
70
70
|
try {
|
|
71
71
|
// Use isomorphic fetch (available globally in modern Node.js and browsers)
|
|
72
72
|
const response = yield fetch(url, Object.assign(Object.assign({}, options), { headers: headers }));
|
|
73
|
-
if
|
|
74
|
-
|
|
75
|
-
console.error(`API Error (${response.status}) for ${options.method || 'GET'} ${url}: ${errorBody}`);
|
|
76
|
-
throw new Error(`Judgment API request failed: ${response.status} ${response.statusText} - ${errorBody}`);
|
|
77
|
-
}
|
|
73
|
+
// We will return the response object even if !response.ok
|
|
74
|
+
// The caller (e.g., saveTrace) is responsible for checking response.ok or response.status
|
|
78
75
|
// Handle cases where the response might be empty (e.g., 204 No Content on DELETE)
|
|
79
76
|
if (response.status === 204) {
|
|
80
77
|
return null; // Indicate success with no content
|
|
81
78
|
}
|
|
82
|
-
return
|
|
79
|
+
return response;
|
|
83
80
|
}
|
|
84
81
|
catch (error) {
|
|
85
82
|
console.error(`Network or fetch error during ${options.method || 'GET'} ${url}:`, error);
|
|
@@ -96,21 +93,52 @@ class TraceManagerClient {
|
|
|
96
93
|
});
|
|
97
94
|
});
|
|
98
95
|
}
|
|
99
|
-
saveTrace(traceData
|
|
96
|
+
saveTrace(traceData) {
|
|
100
97
|
return __awaiter(this, void 0, void 0, function* () {
|
|
98
|
+
// _fetch now returns the raw response object or throws on network error
|
|
101
99
|
const response = yield this._fetch(JUDGMENT_TRACES_SAVE_API_URL, {
|
|
102
100
|
method: 'POST',
|
|
103
|
-
body: JSON.stringify(traceData),
|
|
101
|
+
body: JSON.stringify(traceData), // Stringify directly here again
|
|
104
102
|
});
|
|
105
|
-
//
|
|
106
|
-
if (!
|
|
107
|
-
//
|
|
108
|
-
|
|
103
|
+
// Check if _fetch threw a network error (caught below) or returned an invalid object
|
|
104
|
+
if (!response) {
|
|
105
|
+
// This case should ideally be caught by _fetch's catch block, but double-check
|
|
106
|
+
throw new Error('Failed to save trace data: No response received from API.');
|
|
107
|
+
}
|
|
108
|
+
// Now, check the status code on the received response object
|
|
109
|
+
if (response.status === 400) {
|
|
110
|
+
// Attempt to get error body for more info
|
|
111
|
+
const errorBody = yield response.text();
|
|
112
|
+
throw new Error(`Failed to save trace data: Check your Trace name for conflicts, set overwrite=True to overwrite existing traces: ${response.status} ${response.statusText || ''} - ${errorBody}`);
|
|
113
|
+
}
|
|
114
|
+
else if (!response.ok) { // Handles other errors (5xx, 4xx except 400)
|
|
115
|
+
const errorBody = yield response.text();
|
|
116
|
+
throw new Error(`Failed to save trace data: Status ${response.status} ${response.statusText || '(No status text)'} - ${errorBody}`);
|
|
117
|
+
}
|
|
118
|
+
// --- Success Path ---
|
|
119
|
+
// Optionally log the UI URL (needs JSON parsing)
|
|
120
|
+
let responseData = null;
|
|
121
|
+
try {
|
|
122
|
+
// Handle 204 No Content specifically
|
|
123
|
+
if (response.status === 204) {
|
|
124
|
+
responseData = null; // Or maybe { success: true }?
|
|
125
|
+
}
|
|
126
|
+
else {
|
|
127
|
+
responseData = yield response.json(); // Parse JSON only on success
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
catch (parseError) {
|
|
131
|
+
logger.warn("Failed to parse successful API response JSON.", { error: parseError });
|
|
132
|
+
// Depending on requirements, maybe throw, maybe return a default success object
|
|
133
|
+
throw new Error(`API request succeeded (${response.status}), but failed to parse JSON response.`);
|
|
134
|
+
}
|
|
135
|
+
if (responseData === null || responseData === void 0 ? void 0 : responseData.ui_results_url) {
|
|
109
136
|
console.info(`
|
|
110
|
-
🔍 View trace: ${
|
|
137
|
+
🔍 View trace: ${responseData.ui_results_url}
|
|
111
138
|
`);
|
|
112
139
|
}
|
|
113
|
-
|
|
140
|
+
// Return the parsed data (or null for 204)
|
|
141
|
+
return responseData;
|
|
114
142
|
});
|
|
115
143
|
}
|
|
116
144
|
deleteTrace(traceId) {
|
|
@@ -139,6 +167,50 @@ class TraceManagerClient {
|
|
|
139
167
|
});
|
|
140
168
|
});
|
|
141
169
|
}
|
|
170
|
+
/**
|
|
171
|
+
* Calculate token costs directly using the API endpoint.
|
|
172
|
+
* This is more accurate than client-side calculation as it uses the most up-to-date pricing.
|
|
173
|
+
*
|
|
174
|
+
* @param model The model name (e.g. 'gpt-4', 'claude-3-opus-20240229')
|
|
175
|
+
* @param promptTokens Number of tokens in the prompt/input
|
|
176
|
+
* @param completionTokens Number of tokens in the completion/output
|
|
177
|
+
* @returns Object containing token counts and calculated costs in USD
|
|
178
|
+
*/
|
|
179
|
+
calculateTokenCosts(model, promptTokens, completionTokens) {
|
|
180
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
181
|
+
try {
|
|
182
|
+
// Use the new calculation endpoint
|
|
183
|
+
const response = yield this._fetch(JUDGMENT_CALCULATE_TOKEN_COSTS_API_URL, {
|
|
184
|
+
method: 'POST',
|
|
185
|
+
body: JSON.stringify({
|
|
186
|
+
model,
|
|
187
|
+
prompt_tokens: promptTokens,
|
|
188
|
+
completion_tokens: completionTokens
|
|
189
|
+
})
|
|
190
|
+
});
|
|
191
|
+
// Check if the response is okay and parse JSON
|
|
192
|
+
if (response && response.ok) {
|
|
193
|
+
const data = yield response.json();
|
|
194
|
+
return data;
|
|
195
|
+
}
|
|
196
|
+
else if (response) {
|
|
197
|
+
// Log error if response was not ok
|
|
198
|
+
const errorBody = yield response.text();
|
|
199
|
+
logger.warn(`API error calculating token costs for model ${model}: ${response.status} ${response.statusText}`, { errorBody });
|
|
200
|
+
return null;
|
|
201
|
+
}
|
|
202
|
+
else {
|
|
203
|
+
// Handle cases where _fetch might return null or undefined (though it shouldn't with current implementation)
|
|
204
|
+
logger.warn(`No response received when calculating token costs for model ${model}.`);
|
|
205
|
+
return null;
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
catch (error) {
|
|
209
|
+
logger.warn(`Failed to calculate token costs for model ${model}.`, { error: error instanceof Error ? error.message : String(error) });
|
|
210
|
+
return null;
|
|
211
|
+
}
|
|
212
|
+
});
|
|
213
|
+
}
|
|
142
214
|
}
|
|
143
215
|
// --- Helper Functions ---
|
|
144
216
|
// Helper function to sanitize names (e.g., replace spaces with underscores)
|
|
@@ -155,6 +227,7 @@ class TraceClient {
|
|
|
155
227
|
constructor(config) {
|
|
156
228
|
var _a, _b, _c, _d, _e;
|
|
157
229
|
this.traceManager = null; // Can be null if monitoring disabled
|
|
230
|
+
this._spanDepths = {}; // Track depth of active spans
|
|
158
231
|
this.traceId = config.traceId || uuidv4();
|
|
159
232
|
this.originalName = config.name || 'default_trace'; // Store original
|
|
160
233
|
this.name = sanitizeName(this.originalName); // Use sanitized name internally
|
|
@@ -193,7 +266,7 @@ class TraceClient {
|
|
|
193
266
|
recordInput(inputs) {
|
|
194
267
|
const traceClientContext = getTraceClientContext();
|
|
195
268
|
const currentEntry = traceClientContext.entryStack.at(-1);
|
|
196
|
-
if (!currentEntry) {
|
|
269
|
+
if (!currentEntry || !currentEntry.span_id) {
|
|
197
270
|
console.warn(`No current entry to record input to\nStack trace: ${new Error().stack}`);
|
|
198
271
|
return;
|
|
199
272
|
}
|
|
@@ -202,14 +275,16 @@ class TraceClient {
|
|
|
202
275
|
span_id: currentEntry.span_id,
|
|
203
276
|
inputs,
|
|
204
277
|
function: currentEntry.function,
|
|
205
|
-
depth: currentEntry.
|
|
206
|
-
|
|
278
|
+
depth: this._spanDepths[currentEntry.span_id],
|
|
279
|
+
created_at: Date.now() / 1000,
|
|
280
|
+
span_type: currentEntry.span_type,
|
|
281
|
+
message: `Inputs to ${currentEntry.function}`
|
|
207
282
|
});
|
|
208
283
|
}
|
|
209
284
|
recordOutput(output) {
|
|
210
285
|
const traceClientContext = getTraceClientContext();
|
|
211
286
|
const currentEntry = traceClientContext.entryStack.at(-1);
|
|
212
|
-
if (!currentEntry) {
|
|
287
|
+
if (!currentEntry || !currentEntry.span_id) {
|
|
213
288
|
console.warn(`No current entry to record output to\nStack trace: ${new Error().stack}`);
|
|
214
289
|
return;
|
|
215
290
|
}
|
|
@@ -218,33 +293,28 @@ class TraceClient {
|
|
|
218
293
|
span_id: currentEntry.span_id,
|
|
219
294
|
output,
|
|
220
295
|
function: currentEntry.function,
|
|
221
|
-
depth: currentEntry.
|
|
222
|
-
|
|
296
|
+
depth: this._spanDepths[currentEntry.span_id],
|
|
297
|
+
created_at: Date.now() / 1000,
|
|
298
|
+
span_type: currentEntry.span_type,
|
|
299
|
+
message: `Output from ${currentEntry.function}`
|
|
223
300
|
});
|
|
224
301
|
}
|
|
225
302
|
recordError(error) {
|
|
226
|
-
var _a;
|
|
227
303
|
const traceClientContext = getTraceClientContext();
|
|
228
304
|
const currentEntry = traceClientContext.entryStack.at(-1);
|
|
229
|
-
if (!currentEntry) {
|
|
305
|
+
if (!currentEntry || !currentEntry.span_id) {
|
|
230
306
|
console.warn(`No current entry to record error to\nStack trace: ${new Error().stack}`);
|
|
231
307
|
return;
|
|
232
308
|
}
|
|
233
|
-
let output = error;
|
|
234
|
-
if (error instanceof Error) {
|
|
235
|
-
output = {
|
|
236
|
-
name: error.name,
|
|
237
|
-
message: error.message,
|
|
238
|
-
stack: (_a = error.stack) === null || _a === void 0 ? void 0 : _a.substring(0, 1000)
|
|
239
|
-
};
|
|
240
|
-
}
|
|
241
309
|
this.addEntry({
|
|
242
310
|
type: 'error',
|
|
243
311
|
span_id: currentEntry.span_id,
|
|
244
|
-
output,
|
|
312
|
+
output: error,
|
|
245
313
|
function: currentEntry.function,
|
|
246
|
-
depth: currentEntry.
|
|
247
|
-
|
|
314
|
+
depth: this._spanDepths[currentEntry.span_id],
|
|
315
|
+
created_at: Date.now() / 1000,
|
|
316
|
+
span_type: currentEntry.span_type,
|
|
317
|
+
message: `Error from ${currentEntry.function}`
|
|
248
318
|
});
|
|
249
319
|
}
|
|
250
320
|
startSpan(name, options = {}) {
|
|
@@ -255,18 +325,20 @@ class TraceClient {
|
|
|
255
325
|
const spanType = (_a = options.spanType) !== null && _a !== void 0 ? _a : 'span';
|
|
256
326
|
const startTime = Date.now() / 1000;
|
|
257
327
|
let depth = 0, parentSpanId = undefined;
|
|
258
|
-
if (parentEntry) {
|
|
259
|
-
depth = parentEntry.
|
|
328
|
+
if (parentEntry && parentEntry.span_id) {
|
|
329
|
+
depth = this._spanDepths[parentEntry.span_id] + 1;
|
|
260
330
|
parentSpanId = parentEntry.span_id;
|
|
261
331
|
}
|
|
332
|
+
this._spanDepths[spanId] = depth;
|
|
262
333
|
const entry = {
|
|
263
334
|
type: 'enter',
|
|
264
335
|
function: name,
|
|
265
336
|
span_id: spanId,
|
|
266
337
|
depth: depth,
|
|
267
|
-
|
|
338
|
+
created_at: startTime,
|
|
268
339
|
span_type: spanType,
|
|
269
|
-
parent_span_id: parentSpanId
|
|
340
|
+
parent_span_id: parentSpanId,
|
|
341
|
+
message: name
|
|
270
342
|
};
|
|
271
343
|
this.addEntry(entry);
|
|
272
344
|
traceClientContext.entryStack.push(entry);
|
|
@@ -274,21 +346,24 @@ class TraceClient {
|
|
|
274
346
|
endSpan() {
|
|
275
347
|
const traceClientContext = getTraceClientContext();
|
|
276
348
|
const enterEntry = traceClientContext.entryStack.pop();
|
|
277
|
-
if (!enterEntry) {
|
|
349
|
+
if (!enterEntry || !enterEntry.span_id) {
|
|
278
350
|
console.warn("No enter entry to end");
|
|
279
351
|
return;
|
|
280
352
|
}
|
|
281
353
|
const endTime = Date.now() / 1000;
|
|
282
|
-
const duration = endTime - enterEntry.
|
|
354
|
+
const duration = endTime - enterEntry.created_at;
|
|
283
355
|
this.addEntry({
|
|
284
356
|
type: 'exit',
|
|
285
357
|
function: enterEntry.function,
|
|
286
358
|
span_id: enterEntry.span_id,
|
|
287
|
-
depth: enterEntry.
|
|
288
|
-
|
|
359
|
+
depth: this._spanDepths[enterEntry.span_id],
|
|
360
|
+
created_at: endTime,
|
|
289
361
|
duration: duration,
|
|
290
|
-
span_type: enterEntry.span_type
|
|
362
|
+
span_type: enterEntry.span_type,
|
|
363
|
+
message: `← ${enterEntry.function}`
|
|
291
364
|
});
|
|
365
|
+
// Clean up depth tracking
|
|
366
|
+
delete this._spanDepths[enterEntry.span_id];
|
|
292
367
|
}
|
|
293
368
|
*span(name, options = {}) {
|
|
294
369
|
if (!this.enableMonitoring) {
|
|
@@ -306,6 +381,7 @@ class TraceClient {
|
|
|
306
381
|
condenseTrace(rawEntries) {
|
|
307
382
|
var _a, _b, _c, _d, _e;
|
|
308
383
|
const spansById = {};
|
|
384
|
+
const allEvaluationRuns = [];
|
|
309
385
|
for (const entry of rawEntries) {
|
|
310
386
|
const spanId = entry.span_id;
|
|
311
387
|
if (!spanId)
|
|
@@ -315,12 +391,12 @@ class TraceClient {
|
|
|
315
391
|
span_id: spanId,
|
|
316
392
|
function: entry.function || 'unknown',
|
|
317
393
|
depth: (_a = entry.depth) !== null && _a !== void 0 ? _a : 0,
|
|
318
|
-
|
|
394
|
+
created_at: new Date(((_b = entry.created_at) !== null && _b !== void 0 ? _b : 0) * 1000).toISOString(), // Convert number to ISO string
|
|
395
|
+
trace_id: this.traceId, // Add trace_id
|
|
319
396
|
parent_span_id: entry.parent_span_id,
|
|
320
397
|
span_type: entry.span_type || 'span',
|
|
321
398
|
inputs: null,
|
|
322
399
|
output: null,
|
|
323
|
-
evaluation_runs: [],
|
|
324
400
|
duration: null,
|
|
325
401
|
children: []
|
|
326
402
|
};
|
|
@@ -330,14 +406,14 @@ class TraceClient {
|
|
|
330
406
|
case 'enter':
|
|
331
407
|
currentSpanData.function = entry.function || currentSpanData.function;
|
|
332
408
|
currentSpanData.depth = (_c = entry.depth) !== null && _c !== void 0 ? _c : currentSpanData.depth;
|
|
333
|
-
currentSpanData.
|
|
409
|
+
currentSpanData.created_at = new Date(((_d = entry.created_at) !== null && _d !== void 0 ? _d : 0) * 1000).toISOString(); // Ensure created_at is string on update
|
|
334
410
|
currentSpanData.parent_span_id = entry.parent_span_id;
|
|
335
411
|
currentSpanData.span_type = entry.span_type || currentSpanData.span_type;
|
|
336
|
-
currentSpanData.start_time = entry.
|
|
412
|
+
currentSpanData.start_time = entry.created_at; // Keep original number for duration calc
|
|
337
413
|
break;
|
|
338
414
|
case 'exit':
|
|
339
415
|
currentSpanData.duration = (_e = entry.duration) !== null && _e !== void 0 ? _e : currentSpanData.duration;
|
|
340
|
-
currentSpanData.end_time = entry.
|
|
416
|
+
currentSpanData.end_time = entry.created_at; // Keep original number for duration calc
|
|
341
417
|
if (currentSpanData.duration === null && currentSpanData.start_time && currentSpanData.end_time) {
|
|
342
418
|
currentSpanData.duration = currentSpanData.end_time - currentSpanData.start_time;
|
|
343
419
|
}
|
|
@@ -353,10 +429,8 @@ class TraceClient {
|
|
|
353
429
|
case 'output':
|
|
354
430
|
case 'error':
|
|
355
431
|
currentSpanData.output = entry.output;
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
if (entry.evaluation_runs) {
|
|
359
|
-
currentSpanData.evaluation_runs.push(...entry.evaluation_runs);
|
|
432
|
+
if (entry.type === 'output' && entry.output && typeof entry.output === 'object' && 'eval_name' in entry.output && 'scorers' in entry.output && 'trace_span_id' in entry.output) {
|
|
433
|
+
allEvaluationRuns.push(entry.output);
|
|
360
434
|
}
|
|
361
435
|
break;
|
|
362
436
|
}
|
|
@@ -387,9 +461,11 @@ class TraceClient {
|
|
|
387
461
|
childrenMap[parentId].push(span);
|
|
388
462
|
}
|
|
389
463
|
}
|
|
390
|
-
|
|
464
|
+
// Sort using parsed dates
|
|
465
|
+
roots.sort((a, b) => Date.parse(a.created_at) - Date.parse(b.created_at));
|
|
391
466
|
for (const parentId in childrenMap) {
|
|
392
|
-
|
|
467
|
+
// Sort using parsed dates
|
|
468
|
+
childrenMap[parentId].sort((a, b) => Date.parse(a.created_at) - Date.parse(b.created_at));
|
|
393
469
|
}
|
|
394
470
|
function buildFlatListDfs(span) {
|
|
395
471
|
if (visited.has(span.span_id))
|
|
@@ -410,26 +486,36 @@ class TraceClient {
|
|
|
410
486
|
buildFlatListDfs(span);
|
|
411
487
|
}
|
|
412
488
|
}
|
|
413
|
-
return sortedCondensedList;
|
|
489
|
+
return [sortedCondensedList, allEvaluationRuns];
|
|
414
490
|
}
|
|
415
491
|
save() {
|
|
416
492
|
return __awaiter(this, arguments, void 0, function* (emptySave = false) {
|
|
493
|
+
var _a, _b, _c, _d, _e;
|
|
417
494
|
if (!this.enableMonitoring || !this.traceManager) {
|
|
418
495
|
return null;
|
|
419
496
|
}
|
|
420
497
|
const traceClientContext = getTraceClientContext();
|
|
421
498
|
const totalDuration = this.getDuration();
|
|
422
|
-
|
|
499
|
+
// Use the tuple returned by condenseTrace
|
|
500
|
+
const [condensedEntries, evaluationRuns] = this.condenseTrace(traceClientContext.entries);
|
|
423
501
|
const tokenCounts = {
|
|
424
|
-
prompt_tokens: 0,
|
|
425
|
-
|
|
502
|
+
prompt_tokens: 0,
|
|
503
|
+
completion_tokens: 0,
|
|
504
|
+
total_tokens: 0,
|
|
505
|
+
prompt_tokens_cost_usd: 0.0,
|
|
506
|
+
completion_tokens_cost_usd: 0.0,
|
|
507
|
+
total_cost_usd: 0.0
|
|
426
508
|
};
|
|
427
|
-
|
|
428
|
-
|
|
509
|
+
// First pass: collect all LLM calls with their token counts
|
|
510
|
+
const llmCalls = [];
|
|
511
|
+
let index = 0;
|
|
512
|
+
for (const entry of condensedEntries) {
|
|
429
513
|
if (entry.span_type === 'llm' && ((_a = entry.output) === null || _a === void 0 ? void 0 : _a.usage)) {
|
|
430
514
|
const usage = entry.output.usage;
|
|
515
|
+
const modelName = ((_b = entry.inputs) === null || _b === void 0 ? void 0 : _b.model) || "";
|
|
431
516
|
let promptTokens = 0;
|
|
432
517
|
let completionTokens = 0;
|
|
518
|
+
// Handle different token naming conventions
|
|
433
519
|
if (usage.prompt_tokens !== undefined || usage.completion_tokens !== undefined) {
|
|
434
520
|
promptTokens = usage.prompt_tokens || 0;
|
|
435
521
|
completionTokens = usage.completion_tokens || 0;
|
|
@@ -437,6 +523,7 @@ class TraceClient {
|
|
|
437
523
|
else if (usage.input_tokens !== undefined || usage.output_tokens !== undefined) {
|
|
438
524
|
promptTokens = usage.input_tokens || 0;
|
|
439
525
|
completionTokens = usage.output_tokens || 0;
|
|
526
|
+
// Standardize naming
|
|
440
527
|
usage.prompt_tokens = promptTokens;
|
|
441
528
|
usage.completion_tokens = completionTokens;
|
|
442
529
|
delete usage.input_tokens;
|
|
@@ -445,33 +532,82 @@ class TraceClient {
|
|
|
445
532
|
tokenCounts.prompt_tokens += promptTokens;
|
|
446
533
|
tokenCounts.completion_tokens += completionTokens;
|
|
447
534
|
tokenCounts.total_tokens += usage.total_tokens || (promptTokens + completionTokens);
|
|
448
|
-
|
|
535
|
+
// Add to list of calls for cost calculation
|
|
449
536
|
if (modelName) {
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
537
|
+
llmCalls.push({
|
|
538
|
+
modelName,
|
|
539
|
+
promptTokens,
|
|
540
|
+
completionTokens,
|
|
541
|
+
entryIndex: index
|
|
542
|
+
});
|
|
543
|
+
}
|
|
544
|
+
}
|
|
545
|
+
index++;
|
|
546
|
+
}
|
|
547
|
+
// Second pass: calculate costs for each LLM call using the API
|
|
548
|
+
if (this.traceManager && llmCalls.length > 0) {
|
|
549
|
+
// Process each LLM call
|
|
550
|
+
for (const call of llmCalls) {
|
|
551
|
+
try {
|
|
552
|
+
// Get costs from the API
|
|
553
|
+
const costs = yield this.traceManager.calculateTokenCosts(call.modelName, call.promptTokens, call.completionTokens);
|
|
554
|
+
if (costs) {
|
|
555
|
+
// Update the entry with the costs
|
|
556
|
+
const entry = condensedEntries[call.entryIndex];
|
|
557
|
+
// Ensure output and usage objects exist before assigning costs
|
|
558
|
+
if (entry.output && entry.output.usage) {
|
|
559
|
+
// --- This part assigns costs to the individual span ---
|
|
560
|
+
entry.output.usage.prompt_tokens_cost_usd = costs.prompt_tokens_cost_usd;
|
|
561
|
+
entry.output.usage.completion_tokens_cost_usd = costs.completion_tokens_cost_usd;
|
|
562
|
+
entry.output.usage.total_cost_usd = costs.total_cost_usd;
|
|
563
|
+
logger.debug(`Assigned costs to span ${entry.span_id} (model: ${call.modelName})`, { costs }); // Added debug log
|
|
564
|
+
// -----------------------------------------------------
|
|
565
|
+
}
|
|
566
|
+
else {
|
|
567
|
+
logger.warn(`Could not assign costs to span ${entry.span_id} (model: ${call.modelName}): Missing 'output' or 'output.usage' object.`, { output: entry.output }); // Log if structure is missing
|
|
568
|
+
}
|
|
569
|
+
// Add to the total costs for the trace
|
|
570
|
+
tokenCounts.prompt_tokens_cost_usd += (_c = costs.prompt_tokens_cost_usd) !== null && _c !== void 0 ? _c : 0.0;
|
|
571
|
+
tokenCounts.completion_tokens_cost_usd += (_d = costs.completion_tokens_cost_usd) !== null && _d !== void 0 ? _d : 0.0;
|
|
572
|
+
tokenCounts.total_cost_usd += (_e = costs.total_cost_usd) !== null && _e !== void 0 ? _e : 0.0;
|
|
460
573
|
}
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
usage
|
|
465
|
-
|
|
574
|
+
else {
|
|
575
|
+
// If calculation failed, set costs to null in the entry (matching Python behavior)
|
|
576
|
+
const entry = condensedEntries[call.entryIndex];
|
|
577
|
+
// Ensure output and usage objects exist before assigning null costs
|
|
578
|
+
if (entry.output && entry.output.usage) {
|
|
579
|
+
// --- Sets null costs on the individual span ---
|
|
580
|
+
entry.output.usage.prompt_tokens_cost_usd = null;
|
|
581
|
+
entry.output.usage.completion_tokens_cost_usd = null;
|
|
582
|
+
entry.output.usage.total_cost_usd = null;
|
|
583
|
+
// ------------------------------------------
|
|
584
|
+
}
|
|
585
|
+
else {
|
|
586
|
+
// Log if we can't even assign null because the structure is missing
|
|
587
|
+
logger.warn(`Could not assign null costs to span ${entry.span_id} (model: ${call.modelName}): Missing 'output' or 'output.usage' object.`, { output: entry.output });
|
|
588
|
+
}
|
|
589
|
+
logger.warn(`Token cost calculation failed for model '${call.modelName}'. Cost information will be null for this span.`); // More specific warning
|
|
466
590
|
}
|
|
467
591
|
}
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
592
|
+
catch (e) {
|
|
593
|
+
logger.warn(`Error during cost calculation loop for model '${call.modelName}':`, e); // Adjusted logging
|
|
594
|
+
// Set costs to null in the entry if an error occurs during the loop iteration
|
|
595
|
+
const entry = condensedEntries[call.entryIndex];
|
|
596
|
+
// Ensure output and usage objects exist before assigning null costs on error
|
|
597
|
+
if (entry.output && entry.output.usage) {
|
|
598
|
+
// --- Sets null costs on the individual span on error ---
|
|
599
|
+
entry.output.usage.prompt_tokens_cost_usd = null;
|
|
600
|
+
entry.output.usage.completion_tokens_cost_usd = null;
|
|
601
|
+
entry.output.usage.total_cost_usd = null;
|
|
602
|
+
// ----------------------------------------------------
|
|
603
|
+
}
|
|
604
|
+
else {
|
|
605
|
+
// Log if we can't assign null on error because the structure is missing
|
|
606
|
+
logger.warn(`Could not assign null costs to span ${entry.span_id} (model: ${call.modelName}) on error: Missing 'output' or 'output.usage' object.`, { output: entry.output });
|
|
607
|
+
}
|
|
472
608
|
}
|
|
473
609
|
}
|
|
474
|
-
}
|
|
610
|
+
}
|
|
475
611
|
// Convert rules array to a dictionary (Record<string, Rule>)
|
|
476
612
|
const rulesDict = {};
|
|
477
613
|
this.rules.forEach(rule => {
|
|
@@ -488,16 +624,15 @@ class TraceClient {
|
|
|
488
624
|
duration: totalDuration,
|
|
489
625
|
token_counts: tokenCounts,
|
|
490
626
|
entries: condensedEntries,
|
|
491
|
-
|
|
492
|
-
empty_save: emptySave,
|
|
627
|
+
evaluation_runs: evaluationRuns,
|
|
493
628
|
overwrite: this.overwrite,
|
|
494
629
|
parent_trace_id: this.parentTraceId,
|
|
495
630
|
parent_name: this.parentName
|
|
496
631
|
};
|
|
497
632
|
try {
|
|
498
|
-
yield this.traceManager.saveTrace(traceData
|
|
633
|
+
yield this.traceManager.saveTrace(traceData);
|
|
499
634
|
logger.info(`Trace ${this.traceId} saved successfully.`);
|
|
500
|
-
if (
|
|
635
|
+
if (this.enableEvaluations) {
|
|
501
636
|
try {
|
|
502
637
|
yield this.traceManager.addTraceToEvalQueue(traceData);
|
|
503
638
|
logger.info(`Trace ${this.traceId} added to evaluation queue.`);
|
|
@@ -531,7 +666,7 @@ class TraceClient {
|
|
|
531
666
|
traceClientContext.entries.forEach(entry => {
|
|
532
667
|
var _a;
|
|
533
668
|
const indent = " ".repeat((_a = entry.depth) !== null && _a !== void 0 ? _a : 0);
|
|
534
|
-
const timeStr = entry.
|
|
669
|
+
const timeStr = entry.created_at ? `@ ${new Date(entry.created_at * 1000).toISOString()}` : '';
|
|
535
670
|
const shortSpanId = entry.span_id ? `(id: ${entry.span_id.substring(0, 8)}...)` : '';
|
|
536
671
|
const shortParentId = entry.parent_span_id ? `(parent: ${entry.parent_span_id.substring(0, 8)}...)` : '';
|
|
537
672
|
try {
|
|
@@ -562,17 +697,6 @@ class TraceClient {
|
|
|
562
697
|
// Keep console.log
|
|
563
698
|
console.log(`${indent} ${prefix} (for ${shortSpanId}): ${outputStr || 'null'}`);
|
|
564
699
|
break;
|
|
565
|
-
case 'evaluation':
|
|
566
|
-
let evalStr = JSON.stringify(entry.evaluation_runs);
|
|
567
|
-
if (evalStr && evalStr.length > 200) {
|
|
568
|
-
evalStr = evalStr.substring(0, 197) + '...';
|
|
569
|
-
}
|
|
570
|
-
// Keep console.log
|
|
571
|
-
console.log(`${indent} Evaluation (for ${shortSpanId}): ${evalStr || '[]'}`);
|
|
572
|
-
break;
|
|
573
|
-
default:
|
|
574
|
-
// Keep console.log
|
|
575
|
-
console.log(`${indent}? Unknown entry type: ${JSON.stringify(entry)}`);
|
|
576
700
|
}
|
|
577
701
|
}
|
|
578
702
|
catch (stringifyError) {
|
|
@@ -612,9 +736,7 @@ class TraceClient {
|
|
|
612
736
|
* @returns Promise that resolves when the evaluation entry has been added to the trace
|
|
613
737
|
*/
|
|
614
738
|
asyncEvaluate(scorers_1) {
|
|
615
|
-
return __awaiter(this, arguments, void 0, function* (
|
|
616
|
-
// Accept general Scorer type, but filter/check for API scorers internally
|
|
617
|
-
scorers, options = {}) {
|
|
739
|
+
return __awaiter(this, arguments, void 0, function* (scorers, options = {}) {
|
|
618
740
|
if (!this.enableEvaluations) {
|
|
619
741
|
logger.warn("Evaluations are disabled. Skipping async evaluation.");
|
|
620
742
|
return;
|
|
@@ -629,7 +751,12 @@ class TraceClient {
|
|
|
629
751
|
logger.warn("No APIJudgmentScorers found in the provided scorers list. Skipping async evaluation as backend requires API scorers.");
|
|
630
752
|
return;
|
|
631
753
|
}
|
|
632
|
-
|
|
754
|
+
// Process rules (currently just using this.rules directly)
|
|
755
|
+
const loadedRules = this.rules; // TODO: Add ScorerWrapper-like processing if needed in TS
|
|
756
|
+
// Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
|
|
757
|
+
if (loadedRules && loadedRules.length > 0 && scorers.some(s => !(s instanceof APIJudgmentScorer))) {
|
|
758
|
+
throw new Error("Cannot use Judgeval scorers, you can only use API scorers when using rules. Please either remove rules or use only APIJudgmentScorer types.");
|
|
759
|
+
}
|
|
633
760
|
// Create example structure matching Python/backend expectations
|
|
634
761
|
const example = {
|
|
635
762
|
input: options.input || "",
|
|
@@ -649,6 +776,7 @@ class TraceClient {
|
|
|
649
776
|
logger.warn(`No current entry to record evaluation to\nStack trace: ${new Error().stack}`);
|
|
650
777
|
return;
|
|
651
778
|
}
|
|
779
|
+
const currentSpanId = currentEntry.span_id; // Get the span ID
|
|
652
780
|
// --- Create evaluation run name (similar to Python) ---
|
|
653
781
|
// Capitalize scorer names
|
|
654
782
|
const scorerNames = apiScorers.map(scorer => {
|
|
@@ -658,11 +786,9 @@ class TraceClient {
|
|
|
658
786
|
return name.charAt(0).toUpperCase() + name.slice(1);
|
|
659
787
|
}).join(',');
|
|
660
788
|
// Use trace name and shortened span ID (or trace ID if no span)
|
|
661
|
-
const idPart =
|
|
789
|
+
const idPart = currentSpanId ? currentSpanId.substring(0, 8) : this.traceId.substring(0, 8);
|
|
662
790
|
const evalName = `${this.name.charAt(0).toUpperCase() + this.name.slice(1)}-${idPart}-[${scorerNames}]`;
|
|
663
791
|
// --- End eval name creation ---
|
|
664
|
-
// Process rules (currently just using this.rules directly)
|
|
665
|
-
const loadedRules = this.rules; // TODO: Add ScorerWrapper-like processing if needed in TS
|
|
666
792
|
// Construct the evaluation payload
|
|
667
793
|
const evalRunPayload = {
|
|
668
794
|
organization_id: this.organizationId,
|
|
@@ -675,47 +801,18 @@ class TraceClient {
|
|
|
675
801
|
metadata: {}, // Matches Python tracer
|
|
676
802
|
judgment_api_key: this.apiKey,
|
|
677
803
|
override: this.overwrite, // Use trace's overwrite setting
|
|
678
|
-
rules: loadedRules // Pass the processed rules
|
|
804
|
+
rules: loadedRules, // Pass the processed rules
|
|
805
|
+
trace_span_id: currentSpanId // <<< RENAMED: Assign the current span ID (matching backend)
|
|
679
806
|
};
|
|
680
|
-
// Add evaluation entry
|
|
681
|
-
this.
|
|
807
|
+
// Add evaluation entry to the trace
|
|
808
|
+
this.recordOutput(evalRunPayload);
|
|
682
809
|
}
|
|
683
810
|
catch (error) {
|
|
684
|
-
|
|
685
|
-
|
|
811
|
+
logger.error(`Failed during asyncEvaluate execution: ${error instanceof Error ? error.message : String(error)}`);
|
|
812
|
+
throw error; // Re-throw after logging
|
|
686
813
|
}
|
|
687
814
|
});
|
|
688
815
|
}
|
|
689
|
-
/**
|
|
690
|
-
* Private helper to add an evaluation entry to the trace.
|
|
691
|
-
* This mirrors the structure of Python's add_eval_run.
|
|
692
|
-
*
|
|
693
|
-
* @param evalRunPayload The constructed payload for the evaluation.
|
|
694
|
-
* @param startTime The start time (in seconds) of the evaluation process.
|
|
695
|
-
*/
|
|
696
|
-
_addEvalRun(evalRunPayload, startTime) {
|
|
697
|
-
var _a, _b;
|
|
698
|
-
const traceClientContext = getTraceClientContext();
|
|
699
|
-
const currentEntry = traceClientContext.entryStack.at(-1);
|
|
700
|
-
if (!currentEntry) {
|
|
701
|
-
logger.warn(`No current entry to record evaluation to\nStack trace: ${new Error().stack}`);
|
|
702
|
-
return;
|
|
703
|
-
}
|
|
704
|
-
const function_ = (_a = currentEntry.function) !== null && _a !== void 0 ? _a : "unknown_function";
|
|
705
|
-
const depth = (_b = currentEntry.depth) !== null && _b !== void 0 ? _b : 0;
|
|
706
|
-
const duration = Date.now() / 1000 - startTime;
|
|
707
|
-
// Add evaluation entry to the trace
|
|
708
|
-
this.addEntry({
|
|
709
|
-
type: "evaluation",
|
|
710
|
-
function: function_,
|
|
711
|
-
span_id: currentEntry.span_id, // May be undefined
|
|
712
|
-
depth: depth,
|
|
713
|
-
timestamp: Date.now() / 1000,
|
|
714
|
-
evaluation_runs: [evalRunPayload], // Embed the payload
|
|
715
|
-
duration: duration,
|
|
716
|
-
span_type: "evaluation"
|
|
717
|
-
});
|
|
718
|
-
}
|
|
719
816
|
// OPTIONAL: Add a method to get the original name if needed elsewhere
|
|
720
817
|
getOriginalName() {
|
|
721
818
|
return this.originalName;
|
|
@@ -796,11 +893,6 @@ class Tracer {
|
|
|
796
893
|
apiKey: this.apiKey,
|
|
797
894
|
organizationId: this.organizationId,
|
|
798
895
|
});
|
|
799
|
-
if (traceClient.enableMonitoring) {
|
|
800
|
-
traceClient.save(true).catch(err => {
|
|
801
|
-
logger.error(`Failed to save empty trace (${traceClient.traceId}):`, err);
|
|
802
|
-
});
|
|
803
|
-
}
|
|
804
896
|
return traceClient;
|
|
805
897
|
}
|
|
806
898
|
*trace(name, options = {}) {
|