judgeval 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/README.md +95 -68
  2. package/dist/cjs/common/tracer.js +235 -143
  3. package/dist/cjs/common/tracer.js.map +1 -1
  4. package/dist/cjs/constants.js +8 -5
  5. package/dist/cjs/constants.js.map +1 -1
  6. package/dist/cjs/data/datasets/eval-dataset-client.js +349 -0
  7. package/dist/cjs/data/datasets/eval-dataset-client.js.map +1 -0
  8. package/dist/cjs/data/datasets/eval-dataset.js +405 -0
  9. package/dist/cjs/data/datasets/eval-dataset.js.map +1 -0
  10. package/dist/cjs/data/example.js +22 -1
  11. package/dist/cjs/data/example.js.map +1 -1
  12. package/dist/cjs/e2etests/eval-operations.test.js +282 -0
  13. package/dist/cjs/e2etests/eval-operations.test.js.map +1 -0
  14. package/dist/cjs/e2etests/judgee-traces.test.js +278 -0
  15. package/dist/cjs/e2etests/judgee-traces.test.js.map +1 -0
  16. package/dist/cjs/index.js +1 -3
  17. package/dist/cjs/index.js.map +1 -1
  18. package/dist/cjs/judgment-client.js +326 -645
  19. package/dist/cjs/judgment-client.js.map +1 -1
  20. package/dist/cjs/scorers/api-scorer.js +56 -48
  21. package/dist/cjs/scorers/api-scorer.js.map +1 -1
  22. package/dist/cjs/scorers/base-scorer.js +66 -11
  23. package/dist/cjs/scorers/base-scorer.js.map +1 -1
  24. package/dist/esm/common/tracer.js +236 -144
  25. package/dist/esm/common/tracer.js.map +1 -1
  26. package/dist/esm/constants.js +7 -4
  27. package/dist/esm/constants.js.map +1 -1
  28. package/dist/esm/data/datasets/eval-dataset-client.js +342 -0
  29. package/dist/esm/data/datasets/eval-dataset-client.js.map +1 -0
  30. package/dist/esm/data/datasets/eval-dataset.js +375 -0
  31. package/dist/esm/data/datasets/eval-dataset.js.map +1 -0
  32. package/dist/esm/data/example.js +22 -1
  33. package/dist/esm/data/example.js.map +1 -1
  34. package/dist/esm/e2etests/eval-operations.test.js +254 -0
  35. package/dist/esm/e2etests/eval-operations.test.js.map +1 -0
  36. package/dist/esm/e2etests/judgee-traces.test.js +253 -0
  37. package/dist/esm/e2etests/judgee-traces.test.js.map +1 -0
  38. package/dist/esm/index.js +0 -1
  39. package/dist/esm/index.js.map +1 -1
  40. package/dist/esm/judgment-client.js +328 -647
  41. package/dist/esm/judgment-client.js.map +1 -1
  42. package/dist/esm/scorers/api-scorer.js +56 -48
  43. package/dist/esm/scorers/api-scorer.js.map +1 -1
  44. package/dist/esm/scorers/base-scorer.js +66 -11
  45. package/dist/esm/scorers/base-scorer.js.map +1 -1
  46. package/dist/types/common/tracer.d.ts +27 -14
  47. package/dist/types/constants.d.ts +4 -4
  48. package/dist/types/data/datasets/eval-dataset-client.d.ts +39 -0
  49. package/dist/types/data/datasets/eval-dataset.d.ts +45 -0
  50. package/dist/types/data/example.d.ts +24 -12
  51. package/dist/types/e2etests/eval-operations.test.d.ts +5 -0
  52. package/dist/types/e2etests/judgee-traces.test.d.ts +5 -0
  53. package/dist/types/index.d.ts +0 -1
  54. package/dist/types/judgment-client.d.ts +3 -47
  55. package/dist/types/scorers/api-scorer.d.ts +15 -15
  56. package/dist/types/scorers/base-scorer.d.ts +53 -10
  57. package/package.json +2 -1
  58. package/dist/cjs/scorers/exact-match-scorer.js +0 -84
  59. package/dist/cjs/scorers/exact-match-scorer.js.map +0 -1
  60. package/dist/esm/scorers/exact-match-scorer.js +0 -80
  61. package/dist/esm/scorers/exact-match-scorer.js.map +0 -1
  62. package/dist/types/scorers/exact-match-scorer.d.ts +0 -10
@@ -74,16 +74,13 @@ class TraceManagerClient {
74
74
  try {
75
75
  // Use isomorphic fetch (available globally in modern Node.js and browsers)
76
76
  const response = yield fetch(url, Object.assign(Object.assign({}, options), { headers: headers }));
77
- if (!response.ok) {
78
- const errorBody = yield response.text();
79
- console.error(`API Error (${response.status}) for ${options.method || 'GET'} ${url}: ${errorBody}`);
80
- throw new Error(`Judgment API request failed: ${response.status} ${response.statusText} - ${errorBody}`);
81
- }
77
+ // We will return the response object even if !response.ok
78
+ // The caller (e.g., saveTrace) is responsible for checking response.ok or response.status
82
79
  // Handle cases where the response might be empty (e.g., 204 No Content on DELETE)
83
80
  if (response.status === 204) {
84
81
  return null; // Indicate success with no content
85
82
  }
86
- return yield response.json();
83
+ return response;
87
84
  }
88
85
  catch (error) {
89
86
  console.error(`Network or fetch error during ${options.method || 'GET'} ${url}:`, error);
@@ -100,21 +97,52 @@ class TraceManagerClient {
100
97
  });
101
98
  });
102
99
  }
103
- saveTrace(traceData, emptySave) {
100
+ saveTrace(traceData) {
104
101
  return __awaiter(this, void 0, void 0, function* () {
102
+ // _fetch now returns the raw response object or throws on network error
105
103
  const response = yield this._fetch(constants_js_1.JUDGMENT_TRACES_SAVE_API_URL, {
106
104
  method: 'POST',
107
- body: JSON.stringify(traceData),
105
+ body: JSON.stringify(traceData), // Stringify directly here again
108
106
  });
109
- // Optionally log the UI URL like the Python version
110
- if (!emptySave && (response === null || response === void 0 ? void 0 : response.ui_results_url)) {
111
- // Use console.info or a dedicated logger for user-facing messages
112
- // Note: We can't replicate Rich library's colored link easily in standard console
107
+ // Check if _fetch threw a network error (caught below) or returned an invalid object
108
+ if (!response) {
109
+ // This case should ideally be caught by _fetch's catch block, but double-check
110
+ throw new Error('Failed to save trace data: No response received from API.');
111
+ }
112
+ // Now, check the status code on the received response object
113
+ if (response.status === 400) {
114
+ // Attempt to get error body for more info
115
+ const errorBody = yield response.text();
116
+ throw new Error(`Failed to save trace data: Check your Trace name for conflicts, set overwrite=True to overwrite existing traces: ${response.status} ${response.statusText || ''} - ${errorBody}`);
117
+ }
118
+ else if (!response.ok) { // Handles other errors (5xx, 4xx except 400)
119
+ const errorBody = yield response.text();
120
+ throw new Error(`Failed to save trace data: Status ${response.status} ${response.statusText || '(No status text)'} - ${errorBody}`);
121
+ }
122
+ // --- Success Path ---
123
+ // Optionally log the UI URL (needs JSON parsing)
124
+ let responseData = null;
125
+ try {
126
+ // Handle 204 No Content specifically
127
+ if (response.status === 204) {
128
+ responseData = null; // Or maybe { success: true }?
129
+ }
130
+ else {
131
+ responseData = yield response.json(); // Parse JSON only on success
132
+ }
133
+ }
134
+ catch (parseError) {
135
+ logger_instance_js_1.default.warn("Failed to parse successful API response JSON.", { error: parseError });
136
+ // Depending on requirements, maybe throw, maybe return a default success object
137
+ throw new Error(`API request succeeded (${response.status}), but failed to parse JSON response.`);
138
+ }
139
+ if (responseData === null || responseData === void 0 ? void 0 : responseData.ui_results_url) {
113
140
  console.info(`
114
- 🔍 View trace: ${response.ui_results_url}
141
+ 🔍 View trace: ${responseData.ui_results_url}
115
142
  `);
116
143
  }
117
- return response;
144
+ // Return the parsed data (or null for 204)
145
+ return responseData;
118
146
  });
119
147
  }
120
148
  deleteTrace(traceId) {
@@ -143,6 +171,50 @@ class TraceManagerClient {
143
171
  });
144
172
  });
145
173
  }
174
+ /**
175
+ * Calculate token costs directly using the API endpoint.
176
+ * This is more accurate than client-side calculation as it uses the most up-to-date pricing.
177
+ *
178
+ * @param model The model name (e.g. 'gpt-4', 'claude-3-opus-20240229')
179
+ * @param promptTokens Number of tokens in the prompt/input
180
+ * @param completionTokens Number of tokens in the completion/output
181
+ * @returns Object containing token counts and calculated costs in USD
182
+ */
183
+ calculateTokenCosts(model, promptTokens, completionTokens) {
184
+ return __awaiter(this, void 0, void 0, function* () {
185
+ try {
186
+ // Use the new calculation endpoint
187
+ const response = yield this._fetch(constants_js_1.JUDGMENT_CALCULATE_TOKEN_COSTS_API_URL, {
188
+ method: 'POST',
189
+ body: JSON.stringify({
190
+ model,
191
+ prompt_tokens: promptTokens,
192
+ completion_tokens: completionTokens
193
+ })
194
+ });
195
+ // Check if the response is okay and parse JSON
196
+ if (response && response.ok) {
197
+ const data = yield response.json();
198
+ return data;
199
+ }
200
+ else if (response) {
201
+ // Log error if response was not ok
202
+ const errorBody = yield response.text();
203
+ logger_instance_js_1.default.warn(`API error calculating token costs for model ${model}: ${response.status} ${response.statusText}`, { errorBody });
204
+ return null;
205
+ }
206
+ else {
207
+ // Handle cases where _fetch might return null or undefined (though it shouldn't with current implementation)
208
+ logger_instance_js_1.default.warn(`No response received when calculating token costs for model ${model}.`);
209
+ return null;
210
+ }
211
+ }
212
+ catch (error) {
213
+ logger_instance_js_1.default.warn(`Failed to calculate token costs for model ${model}.`, { error: error instanceof Error ? error.message : String(error) });
214
+ return null;
215
+ }
216
+ });
217
+ }
146
218
  }
147
219
  exports.TraceManagerClient = TraceManagerClient;
148
220
  // --- Helper Functions ---
@@ -160,6 +232,7 @@ class TraceClient {
160
232
  constructor(config) {
161
233
  var _a, _b, _c, _d, _e;
162
234
  this.traceManager = null; // Can be null if monitoring disabled
235
+ this._spanDepths = {}; // Track depth of active spans
163
236
  this.traceId = config.traceId || (0, uuid_1.v4)();
164
237
  this.originalName = config.name || 'default_trace'; // Store original
165
238
  this.name = sanitizeName(this.originalName); // Use sanitized name internally
@@ -198,7 +271,7 @@ class TraceClient {
198
271
  recordInput(inputs) {
199
272
  const traceClientContext = getTraceClientContext();
200
273
  const currentEntry = traceClientContext.entryStack.at(-1);
201
- if (!currentEntry) {
274
+ if (!currentEntry || !currentEntry.span_id) {
202
275
  console.warn(`No current entry to record input to\nStack trace: ${new Error().stack}`);
203
276
  return;
204
277
  }
@@ -207,14 +280,16 @@ class TraceClient {
207
280
  span_id: currentEntry.span_id,
208
281
  inputs,
209
282
  function: currentEntry.function,
210
- depth: currentEntry.depth,
211
- span_type: currentEntry.span_type
283
+ depth: this._spanDepths[currentEntry.span_id],
284
+ created_at: Date.now() / 1000,
285
+ span_type: currentEntry.span_type,
286
+ message: `Inputs to ${currentEntry.function}`
212
287
  });
213
288
  }
214
289
  recordOutput(output) {
215
290
  const traceClientContext = getTraceClientContext();
216
291
  const currentEntry = traceClientContext.entryStack.at(-1);
217
- if (!currentEntry) {
292
+ if (!currentEntry || !currentEntry.span_id) {
218
293
  console.warn(`No current entry to record output to\nStack trace: ${new Error().stack}`);
219
294
  return;
220
295
  }
@@ -223,33 +298,28 @@ class TraceClient {
223
298
  span_id: currentEntry.span_id,
224
299
  output,
225
300
  function: currentEntry.function,
226
- depth: currentEntry.depth,
227
- span_type: currentEntry.span_type
301
+ depth: this._spanDepths[currentEntry.span_id],
302
+ created_at: Date.now() / 1000,
303
+ span_type: currentEntry.span_type,
304
+ message: `Output from ${currentEntry.function}`
228
305
  });
229
306
  }
230
307
  recordError(error) {
231
- var _a;
232
308
  const traceClientContext = getTraceClientContext();
233
309
  const currentEntry = traceClientContext.entryStack.at(-1);
234
- if (!currentEntry) {
310
+ if (!currentEntry || !currentEntry.span_id) {
235
311
  console.warn(`No current entry to record error to\nStack trace: ${new Error().stack}`);
236
312
  return;
237
313
  }
238
- let output = error;
239
- if (error instanceof Error) {
240
- output = {
241
- name: error.name,
242
- message: error.message,
243
- stack: (_a = error.stack) === null || _a === void 0 ? void 0 : _a.substring(0, 1000)
244
- };
245
- }
246
314
  this.addEntry({
247
315
  type: 'error',
248
316
  span_id: currentEntry.span_id,
249
- output,
317
+ output: error,
250
318
  function: currentEntry.function,
251
- depth: currentEntry.depth,
252
- span_type: currentEntry.span_type
319
+ depth: this._spanDepths[currentEntry.span_id],
320
+ created_at: Date.now() / 1000,
321
+ span_type: currentEntry.span_type,
322
+ message: `Error from ${currentEntry.function}`
253
323
  });
254
324
  }
255
325
  startSpan(name, options = {}) {
@@ -260,18 +330,20 @@ class TraceClient {
260
330
  const spanType = (_a = options.spanType) !== null && _a !== void 0 ? _a : 'span';
261
331
  const startTime = Date.now() / 1000;
262
332
  let depth = 0, parentSpanId = undefined;
263
- if (parentEntry) {
264
- depth = parentEntry.depth + 1;
333
+ if (parentEntry && parentEntry.span_id) {
334
+ depth = this._spanDepths[parentEntry.span_id] + 1;
265
335
  parentSpanId = parentEntry.span_id;
266
336
  }
337
+ this._spanDepths[spanId] = depth;
267
338
  const entry = {
268
339
  type: 'enter',
269
340
  function: name,
270
341
  span_id: spanId,
271
342
  depth: depth,
272
- timestamp: startTime,
343
+ created_at: startTime,
273
344
  span_type: spanType,
274
- parent_span_id: parentSpanId
345
+ parent_span_id: parentSpanId,
346
+ message: name
275
347
  };
276
348
  this.addEntry(entry);
277
349
  traceClientContext.entryStack.push(entry);
@@ -279,21 +351,24 @@ class TraceClient {
279
351
  endSpan() {
280
352
  const traceClientContext = getTraceClientContext();
281
353
  const enterEntry = traceClientContext.entryStack.pop();
282
- if (!enterEntry) {
354
+ if (!enterEntry || !enterEntry.span_id) {
283
355
  console.warn("No enter entry to end");
284
356
  return;
285
357
  }
286
358
  const endTime = Date.now() / 1000;
287
- const duration = endTime - enterEntry.timestamp;
359
+ const duration = endTime - enterEntry.created_at;
288
360
  this.addEntry({
289
361
  type: 'exit',
290
362
  function: enterEntry.function,
291
363
  span_id: enterEntry.span_id,
292
- depth: enterEntry.depth,
293
- timestamp: endTime,
364
+ depth: this._spanDepths[enterEntry.span_id],
365
+ created_at: endTime,
294
366
  duration: duration,
295
- span_type: enterEntry.span_type
367
+ span_type: enterEntry.span_type,
368
+ message: `← ${enterEntry.function}`
296
369
  });
370
+ // Clean up depth tracking
371
+ delete this._spanDepths[enterEntry.span_id];
297
372
  }
298
373
  *span(name, options = {}) {
299
374
  if (!this.enableMonitoring) {
@@ -311,6 +386,7 @@ class TraceClient {
311
386
  condenseTrace(rawEntries) {
312
387
  var _a, _b, _c, _d, _e;
313
388
  const spansById = {};
389
+ const allEvaluationRuns = [];
314
390
  for (const entry of rawEntries) {
315
391
  const spanId = entry.span_id;
316
392
  if (!spanId)
@@ -320,12 +396,12 @@ class TraceClient {
320
396
  span_id: spanId,
321
397
  function: entry.function || 'unknown',
322
398
  depth: (_a = entry.depth) !== null && _a !== void 0 ? _a : 0,
323
- timestamp: (_b = entry.timestamp) !== null && _b !== void 0 ? _b : 0,
399
+ created_at: new Date(((_b = entry.created_at) !== null && _b !== void 0 ? _b : 0) * 1000).toISOString(), // Convert number to ISO string
400
+ trace_id: this.traceId, // Add trace_id
324
401
  parent_span_id: entry.parent_span_id,
325
402
  span_type: entry.span_type || 'span',
326
403
  inputs: null,
327
404
  output: null,
328
- evaluation_runs: [],
329
405
  duration: null,
330
406
  children: []
331
407
  };
@@ -335,14 +411,14 @@ class TraceClient {
335
411
  case 'enter':
336
412
  currentSpanData.function = entry.function || currentSpanData.function;
337
413
  currentSpanData.depth = (_c = entry.depth) !== null && _c !== void 0 ? _c : currentSpanData.depth;
338
- currentSpanData.timestamp = (_d = entry.timestamp) !== null && _d !== void 0 ? _d : currentSpanData.timestamp;
414
+ currentSpanData.created_at = new Date(((_d = entry.created_at) !== null && _d !== void 0 ? _d : 0) * 1000).toISOString(); // Ensure created_at is string on update
339
415
  currentSpanData.parent_span_id = entry.parent_span_id;
340
416
  currentSpanData.span_type = entry.span_type || currentSpanData.span_type;
341
- currentSpanData.start_time = entry.timestamp;
417
+ currentSpanData.start_time = entry.created_at; // Keep original number for duration calc
342
418
  break;
343
419
  case 'exit':
344
420
  currentSpanData.duration = (_e = entry.duration) !== null && _e !== void 0 ? _e : currentSpanData.duration;
345
- currentSpanData.end_time = entry.timestamp;
421
+ currentSpanData.end_time = entry.created_at; // Keep original number for duration calc
346
422
  if (currentSpanData.duration === null && currentSpanData.start_time && currentSpanData.end_time) {
347
423
  currentSpanData.duration = currentSpanData.end_time - currentSpanData.start_time;
348
424
  }
@@ -358,10 +434,8 @@ class TraceClient {
358
434
  case 'output':
359
435
  case 'error':
360
436
  currentSpanData.output = entry.output;
361
- break;
362
- case 'evaluation':
363
- if (entry.evaluation_runs) {
364
- currentSpanData.evaluation_runs.push(...entry.evaluation_runs);
437
+ if (entry.type === 'output' && entry.output && typeof entry.output === 'object' && 'eval_name' in entry.output && 'scorers' in entry.output && 'trace_span_id' in entry.output) {
438
+ allEvaluationRuns.push(entry.output);
365
439
  }
366
440
  break;
367
441
  }
@@ -392,9 +466,11 @@ class TraceClient {
392
466
  childrenMap[parentId].push(span);
393
467
  }
394
468
  }
395
- roots.sort((a, b) => a.timestamp - b.timestamp);
469
+ // Sort using parsed dates
470
+ roots.sort((a, b) => Date.parse(a.created_at) - Date.parse(b.created_at));
396
471
  for (const parentId in childrenMap) {
397
- childrenMap[parentId].sort((a, b) => a.timestamp - b.timestamp);
472
+ // Sort using parsed dates
473
+ childrenMap[parentId].sort((a, b) => Date.parse(a.created_at) - Date.parse(b.created_at));
398
474
  }
399
475
  function buildFlatListDfs(span) {
400
476
  if (visited.has(span.span_id))
@@ -415,26 +491,36 @@ class TraceClient {
415
491
  buildFlatListDfs(span);
416
492
  }
417
493
  }
418
- return sortedCondensedList;
494
+ return [sortedCondensedList, allEvaluationRuns];
419
495
  }
420
496
  save() {
421
497
  return __awaiter(this, arguments, void 0, function* (emptySave = false) {
498
+ var _a, _b, _c, _d, _e;
422
499
  if (!this.enableMonitoring || !this.traceManager) {
423
500
  return null;
424
501
  }
425
502
  const traceClientContext = getTraceClientContext();
426
503
  const totalDuration = this.getDuration();
427
- const condensedEntries = this.condenseTrace(traceClientContext.entries);
504
+ // Use the tuple returned by condenseTrace
505
+ const [condensedEntries, evaluationRuns] = this.condenseTrace(traceClientContext.entries);
428
506
  const tokenCounts = {
429
- prompt_tokens: 0, completion_tokens: 0, total_tokens: 0,
430
- prompt_tokens_cost_usd: 0.0, completion_tokens_cost_usd: 0.0, total_cost_usd: 0.0
507
+ prompt_tokens: 0,
508
+ completion_tokens: 0,
509
+ total_tokens: 0,
510
+ prompt_tokens_cost_usd: 0.0,
511
+ completion_tokens_cost_usd: 0.0,
512
+ total_cost_usd: 0.0
431
513
  };
432
- condensedEntries.forEach(entry => {
433
- var _a, _b;
514
+ // First pass: collect all LLM calls with their token counts
515
+ const llmCalls = [];
516
+ let index = 0;
517
+ for (const entry of condensedEntries) {
434
518
  if (entry.span_type === 'llm' && ((_a = entry.output) === null || _a === void 0 ? void 0 : _a.usage)) {
435
519
  const usage = entry.output.usage;
520
+ const modelName = ((_b = entry.inputs) === null || _b === void 0 ? void 0 : _b.model) || "";
436
521
  let promptTokens = 0;
437
522
  let completionTokens = 0;
523
+ // Handle different token naming conventions
438
524
  if (usage.prompt_tokens !== undefined || usage.completion_tokens !== undefined) {
439
525
  promptTokens = usage.prompt_tokens || 0;
440
526
  completionTokens = usage.completion_tokens || 0;
@@ -442,6 +528,7 @@ class TraceClient {
442
528
  else if (usage.input_tokens !== undefined || usage.output_tokens !== undefined) {
443
529
  promptTokens = usage.input_tokens || 0;
444
530
  completionTokens = usage.output_tokens || 0;
531
+ // Standardize naming
445
532
  usage.prompt_tokens = promptTokens;
446
533
  usage.completion_tokens = completionTokens;
447
534
  delete usage.input_tokens;
@@ -450,33 +537,82 @@ class TraceClient {
450
537
  tokenCounts.prompt_tokens += promptTokens;
451
538
  tokenCounts.completion_tokens += completionTokens;
452
539
  tokenCounts.total_tokens += usage.total_tokens || (promptTokens + completionTokens);
453
- const modelName = ((_b = entry.inputs) === null || _b === void 0 ? void 0 : _b.model) || "";
540
+ // Add to list of calls for cost calculation
454
541
  if (modelName) {
455
- try {
456
- const promptCost = 0.0;
457
- const completionCost = 0.0;
458
- const callTotalCost = promptCost + completionCost;
459
- usage.prompt_tokens_cost_usd = promptCost;
460
- usage.completion_tokens_cost_usd = completionCost;
461
- usage.total_cost_usd = callTotalCost;
462
- tokenCounts.prompt_tokens_cost_usd += promptCost;
463
- tokenCounts.completion_tokens_cost_usd += completionCost;
464
- tokenCounts.total_cost_usd += callTotalCost;
542
+ llmCalls.push({
543
+ modelName,
544
+ promptTokens,
545
+ completionTokens,
546
+ entryIndex: index
547
+ });
548
+ }
549
+ }
550
+ index++;
551
+ }
552
+ // Second pass: calculate costs for each LLM call using the API
553
+ if (this.traceManager && llmCalls.length > 0) {
554
+ // Process each LLM call
555
+ for (const call of llmCalls) {
556
+ try {
557
+ // Get costs from the API
558
+ const costs = yield this.traceManager.calculateTokenCosts(call.modelName, call.promptTokens, call.completionTokens);
559
+ if (costs) {
560
+ // Update the entry with the costs
561
+ const entry = condensedEntries[call.entryIndex];
562
+ // Ensure output and usage objects exist before assigning costs
563
+ if (entry.output && entry.output.usage) {
564
+ // --- This part assigns costs to the individual span ---
565
+ entry.output.usage.prompt_tokens_cost_usd = costs.prompt_tokens_cost_usd;
566
+ entry.output.usage.completion_tokens_cost_usd = costs.completion_tokens_cost_usd;
567
+ entry.output.usage.total_cost_usd = costs.total_cost_usd;
568
+ logger_instance_js_1.default.debug(`Assigned costs to span ${entry.span_id} (model: ${call.modelName})`, { costs }); // Added debug log
569
+ // -----------------------------------------------------
570
+ }
571
+ else {
572
+ logger_instance_js_1.default.warn(`Could not assign costs to span ${entry.span_id} (model: ${call.modelName}): Missing 'output' or 'output.usage' object.`, { output: entry.output }); // Log if structure is missing
573
+ }
574
+ // Add to the total costs for the trace
575
+ tokenCounts.prompt_tokens_cost_usd += (_c = costs.prompt_tokens_cost_usd) !== null && _c !== void 0 ? _c : 0.0;
576
+ tokenCounts.completion_tokens_cost_usd += (_d = costs.completion_tokens_cost_usd) !== null && _d !== void 0 ? _d : 0.0;
577
+ tokenCounts.total_cost_usd += (_e = costs.total_cost_usd) !== null && _e !== void 0 ? _e : 0.0;
465
578
  }
466
- catch (e) {
467
- console.warn(`Error calculating cost for model '${modelName}':`, e);
468
- usage.prompt_tokens_cost_usd = null;
469
- usage.completion_tokens_cost_usd = null;
470
- usage.total_cost_usd = null;
579
+ else {
580
+ // If calculation failed, set costs to null in the entry (matching Python behavior)
581
+ const entry = condensedEntries[call.entryIndex];
582
+ // Ensure output and usage objects exist before assigning null costs
583
+ if (entry.output && entry.output.usage) {
584
+ // --- Sets null costs on the individual span ---
585
+ entry.output.usage.prompt_tokens_cost_usd = null;
586
+ entry.output.usage.completion_tokens_cost_usd = null;
587
+ entry.output.usage.total_cost_usd = null;
588
+ // ------------------------------------------
589
+ }
590
+ else {
591
+ // Log if we can't even assign null because the structure is missing
592
+ logger_instance_js_1.default.warn(`Could not assign null costs to span ${entry.span_id} (model: ${call.modelName}): Missing 'output' or 'output.usage' object.`, { output: entry.output });
593
+ }
594
+ logger_instance_js_1.default.warn(`Token cost calculation failed for model '${call.modelName}'. Cost information will be null for this span.`); // More specific warning
471
595
  }
472
596
  }
473
- else {
474
- usage.prompt_tokens_cost_usd = null;
475
- usage.completion_tokens_cost_usd = null;
476
- usage.total_cost_usd = null;
597
+ catch (e) {
598
+ logger_instance_js_1.default.warn(`Error during cost calculation loop for model '${call.modelName}':`, e); // Adjusted logging
599
+ // Set costs to null in the entry if an error occurs during the loop iteration
600
+ const entry = condensedEntries[call.entryIndex];
601
+ // Ensure output and usage objects exist before assigning null costs on error
602
+ if (entry.output && entry.output.usage) {
603
+ // --- Sets null costs on the individual span on error ---
604
+ entry.output.usage.prompt_tokens_cost_usd = null;
605
+ entry.output.usage.completion_tokens_cost_usd = null;
606
+ entry.output.usage.total_cost_usd = null;
607
+ // ----------------------------------------------------
608
+ }
609
+ else {
610
+ // Log if we can't assign null on error because the structure is missing
611
+ logger_instance_js_1.default.warn(`Could not assign null costs to span ${entry.span_id} (model: ${call.modelName}) on error: Missing 'output' or 'output.usage' object.`, { output: entry.output });
612
+ }
477
613
  }
478
614
  }
479
- });
615
+ }
480
616
  // Convert rules array to a dictionary (Record<string, Rule>)
481
617
  const rulesDict = {};
482
618
  this.rules.forEach(rule => {
@@ -493,16 +629,15 @@ class TraceClient {
493
629
  duration: totalDuration,
494
630
  token_counts: tokenCounts,
495
631
  entries: condensedEntries,
496
- rules: rulesDict,
497
- empty_save: emptySave,
632
+ evaluation_runs: evaluationRuns,
498
633
  overwrite: this.overwrite,
499
634
  parent_trace_id: this.parentTraceId,
500
635
  parent_name: this.parentName
501
636
  };
502
637
  try {
503
- yield this.traceManager.saveTrace(traceData, emptySave);
638
+ yield this.traceManager.saveTrace(traceData);
504
639
  logger_instance_js_1.default.info(`Trace ${this.traceId} saved successfully.`);
505
- if (!emptySave && this.enableEvaluations) {
640
+ if (this.enableEvaluations) {
506
641
  try {
507
642
  yield this.traceManager.addTraceToEvalQueue(traceData);
508
643
  logger_instance_js_1.default.info(`Trace ${this.traceId} added to evaluation queue.`);
@@ -536,7 +671,7 @@ class TraceClient {
536
671
  traceClientContext.entries.forEach(entry => {
537
672
  var _a;
538
673
  const indent = " ".repeat((_a = entry.depth) !== null && _a !== void 0 ? _a : 0);
539
- const timeStr = entry.timestamp ? `@ ${new Date(entry.timestamp * 1000).toISOString()}` : '';
674
+ const timeStr = entry.created_at ? `@ ${new Date(entry.created_at * 1000).toISOString()}` : '';
540
675
  const shortSpanId = entry.span_id ? `(id: ${entry.span_id.substring(0, 8)}...)` : '';
541
676
  const shortParentId = entry.parent_span_id ? `(parent: ${entry.parent_span_id.substring(0, 8)}...)` : '';
542
677
  try {
@@ -567,17 +702,6 @@ class TraceClient {
567
702
  // Keep console.log
568
703
  console.log(`${indent} ${prefix} (for ${shortSpanId}): ${outputStr || 'null'}`);
569
704
  break;
570
- case 'evaluation':
571
- let evalStr = JSON.stringify(entry.evaluation_runs);
572
- if (evalStr && evalStr.length > 200) {
573
- evalStr = evalStr.substring(0, 197) + '...';
574
- }
575
- // Keep console.log
576
- console.log(`${indent} Evaluation (for ${shortSpanId}): ${evalStr || '[]'}`);
577
- break;
578
- default:
579
- // Keep console.log
580
- console.log(`${indent}? Unknown entry type: ${JSON.stringify(entry)}`);
581
705
  }
582
706
  }
583
707
  catch (stringifyError) {
@@ -617,9 +741,7 @@ class TraceClient {
617
741
  * @returns Promise that resolves when the evaluation entry has been added to the trace
618
742
  */
619
743
  asyncEvaluate(scorers_1) {
620
- return __awaiter(this, arguments, void 0, function* (
621
- // Accept general Scorer type, but filter/check for API scorers internally
622
- scorers, options = {}) {
744
+ return __awaiter(this, arguments, void 0, function* (scorers, options = {}) {
623
745
  if (!this.enableEvaluations) {
624
746
  logger_instance_js_1.default.warn("Evaluations are disabled. Skipping async evaluation.");
625
747
  return;
@@ -634,7 +756,12 @@ class TraceClient {
634
756
  logger_instance_js_1.default.warn("No APIJudgmentScorers found in the provided scorers list. Skipping async evaluation as backend requires API scorers.");
635
757
  return;
636
758
  }
637
- const startTime = Date.now() / 1000; // Record start time in seconds
759
+ // Process rules (currently just using this.rules directly)
760
+ const loadedRules = this.rules; // TODO: Add ScorerWrapper-like processing if needed in TS
761
+ // Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
762
+ if (loadedRules && loadedRules.length > 0 && scorers.some(s => !(s instanceof base_scorer_js_1.APIJudgmentScorer))) {
763
+ throw new Error("Cannot use Judgeval scorers, you can only use API scorers when using rules. Please either remove rules or use only APIJudgmentScorer types.");
764
+ }
638
765
  // Create example structure matching Python/backend expectations
639
766
  const example = {
640
767
  input: options.input || "",
@@ -654,6 +781,7 @@ class TraceClient {
654
781
  logger_instance_js_1.default.warn(`No current entry to record evaluation to\nStack trace: ${new Error().stack}`);
655
782
  return;
656
783
  }
784
+ const currentSpanId = currentEntry.span_id; // Get the span ID
657
785
  // --- Create evaluation run name (similar to Python) ---
658
786
  // Capitalize scorer names
659
787
  const scorerNames = apiScorers.map(scorer => {
@@ -663,11 +791,9 @@ class TraceClient {
663
791
  return name.charAt(0).toUpperCase() + name.slice(1);
664
792
  }).join(',');
665
793
  // Use trace name and shortened span ID (or trace ID if no span)
666
- const idPart = currentEntry ? currentEntry.span_id.substring(0, 8) : this.traceId.substring(0, 8);
794
+ const idPart = currentSpanId ? currentSpanId.substring(0, 8) : this.traceId.substring(0, 8);
667
795
  const evalName = `${this.name.charAt(0).toUpperCase() + this.name.slice(1)}-${idPart}-[${scorerNames}]`;
668
796
  // --- End eval name creation ---
669
- // Process rules (currently just using this.rules directly)
670
- const loadedRules = this.rules; // TODO: Add ScorerWrapper-like processing if needed in TS
671
797
  // Construct the evaluation payload
672
798
  const evalRunPayload = {
673
799
  organization_id: this.organizationId,
@@ -680,47 +806,18 @@ class TraceClient {
680
806
  metadata: {}, // Matches Python tracer
681
807
  judgment_api_key: this.apiKey,
682
808
  override: this.overwrite, // Use trace's overwrite setting
683
- rules: loadedRules // Pass the processed rules
809
+ rules: loadedRules, // Pass the processed rules
810
+ trace_span_id: currentSpanId // <<< RENAMED: Assign the current span ID (matching backend)
684
811
  };
685
- // Add evaluation entry using the helper method
686
- this._addEvalRun(evalRunPayload, startTime);
812
+ // Add evaluation entry to the trace
813
+ this.recordOutput(evalRunPayload);
687
814
  }
688
815
  catch (error) {
689
- console.error(`Failed during asyncEvaluate execution: ${error instanceof Error ? error.message : String(error)}`);
690
- // Decide if we should re-throw or just log
816
+ logger_instance_js_1.default.error(`Failed during asyncEvaluate execution: ${error instanceof Error ? error.message : String(error)}`);
817
+ throw error; // Re-throw after logging
691
818
  }
692
819
  });
693
820
  }
694
- /**
695
- * Private helper to add an evaluation entry to the trace.
696
- * This mirrors the structure of Python's add_eval_run.
697
- *
698
- * @param evalRunPayload The constructed payload for the evaluation.
699
- * @param startTime The start time (in seconds) of the evaluation process.
700
- */
701
- _addEvalRun(evalRunPayload, startTime) {
702
- var _a, _b;
703
- const traceClientContext = getTraceClientContext();
704
- const currentEntry = traceClientContext.entryStack.at(-1);
705
- if (!currentEntry) {
706
- logger_instance_js_1.default.warn(`No current entry to record evaluation to\nStack trace: ${new Error().stack}`);
707
- return;
708
- }
709
- const function_ = (_a = currentEntry.function) !== null && _a !== void 0 ? _a : "unknown_function";
710
- const depth = (_b = currentEntry.depth) !== null && _b !== void 0 ? _b : 0;
711
- const duration = Date.now() / 1000 - startTime;
712
- // Add evaluation entry to the trace
713
- this.addEntry({
714
- type: "evaluation",
715
- function: function_,
716
- span_id: currentEntry.span_id, // May be undefined
717
- depth: depth,
718
- timestamp: Date.now() / 1000,
719
- evaluation_runs: [evalRunPayload], // Embed the payload
720
- duration: duration,
721
- span_type: "evaluation"
722
- });
723
- }
724
821
  // OPTIONAL: Add a method to get the original name if needed elsewhere
725
822
  getOriginalName() {
726
823
  return this.originalName;
@@ -802,11 +899,6 @@ class Tracer {
802
899
  apiKey: this.apiKey,
803
900
  organizationId: this.organizationId,
804
901
  });
805
- if (traceClient.enableMonitoring) {
806
- traceClient.save(true).catch(err => {
807
- logger_instance_js_1.default.error(`Failed to save empty trace (${traceClient.traceId}):`, err);
808
- });
809
- }
810
902
  return traceClient;
811
903
  }
812
904
  *trace(name, options = {}) {