judgeval 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/README.md +95 -68
  2. package/dist/cjs/common/tracer.js +235 -143
  3. package/dist/cjs/common/tracer.js.map +1 -1
  4. package/dist/cjs/constants.js +8 -5
  5. package/dist/cjs/constants.js.map +1 -1
  6. package/dist/cjs/data/datasets/eval-dataset-client.js +349 -0
  7. package/dist/cjs/data/datasets/eval-dataset-client.js.map +1 -0
  8. package/dist/cjs/data/datasets/eval-dataset.js +405 -0
  9. package/dist/cjs/data/datasets/eval-dataset.js.map +1 -0
  10. package/dist/cjs/data/example.js +22 -1
  11. package/dist/cjs/data/example.js.map +1 -1
  12. package/dist/cjs/e2etests/eval-operations.test.js +282 -0
  13. package/dist/cjs/e2etests/eval-operations.test.js.map +1 -0
  14. package/dist/cjs/e2etests/judgee-traces.test.js +278 -0
  15. package/dist/cjs/e2etests/judgee-traces.test.js.map +1 -0
  16. package/dist/cjs/index.js +1 -3
  17. package/dist/cjs/index.js.map +1 -1
  18. package/dist/cjs/judgment-client.js +326 -645
  19. package/dist/cjs/judgment-client.js.map +1 -1
  20. package/dist/cjs/scorers/api-scorer.js +56 -48
  21. package/dist/cjs/scorers/api-scorer.js.map +1 -1
  22. package/dist/cjs/scorers/base-scorer.js +66 -11
  23. package/dist/cjs/scorers/base-scorer.js.map +1 -1
  24. package/dist/esm/common/tracer.js +236 -144
  25. package/dist/esm/common/tracer.js.map +1 -1
  26. package/dist/esm/constants.js +7 -4
  27. package/dist/esm/constants.js.map +1 -1
  28. package/dist/esm/data/datasets/eval-dataset-client.js +342 -0
  29. package/dist/esm/data/datasets/eval-dataset-client.js.map +1 -0
  30. package/dist/esm/data/datasets/eval-dataset.js +375 -0
  31. package/dist/esm/data/datasets/eval-dataset.js.map +1 -0
  32. package/dist/esm/data/example.js +22 -1
  33. package/dist/esm/data/example.js.map +1 -1
  34. package/dist/esm/e2etests/eval-operations.test.js +254 -0
  35. package/dist/esm/e2etests/eval-operations.test.js.map +1 -0
  36. package/dist/esm/e2etests/judgee-traces.test.js +253 -0
  37. package/dist/esm/e2etests/judgee-traces.test.js.map +1 -0
  38. package/dist/esm/index.js +0 -1
  39. package/dist/esm/index.js.map +1 -1
  40. package/dist/esm/judgment-client.js +328 -647
  41. package/dist/esm/judgment-client.js.map +1 -1
  42. package/dist/esm/scorers/api-scorer.js +56 -48
  43. package/dist/esm/scorers/api-scorer.js.map +1 -1
  44. package/dist/esm/scorers/base-scorer.js +66 -11
  45. package/dist/esm/scorers/base-scorer.js.map +1 -1
  46. package/dist/types/common/tracer.d.ts +27 -14
  47. package/dist/types/constants.d.ts +4 -4
  48. package/dist/types/data/datasets/eval-dataset-client.d.ts +39 -0
  49. package/dist/types/data/datasets/eval-dataset.d.ts +45 -0
  50. package/dist/types/data/example.d.ts +24 -12
  51. package/dist/types/e2etests/eval-operations.test.d.ts +5 -0
  52. package/dist/types/e2etests/judgee-traces.test.d.ts +5 -0
  53. package/dist/types/index.d.ts +0 -1
  54. package/dist/types/judgment-client.d.ts +3 -47
  55. package/dist/types/scorers/api-scorer.d.ts +15 -15
  56. package/dist/types/scorers/base-scorer.d.ts +53 -10
  57. package/package.json +2 -1
  58. package/dist/cjs/scorers/exact-match-scorer.js +0 -84
  59. package/dist/cjs/scorers/exact-match-scorer.js.map +0 -1
  60. package/dist/esm/scorers/exact-match-scorer.js +0 -80
  61. package/dist/esm/scorers/exact-match-scorer.js.map +0 -1
  62. package/dist/types/scorers/exact-match-scorer.d.ts +0 -10
@@ -14,7 +14,7 @@ import { AsyncLocalStorage } from 'async_hooks';
14
14
  import OpenAI from 'openai';
15
15
  import Anthropic from '@anthropic-ai/sdk';
16
16
  // Local Imports
17
- import { JUDGMENT_TRACES_SAVE_API_URL, JUDGMENT_TRACES_FETCH_API_URL, JUDGMENT_TRACES_DELETE_API_URL, JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL,
17
+ import { JUDGMENT_TRACES_SAVE_API_URL, JUDGMENT_TRACES_FETCH_API_URL, JUDGMENT_TRACES_DELETE_API_URL, JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL, JUDGMENT_CALCULATE_TOKEN_COSTS_API_URL,
18
18
  // Add other necessary constants if needed
19
19
  } from '../constants.js';
20
20
  import { APIJudgmentScorer } from '../scorers/base-scorer.js';
@@ -70,16 +70,13 @@ class TraceManagerClient {
70
70
  try {
71
71
  // Use isomorphic fetch (available globally in modern Node.js and browsers)
72
72
  const response = yield fetch(url, Object.assign(Object.assign({}, options), { headers: headers }));
73
- if (!response.ok) {
74
- const errorBody = yield response.text();
75
- console.error(`API Error (${response.status}) for ${options.method || 'GET'} ${url}: ${errorBody}`);
76
- throw new Error(`Judgment API request failed: ${response.status} ${response.statusText} - ${errorBody}`);
77
- }
73
+ // We will return the response object even if !response.ok
74
+ // The caller (e.g., saveTrace) is responsible for checking response.ok or response.status
78
75
  // Handle cases where the response might be empty (e.g., 204 No Content on DELETE)
79
76
  if (response.status === 204) {
80
77
  return null; // Indicate success with no content
81
78
  }
82
- return yield response.json();
79
+ return response;
83
80
  }
84
81
  catch (error) {
85
82
  console.error(`Network or fetch error during ${options.method || 'GET'} ${url}:`, error);
@@ -96,21 +93,52 @@ class TraceManagerClient {
96
93
  });
97
94
  });
98
95
  }
99
- saveTrace(traceData, emptySave) {
96
+ saveTrace(traceData) {
100
97
  return __awaiter(this, void 0, void 0, function* () {
98
+ // _fetch now returns the raw response object or throws on network error
101
99
  const response = yield this._fetch(JUDGMENT_TRACES_SAVE_API_URL, {
102
100
  method: 'POST',
103
- body: JSON.stringify(traceData),
101
+ body: JSON.stringify(traceData), // Stringify directly here again
104
102
  });
105
- // Optionally log the UI URL like the Python version
106
- if (!emptySave && (response === null || response === void 0 ? void 0 : response.ui_results_url)) {
107
- // Use console.info or a dedicated logger for user-facing messages
108
- // Note: We can't replicate Rich library's colored link easily in standard console
103
+ // Check if _fetch threw a network error (caught below) or returned an invalid object
104
+ if (!response) {
105
+ // This case should ideally be caught by _fetch's catch block, but double-check
106
+ throw new Error('Failed to save trace data: No response received from API.');
107
+ }
108
+ // Now, check the status code on the received response object
109
+ if (response.status === 400) {
110
+ // Attempt to get error body for more info
111
+ const errorBody = yield response.text();
112
+ throw new Error(`Failed to save trace data: Check your Trace name for conflicts, set overwrite=True to overwrite existing traces: ${response.status} ${response.statusText || ''} - ${errorBody}`);
113
+ }
114
+ else if (!response.ok) { // Handles other errors (5xx, 4xx except 400)
115
+ const errorBody = yield response.text();
116
+ throw new Error(`Failed to save trace data: Status ${response.status} ${response.statusText || '(No status text)'} - ${errorBody}`);
117
+ }
118
+ // --- Success Path ---
119
+ // Optionally log the UI URL (needs JSON parsing)
120
+ let responseData = null;
121
+ try {
122
+ // Handle 204 No Content specifically
123
+ if (response.status === 204) {
124
+ responseData = null; // Or maybe { success: true }?
125
+ }
126
+ else {
127
+ responseData = yield response.json(); // Parse JSON only on success
128
+ }
129
+ }
130
+ catch (parseError) {
131
+ logger.warn("Failed to parse successful API response JSON.", { error: parseError });
132
+ // Depending on requirements, maybe throw, maybe return a default success object
133
+ throw new Error(`API request succeeded (${response.status}), but failed to parse JSON response.`);
134
+ }
135
+ if (responseData === null || responseData === void 0 ? void 0 : responseData.ui_results_url) {
109
136
  console.info(`
110
- 🔍 View trace: ${response.ui_results_url}
137
+ 🔍 View trace: ${responseData.ui_results_url}
111
138
  `);
112
139
  }
113
- return response;
140
+ // Return the parsed data (or null for 204)
141
+ return responseData;
114
142
  });
115
143
  }
116
144
  deleteTrace(traceId) {
@@ -139,6 +167,50 @@ class TraceManagerClient {
139
167
  });
140
168
  });
141
169
  }
170
+ /**
171
+ * Calculate token costs directly using the API endpoint.
172
+ * This is more accurate than client-side calculation as it uses the most up-to-date pricing.
173
+ *
174
+ * @param model The model name (e.g. 'gpt-4', 'claude-3-opus-20240229')
175
+ * @param promptTokens Number of tokens in the prompt/input
176
+ * @param completionTokens Number of tokens in the completion/output
177
+ * @returns Object containing token counts and calculated costs in USD
178
+ */
179
+ calculateTokenCosts(model, promptTokens, completionTokens) {
180
+ return __awaiter(this, void 0, void 0, function* () {
181
+ try {
182
+ // Use the new calculation endpoint
183
+ const response = yield this._fetch(JUDGMENT_CALCULATE_TOKEN_COSTS_API_URL, {
184
+ method: 'POST',
185
+ body: JSON.stringify({
186
+ model,
187
+ prompt_tokens: promptTokens,
188
+ completion_tokens: completionTokens
189
+ })
190
+ });
191
+ // Check if the response is okay and parse JSON
192
+ if (response && response.ok) {
193
+ const data = yield response.json();
194
+ return data;
195
+ }
196
+ else if (response) {
197
+ // Log error if response was not ok
198
+ const errorBody = yield response.text();
199
+ logger.warn(`API error calculating token costs for model ${model}: ${response.status} ${response.statusText}`, { errorBody });
200
+ return null;
201
+ }
202
+ else {
203
+ // Handle cases where _fetch might return null or undefined (though it shouldn't with current implementation)
204
+ logger.warn(`No response received when calculating token costs for model ${model}.`);
205
+ return null;
206
+ }
207
+ }
208
+ catch (error) {
209
+ logger.warn(`Failed to calculate token costs for model ${model}.`, { error: error instanceof Error ? error.message : String(error) });
210
+ return null;
211
+ }
212
+ });
213
+ }
142
214
  }
143
215
  // --- Helper Functions ---
144
216
  // Helper function to sanitize names (e.g., replace spaces with underscores)
@@ -155,6 +227,7 @@ class TraceClient {
155
227
  constructor(config) {
156
228
  var _a, _b, _c, _d, _e;
157
229
  this.traceManager = null; // Can be null if monitoring disabled
230
+ this._spanDepths = {}; // Track depth of active spans
158
231
  this.traceId = config.traceId || uuidv4();
159
232
  this.originalName = config.name || 'default_trace'; // Store original
160
233
  this.name = sanitizeName(this.originalName); // Use sanitized name internally
@@ -193,7 +266,7 @@ class TraceClient {
193
266
  recordInput(inputs) {
194
267
  const traceClientContext = getTraceClientContext();
195
268
  const currentEntry = traceClientContext.entryStack.at(-1);
196
- if (!currentEntry) {
269
+ if (!currentEntry || !currentEntry.span_id) {
197
270
  console.warn(`No current entry to record input to\nStack trace: ${new Error().stack}`);
198
271
  return;
199
272
  }
@@ -202,14 +275,16 @@ class TraceClient {
202
275
  span_id: currentEntry.span_id,
203
276
  inputs,
204
277
  function: currentEntry.function,
205
- depth: currentEntry.depth,
206
- span_type: currentEntry.span_type
278
+ depth: this._spanDepths[currentEntry.span_id],
279
+ created_at: Date.now() / 1000,
280
+ span_type: currentEntry.span_type,
281
+ message: `Inputs to ${currentEntry.function}`
207
282
  });
208
283
  }
209
284
  recordOutput(output) {
210
285
  const traceClientContext = getTraceClientContext();
211
286
  const currentEntry = traceClientContext.entryStack.at(-1);
212
- if (!currentEntry) {
287
+ if (!currentEntry || !currentEntry.span_id) {
213
288
  console.warn(`No current entry to record output to\nStack trace: ${new Error().stack}`);
214
289
  return;
215
290
  }
@@ -218,33 +293,28 @@ class TraceClient {
218
293
  span_id: currentEntry.span_id,
219
294
  output,
220
295
  function: currentEntry.function,
221
- depth: currentEntry.depth,
222
- span_type: currentEntry.span_type
296
+ depth: this._spanDepths[currentEntry.span_id],
297
+ created_at: Date.now() / 1000,
298
+ span_type: currentEntry.span_type,
299
+ message: `Output from ${currentEntry.function}`
223
300
  });
224
301
  }
225
302
  recordError(error) {
226
- var _a;
227
303
  const traceClientContext = getTraceClientContext();
228
304
  const currentEntry = traceClientContext.entryStack.at(-1);
229
- if (!currentEntry) {
305
+ if (!currentEntry || !currentEntry.span_id) {
230
306
  console.warn(`No current entry to record error to\nStack trace: ${new Error().stack}`);
231
307
  return;
232
308
  }
233
- let output = error;
234
- if (error instanceof Error) {
235
- output = {
236
- name: error.name,
237
- message: error.message,
238
- stack: (_a = error.stack) === null || _a === void 0 ? void 0 : _a.substring(0, 1000)
239
- };
240
- }
241
309
  this.addEntry({
242
310
  type: 'error',
243
311
  span_id: currentEntry.span_id,
244
- output,
312
+ output: error,
245
313
  function: currentEntry.function,
246
- depth: currentEntry.depth,
247
- span_type: currentEntry.span_type
314
+ depth: this._spanDepths[currentEntry.span_id],
315
+ created_at: Date.now() / 1000,
316
+ span_type: currentEntry.span_type,
317
+ message: `Error from ${currentEntry.function}`
248
318
  });
249
319
  }
250
320
  startSpan(name, options = {}) {
@@ -255,18 +325,20 @@ class TraceClient {
255
325
  const spanType = (_a = options.spanType) !== null && _a !== void 0 ? _a : 'span';
256
326
  const startTime = Date.now() / 1000;
257
327
  let depth = 0, parentSpanId = undefined;
258
- if (parentEntry) {
259
- depth = parentEntry.depth + 1;
328
+ if (parentEntry && parentEntry.span_id) {
329
+ depth = this._spanDepths[parentEntry.span_id] + 1;
260
330
  parentSpanId = parentEntry.span_id;
261
331
  }
332
+ this._spanDepths[spanId] = depth;
262
333
  const entry = {
263
334
  type: 'enter',
264
335
  function: name,
265
336
  span_id: spanId,
266
337
  depth: depth,
267
- timestamp: startTime,
338
+ created_at: startTime,
268
339
  span_type: spanType,
269
- parent_span_id: parentSpanId
340
+ parent_span_id: parentSpanId,
341
+ message: name
270
342
  };
271
343
  this.addEntry(entry);
272
344
  traceClientContext.entryStack.push(entry);
@@ -274,21 +346,24 @@ class TraceClient {
274
346
  endSpan() {
275
347
  const traceClientContext = getTraceClientContext();
276
348
  const enterEntry = traceClientContext.entryStack.pop();
277
- if (!enterEntry) {
349
+ if (!enterEntry || !enterEntry.span_id) {
278
350
  console.warn("No enter entry to end");
279
351
  return;
280
352
  }
281
353
  const endTime = Date.now() / 1000;
282
- const duration = endTime - enterEntry.timestamp;
354
+ const duration = endTime - enterEntry.created_at;
283
355
  this.addEntry({
284
356
  type: 'exit',
285
357
  function: enterEntry.function,
286
358
  span_id: enterEntry.span_id,
287
- depth: enterEntry.depth,
288
- timestamp: endTime,
359
+ depth: this._spanDepths[enterEntry.span_id],
360
+ created_at: endTime,
289
361
  duration: duration,
290
- span_type: enterEntry.span_type
362
+ span_type: enterEntry.span_type,
363
+ message: `← ${enterEntry.function}`
291
364
  });
365
+ // Clean up depth tracking
366
+ delete this._spanDepths[enterEntry.span_id];
292
367
  }
293
368
  *span(name, options = {}) {
294
369
  if (!this.enableMonitoring) {
@@ -306,6 +381,7 @@ class TraceClient {
306
381
  condenseTrace(rawEntries) {
307
382
  var _a, _b, _c, _d, _e;
308
383
  const spansById = {};
384
+ const allEvaluationRuns = [];
309
385
  for (const entry of rawEntries) {
310
386
  const spanId = entry.span_id;
311
387
  if (!spanId)
@@ -315,12 +391,12 @@ class TraceClient {
315
391
  span_id: spanId,
316
392
  function: entry.function || 'unknown',
317
393
  depth: (_a = entry.depth) !== null && _a !== void 0 ? _a : 0,
318
- timestamp: (_b = entry.timestamp) !== null && _b !== void 0 ? _b : 0,
394
+ created_at: new Date(((_b = entry.created_at) !== null && _b !== void 0 ? _b : 0) * 1000).toISOString(), // Convert number to ISO string
395
+ trace_id: this.traceId, // Add trace_id
319
396
  parent_span_id: entry.parent_span_id,
320
397
  span_type: entry.span_type || 'span',
321
398
  inputs: null,
322
399
  output: null,
323
- evaluation_runs: [],
324
400
  duration: null,
325
401
  children: []
326
402
  };
@@ -330,14 +406,14 @@ class TraceClient {
330
406
  case 'enter':
331
407
  currentSpanData.function = entry.function || currentSpanData.function;
332
408
  currentSpanData.depth = (_c = entry.depth) !== null && _c !== void 0 ? _c : currentSpanData.depth;
333
- currentSpanData.timestamp = (_d = entry.timestamp) !== null && _d !== void 0 ? _d : currentSpanData.timestamp;
409
+ currentSpanData.created_at = new Date(((_d = entry.created_at) !== null && _d !== void 0 ? _d : 0) * 1000).toISOString(); // Ensure created_at is string on update
334
410
  currentSpanData.parent_span_id = entry.parent_span_id;
335
411
  currentSpanData.span_type = entry.span_type || currentSpanData.span_type;
336
- currentSpanData.start_time = entry.timestamp;
412
+ currentSpanData.start_time = entry.created_at; // Keep original number for duration calc
337
413
  break;
338
414
  case 'exit':
339
415
  currentSpanData.duration = (_e = entry.duration) !== null && _e !== void 0 ? _e : currentSpanData.duration;
340
- currentSpanData.end_time = entry.timestamp;
416
+ currentSpanData.end_time = entry.created_at; // Keep original number for duration calc
341
417
  if (currentSpanData.duration === null && currentSpanData.start_time && currentSpanData.end_time) {
342
418
  currentSpanData.duration = currentSpanData.end_time - currentSpanData.start_time;
343
419
  }
@@ -353,10 +429,8 @@ class TraceClient {
353
429
  case 'output':
354
430
  case 'error':
355
431
  currentSpanData.output = entry.output;
356
- break;
357
- case 'evaluation':
358
- if (entry.evaluation_runs) {
359
- currentSpanData.evaluation_runs.push(...entry.evaluation_runs);
432
+ if (entry.type === 'output' && entry.output && typeof entry.output === 'object' && 'eval_name' in entry.output && 'scorers' in entry.output && 'trace_span_id' in entry.output) {
433
+ allEvaluationRuns.push(entry.output);
360
434
  }
361
435
  break;
362
436
  }
@@ -387,9 +461,11 @@ class TraceClient {
387
461
  childrenMap[parentId].push(span);
388
462
  }
389
463
  }
390
- roots.sort((a, b) => a.timestamp - b.timestamp);
464
+ // Sort using parsed dates
465
+ roots.sort((a, b) => Date.parse(a.created_at) - Date.parse(b.created_at));
391
466
  for (const parentId in childrenMap) {
392
- childrenMap[parentId].sort((a, b) => a.timestamp - b.timestamp);
467
+ // Sort using parsed dates
468
+ childrenMap[parentId].sort((a, b) => Date.parse(a.created_at) - Date.parse(b.created_at));
393
469
  }
394
470
  function buildFlatListDfs(span) {
395
471
  if (visited.has(span.span_id))
@@ -410,26 +486,36 @@ class TraceClient {
410
486
  buildFlatListDfs(span);
411
487
  }
412
488
  }
413
- return sortedCondensedList;
489
+ return [sortedCondensedList, allEvaluationRuns];
414
490
  }
415
491
  save() {
416
492
  return __awaiter(this, arguments, void 0, function* (emptySave = false) {
493
+ var _a, _b, _c, _d, _e;
417
494
  if (!this.enableMonitoring || !this.traceManager) {
418
495
  return null;
419
496
  }
420
497
  const traceClientContext = getTraceClientContext();
421
498
  const totalDuration = this.getDuration();
422
- const condensedEntries = this.condenseTrace(traceClientContext.entries);
499
+ // Use the tuple returned by condenseTrace
500
+ const [condensedEntries, evaluationRuns] = this.condenseTrace(traceClientContext.entries);
423
501
  const tokenCounts = {
424
- prompt_tokens: 0, completion_tokens: 0, total_tokens: 0,
425
- prompt_tokens_cost_usd: 0.0, completion_tokens_cost_usd: 0.0, total_cost_usd: 0.0
502
+ prompt_tokens: 0,
503
+ completion_tokens: 0,
504
+ total_tokens: 0,
505
+ prompt_tokens_cost_usd: 0.0,
506
+ completion_tokens_cost_usd: 0.0,
507
+ total_cost_usd: 0.0
426
508
  };
427
- condensedEntries.forEach(entry => {
428
- var _a, _b;
509
+ // First pass: collect all LLM calls with their token counts
510
+ const llmCalls = [];
511
+ let index = 0;
512
+ for (const entry of condensedEntries) {
429
513
  if (entry.span_type === 'llm' && ((_a = entry.output) === null || _a === void 0 ? void 0 : _a.usage)) {
430
514
  const usage = entry.output.usage;
515
+ const modelName = ((_b = entry.inputs) === null || _b === void 0 ? void 0 : _b.model) || "";
431
516
  let promptTokens = 0;
432
517
  let completionTokens = 0;
518
+ // Handle different token naming conventions
433
519
  if (usage.prompt_tokens !== undefined || usage.completion_tokens !== undefined) {
434
520
  promptTokens = usage.prompt_tokens || 0;
435
521
  completionTokens = usage.completion_tokens || 0;
@@ -437,6 +523,7 @@ class TraceClient {
437
523
  else if (usage.input_tokens !== undefined || usage.output_tokens !== undefined) {
438
524
  promptTokens = usage.input_tokens || 0;
439
525
  completionTokens = usage.output_tokens || 0;
526
+ // Standardize naming
440
527
  usage.prompt_tokens = promptTokens;
441
528
  usage.completion_tokens = completionTokens;
442
529
  delete usage.input_tokens;
@@ -445,33 +532,82 @@ class TraceClient {
445
532
  tokenCounts.prompt_tokens += promptTokens;
446
533
  tokenCounts.completion_tokens += completionTokens;
447
534
  tokenCounts.total_tokens += usage.total_tokens || (promptTokens + completionTokens);
448
- const modelName = ((_b = entry.inputs) === null || _b === void 0 ? void 0 : _b.model) || "";
535
+ // Add to list of calls for cost calculation
449
536
  if (modelName) {
450
- try {
451
- const promptCost = 0.0;
452
- const completionCost = 0.0;
453
- const callTotalCost = promptCost + completionCost;
454
- usage.prompt_tokens_cost_usd = promptCost;
455
- usage.completion_tokens_cost_usd = completionCost;
456
- usage.total_cost_usd = callTotalCost;
457
- tokenCounts.prompt_tokens_cost_usd += promptCost;
458
- tokenCounts.completion_tokens_cost_usd += completionCost;
459
- tokenCounts.total_cost_usd += callTotalCost;
537
+ llmCalls.push({
538
+ modelName,
539
+ promptTokens,
540
+ completionTokens,
541
+ entryIndex: index
542
+ });
543
+ }
544
+ }
545
+ index++;
546
+ }
547
+ // Second pass: calculate costs for each LLM call using the API
548
+ if (this.traceManager && llmCalls.length > 0) {
549
+ // Process each LLM call
550
+ for (const call of llmCalls) {
551
+ try {
552
+ // Get costs from the API
553
+ const costs = yield this.traceManager.calculateTokenCosts(call.modelName, call.promptTokens, call.completionTokens);
554
+ if (costs) {
555
+ // Update the entry with the costs
556
+ const entry = condensedEntries[call.entryIndex];
557
+ // Ensure output and usage objects exist before assigning costs
558
+ if (entry.output && entry.output.usage) {
559
+ // --- This part assigns costs to the individual span ---
560
+ entry.output.usage.prompt_tokens_cost_usd = costs.prompt_tokens_cost_usd;
561
+ entry.output.usage.completion_tokens_cost_usd = costs.completion_tokens_cost_usd;
562
+ entry.output.usage.total_cost_usd = costs.total_cost_usd;
563
+ logger.debug(`Assigned costs to span ${entry.span_id} (model: ${call.modelName})`, { costs }); // Added debug log
564
+ // -----------------------------------------------------
565
+ }
566
+ else {
567
+ logger.warn(`Could not assign costs to span ${entry.span_id} (model: ${call.modelName}): Missing 'output' or 'output.usage' object.`, { output: entry.output }); // Log if structure is missing
568
+ }
569
+ // Add to the total costs for the trace
570
+ tokenCounts.prompt_tokens_cost_usd += (_c = costs.prompt_tokens_cost_usd) !== null && _c !== void 0 ? _c : 0.0;
571
+ tokenCounts.completion_tokens_cost_usd += (_d = costs.completion_tokens_cost_usd) !== null && _d !== void 0 ? _d : 0.0;
572
+ tokenCounts.total_cost_usd += (_e = costs.total_cost_usd) !== null && _e !== void 0 ? _e : 0.0;
460
573
  }
461
- catch (e) {
462
- console.warn(`Error calculating cost for model '${modelName}':`, e);
463
- usage.prompt_tokens_cost_usd = null;
464
- usage.completion_tokens_cost_usd = null;
465
- usage.total_cost_usd = null;
574
+ else {
575
+ // If calculation failed, set costs to null in the entry (matching Python behavior)
576
+ const entry = condensedEntries[call.entryIndex];
577
+ // Ensure output and usage objects exist before assigning null costs
578
+ if (entry.output && entry.output.usage) {
579
+ // --- Sets null costs on the individual span ---
580
+ entry.output.usage.prompt_tokens_cost_usd = null;
581
+ entry.output.usage.completion_tokens_cost_usd = null;
582
+ entry.output.usage.total_cost_usd = null;
583
+ // ------------------------------------------
584
+ }
585
+ else {
586
+ // Log if we can't even assign null because the structure is missing
587
+ logger.warn(`Could not assign null costs to span ${entry.span_id} (model: ${call.modelName}): Missing 'output' or 'output.usage' object.`, { output: entry.output });
588
+ }
589
+ logger.warn(`Token cost calculation failed for model '${call.modelName}'. Cost information will be null for this span.`); // More specific warning
466
590
  }
467
591
  }
468
- else {
469
- usage.prompt_tokens_cost_usd = null;
470
- usage.completion_tokens_cost_usd = null;
471
- usage.total_cost_usd = null;
592
+ catch (e) {
593
+ logger.warn(`Error during cost calculation loop for model '${call.modelName}':`, e); // Adjusted logging
594
+ // Set costs to null in the entry if an error occurs during the loop iteration
595
+ const entry = condensedEntries[call.entryIndex];
596
+ // Ensure output and usage objects exist before assigning null costs on error
597
+ if (entry.output && entry.output.usage) {
598
+ // --- Sets null costs on the individual span on error ---
599
+ entry.output.usage.prompt_tokens_cost_usd = null;
600
+ entry.output.usage.completion_tokens_cost_usd = null;
601
+ entry.output.usage.total_cost_usd = null;
602
+ // ----------------------------------------------------
603
+ }
604
+ else {
605
+ // Log if we can't assign null on error because the structure is missing
606
+ logger.warn(`Could not assign null costs to span ${entry.span_id} (model: ${call.modelName}) on error: Missing 'output' or 'output.usage' object.`, { output: entry.output });
607
+ }
472
608
  }
473
609
  }
474
- });
610
+ }
475
611
  // Convert rules array to a dictionary (Record<string, Rule>)
476
612
  const rulesDict = {};
477
613
  this.rules.forEach(rule => {
@@ -488,16 +624,15 @@ class TraceClient {
488
624
  duration: totalDuration,
489
625
  token_counts: tokenCounts,
490
626
  entries: condensedEntries,
491
- rules: rulesDict,
492
- empty_save: emptySave,
627
+ evaluation_runs: evaluationRuns,
493
628
  overwrite: this.overwrite,
494
629
  parent_trace_id: this.parentTraceId,
495
630
  parent_name: this.parentName
496
631
  };
497
632
  try {
498
- yield this.traceManager.saveTrace(traceData, emptySave);
633
+ yield this.traceManager.saveTrace(traceData);
499
634
  logger.info(`Trace ${this.traceId} saved successfully.`);
500
- if (!emptySave && this.enableEvaluations) {
635
+ if (this.enableEvaluations) {
501
636
  try {
502
637
  yield this.traceManager.addTraceToEvalQueue(traceData);
503
638
  logger.info(`Trace ${this.traceId} added to evaluation queue.`);
@@ -531,7 +666,7 @@ class TraceClient {
531
666
  traceClientContext.entries.forEach(entry => {
532
667
  var _a;
533
668
  const indent = " ".repeat((_a = entry.depth) !== null && _a !== void 0 ? _a : 0);
534
- const timeStr = entry.timestamp ? `@ ${new Date(entry.timestamp * 1000).toISOString()}` : '';
669
+ const timeStr = entry.created_at ? `@ ${new Date(entry.created_at * 1000).toISOString()}` : '';
535
670
  const shortSpanId = entry.span_id ? `(id: ${entry.span_id.substring(0, 8)}...)` : '';
536
671
  const shortParentId = entry.parent_span_id ? `(parent: ${entry.parent_span_id.substring(0, 8)}...)` : '';
537
672
  try {
@@ -562,17 +697,6 @@ class TraceClient {
562
697
  // Keep console.log
563
698
  console.log(`${indent} ${prefix} (for ${shortSpanId}): ${outputStr || 'null'}`);
564
699
  break;
565
- case 'evaluation':
566
- let evalStr = JSON.stringify(entry.evaluation_runs);
567
- if (evalStr && evalStr.length > 200) {
568
- evalStr = evalStr.substring(0, 197) + '...';
569
- }
570
- // Keep console.log
571
- console.log(`${indent} Evaluation (for ${shortSpanId}): ${evalStr || '[]'}`);
572
- break;
573
- default:
574
- // Keep console.log
575
- console.log(`${indent}? Unknown entry type: ${JSON.stringify(entry)}`);
576
700
  }
577
701
  }
578
702
  catch (stringifyError) {
@@ -612,9 +736,7 @@ class TraceClient {
612
736
  * @returns Promise that resolves when the evaluation entry has been added to the trace
613
737
  */
614
738
  asyncEvaluate(scorers_1) {
615
- return __awaiter(this, arguments, void 0, function* (
616
- // Accept general Scorer type, but filter/check for API scorers internally
617
- scorers, options = {}) {
739
+ return __awaiter(this, arguments, void 0, function* (scorers, options = {}) {
618
740
  if (!this.enableEvaluations) {
619
741
  logger.warn("Evaluations are disabled. Skipping async evaluation.");
620
742
  return;
@@ -629,7 +751,12 @@ class TraceClient {
629
751
  logger.warn("No APIJudgmentScorers found in the provided scorers list. Skipping async evaluation as backend requires API scorers.");
630
752
  return;
631
753
  }
632
- const startTime = Date.now() / 1000; // Record start time in seconds
754
+ // Process rules (currently just using this.rules directly)
755
+ const loadedRules = this.rules; // TODO: Add ScorerWrapper-like processing if needed in TS
756
+ // Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
757
+ if (loadedRules && loadedRules.length > 0 && scorers.some(s => !(s instanceof APIJudgmentScorer))) {
758
+ throw new Error("Cannot use Judgeval scorers, you can only use API scorers when using rules. Please either remove rules or use only APIJudgmentScorer types.");
759
+ }
633
760
  // Create example structure matching Python/backend expectations
634
761
  const example = {
635
762
  input: options.input || "",
@@ -649,6 +776,7 @@ class TraceClient {
649
776
  logger.warn(`No current entry to record evaluation to\nStack trace: ${new Error().stack}`);
650
777
  return;
651
778
  }
779
+ const currentSpanId = currentEntry.span_id; // Get the span ID
652
780
  // --- Create evaluation run name (similar to Python) ---
653
781
  // Capitalize scorer names
654
782
  const scorerNames = apiScorers.map(scorer => {
@@ -658,11 +786,9 @@ class TraceClient {
658
786
  return name.charAt(0).toUpperCase() + name.slice(1);
659
787
  }).join(',');
660
788
  // Use trace name and shortened span ID (or trace ID if no span)
661
- const idPart = currentEntry ? currentEntry.span_id.substring(0, 8) : this.traceId.substring(0, 8);
789
+ const idPart = currentSpanId ? currentSpanId.substring(0, 8) : this.traceId.substring(0, 8);
662
790
  const evalName = `${this.name.charAt(0).toUpperCase() + this.name.slice(1)}-${idPart}-[${scorerNames}]`;
663
791
  // --- End eval name creation ---
664
- // Process rules (currently just using this.rules directly)
665
- const loadedRules = this.rules; // TODO: Add ScorerWrapper-like processing if needed in TS
666
792
  // Construct the evaluation payload
667
793
  const evalRunPayload = {
668
794
  organization_id: this.organizationId,
@@ -675,47 +801,18 @@ class TraceClient {
675
801
  metadata: {}, // Matches Python tracer
676
802
  judgment_api_key: this.apiKey,
677
803
  override: this.overwrite, // Use trace's overwrite setting
678
- rules: loadedRules // Pass the processed rules
804
+ rules: loadedRules, // Pass the processed rules
805
+ trace_span_id: currentSpanId // <<< RENAMED: Assign the current span ID (matching backend)
679
806
  };
680
- // Add evaluation entry using the helper method
681
- this._addEvalRun(evalRunPayload, startTime);
807
+ // Add evaluation entry to the trace
808
+ this.recordOutput(evalRunPayload);
682
809
  }
683
810
  catch (error) {
684
- console.error(`Failed during asyncEvaluate execution: ${error instanceof Error ? error.message : String(error)}`);
685
- // Decide if we should re-throw or just log
811
+ logger.error(`Failed during asyncEvaluate execution: ${error instanceof Error ? error.message : String(error)}`);
812
+ throw error; // Re-throw after logging
686
813
  }
687
814
  });
688
815
  }
689
- /**
690
- * Private helper to add an evaluation entry to the trace.
691
- * This mirrors the structure of Python's add_eval_run.
692
- *
693
- * @param evalRunPayload The constructed payload for the evaluation.
694
- * @param startTime The start time (in seconds) of the evaluation process.
695
- */
696
- _addEvalRun(evalRunPayload, startTime) {
697
- var _a, _b;
698
- const traceClientContext = getTraceClientContext();
699
- const currentEntry = traceClientContext.entryStack.at(-1);
700
- if (!currentEntry) {
701
- logger.warn(`No current entry to record evaluation to\nStack trace: ${new Error().stack}`);
702
- return;
703
- }
704
- const function_ = (_a = currentEntry.function) !== null && _a !== void 0 ? _a : "unknown_function";
705
- const depth = (_b = currentEntry.depth) !== null && _b !== void 0 ? _b : 0;
706
- const duration = Date.now() / 1000 - startTime;
707
- // Add evaluation entry to the trace
708
- this.addEntry({
709
- type: "evaluation",
710
- function: function_,
711
- span_id: currentEntry.span_id, // May be undefined
712
- depth: depth,
713
- timestamp: Date.now() / 1000,
714
- evaluation_runs: [evalRunPayload], // Embed the payload
715
- duration: duration,
716
- span_type: "evaluation"
717
- });
718
- }
719
816
  // OPTIONAL: Add a method to get the original name if needed elsewhere
720
817
  getOriginalName() {
721
818
  return this.originalName;
@@ -796,11 +893,6 @@ class Tracer {
796
893
  apiKey: this.apiKey,
797
894
  organizationId: this.organizationId,
798
895
  });
799
- if (traceClient.enableMonitoring) {
800
- traceClient.save(true).catch(err => {
801
- logger.error(`Failed to save empty trace (${traceClient.traceId}):`, err);
802
- });
803
- }
804
896
  return traceClient;
805
897
  }
806
898
  *trace(name, options = {}) {