npm - judgeval - Versions diffs - 0.2.0 → 0.2.2 - Mend

judgeval 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

package/README.md +95 -68
package/dist/cjs/common/tracer.js +235 -143
package/dist/cjs/common/tracer.js.map +1 -1
package/dist/cjs/constants.js +8 -5
package/dist/cjs/constants.js.map +1 -1
package/dist/cjs/data/datasets/eval-dataset-client.js +349 -0
package/dist/cjs/data/datasets/eval-dataset-client.js.map +1 -0
package/dist/cjs/data/datasets/eval-dataset.js +405 -0
package/dist/cjs/data/datasets/eval-dataset.js.map +1 -0
package/dist/cjs/data/example.js +22 -1
package/dist/cjs/data/example.js.map +1 -1
package/dist/cjs/e2etests/eval-operations.test.js +282 -0
package/dist/cjs/e2etests/eval-operations.test.js.map +1 -0
package/dist/cjs/e2etests/judgee-traces.test.js +278 -0
package/dist/cjs/e2etests/judgee-traces.test.js.map +1 -0
package/dist/cjs/index.js +1 -3
package/dist/cjs/index.js.map +1 -1
package/dist/cjs/judgment-client.js +326 -645
package/dist/cjs/judgment-client.js.map +1 -1
package/dist/cjs/scorers/api-scorer.js +56 -48
package/dist/cjs/scorers/api-scorer.js.map +1 -1
package/dist/cjs/scorers/base-scorer.js +66 -11
package/dist/cjs/scorers/base-scorer.js.map +1 -1
package/dist/esm/common/tracer.js +236 -144
package/dist/esm/common/tracer.js.map +1 -1
package/dist/esm/constants.js +7 -4
package/dist/esm/constants.js.map +1 -1
package/dist/esm/data/datasets/eval-dataset-client.js +342 -0
package/dist/esm/data/datasets/eval-dataset-client.js.map +1 -0
package/dist/esm/data/datasets/eval-dataset.js +375 -0
package/dist/esm/data/datasets/eval-dataset.js.map +1 -0
package/dist/esm/data/example.js +22 -1
package/dist/esm/data/example.js.map +1 -1
package/dist/esm/e2etests/eval-operations.test.js +254 -0
package/dist/esm/e2etests/eval-operations.test.js.map +1 -0
package/dist/esm/e2etests/judgee-traces.test.js +253 -0
package/dist/esm/e2etests/judgee-traces.test.js.map +1 -0
package/dist/esm/index.js +0 -1
package/dist/esm/index.js.map +1 -1
package/dist/esm/judgment-client.js +328 -647
package/dist/esm/judgment-client.js.map +1 -1
package/dist/esm/scorers/api-scorer.js +56 -48
package/dist/esm/scorers/api-scorer.js.map +1 -1
package/dist/esm/scorers/base-scorer.js +66 -11
package/dist/esm/scorers/base-scorer.js.map +1 -1
package/dist/types/common/tracer.d.ts +27 -14
package/dist/types/constants.d.ts +4 -4
package/dist/types/data/datasets/eval-dataset-client.d.ts +39 -0
package/dist/types/data/datasets/eval-dataset.d.ts +45 -0
package/dist/types/data/example.d.ts +24 -12
package/dist/types/e2etests/eval-operations.test.d.ts +5 -0
package/dist/types/e2etests/judgee-traces.test.d.ts +5 -0
package/dist/types/index.d.ts +0 -1
package/dist/types/judgment-client.d.ts +3 -47
package/dist/types/scorers/api-scorer.d.ts +15 -15
package/dist/types/scorers/base-scorer.d.ts +53 -10
package/package.json +2 -1
package/dist/cjs/scorers/exact-match-scorer.js +0 -84
package/dist/cjs/scorers/exact-match-scorer.js.map +0 -1
package/dist/esm/scorers/exact-match-scorer.js +0 -80
package/dist/esm/scorers/exact-match-scorer.js.map +0 -1
package/dist/types/scorers/exact-match-scorer.d.ts +0 -10

package/dist/esm/common/tracer.js CHANGED Viewed

@@ -14,7 +14,7 @@ import { AsyncLocalStorage } from 'async_hooks';
 import OpenAI from 'openai';
 import Anthropic from '@anthropic-ai/sdk';
 // Local Imports
-import { JUDGMENT_TRACES_SAVE_API_URL, JUDGMENT_TRACES_FETCH_API_URL, JUDGMENT_TRACES_DELETE_API_URL, JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL,
+import { JUDGMENT_TRACES_SAVE_API_URL, JUDGMENT_TRACES_FETCH_API_URL, JUDGMENT_TRACES_DELETE_API_URL, JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL, JUDGMENT_CALCULATE_TOKEN_COSTS_API_URL,
 // Add other necessary constants if needed
  } from '../constants.js';
 import { APIJudgmentScorer } from '../scorers/base-scorer.js';
@@ -70,16 +70,13 @@ class TraceManagerClient {
             try {
                 // Use isomorphic fetch (available globally in modern Node.js and browsers)
                 const response = yield fetch(url, Object.assign(Object.assign({}, options), { headers: headers }));
-                if (!response.ok) {
-                    const errorBody = yield response.text();
-                    console.error(`API Error (${response.status}) for ${options.method || 'GET'} ${url}: ${errorBody}`);
-                    throw new Error(`Judgment API request failed: ${response.status} ${response.statusText} - ${errorBody}`);
-                }
+                // We will return the response object even if !response.ok
+                // The caller (e.g., saveTrace) is responsible for checking response.ok or response.status
                 // Handle cases where the response might be empty (e.g., 204 No Content on DELETE)
                 if (response.status === 204) {
                     return null; // Indicate success with no content
                 }
-                return yield response.json();
+                return response;
             }
             catch (error) {
                 console.error(`Network or fetch error during ${options.method || 'GET'} ${url}:`, error);
@@ -96,21 +93,52 @@ class TraceManagerClient {
             });
         });
     }
-    saveTrace(traceData, emptySave) {
+    saveTrace(traceData) {
         return __awaiter(this, void 0, void 0, function* () {
+            // _fetch now returns the raw response object or throws on network error
             const response = yield this._fetch(JUDGMENT_TRACES_SAVE_API_URL, {
                 method: 'POST',
-                body: JSON.stringify(traceData),
+                body: JSON.stringify(traceData), // Stringify directly here again
             });
-            // Optionally log the UI URL like the Python version
-            if (!emptySave && (response === null || response === void 0 ? void 0 : response.ui_results_url)) {
-                // Use console.info or a dedicated logger for user-facing messages
-                // Note: We can't replicate Rich library's colored link easily in standard console
+            // Check if _fetch threw a network error (caught below) or returned an invalid object
+            if (!response) {
+                // This case should ideally be caught by _fetch's catch block, but double-check
+                throw new Error('Failed to save trace data: No response received from API.');
+            }
+            // Now, check the status code on the received response object
+            if (response.status === 400) {
+                // Attempt to get error body for more info
+                const errorBody = yield response.text();
+                throw new Error(`Failed to save trace data: Check your Trace name for conflicts, set overwrite=True to overwrite existing traces: ${response.status} ${response.statusText || ''} - ${errorBody}`);
+            }
+            else if (!response.ok) { // Handles other errors (5xx, 4xx except 400)
+                const errorBody = yield response.text();
+                throw new Error(`Failed to save trace data: Status ${response.status} ${response.statusText || '(No status text)'} - ${errorBody}`);
+            }
+            // --- Success Path ---
+            // Optionally log the UI URL (needs JSON parsing)
+            let responseData = null;
+            try {
+                // Handle 204 No Content specifically
+                if (response.status === 204) {
+                    responseData = null; // Or maybe { success: true }?
+                }
+                else {
+                    responseData = yield response.json(); // Parse JSON only on success
+                }
+            }
+            catch (parseError) {
+                logger.warn("Failed to parse successful API response JSON.", { error: parseError });
+                // Depending on requirements, maybe throw, maybe return a default success object
+                throw new Error(`API request succeeded (${response.status}), but failed to parse JSON response.`);
+            }
+            if (responseData === null || responseData === void 0 ? void 0 : responseData.ui_results_url) {
                 console.info(`
-🔍 View trace: ${response.ui_results_url}
+🔍 View trace: ${responseData.ui_results_url}
 `);
             }
-            return response;
+            // Return the parsed data (or null for 204)
+            return responseData;
         });
     }
     deleteTrace(traceId) {
@@ -139,6 +167,50 @@ class TraceManagerClient {
             });
         });
     }
+    /**
+     * Calculate token costs directly using the API endpoint.
+     * This is more accurate than client-side calculation as it uses the most up-to-date pricing.
+     *
+     * @param model The model name (e.g. 'gpt-4', 'claude-3-opus-20240229')
+     * @param promptTokens Number of tokens in the prompt/input
+     * @param completionTokens Number of tokens in the completion/output
+     * @returns Object containing token counts and calculated costs in USD
+     */
+    calculateTokenCosts(model, promptTokens, completionTokens) {
+        return __awaiter(this, void 0, void 0, function* () {
+            try {
+                // Use the new calculation endpoint
+                const response = yield this._fetch(JUDGMENT_CALCULATE_TOKEN_COSTS_API_URL, {
+                    method: 'POST',
+                    body: JSON.stringify({
+                        model,
+                        prompt_tokens: promptTokens,
+                        completion_tokens: completionTokens
+                    })
+                });
+                // Check if the response is okay and parse JSON
+                if (response && response.ok) {
+                    const data = yield response.json();
+                    return data;
+                }
+                else if (response) {
+                    // Log error if response was not ok
+                    const errorBody = yield response.text();
+                    logger.warn(`API error calculating token costs for model ${model}: ${response.status} ${response.statusText}`, { errorBody });
+                    return null;
+                }
+                else {
+                    // Handle cases where _fetch might return null or undefined (though it shouldn't with current implementation)
+                    logger.warn(`No response received when calculating token costs for model ${model}.`);
+                    return null;
+                }
+            }
+            catch (error) {
+                logger.warn(`Failed to calculate token costs for model ${model}.`, { error: error instanceof Error ? error.message : String(error) });
+                return null;
+            }
+        });
+    }
 }
 // --- Helper Functions ---
 // Helper function to sanitize names (e.g., replace spaces with underscores)
@@ -155,6 +227,7 @@ class TraceClient {
     constructor(config) {
         var _a, _b, _c, _d, _e;
         this.traceManager = null; // Can be null if monitoring disabled
+        this._spanDepths = {}; // Track depth of active spans
         this.traceId = config.traceId || uuidv4();
         this.originalName = config.name || 'default_trace'; // Store original
         this.name = sanitizeName(this.originalName); // Use sanitized name internally
@@ -193,7 +266,7 @@ class TraceClient {
     recordInput(inputs) {
         const traceClientContext = getTraceClientContext();
         const currentEntry = traceClientContext.entryStack.at(-1);
-        if (!currentEntry) {
+        if (!currentEntry || !currentEntry.span_id) {
             console.warn(`No current entry to record input to\nStack trace: ${new Error().stack}`);
             return;
         }
@@ -202,14 +275,16 @@ class TraceClient {
             span_id: currentEntry.span_id,
             inputs,
             function: currentEntry.function,
-            depth: currentEntry.depth,
-            span_type: currentEntry.span_type
+            depth: this._spanDepths[currentEntry.span_id],
+            created_at: Date.now() / 1000,
+            span_type: currentEntry.span_type,
+            message: `Inputs to ${currentEntry.function}`
         });
     }
     recordOutput(output) {
         const traceClientContext = getTraceClientContext();
         const currentEntry = traceClientContext.entryStack.at(-1);
-        if (!currentEntry) {
+        if (!currentEntry || !currentEntry.span_id) {
             console.warn(`No current entry to record output to\nStack trace: ${new Error().stack}`);
             return;
         }
@@ -218,33 +293,28 @@ class TraceClient {
             span_id: currentEntry.span_id,
             output,
             function: currentEntry.function,
-            depth: currentEntry.depth,
-            span_type: currentEntry.span_type
+            depth: this._spanDepths[currentEntry.span_id],
+            created_at: Date.now() / 1000,
+            span_type: currentEntry.span_type,
+            message: `Output from ${currentEntry.function}`
         });
     }
     recordError(error) {
-        var _a;
         const traceClientContext = getTraceClientContext();
         const currentEntry = traceClientContext.entryStack.at(-1);
-        if (!currentEntry) {
+        if (!currentEntry || !currentEntry.span_id) {
             console.warn(`No current entry to record error to\nStack trace: ${new Error().stack}`);
             return;
         }
-        let output = error;
-        if (error instanceof Error) {
-            output = {
-                name: error.name,
-                message: error.message,
-                stack: (_a = error.stack) === null || _a === void 0 ? void 0 : _a.substring(0, 1000)
-            };
-        }
         this.addEntry({
             type: 'error',
             span_id: currentEntry.span_id,
-            output,
+            output: error,
             function: currentEntry.function,
-            depth: currentEntry.depth,
-            span_type: currentEntry.span_type
+            depth: this._spanDepths[currentEntry.span_id],
+            created_at: Date.now() / 1000,
+            span_type: currentEntry.span_type,
+            message: `Error from ${currentEntry.function}`
         });
     }
     startSpan(name, options = {}) {
@@ -255,18 +325,20 @@ class TraceClient {
         const spanType = (_a = options.spanType) !== null && _a !== void 0 ? _a : 'span';
         const startTime = Date.now() / 1000;
         let depth = 0, parentSpanId = undefined;
-        if (parentEntry) {
-            depth = parentEntry.depth + 1;
+        if (parentEntry && parentEntry.span_id) {
+            depth = this._spanDepths[parentEntry.span_id] + 1;
             parentSpanId = parentEntry.span_id;
         }
+        this._spanDepths[spanId] = depth;
         const entry = {
             type: 'enter',
             function: name,
             span_id: spanId,
             depth: depth,
-            timestamp: startTime,
+            created_at: startTime,
             span_type: spanType,
-            parent_span_id: parentSpanId
+            parent_span_id: parentSpanId,
+            message: name
         };
         this.addEntry(entry);
         traceClientContext.entryStack.push(entry);
@@ -274,21 +346,24 @@ class TraceClient {
     endSpan() {
         const traceClientContext = getTraceClientContext();
         const enterEntry = traceClientContext.entryStack.pop();
-        if (!enterEntry) {
+        if (!enterEntry || !enterEntry.span_id) {
             console.warn("No enter entry to end");
             return;
         }
         const endTime = Date.now() / 1000;
-        const duration = endTime - enterEntry.timestamp;
+        const duration = endTime - enterEntry.created_at;
         this.addEntry({
             type: 'exit',
             function: enterEntry.function,
             span_id: enterEntry.span_id,
-            depth: enterEntry.depth,
-            timestamp: endTime,
+            depth: this._spanDepths[enterEntry.span_id],
+            created_at: endTime,
             duration: duration,
-            span_type: enterEntry.span_type
+            span_type: enterEntry.span_type,
+            message: `← ${enterEntry.function}`
         });
+        // Clean up depth tracking
+        delete this._spanDepths[enterEntry.span_id];
     }
     *span(name, options = {}) {
         if (!this.enableMonitoring) {
@@ -306,6 +381,7 @@ class TraceClient {
     condenseTrace(rawEntries) {
         var _a, _b, _c, _d, _e;
         const spansById = {};
+        const allEvaluationRuns = [];
         for (const entry of rawEntries) {
             const spanId = entry.span_id;
             if (!spanId)
@@ -315,12 +391,12 @@ class TraceClient {
                     span_id: spanId,
                     function: entry.function || 'unknown',
                     depth: (_a = entry.depth) !== null && _a !== void 0 ? _a : 0,
-                    timestamp: (_b = entry.timestamp) !== null && _b !== void 0 ? _b : 0,
+                    created_at: new Date(((_b = entry.created_at) !== null && _b !== void 0 ? _b : 0) * 1000).toISOString(), // Convert number to ISO string
+                    trace_id: this.traceId, // Add trace_id
                     parent_span_id: entry.parent_span_id,
                     span_type: entry.span_type || 'span',
                     inputs: null,
                     output: null,
-                    evaluation_runs: [],
                     duration: null,
                     children: []
                 };
@@ -330,14 +406,14 @@ class TraceClient {
                 case 'enter':
                     currentSpanData.function = entry.function || currentSpanData.function;
                     currentSpanData.depth = (_c = entry.depth) !== null && _c !== void 0 ? _c : currentSpanData.depth;
-                    currentSpanData.timestamp = (_d = entry.timestamp) !== null && _d !== void 0 ? _d : currentSpanData.timestamp;
+                    currentSpanData.created_at = new Date(((_d = entry.created_at) !== null && _d !== void 0 ? _d : 0) * 1000).toISOString(); // Ensure created_at is string on update
                     currentSpanData.parent_span_id = entry.parent_span_id;
                     currentSpanData.span_type = entry.span_type || currentSpanData.span_type;
-                    currentSpanData.start_time = entry.timestamp;
+                    currentSpanData.start_time = entry.created_at; // Keep original number for duration calc
                     break;
                 case 'exit':
                     currentSpanData.duration = (_e = entry.duration) !== null && _e !== void 0 ? _e : currentSpanData.duration;
-                    currentSpanData.end_time = entry.timestamp;
+                    currentSpanData.end_time = entry.created_at; // Keep original number for duration calc
                     if (currentSpanData.duration === null && currentSpanData.start_time && currentSpanData.end_time) {
                         currentSpanData.duration = currentSpanData.end_time - currentSpanData.start_time;
                     }
@@ -353,10 +429,8 @@ class TraceClient {
                 case 'output':
                 case 'error':
                     currentSpanData.output = entry.output;
-                    break;
-                case 'evaluation':
-                    if (entry.evaluation_runs) {
-                        currentSpanData.evaluation_runs.push(...entry.evaluation_runs);
+                    if (entry.type === 'output' && entry.output && typeof entry.output === 'object' && 'eval_name' in entry.output && 'scorers' in entry.output && 'trace_span_id' in entry.output) {
+                        allEvaluationRuns.push(entry.output);
                     }
                     break;
             }
@@ -387,9 +461,11 @@ class TraceClient {
                 childrenMap[parentId].push(span);
             }
         }
-        roots.sort((a, b) => a.timestamp - b.timestamp);
+        // Sort using parsed dates
+        roots.sort((a, b) => Date.parse(a.created_at) - Date.parse(b.created_at));
         for (const parentId in childrenMap) {
-            childrenMap[parentId].sort((a, b) => a.timestamp - b.timestamp);
+            // Sort using parsed dates
+            childrenMap[parentId].sort((a, b) => Date.parse(a.created_at) - Date.parse(b.created_at));
         }
         function buildFlatListDfs(span) {
             if (visited.has(span.span_id))
@@ -410,26 +486,36 @@ class TraceClient {
                 buildFlatListDfs(span);
             }
         }
-        return sortedCondensedList;
+        return [sortedCondensedList, allEvaluationRuns];
     }
     save() {
         return __awaiter(this, arguments, void 0, function* (emptySave = false) {
+            var _a, _b, _c, _d, _e;
             if (!this.enableMonitoring || !this.traceManager) {
                 return null;
             }
             const traceClientContext = getTraceClientContext();
             const totalDuration = this.getDuration();
-            const condensedEntries = this.condenseTrace(traceClientContext.entries);
+            // Use the tuple returned by condenseTrace
+            const [condensedEntries, evaluationRuns] = this.condenseTrace(traceClientContext.entries);
             const tokenCounts = {
-                prompt_tokens: 0, completion_tokens: 0, total_tokens: 0,
-                prompt_tokens_cost_usd: 0.0, completion_tokens_cost_usd: 0.0, total_cost_usd: 0.0
+                prompt_tokens: 0,
+                completion_tokens: 0,
+                total_tokens: 0,
+                prompt_tokens_cost_usd: 0.0,
+                completion_tokens_cost_usd: 0.0,
+                total_cost_usd: 0.0
             };
-            condensedEntries.forEach(entry => {
-                var _a, _b;
+            // First pass: collect all LLM calls with their token counts
+            const llmCalls = [];
+            let index = 0;
+            for (const entry of condensedEntries) {
                 if (entry.span_type === 'llm' && ((_a = entry.output) === null || _a === void 0 ? void 0 : _a.usage)) {
                     const usage = entry.output.usage;
+                    const modelName = ((_b = entry.inputs) === null || _b === void 0 ? void 0 : _b.model) || "";
                     let promptTokens = 0;
                     let completionTokens = 0;
+                    // Handle different token naming conventions
                     if (usage.prompt_tokens !== undefined || usage.completion_tokens !== undefined) {
                         promptTokens = usage.prompt_tokens || 0;
                         completionTokens = usage.completion_tokens || 0;
@@ -437,6 +523,7 @@ class TraceClient {
                     else if (usage.input_tokens !== undefined || usage.output_tokens !== undefined) {
                         promptTokens = usage.input_tokens || 0;
                         completionTokens = usage.output_tokens || 0;
+                        // Standardize naming
                         usage.prompt_tokens = promptTokens;
                         usage.completion_tokens = completionTokens;
                         delete usage.input_tokens;
@@ -445,33 +532,82 @@ class TraceClient {
                     tokenCounts.prompt_tokens += promptTokens;
                     tokenCounts.completion_tokens += completionTokens;
                     tokenCounts.total_tokens += usage.total_tokens || (promptTokens + completionTokens);
-                    const modelName = ((_b = entry.inputs) === null || _b === void 0 ? void 0 : _b.model) || "";
+                    // Add to list of calls for cost calculation
                     if (modelName) {
-                        try {
-                            const promptCost = 0.0;
-                            const completionCost = 0.0;
-                            const callTotalCost = promptCost + completionCost;
-                            usage.prompt_tokens_cost_usd = promptCost;
-                            usage.completion_tokens_cost_usd = completionCost;
-                            usage.total_cost_usd = callTotalCost;
-                            tokenCounts.prompt_tokens_cost_usd += promptCost;
-                            tokenCounts.completion_tokens_cost_usd += completionCost;
-                            tokenCounts.total_cost_usd += callTotalCost;
+                        llmCalls.push({
+                            modelName,
+                            promptTokens,
+                            completionTokens,
+                            entryIndex: index
+                        });
+                    }
+                }
+                index++;
+            }
+            // Second pass: calculate costs for each LLM call using the API
+            if (this.traceManager && llmCalls.length > 0) {
+                // Process each LLM call
+                for (const call of llmCalls) {
+                    try {
+                        // Get costs from the API
+                        const costs = yield this.traceManager.calculateTokenCosts(call.modelName, call.promptTokens, call.completionTokens);
+                        if (costs) {
+                            // Update the entry with the costs
+                            const entry = condensedEntries[call.entryIndex];
+                            // Ensure output and usage objects exist before assigning costs
+                            if (entry.output && entry.output.usage) {
+                                // --- This part assigns costs to the individual span ---
+                                entry.output.usage.prompt_tokens_cost_usd = costs.prompt_tokens_cost_usd;
+                                entry.output.usage.completion_tokens_cost_usd = costs.completion_tokens_cost_usd;
+                                entry.output.usage.total_cost_usd = costs.total_cost_usd;
+                                logger.debug(`Assigned costs to span ${entry.span_id} (model: ${call.modelName})`, { costs }); // Added debug log
+                                // -----------------------------------------------------
+                            }
+                            else {
+                                logger.warn(`Could not assign costs to span ${entry.span_id} (model: ${call.modelName}): Missing 'output' or 'output.usage' object.`, { output: entry.output }); // Log if structure is missing
+                            }
+                            // Add to the total costs for the trace
+                            tokenCounts.prompt_tokens_cost_usd += (_c = costs.prompt_tokens_cost_usd) !== null && _c !== void 0 ? _c : 0.0;
+                            tokenCounts.completion_tokens_cost_usd += (_d = costs.completion_tokens_cost_usd) !== null && _d !== void 0 ? _d : 0.0;
+                            tokenCounts.total_cost_usd += (_e = costs.total_cost_usd) !== null && _e !== void 0 ? _e : 0.0;
                         }
-                        catch (e) {
-                            console.warn(`Error calculating cost for model '${modelName}':`, e);
-                            usage.prompt_tokens_cost_usd = null;
-                            usage.completion_tokens_cost_usd = null;
-                            usage.total_cost_usd = null;
+                        else {
+                            // If calculation failed, set costs to null in the entry (matching Python behavior)
+                            const entry = condensedEntries[call.entryIndex];
+                            // Ensure output and usage objects exist before assigning null costs
+                            if (entry.output && entry.output.usage) {
+                                // --- Sets null costs on the individual span ---
+                                entry.output.usage.prompt_tokens_cost_usd = null;
+                                entry.output.usage.completion_tokens_cost_usd = null;
+                                entry.output.usage.total_cost_usd = null;
+                                // ------------------------------------------
+                            }
+                            else {
+                                // Log if we can't even assign null because the structure is missing
+                                logger.warn(`Could not assign null costs to span ${entry.span_id} (model: ${call.modelName}): Missing 'output' or 'output.usage' object.`, { output: entry.output });
+                            }
+                            logger.warn(`Token cost calculation failed for model '${call.modelName}'. Cost information will be null for this span.`); // More specific warning
                         }
                     }
-                    else {
-                        usage.prompt_tokens_cost_usd = null;
-                        usage.completion_tokens_cost_usd = null;
-                        usage.total_cost_usd = null;
+                    catch (e) {
+                        logger.warn(`Error during cost calculation loop for model '${call.modelName}':`, e); // Adjusted logging
+                        // Set costs to null in the entry if an error occurs during the loop iteration
+                        const entry = condensedEntries[call.entryIndex];
+                        // Ensure output and usage objects exist before assigning null costs on error
+                        if (entry.output && entry.output.usage) {
+                            // --- Sets null costs on the individual span on error ---
+                            entry.output.usage.prompt_tokens_cost_usd = null;
+                            entry.output.usage.completion_tokens_cost_usd = null;
+                            entry.output.usage.total_cost_usd = null;
+                            // ----------------------------------------------------
+                        }
+                        else {
+                            // Log if we can't assign null on error because the structure is missing
+                            logger.warn(`Could not assign null costs to span ${entry.span_id} (model: ${call.modelName}) on error: Missing 'output' or 'output.usage' object.`, { output: entry.output });
+                        }
                     }
                 }
-            });
+            }
             // Convert rules array to a dictionary (Record<string, Rule>)
             const rulesDict = {};
             this.rules.forEach(rule => {
@@ -488,16 +624,15 @@ class TraceClient {
                 duration: totalDuration,
                 token_counts: tokenCounts,
                 entries: condensedEntries,
-                rules: rulesDict,
-                empty_save: emptySave,
+                evaluation_runs: evaluationRuns,
                 overwrite: this.overwrite,
                 parent_trace_id: this.parentTraceId,
                 parent_name: this.parentName
             };
             try {
-                yield this.traceManager.saveTrace(traceData, emptySave);
+                yield this.traceManager.saveTrace(traceData);
                 logger.info(`Trace ${this.traceId} saved successfully.`);
-                if (!emptySave && this.enableEvaluations) {
+                if (this.enableEvaluations) {
                     try {
                         yield this.traceManager.addTraceToEvalQueue(traceData);
                         logger.info(`Trace ${this.traceId} added to evaluation queue.`);
@@ -531,7 +666,7 @@ class TraceClient {
         traceClientContext.entries.forEach(entry => {
             var _a;
             const indent = "  ".repeat((_a = entry.depth) !== null && _a !== void 0 ? _a : 0);
-            const timeStr = entry.timestamp ? `@ ${new Date(entry.timestamp * 1000).toISOString()}` : '';
+            const timeStr = entry.created_at ? `@ ${new Date(entry.created_at * 1000).toISOString()}` : '';
             const shortSpanId = entry.span_id ? `(id: ${entry.span_id.substring(0, 8)}...)` : '';
             const shortParentId = entry.parent_span_id ? `(parent: ${entry.parent_span_id.substring(0, 8)}...)` : '';
             try {
@@ -562,17 +697,6 @@ class TraceClient {
                         // Keep console.log
                         console.log(`${indent}  ${prefix} (for ${shortSpanId}): ${outputStr || 'null'}`);
                         break;
-                    case 'evaluation':
-                        let evalStr = JSON.stringify(entry.evaluation_runs);
-                        if (evalStr && evalStr.length > 200) {
-                            evalStr = evalStr.substring(0, 197) + '...';
-                        }
-                        // Keep console.log
-                        console.log(`${indent}  Evaluation (for ${shortSpanId}): ${evalStr || '[]'}`);
-                        break;
-                    default:
-                        // Keep console.log
-                        console.log(`${indent}? Unknown entry type: ${JSON.stringify(entry)}`);
                 }
             }
             catch (stringifyError) {
@@ -612,9 +736,7 @@ class TraceClient {
      * @returns Promise that resolves when the evaluation entry has been added to the trace
      */
     asyncEvaluate(scorers_1) {
-        return __awaiter(this, arguments, void 0, function* (
-        // Accept general Scorer type, but filter/check for API scorers internally
-        scorers, options = {}) {
+        return __awaiter(this, arguments, void 0, function* (scorers, options = {}) {
             if (!this.enableEvaluations) {
                 logger.warn("Evaluations are disabled. Skipping async evaluation.");
                 return;
@@ -629,7 +751,12 @@ class TraceClient {
                 logger.warn("No APIJudgmentScorers found in the provided scorers list. Skipping async evaluation as backend requires API scorers.");
                 return;
             }
-            const startTime = Date.now() / 1000; // Record start time in seconds
+            // Process rules (currently just using this.rules directly)
+            const loadedRules = this.rules; // TODO: Add ScorerWrapper-like processing if needed in TS
+            // Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
+            if (loadedRules && loadedRules.length > 0 && scorers.some(s => !(s instanceof APIJudgmentScorer))) {
+                throw new Error("Cannot use Judgeval scorers, you can only use API scorers when using rules. Please either remove rules or use only APIJudgmentScorer types.");
+            }
             // Create example structure matching Python/backend expectations
             const example = {
                 input: options.input || "",
@@ -649,6 +776,7 @@ class TraceClient {
                     logger.warn(`No current entry to record evaluation to\nStack trace: ${new Error().stack}`);
                     return;
                 }
+                const currentSpanId = currentEntry.span_id; // Get the span ID
                 // --- Create evaluation run name (similar to Python) ---
                 // Capitalize scorer names
                 const scorerNames = apiScorers.map(scorer => {
@@ -658,11 +786,9 @@ class TraceClient {
                     return name.charAt(0).toUpperCase() + name.slice(1);
                 }).join(',');
                 // Use trace name and shortened span ID (or trace ID if no span)
-                const idPart = currentEntry ? currentEntry.span_id.substring(0, 8) : this.traceId.substring(0, 8);
+                const idPart = currentSpanId ? currentSpanId.substring(0, 8) : this.traceId.substring(0, 8);
                 const evalName = `${this.name.charAt(0).toUpperCase() + this.name.slice(1)}-${idPart}-[${scorerNames}]`;
                 // --- End eval name creation ---
-                // Process rules (currently just using this.rules directly)
-                const loadedRules = this.rules; // TODO: Add ScorerWrapper-like processing if needed in TS
                 // Construct the evaluation payload
                 const evalRunPayload = {
                     organization_id: this.organizationId,
@@ -675,47 +801,18 @@ class TraceClient {
                     metadata: {}, // Matches Python tracer
                     judgment_api_key: this.apiKey,
                     override: this.overwrite, // Use trace's overwrite setting
-                    rules: loadedRules // Pass the processed rules
+                    rules: loadedRules, // Pass the processed rules
+                    trace_span_id: currentSpanId // <<< RENAMED: Assign the current span ID (matching backend)
                 };
-                // Add evaluation entry using the helper method
-                this._addEvalRun(evalRunPayload, startTime);
+                // Add evaluation entry to the trace
+                this.recordOutput(evalRunPayload);
             }
             catch (error) {
-                console.error(`Failed during asyncEvaluate execution: ${error instanceof Error ? error.message : String(error)}`);
-                // Decide if we should re-throw or just log
+                logger.error(`Failed during asyncEvaluate execution: ${error instanceof Error ? error.message : String(error)}`);
+                throw error; // Re-throw after logging
             }
         });
     }
-    /**
-     * Private helper to add an evaluation entry to the trace.
-     * This mirrors the structure of Python's add_eval_run.
-     *
-     * @param evalRunPayload The constructed payload for the evaluation.
-     * @param startTime The start time (in seconds) of the evaluation process.
-     */
-    _addEvalRun(evalRunPayload, startTime) {
-        var _a, _b;
-        const traceClientContext = getTraceClientContext();
-        const currentEntry = traceClientContext.entryStack.at(-1);
-        if (!currentEntry) {
-            logger.warn(`No current entry to record evaluation to\nStack trace: ${new Error().stack}`);
-            return;
-        }
-        const function_ = (_a = currentEntry.function) !== null && _a !== void 0 ? _a : "unknown_function";
-        const depth = (_b = currentEntry.depth) !== null && _b !== void 0 ? _b : 0;
-        const duration = Date.now() / 1000 - startTime;
-        // Add evaluation entry to the trace
-        this.addEntry({
-            type: "evaluation",
-            function: function_,
-            span_id: currentEntry.span_id, // May be undefined
-            depth: depth,
-            timestamp: Date.now() / 1000,
-            evaluation_runs: [evalRunPayload], // Embed the payload
-            duration: duration,
-            span_type: "evaluation"
-        });
-    }
     // OPTIONAL: Add a method to get the original name if needed elsewhere
     getOriginalName() {
         return this.originalName;
@@ -796,11 +893,6 @@ class Tracer {
             apiKey: this.apiKey,
             organizationId: this.organizationId,
         });
-        if (traceClient.enableMonitoring) {
-            traceClient.save(true).catch(err => {
-                logger.error(`Failed to save empty trace (${traceClient.traceId}):`, err);
-            });
-        }
         return traceClient;
     }
     *trace(name, options = {}) {