judgeval 0.0.37__py3-none-any.whl → 0.0.38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +132 -281
- judgeval/common/utils.py +1 -1
- judgeval/constants.py +1 -3
- judgeval/data/__init__.py +0 -2
- judgeval/data/datasets/dataset.py +2 -9
- judgeval/data/datasets/eval_dataset_client.py +1 -62
- judgeval/data/example.py +0 -1
- judgeval/data/result.py +3 -3
- judgeval/data/trace.py +4 -1
- judgeval/data/{sequence_run.py → trace_run.py} +4 -4
- judgeval/evaluation_run.py +1 -1
- judgeval/integrations/langgraph.py +187 -1768
- judgeval/judges/litellm_judge.py +1 -1
- judgeval/judges/mixture_of_judges.py +1 -1
- judgeval/judges/utils.py +1 -1
- judgeval/judgment_client.py +15 -21
- judgeval/run_evaluation.py +31 -81
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +4 -2
- judgeval-0.0.38.dist-info/METADATA +247 -0
- {judgeval-0.0.37.dist-info → judgeval-0.0.38.dist-info}/RECORD +22 -23
- judgeval/data/sequence.py +0 -50
- judgeval-0.0.37.dist-info/METADATA +0 -214
- {judgeval-0.0.37.dist-info → judgeval-0.0.38.dist-info}/WHEEL +0 -0
- {judgeval-0.0.37.dist-info → judgeval-0.0.38.dist-info}/licenses/LICENSE.md +0 -0
judgeval/common/tracer.py
CHANGED
@@ -146,7 +146,7 @@ class TraceManagerClient:
|
|
146
146
|
|
147
147
|
return response.json()
|
148
148
|
|
149
|
-
def save_trace(self, trace_data: dict):
|
149
|
+
def save_trace(self, trace_data: dict, offline_mode: bool = False):
|
150
150
|
"""
|
151
151
|
Saves a trace to the Judgment Supabase and optionally to S3 if configured.
|
152
152
|
|
@@ -183,7 +183,7 @@ class TraceManagerClient:
|
|
183
183
|
except Exception as e:
|
184
184
|
warnings.warn(f"Failed to save trace to S3: {str(e)}")
|
185
185
|
|
186
|
-
if "ui_results_url" in response.json():
|
186
|
+
if not offline_mode and "ui_results_url" in response.json():
|
187
187
|
pretty_str = f"\n🔍 You can view your trace data here: [rgb(106,0,255)][link={response.json()['ui_results_url']}]View Trace[/link]\n"
|
188
188
|
rprint(pretty_str)
|
189
189
|
|
@@ -314,6 +314,7 @@ class TraceClient:
|
|
314
314
|
self.executed_tools = []
|
315
315
|
self.executed_node_tools = []
|
316
316
|
self._span_depths: Dict[str, int] = {} # NEW: To track depth of active spans
|
317
|
+
|
317
318
|
def get_current_span(self):
|
318
319
|
"""Get the current span from the context var"""
|
319
320
|
return current_span_var.get()
|
@@ -428,7 +429,8 @@ class TraceClient:
|
|
428
429
|
# span_id_at_eval_call = current_span_var.get()
|
429
430
|
# print(f"[TraceClient.async_evaluate] Captured span ID at eval call: {span_id_at_eval_call}")
|
430
431
|
# Prioritize explicitly passed span_id, fallback to context var
|
431
|
-
|
432
|
+
current_span_ctx_var = current_span_var.get()
|
433
|
+
span_id_to_use = span_id if span_id is not None else current_span_ctx_var if current_span_ctx_var is not None else self.tracer.get_current_span()
|
432
434
|
# print(f"[TraceClient.async_evaluate] Using span_id: {span_id_to_use}")
|
433
435
|
# --- End Modification ---
|
434
436
|
|
@@ -438,7 +440,7 @@ class TraceClient:
|
|
438
440
|
log_results=log_results,
|
439
441
|
project_name=self.project_name,
|
440
442
|
eval_name=f"{self.name.capitalize()}-"
|
441
|
-
f"{
|
443
|
+
f"{span_id_to_use}-" # Keep original eval name format using context var if available
|
442
444
|
f"[{','.join(scorer.score_type.capitalize() for scorer in scorers)}]",
|
443
445
|
examples=[example],
|
444
446
|
scorers=scorers,
|
@@ -658,11 +660,12 @@ class TraceClient:
|
|
658
660
|
"entries": [span.model_dump() for span in self.trace_spans],
|
659
661
|
"evaluation_runs": [run.model_dump() for run in self.evaluation_runs],
|
660
662
|
"overwrite": overwrite,
|
663
|
+
"offline_mode": self.tracer.offline_mode,
|
661
664
|
"parent_trace_id": self.parent_trace_id,
|
662
665
|
"parent_name": self.parent_name
|
663
666
|
}
|
664
667
|
# --- Log trace data before saving ---
|
665
|
-
self.trace_manager_client.save_trace(trace_data)
|
668
|
+
self.trace_manager_client.save_trace(trace_data, offline_mode=self.tracer.offline_mode)
|
666
669
|
|
667
670
|
# upload annotations
|
668
671
|
# TODO: batch to the log endpoint
|
@@ -928,6 +931,7 @@ class Tracer:
|
|
928
931
|
s3_aws_access_key_id: Optional[str] = None,
|
929
932
|
s3_aws_secret_access_key: Optional[str] = None,
|
930
933
|
s3_region_name: Optional[str] = None,
|
934
|
+
offline_mode: bool = False,
|
931
935
|
deep_tracing: bool = True # Deep tracing is enabled by default
|
932
936
|
):
|
933
937
|
if not hasattr(self, 'initialized'):
|
@@ -968,6 +972,7 @@ class Tracer:
|
|
968
972
|
aws_secret_access_key=s3_aws_secret_access_key,
|
969
973
|
region_name=s3_region_name
|
970
974
|
)
|
975
|
+
self.offline_mode: bool = offline_mode
|
971
976
|
self.deep_tracing: bool = deep_tracing # NEW: Store deep tracing setting
|
972
977
|
|
973
978
|
elif hasattr(self, 'project_name') and self.project_name != project_name:
|
@@ -977,6 +982,12 @@ class Tracer:
|
|
977
982
|
"To use a different project name, ensure the first Tracer initialization uses the desired project name.",
|
978
983
|
RuntimeWarning
|
979
984
|
)
|
985
|
+
|
986
|
+
def set_current_span(self, span_id: str):
|
987
|
+
self.current_span_id = span_id
|
988
|
+
|
989
|
+
def get_current_span(self) -> Optional[str]:
|
990
|
+
return getattr(self, 'current_span_id', None)
|
980
991
|
|
981
992
|
def set_current_trace(self, trace: TraceClient):
|
982
993
|
"""
|
@@ -1263,64 +1274,94 @@ class Tracer:
|
|
1263
1274
|
else:
|
1264
1275
|
warnings.warn("No trace found (context var or fallback), skipping evaluation") # Modified warning
|
1265
1276
|
|
1266
|
-
|
1267
1277
|
def wrap(client: Any) -> Any:
|
1268
1278
|
"""
|
1269
1279
|
Wraps an API client to add tracing capabilities.
|
1270
1280
|
Supports OpenAI, Together, Anthropic, and Google GenAI clients.
|
1271
1281
|
Patches both '.create' and Anthropic's '.stream' methods using a wrapper class.
|
1272
1282
|
"""
|
1273
|
-
span_name, original_create,
|
1283
|
+
span_name, original_create, original_responses_create, original_stream = _get_client_config(client)
|
1284
|
+
|
1285
|
+
def _record_input_and_check_streaming(span, kwargs, is_responses=False):
|
1286
|
+
"""Record input and check for streaming"""
|
1287
|
+
is_streaming = kwargs.get("stream", False)
|
1274
1288
|
|
1275
|
-
|
1289
|
+
# Record input based on whether this is a responses endpoint
|
1290
|
+
if is_responses:
|
1291
|
+
span.record_input(kwargs)
|
1292
|
+
else:
|
1293
|
+
input_data = _format_input_data(client, **kwargs)
|
1294
|
+
span.record_input(input_data)
|
1295
|
+
|
1296
|
+
# Warn about token counting limitations with streaming
|
1297
|
+
if isinstance(client, (AsyncOpenAI, OpenAI)) and is_streaming:
|
1298
|
+
if not kwargs.get("stream_options", {}).get("include_usage"):
|
1299
|
+
warnings.warn(
|
1300
|
+
"OpenAI streaming calls don't include token counts by default. "
|
1301
|
+
"To enable token counting with streams, set stream_options={'include_usage': True} "
|
1302
|
+
"in your API call arguments.",
|
1303
|
+
UserWarning
|
1304
|
+
)
|
1305
|
+
|
1306
|
+
return is_streaming
|
1307
|
+
|
1308
|
+
def _format_and_record_output(span, response, is_streaming, is_async, is_responses):
|
1309
|
+
"""Format and record the output in the span"""
|
1310
|
+
if is_streaming:
|
1311
|
+
output_entry = span.record_output("<pending stream>")
|
1312
|
+
wrapper_func = _async_stream_wrapper if is_async else _sync_stream_wrapper
|
1313
|
+
return wrapper_func(response, client, output_entry)
|
1314
|
+
else:
|
1315
|
+
format_func = _format_response_output_data if is_responses else _format_output_data
|
1316
|
+
output_data = format_func(client, response)
|
1317
|
+
span.record_output(output_data)
|
1318
|
+
return response
|
1319
|
+
|
1320
|
+
def _handle_error(span, e, is_async):
|
1321
|
+
"""Handle and record errors"""
|
1322
|
+
call_type = "async" if is_async else "sync"
|
1323
|
+
print(f"Error during wrapped {call_type} API call ({span_name}): {e}")
|
1324
|
+
span.record_output({"error": str(e)})
|
1325
|
+
raise
|
1326
|
+
|
1327
|
+
# --- Traced Async Functions ---
|
1276
1328
|
async def traced_create_async(*args, **kwargs):
|
1277
|
-
# [Existing logic - unchanged]
|
1278
1329
|
current_trace = current_trace_var.get()
|
1279
1330
|
if not current_trace:
|
1280
|
-
|
1281
|
-
|
1282
|
-
else:
|
1283
|
-
return original_create(*args, **kwargs)
|
1284
|
-
|
1285
|
-
is_streaming = kwargs.get("stream", False)
|
1286
|
-
|
1331
|
+
return await original_create(*args, **kwargs)
|
1332
|
+
|
1287
1333
|
with current_trace.span(span_name, span_type="llm") as span:
|
1288
|
-
|
1289
|
-
|
1290
|
-
|
1291
|
-
# Warn about token counting limitations with streaming
|
1292
|
-
if isinstance(client, (AsyncOpenAI, OpenAI)) and is_streaming:
|
1293
|
-
if not kwargs.get("stream_options", {}).get("include_usage"):
|
1294
|
-
warnings.warn(
|
1295
|
-
"OpenAI streaming calls don't include token counts by default. "
|
1296
|
-
"To enable token counting with streams, set stream_options={'include_usage': True} "
|
1297
|
-
"in your API call arguments.",
|
1298
|
-
UserWarning
|
1299
|
-
)
|
1300
|
-
|
1334
|
+
is_streaming = _record_input_and_check_streaming(span, kwargs)
|
1335
|
+
|
1301
1336
|
try:
|
1302
|
-
|
1303
|
-
|
1304
|
-
output_entry = span.record_output("<pending stream>")
|
1305
|
-
return _async_stream_wrapper(stream_iterator, client, output_entry)
|
1306
|
-
else:
|
1307
|
-
awaited_response = await original_create(*args, **kwargs)
|
1308
|
-
output_data = _format_output_data(client, awaited_response)
|
1309
|
-
span.record_output(output_data)
|
1310
|
-
return awaited_response
|
1337
|
+
response_or_iterator = await original_create(*args, **kwargs)
|
1338
|
+
return _format_and_record_output(span, response_or_iterator, is_streaming, True, False)
|
1311
1339
|
except Exception as e:
|
1312
|
-
|
1313
|
-
|
1314
|
-
|
1315
|
-
|
1316
|
-
|
1317
|
-
|
1340
|
+
return _handle_error(span, e, True)
|
1341
|
+
|
1342
|
+
# Async responses for OpenAI clients
|
1343
|
+
async def traced_response_create_async(*args, **kwargs):
|
1344
|
+
current_trace = current_trace_var.get()
|
1345
|
+
if not current_trace:
|
1346
|
+
return await original_responses_create(*args, **kwargs)
|
1347
|
+
|
1348
|
+
with current_trace.span(span_name, span_type="llm") as span:
|
1349
|
+
is_streaming = _record_input_and_check_streaming(span, kwargs, is_responses=True)
|
1350
|
+
|
1351
|
+
try:
|
1352
|
+
response_or_iterator = await original_responses_create(*args, **kwargs)
|
1353
|
+
return _format_and_record_output(span, response_or_iterator, is_streaming, True, True)
|
1354
|
+
except Exception as e:
|
1355
|
+
return _handle_error(span, e, True)
|
1356
|
+
|
1357
|
+
# Function replacing .stream() for async clients
|
1318
1358
|
def traced_stream_async(*args, **kwargs):
|
1319
1359
|
current_trace = current_trace_var.get()
|
1320
1360
|
if not current_trace or not original_stream:
|
1321
1361
|
return original_stream(*args, **kwargs)
|
1362
|
+
|
1322
1363
|
original_manager = original_stream(*args, **kwargs)
|
1323
|
-
|
1364
|
+
return _TracedAsyncStreamManagerWrapper(
|
1324
1365
|
original_manager=original_manager,
|
1325
1366
|
client=client,
|
1326
1367
|
span_name=span_name,
|
@@ -1328,139 +1369,74 @@ def wrap(client: Any) -> Any:
|
|
1328
1369
|
stream_wrapper_func=_async_stream_wrapper,
|
1329
1370
|
input_kwargs=kwargs
|
1330
1371
|
)
|
1331
|
-
|
1332
|
-
|
1333
|
-
# --- Define Traced Sync Functions ---
|
1372
|
+
|
1373
|
+
# --- Traced Sync Functions ---
|
1334
1374
|
def traced_create_sync(*args, **kwargs):
|
1335
|
-
# [Existing logic - unchanged]
|
1336
1375
|
current_trace = current_trace_var.get()
|
1337
1376
|
if not current_trace:
|
1338
|
-
|
1339
|
-
|
1340
|
-
is_streaming = kwargs.get("stream", False)
|
1341
|
-
|
1377
|
+
return original_create(*args, **kwargs)
|
1378
|
+
|
1342
1379
|
with current_trace.span(span_name, span_type="llm") as span:
|
1343
|
-
|
1344
|
-
|
1345
|
-
|
1346
|
-
|
1347
|
-
|
1348
|
-
|
1349
|
-
|
1350
|
-
|
1351
|
-
"To enable token counting with streams, set stream_options={'include_usage': True} "
|
1352
|
-
"in your API call arguments.",
|
1353
|
-
UserWarning
|
1354
|
-
)
|
1355
|
-
|
1356
|
-
try:
|
1357
|
-
response_or_iterator = original_create(*args, **kwargs)
|
1358
|
-
except Exception as e:
|
1359
|
-
print(f"Error during wrapped sync API call ({span_name}): {e}")
|
1360
|
-
span.record_output({"error": str(e)})
|
1361
|
-
raise
|
1362
|
-
|
1363
|
-
if is_streaming:
|
1364
|
-
output_entry = span.record_output("<pending stream>")
|
1365
|
-
return _sync_stream_wrapper(response_or_iterator, client, output_entry)
|
1366
|
-
else:
|
1367
|
-
output_data = _format_output_data(client, response_or_iterator)
|
1368
|
-
span.record_output(output_data)
|
1369
|
-
return response_or_iterator
|
1370
|
-
|
1371
|
-
# --- Define Traced Sync Functions ---
|
1380
|
+
is_streaming = _record_input_and_check_streaming(span, kwargs)
|
1381
|
+
|
1382
|
+
try:
|
1383
|
+
response_or_iterator = original_create(*args, **kwargs)
|
1384
|
+
return _format_and_record_output(span, response_or_iterator, is_streaming, False, False)
|
1385
|
+
except Exception as e:
|
1386
|
+
return _handle_error(span, e, False)
|
1387
|
+
|
1372
1388
|
def traced_response_create_sync(*args, **kwargs):
|
1373
|
-
# [Existing logic - unchanged]
|
1374
1389
|
current_trace = current_trace_var.get()
|
1375
1390
|
if not current_trace:
|
1376
|
-
|
1377
|
-
|
1378
|
-
is_streaming = kwargs.get("stream", False)
|
1391
|
+
return original_responses_create(*args, **kwargs)
|
1392
|
+
|
1379
1393
|
with current_trace.span(span_name, span_type="llm") as span:
|
1380
|
-
|
1381
|
-
|
1382
|
-
|
1383
|
-
|
1384
|
-
|
1385
|
-
|
1386
|
-
|
1387
|
-
|
1388
|
-
"in your API call arguments.",
|
1389
|
-
UserWarning
|
1390
|
-
)
|
1391
|
-
|
1392
|
-
try:
|
1393
|
-
response_or_iterator = responses_create(*args, **kwargs)
|
1394
|
-
except Exception as e:
|
1395
|
-
print(f"Error during wrapped sync API call ({span_name}): {e}")
|
1396
|
-
span.record_output({"error": str(e)})
|
1397
|
-
raise
|
1398
|
-
if is_streaming:
|
1399
|
-
output_entry = span.record_output("<pending stream>")
|
1400
|
-
return _sync_stream_wrapper(response_or_iterator, client, output_entry)
|
1401
|
-
else:
|
1402
|
-
output_data = _format_response_output_data(client, response_or_iterator)
|
1403
|
-
span.record_output(output_data)
|
1404
|
-
return response_or_iterator
|
1405
|
-
|
1394
|
+
is_streaming = _record_input_and_check_streaming(span, kwargs, is_responses=True)
|
1395
|
+
|
1396
|
+
try:
|
1397
|
+
response_or_iterator = original_responses_create(*args, **kwargs)
|
1398
|
+
return _format_and_record_output(span, response_or_iterator, is_streaming, False, True)
|
1399
|
+
except Exception as e:
|
1400
|
+
return _handle_error(span, e, False)
|
1401
|
+
|
1406
1402
|
# Function replacing sync .stream()
|
1407
1403
|
def traced_stream_sync(*args, **kwargs):
|
1408
|
-
|
1409
|
-
|
1410
|
-
|
1411
|
-
|
1412
|
-
|
1413
|
-
|
1414
|
-
|
1415
|
-
|
1416
|
-
|
1417
|
-
|
1418
|
-
|
1419
|
-
|
1420
|
-
|
1421
|
-
|
1422
|
-
|
1404
|
+
current_trace = current_trace_var.get()
|
1405
|
+
if not current_trace or not original_stream:
|
1406
|
+
return original_stream(*args, **kwargs)
|
1407
|
+
|
1408
|
+
original_manager = original_stream(*args, **kwargs)
|
1409
|
+
return _TracedSyncStreamManagerWrapper(
|
1410
|
+
original_manager=original_manager,
|
1411
|
+
client=client,
|
1412
|
+
span_name=span_name,
|
1413
|
+
trace_client=current_trace,
|
1414
|
+
stream_wrapper_func=_sync_stream_wrapper,
|
1415
|
+
input_kwargs=kwargs
|
1416
|
+
)
|
1417
|
+
|
1423
1418
|
# --- Assign Traced Methods to Client Instance ---
|
1424
|
-
# [Assignment logic remains the same]
|
1425
1419
|
if isinstance(client, (AsyncOpenAI, AsyncTogether)):
|
1426
1420
|
client.chat.completions.create = traced_create_async
|
1427
|
-
# Wrap the Responses API endpoint for OpenAI clients
|
1428
1421
|
if hasattr(client, "responses") and hasattr(client.responses, "create"):
|
1429
|
-
|
1430
|
-
original_responses_create = client.responses.create
|
1431
|
-
def traced_responses(*args, **kwargs):
|
1432
|
-
# Get the current trace from contextvars
|
1433
|
-
current_trace = current_trace_var.get()
|
1434
|
-
# If no active trace, call the original
|
1435
|
-
if not current_trace:
|
1436
|
-
return original_responses_create(*args, **kwargs)
|
1437
|
-
# Trace this responses.create call
|
1438
|
-
with current_trace.span(span_name, span_type="llm") as span:
|
1439
|
-
# Record raw input kwargs
|
1440
|
-
span.record_input(kwargs)
|
1441
|
-
# Make the actual API call
|
1442
|
-
response = original_responses_create(*args, **kwargs)
|
1443
|
-
# Record the output object
|
1444
|
-
span.record_output(response)
|
1445
|
-
return response
|
1446
|
-
# Assign the traced wrapper
|
1447
|
-
client.responses.create = traced_responses
|
1422
|
+
client.responses.create = traced_response_create_async
|
1448
1423
|
elif isinstance(client, AsyncAnthropic):
|
1449
1424
|
client.messages.create = traced_create_async
|
1450
1425
|
if original_stream:
|
1451
|
-
|
1426
|
+
client.messages.stream = traced_stream_async
|
1452
1427
|
elif isinstance(client, genai.client.AsyncClient):
|
1453
1428
|
client.models.generate_content = traced_create_async
|
1454
1429
|
elif isinstance(client, (OpenAI, Together)):
|
1455
|
-
|
1456
|
-
|
1430
|
+
client.chat.completions.create = traced_create_sync
|
1431
|
+
if hasattr(client, "responses") and hasattr(client.responses, "create"):
|
1432
|
+
client.responses.create = traced_response_create_sync
|
1457
1433
|
elif isinstance(client, Anthropic):
|
1458
|
-
|
1459
|
-
|
1460
|
-
|
1434
|
+
client.messages.create = traced_create_sync
|
1435
|
+
if original_stream:
|
1436
|
+
client.messages.stream = traced_stream_sync
|
1461
1437
|
elif isinstance(client, genai.Client):
|
1462
|
-
|
1463
|
-
|
1438
|
+
client.models.generate_content = traced_create_sync
|
1439
|
+
|
1464
1440
|
return client
|
1465
1441
|
|
1466
1442
|
# Helper functions for client-specific operations
|
@@ -1896,128 +1872,3 @@ class _TracedSyncStreamManagerWrapper(_BaseStreamManagerWrapper, AbstractContext
|
|
1896
1872
|
current_span_var.reset(self._span_context_token)
|
1897
1873
|
delattr(self, '_span_context_token')
|
1898
1874
|
return self._original_manager.__exit__(exc_type, exc_val, exc_tb)
|
1899
|
-
|
1900
|
-
# --- NEW Generalized Helper Function (Moved from demo) ---
|
1901
|
-
def prepare_evaluation_for_state(
|
1902
|
-
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
|
1903
|
-
example: Optional[Example] = None,
|
1904
|
-
# --- Individual components (alternative to 'example') ---
|
1905
|
-
input: Optional[str] = None,
|
1906
|
-
actual_output: Optional[Union[str, List[str]]] = None,
|
1907
|
-
expected_output: Optional[Union[str, List[str]]] = None,
|
1908
|
-
context: Optional[List[str]] = None,
|
1909
|
-
retrieval_context: Optional[List[str]] = None,
|
1910
|
-
tools_called: Optional[List[str]] = None,
|
1911
|
-
expected_tools: Optional[List[str]] = None,
|
1912
|
-
additional_metadata: Optional[Dict[str, Any]] = None,
|
1913
|
-
# --- Other eval parameters ---
|
1914
|
-
model: Optional[str] = None,
|
1915
|
-
log_results: Optional[bool] = True
|
1916
|
-
) -> Optional[EvaluationConfig]:
|
1917
|
-
"""
|
1918
|
-
Prepares an EvaluationConfig object, similar to TraceClient.async_evaluate.
|
1919
|
-
|
1920
|
-
Accepts either a pre-made Example object or individual components to construct one.
|
1921
|
-
Returns the EvaluationConfig object ready to be placed in the state, or None.
|
1922
|
-
"""
|
1923
|
-
final_example = example
|
1924
|
-
|
1925
|
-
# If example is not provided, try to construct one from individual parts
|
1926
|
-
if final_example is None:
|
1927
|
-
# Basic validation: Ensure at least actual_output is present for most scorers
|
1928
|
-
if actual_output is None:
|
1929
|
-
# print("[prepare_evaluation_for_state] Warning: 'actual_output' is required when 'example' is not provided. Skipping evaluation setup.")
|
1930
|
-
return None
|
1931
|
-
try:
|
1932
|
-
final_example = Example(
|
1933
|
-
input=input,
|
1934
|
-
actual_output=actual_output,
|
1935
|
-
expected_output=expected_output,
|
1936
|
-
context=context,
|
1937
|
-
retrieval_context=retrieval_context,
|
1938
|
-
tools_called=tools_called,
|
1939
|
-
expected_tools=expected_tools,
|
1940
|
-
additional_metadata=additional_metadata,
|
1941
|
-
# trace_id will be set by the handler later if needed
|
1942
|
-
)
|
1943
|
-
# print("[prepare_evaluation_for_state] Constructed Example from individual components.")
|
1944
|
-
except Exception as e:
|
1945
|
-
# print(f"[prepare_evaluation_for_state] Error constructing Example: {e}. Skipping evaluation setup.")
|
1946
|
-
return None
|
1947
|
-
|
1948
|
-
# If we have a valid example (provided or constructed) and scorers
|
1949
|
-
if final_example and scorers:
|
1950
|
-
# TODO: Add validation like check_examples if needed here,
|
1951
|
-
# although the handler might implicitly handle some checks via TraceClient.
|
1952
|
-
return EvaluationConfig(
|
1953
|
-
scorers=scorers,
|
1954
|
-
example=final_example,
|
1955
|
-
model=model,
|
1956
|
-
log_results=log_results
|
1957
|
-
)
|
1958
|
-
elif not scorers:
|
1959
|
-
# print("[prepare_evaluation_for_state] No scorers provided. Skipping evaluation setup.")
|
1960
|
-
return None
|
1961
|
-
else: # No valid example
|
1962
|
-
# print("[prepare_evaluation_for_state] No valid Example available. Skipping evaluation setup.")
|
1963
|
-
return None
|
1964
|
-
# --- End NEW Helper Function ---
|
1965
|
-
|
1966
|
-
# --- NEW: Helper function to simplify adding eval config to state ---
|
1967
|
-
def add_evaluation_to_state(
|
1968
|
-
state: Dict[str, Any], # The LangGraph state dictionary
|
1969
|
-
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
|
1970
|
-
# --- Evaluation components (same as prepare_evaluation_for_state) ---
|
1971
|
-
input: Optional[str] = None,
|
1972
|
-
actual_output: Optional[Union[str, List[str]]] = None,
|
1973
|
-
expected_output: Optional[Union[str, List[str]]] = None,
|
1974
|
-
context: Optional[List[str]] = None,
|
1975
|
-
retrieval_context: Optional[List[str]] = None,
|
1976
|
-
tools_called: Optional[List[str]] = None,
|
1977
|
-
expected_tools: Optional[List[str]] = None,
|
1978
|
-
additional_metadata: Optional[Dict[str, Any]] = None,
|
1979
|
-
# --- Other eval parameters ---
|
1980
|
-
model: Optional[str] = None,
|
1981
|
-
log_results: Optional[bool] = True
|
1982
|
-
) -> None:
|
1983
|
-
"""
|
1984
|
-
Prepares an EvaluationConfig and adds it to the state dictionary
|
1985
|
-
under the '_judgeval_eval' key if successful.
|
1986
|
-
|
1987
|
-
This simplifies the process of setting up evaluations within LangGraph nodes.
|
1988
|
-
|
1989
|
-
Args:
|
1990
|
-
state: The LangGraph state dictionary to modify.
|
1991
|
-
scorers: List of scorer instances.
|
1992
|
-
input: Input for the evaluation example.
|
1993
|
-
actual_output: Actual output for the evaluation example.
|
1994
|
-
expected_output: Expected output for the evaluation example.
|
1995
|
-
context: Context for the evaluation example.
|
1996
|
-
retrieval_context: Retrieval context for the evaluation example.
|
1997
|
-
tools_called: Tools called for the evaluation example.
|
1998
|
-
expected_tools: Expected tools for the evaluation example.
|
1999
|
-
additional_metadata: Additional metadata for the evaluation example.
|
2000
|
-
model: Model name used for generation (optional).
|
2001
|
-
log_results: Whether to log evaluation results (optional, defaults to True).
|
2002
|
-
"""
|
2003
|
-
eval_config = prepare_evaluation_for_state(
|
2004
|
-
scorers=scorers,
|
2005
|
-
input=input,
|
2006
|
-
actual_output=actual_output,
|
2007
|
-
expected_output=expected_output,
|
2008
|
-
context=context,
|
2009
|
-
retrieval_context=retrieval_context,
|
2010
|
-
tools_called=tools_called,
|
2011
|
-
expected_tools=expected_tools,
|
2012
|
-
additional_metadata=additional_metadata,
|
2013
|
-
model=model,
|
2014
|
-
log_results=log_results
|
2015
|
-
)
|
2016
|
-
|
2017
|
-
if eval_config:
|
2018
|
-
state["_judgeval_eval"] = eval_config
|
2019
|
-
# print(f"[_judgeval_eval added to state for node]") # Optional: Log confirmation
|
2020
|
-
|
2021
|
-
# print("[Skipped adding _judgeval_eval to state: prepare_evaluation_for_state failed]")
|
2022
|
-
# --- End NEW Helper ---
|
2023
|
-
|
judgeval/common/utils.py
CHANGED
@@ -765,7 +765,7 @@ if __name__ == "__main__":
|
|
765
765
|
# Batched single completion to multiple models
|
766
766
|
pprint.pprint(get_completion_multiple_models(
|
767
767
|
models=[
|
768
|
-
"LLAMA3_70B_INSTRUCT_TURBO", "LLAMA3_405B_INSTRUCT_TURBO", "gpt-
|
768
|
+
"LLAMA3_70B_INSTRUCT_TURBO", "LLAMA3_405B_INSTRUCT_TURBO", "gpt-4.1-mini"
|
769
769
|
],
|
770
770
|
messages=[
|
771
771
|
[
|
judgeval/constants.py
CHANGED
@@ -40,17 +40,15 @@ UNBOUNDED_SCORERS = set([APIScorer.COMPARISON]) # scorers whose scores are not
|
|
40
40
|
ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
|
41
41
|
# API URLs
|
42
42
|
JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
|
43
|
-
|
43
|
+
JUDGMENT_TRACE_EVAL_API_URL = f"{ROOT_API}/evaluate_trace/"
|
44
44
|
JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
|
45
45
|
JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL = f"{ROOT_API}/datasets/insert_examples/"
|
46
|
-
JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL = f"{ROOT_API}/datasets/insert_sequences/"
|
47
46
|
JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull_for_judgeval/"
|
48
47
|
JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
|
49
48
|
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
|
50
49
|
JUDGMENT_DATASETS_PROJECT_STATS_API_URL = f"{ROOT_API}/datasets/fetch_stats_by_project/"
|
51
50
|
JUDGMENT_DATASETS_INSERT_API_URL = f"{ROOT_API}/datasets/insert_examples/"
|
52
51
|
JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
|
53
|
-
JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL = f"{ROOT_API}/traces/convert_trace_to_sequence/"
|
54
52
|
JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_experiment_run/"
|
55
53
|
JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_names/"
|
56
54
|
JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
|
judgeval/data/__init__.py
CHANGED
@@ -2,7 +2,6 @@ from judgeval.data.example import Example, ExampleParams
|
|
2
2
|
from judgeval.data.custom_example import CustomExample
|
3
3
|
from judgeval.data.scorer_data import ScorerData, create_scorer_data
|
4
4
|
from judgeval.data.result import ScoringResult, generate_scoring_result
|
5
|
-
from judgeval.data.sequence import Sequence
|
6
5
|
from judgeval.data.trace import Trace, TraceSpan
|
7
6
|
|
8
7
|
|
@@ -14,7 +13,6 @@ __all__ = [
|
|
14
13
|
"create_scorer_data",
|
15
14
|
"ScoringResult",
|
16
15
|
"generate_scoring_result",
|
17
|
-
"Sequence",
|
18
16
|
"Trace",
|
19
17
|
"TraceSpan",
|
20
18
|
]
|
@@ -7,13 +7,12 @@ import yaml
|
|
7
7
|
from dataclasses import dataclass, field
|
8
8
|
from typing import List, Union, Literal
|
9
9
|
|
10
|
-
from judgeval.data import Example
|
10
|
+
from judgeval.data import Example
|
11
11
|
from judgeval.common.logger import debug, error, warning, info
|
12
12
|
|
13
13
|
@dataclass
|
14
14
|
class EvalDataset:
|
15
15
|
examples: List[Example]
|
16
|
-
sequences: List[Sequence]
|
17
16
|
_alias: Union[str, None] = field(default=None)
|
18
17
|
_id: Union[str, None] = field(default=None)
|
19
18
|
judgment_api_key: str = field(default="")
|
@@ -22,13 +21,11 @@ class EvalDataset:
|
|
22
21
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"),
|
23
22
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
|
24
23
|
examples: List[Example] = [],
|
25
|
-
sequences: List[Sequence] = []
|
26
24
|
):
|
27
25
|
debug(f"Initializing EvalDataset with {len(examples)} examples")
|
28
26
|
if not judgment_api_key:
|
29
27
|
warning("No judgment_api_key provided")
|
30
28
|
self.examples = examples
|
31
|
-
self.sequences = sequences
|
32
29
|
self._alias = None
|
33
30
|
self._id = None
|
34
31
|
self.judgment_api_key = judgment_api_key
|
@@ -223,10 +220,7 @@ class EvalDataset:
|
|
223
220
|
def add_example(self, e: Example) -> None:
|
224
221
|
self.examples = self.examples + [e]
|
225
222
|
# TODO if we need to add rank, then we need to do it here
|
226
|
-
|
227
|
-
def add_sequence(self, s: Sequence) -> None:
|
228
|
-
self.sequences = self.sequences + [s]
|
229
|
-
|
223
|
+
|
230
224
|
def save_as(self, file_type: Literal["json", "csv", "yaml"], dir_path: str, save_name: str = None) -> None:
|
231
225
|
"""
|
232
226
|
Saves the dataset as a file. Save only the examples.
|
@@ -313,7 +307,6 @@ class EvalDataset:
|
|
313
307
|
return (
|
314
308
|
f"{self.__class__.__name__}("
|
315
309
|
f"examples={self.examples}, "
|
316
|
-
f"sequences={self.sequences}, "
|
317
310
|
f"_alias={self._alias}, "
|
318
311
|
f"_id={self._id}"
|
319
312
|
f")"
|