judgeval 0.0.37__py3-none-any.whl → 0.0.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
judgeval/common/tracer.py CHANGED
@@ -146,7 +146,7 @@ class TraceManagerClient:
146
146
 
147
147
  return response.json()
148
148
 
149
- def save_trace(self, trace_data: dict):
149
+ def save_trace(self, trace_data: dict, offline_mode: bool = False):
150
150
  """
151
151
  Saves a trace to the Judgment Supabase and optionally to S3 if configured.
152
152
 
@@ -183,7 +183,7 @@ class TraceManagerClient:
183
183
  except Exception as e:
184
184
  warnings.warn(f"Failed to save trace to S3: {str(e)}")
185
185
 
186
- if "ui_results_url" in response.json():
186
+ if not offline_mode and "ui_results_url" in response.json():
187
187
  pretty_str = f"\n🔍 You can view your trace data here: [rgb(106,0,255)][link={response.json()['ui_results_url']}]View Trace[/link]\n"
188
188
  rprint(pretty_str)
189
189
 
@@ -314,6 +314,7 @@ class TraceClient:
314
314
  self.executed_tools = []
315
315
  self.executed_node_tools = []
316
316
  self._span_depths: Dict[str, int] = {} # NEW: To track depth of active spans
317
+
317
318
  def get_current_span(self):
318
319
  """Get the current span from the context var"""
319
320
  return current_span_var.get()
@@ -428,7 +429,8 @@ class TraceClient:
428
429
  # span_id_at_eval_call = current_span_var.get()
429
430
  # print(f"[TraceClient.async_evaluate] Captured span ID at eval call: {span_id_at_eval_call}")
430
431
  # Prioritize explicitly passed span_id, fallback to context var
431
- span_id_to_use = span_id if span_id is not None else current_span_var.get()
432
+ current_span_ctx_var = current_span_var.get()
433
+ span_id_to_use = span_id if span_id is not None else current_span_ctx_var if current_span_ctx_var is not None else self.tracer.get_current_span()
432
434
  # print(f"[TraceClient.async_evaluate] Using span_id: {span_id_to_use}")
433
435
  # --- End Modification ---
434
436
 
@@ -438,7 +440,7 @@ class TraceClient:
438
440
  log_results=log_results,
439
441
  project_name=self.project_name,
440
442
  eval_name=f"{self.name.capitalize()}-"
441
- f"{current_span_var.get()}-" # Keep original eval name format using context var if available
443
+ f"{span_id_to_use}-" # Keep original eval name format using context var if available
442
444
  f"[{','.join(scorer.score_type.capitalize() for scorer in scorers)}]",
443
445
  examples=[example],
444
446
  scorers=scorers,
@@ -658,11 +660,12 @@ class TraceClient:
658
660
  "entries": [span.model_dump() for span in self.trace_spans],
659
661
  "evaluation_runs": [run.model_dump() for run in self.evaluation_runs],
660
662
  "overwrite": overwrite,
663
+ "offline_mode": self.tracer.offline_mode,
661
664
  "parent_trace_id": self.parent_trace_id,
662
665
  "parent_name": self.parent_name
663
666
  }
664
667
  # --- Log trace data before saving ---
665
- self.trace_manager_client.save_trace(trace_data)
668
+ self.trace_manager_client.save_trace(trace_data, offline_mode=self.tracer.offline_mode)
666
669
 
667
670
  # upload annotations
668
671
  # TODO: batch to the log endpoint
@@ -928,6 +931,7 @@ class Tracer:
928
931
  s3_aws_access_key_id: Optional[str] = None,
929
932
  s3_aws_secret_access_key: Optional[str] = None,
930
933
  s3_region_name: Optional[str] = None,
934
+ offline_mode: bool = False,
931
935
  deep_tracing: bool = True # Deep tracing is enabled by default
932
936
  ):
933
937
  if not hasattr(self, 'initialized'):
@@ -968,6 +972,7 @@ class Tracer:
968
972
  aws_secret_access_key=s3_aws_secret_access_key,
969
973
  region_name=s3_region_name
970
974
  )
975
+ self.offline_mode: bool = offline_mode
971
976
  self.deep_tracing: bool = deep_tracing # NEW: Store deep tracing setting
972
977
 
973
978
  elif hasattr(self, 'project_name') and self.project_name != project_name:
@@ -977,6 +982,12 @@ class Tracer:
977
982
  "To use a different project name, ensure the first Tracer initialization uses the desired project name.",
978
983
  RuntimeWarning
979
984
  )
985
+
986
+ def set_current_span(self, span_id: str):
987
+ self.current_span_id = span_id
988
+
989
+ def get_current_span(self) -> Optional[str]:
990
+ return getattr(self, 'current_span_id', None)
980
991
 
981
992
  def set_current_trace(self, trace: TraceClient):
982
993
  """
@@ -1263,64 +1274,94 @@ class Tracer:
1263
1274
  else:
1264
1275
  warnings.warn("No trace found (context var or fallback), skipping evaluation") # Modified warning
1265
1276
 
1266
-
1267
1277
  def wrap(client: Any) -> Any:
1268
1278
  """
1269
1279
  Wraps an API client to add tracing capabilities.
1270
1280
  Supports OpenAI, Together, Anthropic, and Google GenAI clients.
1271
1281
  Patches both '.create' and Anthropic's '.stream' methods using a wrapper class.
1272
1282
  """
1273
- span_name, original_create, responses_create, original_stream = _get_client_config(client)
1283
+ span_name, original_create, original_responses_create, original_stream = _get_client_config(client)
1284
+
1285
+ def _record_input_and_check_streaming(span, kwargs, is_responses=False):
1286
+ """Record input and check for streaming"""
1287
+ is_streaming = kwargs.get("stream", False)
1274
1288
 
1275
- # --- Define Traced Async Functions ---
1289
+ # Record input based on whether this is a responses endpoint
1290
+ if is_responses:
1291
+ span.record_input(kwargs)
1292
+ else:
1293
+ input_data = _format_input_data(client, **kwargs)
1294
+ span.record_input(input_data)
1295
+
1296
+ # Warn about token counting limitations with streaming
1297
+ if isinstance(client, (AsyncOpenAI, OpenAI)) and is_streaming:
1298
+ if not kwargs.get("stream_options", {}).get("include_usage"):
1299
+ warnings.warn(
1300
+ "OpenAI streaming calls don't include token counts by default. "
1301
+ "To enable token counting with streams, set stream_options={'include_usage': True} "
1302
+ "in your API call arguments.",
1303
+ UserWarning
1304
+ )
1305
+
1306
+ return is_streaming
1307
+
1308
+ def _format_and_record_output(span, response, is_streaming, is_async, is_responses):
1309
+ """Format and record the output in the span"""
1310
+ if is_streaming:
1311
+ output_entry = span.record_output("<pending stream>")
1312
+ wrapper_func = _async_stream_wrapper if is_async else _sync_stream_wrapper
1313
+ return wrapper_func(response, client, output_entry)
1314
+ else:
1315
+ format_func = _format_response_output_data if is_responses else _format_output_data
1316
+ output_data = format_func(client, response)
1317
+ span.record_output(output_data)
1318
+ return response
1319
+
1320
+ def _handle_error(span, e, is_async):
1321
+ """Handle and record errors"""
1322
+ call_type = "async" if is_async else "sync"
1323
+ print(f"Error during wrapped {call_type} API call ({span_name}): {e}")
1324
+ span.record_output({"error": str(e)})
1325
+ raise
1326
+
1327
+ # --- Traced Async Functions ---
1276
1328
  async def traced_create_async(*args, **kwargs):
1277
- # [Existing logic - unchanged]
1278
1329
  current_trace = current_trace_var.get()
1279
1330
  if not current_trace:
1280
- if asyncio.iscoroutinefunction(original_create):
1281
- return await original_create(*args, **kwargs)
1282
- else:
1283
- return original_create(*args, **kwargs)
1284
-
1285
- is_streaming = kwargs.get("stream", False)
1286
-
1331
+ return await original_create(*args, **kwargs)
1332
+
1287
1333
  with current_trace.span(span_name, span_type="llm") as span:
1288
- input_data = _format_input_data(client, **kwargs)
1289
- span.record_input(input_data)
1290
-
1291
- # Warn about token counting limitations with streaming
1292
- if isinstance(client, (AsyncOpenAI, OpenAI)) and is_streaming:
1293
- if not kwargs.get("stream_options", {}).get("include_usage"):
1294
- warnings.warn(
1295
- "OpenAI streaming calls don't include token counts by default. "
1296
- "To enable token counting with streams, set stream_options={'include_usage': True} "
1297
- "in your API call arguments.",
1298
- UserWarning
1299
- )
1300
-
1334
+ is_streaming = _record_input_and_check_streaming(span, kwargs)
1335
+
1301
1336
  try:
1302
- if is_streaming:
1303
- stream_iterator = await original_create(*args, **kwargs)
1304
- output_entry = span.record_output("<pending stream>")
1305
- return _async_stream_wrapper(stream_iterator, client, output_entry)
1306
- else:
1307
- awaited_response = await original_create(*args, **kwargs)
1308
- output_data = _format_output_data(client, awaited_response)
1309
- span.record_output(output_data)
1310
- return awaited_response
1337
+ response_or_iterator = await original_create(*args, **kwargs)
1338
+ return _format_and_record_output(span, response_or_iterator, is_streaming, True, False)
1311
1339
  except Exception as e:
1312
- print(f"Error during wrapped async API call ({span_name}): {e}")
1313
- span.record_output({"error": str(e)})
1314
- raise
1315
-
1316
-
1317
- # Function replacing .stream() - NOW returns the wrapper class instance
1340
+ return _handle_error(span, e, True)
1341
+
1342
+ # Async responses for OpenAI clients
1343
+ async def traced_response_create_async(*args, **kwargs):
1344
+ current_trace = current_trace_var.get()
1345
+ if not current_trace:
1346
+ return await original_responses_create(*args, **kwargs)
1347
+
1348
+ with current_trace.span(span_name, span_type="llm") as span:
1349
+ is_streaming = _record_input_and_check_streaming(span, kwargs, is_responses=True)
1350
+
1351
+ try:
1352
+ response_or_iterator = await original_responses_create(*args, **kwargs)
1353
+ return _format_and_record_output(span, response_or_iterator, is_streaming, True, True)
1354
+ except Exception as e:
1355
+ return _handle_error(span, e, True)
1356
+
1357
+ # Function replacing .stream() for async clients
1318
1358
  def traced_stream_async(*args, **kwargs):
1319
1359
  current_trace = current_trace_var.get()
1320
1360
  if not current_trace or not original_stream:
1321
1361
  return original_stream(*args, **kwargs)
1362
+
1322
1363
  original_manager = original_stream(*args, **kwargs)
1323
- wrapper_manager = _TracedAsyncStreamManagerWrapper(
1364
+ return _TracedAsyncStreamManagerWrapper(
1324
1365
  original_manager=original_manager,
1325
1366
  client=client,
1326
1367
  span_name=span_name,
@@ -1328,139 +1369,74 @@ def wrap(client: Any) -> Any:
1328
1369
  stream_wrapper_func=_async_stream_wrapper,
1329
1370
  input_kwargs=kwargs
1330
1371
  )
1331
- return wrapper_manager
1332
-
1333
- # --- Define Traced Sync Functions ---
1372
+
1373
+ # --- Traced Sync Functions ---
1334
1374
  def traced_create_sync(*args, **kwargs):
1335
- # [Existing logic - unchanged]
1336
1375
  current_trace = current_trace_var.get()
1337
1376
  if not current_trace:
1338
- return original_create(*args, **kwargs)
1339
-
1340
- is_streaming = kwargs.get("stream", False)
1341
-
1377
+ return original_create(*args, **kwargs)
1378
+
1342
1379
  with current_trace.span(span_name, span_type="llm") as span:
1343
- input_data = _format_input_data(client, **kwargs)
1344
- span.record_input(input_data)
1345
-
1346
- # Warn about token counting limitations with streaming
1347
- if isinstance(client, (AsyncOpenAI, OpenAI)) and is_streaming:
1348
- if not kwargs.get("stream_options", {}).get("include_usage"):
1349
- warnings.warn(
1350
- "OpenAI streaming calls don't include token counts by default. "
1351
- "To enable token counting with streams, set stream_options={'include_usage': True} "
1352
- "in your API call arguments.",
1353
- UserWarning
1354
- )
1355
-
1356
- try:
1357
- response_or_iterator = original_create(*args, **kwargs)
1358
- except Exception as e:
1359
- print(f"Error during wrapped sync API call ({span_name}): {e}")
1360
- span.record_output({"error": str(e)})
1361
- raise
1362
-
1363
- if is_streaming:
1364
- output_entry = span.record_output("<pending stream>")
1365
- return _sync_stream_wrapper(response_or_iterator, client, output_entry)
1366
- else:
1367
- output_data = _format_output_data(client, response_or_iterator)
1368
- span.record_output(output_data)
1369
- return response_or_iterator
1370
-
1371
- # --- Define Traced Sync Functions ---
1380
+ is_streaming = _record_input_and_check_streaming(span, kwargs)
1381
+
1382
+ try:
1383
+ response_or_iterator = original_create(*args, **kwargs)
1384
+ return _format_and_record_output(span, response_or_iterator, is_streaming, False, False)
1385
+ except Exception as e:
1386
+ return _handle_error(span, e, False)
1387
+
1372
1388
  def traced_response_create_sync(*args, **kwargs):
1373
- # [Existing logic - unchanged]
1374
1389
  current_trace = current_trace_var.get()
1375
1390
  if not current_trace:
1376
- return responses_create(*args, **kwargs)
1377
-
1378
- is_streaming = kwargs.get("stream", False)
1391
+ return original_responses_create(*args, **kwargs)
1392
+
1379
1393
  with current_trace.span(span_name, span_type="llm") as span:
1380
- span.record_input(kwargs)
1381
-
1382
- # Warn about token counting limitations with streaming
1383
- if isinstance(client, (AsyncOpenAI, OpenAI)) and is_streaming:
1384
- if not kwargs.get("stream_options", {}).get("include_usage"):
1385
- warnings.warn(
1386
- "OpenAI streaming calls don't include token counts by default. "
1387
- "To enable token counting with streams, set stream_options={'include_usage': True} "
1388
- "in your API call arguments.",
1389
- UserWarning
1390
- )
1391
-
1392
- try:
1393
- response_or_iterator = responses_create(*args, **kwargs)
1394
- except Exception as e:
1395
- print(f"Error during wrapped sync API call ({span_name}): {e}")
1396
- span.record_output({"error": str(e)})
1397
- raise
1398
- if is_streaming:
1399
- output_entry = span.record_output("<pending stream>")
1400
- return _sync_stream_wrapper(response_or_iterator, client, output_entry)
1401
- else:
1402
- output_data = _format_response_output_data(client, response_or_iterator)
1403
- span.record_output(output_data)
1404
- return response_or_iterator
1405
-
1394
+ is_streaming = _record_input_and_check_streaming(span, kwargs, is_responses=True)
1395
+
1396
+ try:
1397
+ response_or_iterator = original_responses_create(*args, **kwargs)
1398
+ return _format_and_record_output(span, response_or_iterator, is_streaming, False, True)
1399
+ except Exception as e:
1400
+ return _handle_error(span, e, False)
1401
+
1406
1402
  # Function replacing sync .stream()
1407
1403
  def traced_stream_sync(*args, **kwargs):
1408
- current_trace = current_trace_var.get()
1409
- if not current_trace or not original_stream:
1410
- return original_stream(*args, **kwargs)
1411
- original_manager = original_stream(*args, **kwargs)
1412
- wrapper_manager = _TracedSyncStreamManagerWrapper(
1413
- original_manager=original_manager,
1414
- client=client,
1415
- span_name=span_name,
1416
- trace_client=current_trace,
1417
- stream_wrapper_func=_sync_stream_wrapper,
1418
- input_kwargs=kwargs
1419
- )
1420
- return wrapper_manager
1421
-
1422
-
1404
+ current_trace = current_trace_var.get()
1405
+ if not current_trace or not original_stream:
1406
+ return original_stream(*args, **kwargs)
1407
+
1408
+ original_manager = original_stream(*args, **kwargs)
1409
+ return _TracedSyncStreamManagerWrapper(
1410
+ original_manager=original_manager,
1411
+ client=client,
1412
+ span_name=span_name,
1413
+ trace_client=current_trace,
1414
+ stream_wrapper_func=_sync_stream_wrapper,
1415
+ input_kwargs=kwargs
1416
+ )
1417
+
1423
1418
  # --- Assign Traced Methods to Client Instance ---
1424
- # [Assignment logic remains the same]
1425
1419
  if isinstance(client, (AsyncOpenAI, AsyncTogether)):
1426
1420
  client.chat.completions.create = traced_create_async
1427
- # Wrap the Responses API endpoint for OpenAI clients
1428
1421
  if hasattr(client, "responses") and hasattr(client.responses, "create"):
1429
- # Capture the original responses.create
1430
- original_responses_create = client.responses.create
1431
- def traced_responses(*args, **kwargs):
1432
- # Get the current trace from contextvars
1433
- current_trace = current_trace_var.get()
1434
- # If no active trace, call the original
1435
- if not current_trace:
1436
- return original_responses_create(*args, **kwargs)
1437
- # Trace this responses.create call
1438
- with current_trace.span(span_name, span_type="llm") as span:
1439
- # Record raw input kwargs
1440
- span.record_input(kwargs)
1441
- # Make the actual API call
1442
- response = original_responses_create(*args, **kwargs)
1443
- # Record the output object
1444
- span.record_output(response)
1445
- return response
1446
- # Assign the traced wrapper
1447
- client.responses.create = traced_responses
1422
+ client.responses.create = traced_response_create_async
1448
1423
  elif isinstance(client, AsyncAnthropic):
1449
1424
  client.messages.create = traced_create_async
1450
1425
  if original_stream:
1451
- client.messages.stream = traced_stream_async
1426
+ client.messages.stream = traced_stream_async
1452
1427
  elif isinstance(client, genai.client.AsyncClient):
1453
1428
  client.models.generate_content = traced_create_async
1454
1429
  elif isinstance(client, (OpenAI, Together)):
1455
- client.chat.completions.create = traced_create_sync
1456
- client.responses.create = traced_response_create_sync
1430
+ client.chat.completions.create = traced_create_sync
1431
+ if hasattr(client, "responses") and hasattr(client.responses, "create"):
1432
+ client.responses.create = traced_response_create_sync
1457
1433
  elif isinstance(client, Anthropic):
1458
- client.messages.create = traced_create_sync
1459
- if original_stream:
1460
- client.messages.stream = traced_stream_sync
1434
+ client.messages.create = traced_create_sync
1435
+ if original_stream:
1436
+ client.messages.stream = traced_stream_sync
1461
1437
  elif isinstance(client, genai.Client):
1462
- client.models.generate_content = traced_create_sync
1463
-
1438
+ client.models.generate_content = traced_create_sync
1439
+
1464
1440
  return client
1465
1441
 
1466
1442
  # Helper functions for client-specific operations
@@ -1896,128 +1872,3 @@ class _TracedSyncStreamManagerWrapper(_BaseStreamManagerWrapper, AbstractContext
1896
1872
  current_span_var.reset(self._span_context_token)
1897
1873
  delattr(self, '_span_context_token')
1898
1874
  return self._original_manager.__exit__(exc_type, exc_val, exc_tb)
1899
-
1900
- # --- NEW Generalized Helper Function (Moved from demo) ---
1901
- def prepare_evaluation_for_state(
1902
- scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
1903
- example: Optional[Example] = None,
1904
- # --- Individual components (alternative to 'example') ---
1905
- input: Optional[str] = None,
1906
- actual_output: Optional[Union[str, List[str]]] = None,
1907
- expected_output: Optional[Union[str, List[str]]] = None,
1908
- context: Optional[List[str]] = None,
1909
- retrieval_context: Optional[List[str]] = None,
1910
- tools_called: Optional[List[str]] = None,
1911
- expected_tools: Optional[List[str]] = None,
1912
- additional_metadata: Optional[Dict[str, Any]] = None,
1913
- # --- Other eval parameters ---
1914
- model: Optional[str] = None,
1915
- log_results: Optional[bool] = True
1916
- ) -> Optional[EvaluationConfig]:
1917
- """
1918
- Prepares an EvaluationConfig object, similar to TraceClient.async_evaluate.
1919
-
1920
- Accepts either a pre-made Example object or individual components to construct one.
1921
- Returns the EvaluationConfig object ready to be placed in the state, or None.
1922
- """
1923
- final_example = example
1924
-
1925
- # If example is not provided, try to construct one from individual parts
1926
- if final_example is None:
1927
- # Basic validation: Ensure at least actual_output is present for most scorers
1928
- if actual_output is None:
1929
- # print("[prepare_evaluation_for_state] Warning: 'actual_output' is required when 'example' is not provided. Skipping evaluation setup.")
1930
- return None
1931
- try:
1932
- final_example = Example(
1933
- input=input,
1934
- actual_output=actual_output,
1935
- expected_output=expected_output,
1936
- context=context,
1937
- retrieval_context=retrieval_context,
1938
- tools_called=tools_called,
1939
- expected_tools=expected_tools,
1940
- additional_metadata=additional_metadata,
1941
- # trace_id will be set by the handler later if needed
1942
- )
1943
- # print("[prepare_evaluation_for_state] Constructed Example from individual components.")
1944
- except Exception as e:
1945
- # print(f"[prepare_evaluation_for_state] Error constructing Example: {e}. Skipping evaluation setup.")
1946
- return None
1947
-
1948
- # If we have a valid example (provided or constructed) and scorers
1949
- if final_example and scorers:
1950
- # TODO: Add validation like check_examples if needed here,
1951
- # although the handler might implicitly handle some checks via TraceClient.
1952
- return EvaluationConfig(
1953
- scorers=scorers,
1954
- example=final_example,
1955
- model=model,
1956
- log_results=log_results
1957
- )
1958
- elif not scorers:
1959
- # print("[prepare_evaluation_for_state] No scorers provided. Skipping evaluation setup.")
1960
- return None
1961
- else: # No valid example
1962
- # print("[prepare_evaluation_for_state] No valid Example available. Skipping evaluation setup.")
1963
- return None
1964
- # --- End NEW Helper Function ---
1965
-
1966
- # --- NEW: Helper function to simplify adding eval config to state ---
1967
- def add_evaluation_to_state(
1968
- state: Dict[str, Any], # The LangGraph state dictionary
1969
- scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
1970
- # --- Evaluation components (same as prepare_evaluation_for_state) ---
1971
- input: Optional[str] = None,
1972
- actual_output: Optional[Union[str, List[str]]] = None,
1973
- expected_output: Optional[Union[str, List[str]]] = None,
1974
- context: Optional[List[str]] = None,
1975
- retrieval_context: Optional[List[str]] = None,
1976
- tools_called: Optional[List[str]] = None,
1977
- expected_tools: Optional[List[str]] = None,
1978
- additional_metadata: Optional[Dict[str, Any]] = None,
1979
- # --- Other eval parameters ---
1980
- model: Optional[str] = None,
1981
- log_results: Optional[bool] = True
1982
- ) -> None:
1983
- """
1984
- Prepares an EvaluationConfig and adds it to the state dictionary
1985
- under the '_judgeval_eval' key if successful.
1986
-
1987
- This simplifies the process of setting up evaluations within LangGraph nodes.
1988
-
1989
- Args:
1990
- state: The LangGraph state dictionary to modify.
1991
- scorers: List of scorer instances.
1992
- input: Input for the evaluation example.
1993
- actual_output: Actual output for the evaluation example.
1994
- expected_output: Expected output for the evaluation example.
1995
- context: Context for the evaluation example.
1996
- retrieval_context: Retrieval context for the evaluation example.
1997
- tools_called: Tools called for the evaluation example.
1998
- expected_tools: Expected tools for the evaluation example.
1999
- additional_metadata: Additional metadata for the evaluation example.
2000
- model: Model name used for generation (optional).
2001
- log_results: Whether to log evaluation results (optional, defaults to True).
2002
- """
2003
- eval_config = prepare_evaluation_for_state(
2004
- scorers=scorers,
2005
- input=input,
2006
- actual_output=actual_output,
2007
- expected_output=expected_output,
2008
- context=context,
2009
- retrieval_context=retrieval_context,
2010
- tools_called=tools_called,
2011
- expected_tools=expected_tools,
2012
- additional_metadata=additional_metadata,
2013
- model=model,
2014
- log_results=log_results
2015
- )
2016
-
2017
- if eval_config:
2018
- state["_judgeval_eval"] = eval_config
2019
- # print(f"[_judgeval_eval added to state for node]") # Optional: Log confirmation
2020
-
2021
- # print("[Skipped adding _judgeval_eval to state: prepare_evaluation_for_state failed]")
2022
- # --- End NEW Helper ---
2023
-
judgeval/common/utils.py CHANGED
@@ -765,7 +765,7 @@ if __name__ == "__main__":
765
765
  # Batched single completion to multiple models
766
766
  pprint.pprint(get_completion_multiple_models(
767
767
  models=[
768
- "LLAMA3_70B_INSTRUCT_TURBO", "LLAMA3_405B_INSTRUCT_TURBO", "gpt-4o-mini"
768
+ "LLAMA3_70B_INSTRUCT_TURBO", "LLAMA3_405B_INSTRUCT_TURBO", "gpt-4.1-mini"
769
769
  ],
770
770
  messages=[
771
771
  [
judgeval/constants.py CHANGED
@@ -40,17 +40,15 @@ UNBOUNDED_SCORERS = set([APIScorer.COMPARISON]) # scorers whose scores are not
40
40
  ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
41
41
  # API URLs
42
42
  JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
43
- JUDGMENT_SEQUENCE_EVAL_API_URL = f"{ROOT_API}/evaluate_sequence/"
43
+ JUDGMENT_TRACE_EVAL_API_URL = f"{ROOT_API}/evaluate_trace/"
44
44
  JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
45
45
  JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL = f"{ROOT_API}/datasets/insert_examples/"
46
- JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL = f"{ROOT_API}/datasets/insert_sequences/"
47
46
  JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull_for_judgeval/"
48
47
  JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
49
48
  JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
50
49
  JUDGMENT_DATASETS_PROJECT_STATS_API_URL = f"{ROOT_API}/datasets/fetch_stats_by_project/"
51
50
  JUDGMENT_DATASETS_INSERT_API_URL = f"{ROOT_API}/datasets/insert_examples/"
52
51
  JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
53
- JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL = f"{ROOT_API}/traces/convert_trace_to_sequence/"
54
52
  JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_experiment_run/"
55
53
  JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_names/"
56
54
  JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
@@ -61,6 +59,7 @@ JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
61
59
  JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
62
60
  JUDGMENT_TRACES_ADD_ANNOTATION_API_URL = f"{ROOT_API}/traces/add_annotation/"
63
61
  JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
62
+ JUDGMENT_GET_EVAL_STATUS_API_URL = f"{ROOT_API}/get_evaluation_status/"
64
63
  # RabbitMQ
65
64
  RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
66
65
  RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", 5672)
judgeval/data/__init__.py CHANGED
@@ -2,7 +2,6 @@ from judgeval.data.example import Example, ExampleParams
2
2
  from judgeval.data.custom_example import CustomExample
3
3
  from judgeval.data.scorer_data import ScorerData, create_scorer_data
4
4
  from judgeval.data.result import ScoringResult, generate_scoring_result
5
- from judgeval.data.sequence import Sequence
6
5
  from judgeval.data.trace import Trace, TraceSpan
7
6
 
8
7
 
@@ -14,7 +13,6 @@ __all__ = [
14
13
  "create_scorer_data",
15
14
  "ScoringResult",
16
15
  "generate_scoring_result",
17
- "Sequence",
18
16
  "Trace",
19
17
  "TraceSpan",
20
18
  ]
@@ -7,13 +7,12 @@ import yaml
7
7
  from dataclasses import dataclass, field
8
8
  from typing import List, Union, Literal
9
9
 
10
- from judgeval.data import Example, Sequence
10
+ from judgeval.data import Example
11
11
  from judgeval.common.logger import debug, error, warning, info
12
12
 
13
13
  @dataclass
14
14
  class EvalDataset:
15
15
  examples: List[Example]
16
- sequences: List[Sequence]
17
16
  _alias: Union[str, None] = field(default=None)
18
17
  _id: Union[str, None] = field(default=None)
19
18
  judgment_api_key: str = field(default="")
@@ -22,13 +21,11 @@ class EvalDataset:
22
21
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"),
23
22
  organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
24
23
  examples: List[Example] = [],
25
- sequences: List[Sequence] = []
26
24
  ):
27
25
  debug(f"Initializing EvalDataset with {len(examples)} examples")
28
26
  if not judgment_api_key:
29
27
  warning("No judgment_api_key provided")
30
28
  self.examples = examples
31
- self.sequences = sequences
32
29
  self._alias = None
33
30
  self._id = None
34
31
  self.judgment_api_key = judgment_api_key
@@ -223,10 +220,7 @@ class EvalDataset:
223
220
  def add_example(self, e: Example) -> None:
224
221
  self.examples = self.examples + [e]
225
222
  # TODO if we need to add rank, then we need to do it here
226
-
227
- def add_sequence(self, s: Sequence) -> None:
228
- self.sequences = self.sequences + [s]
229
-
223
+
230
224
  def save_as(self, file_type: Literal["json", "csv", "yaml"], dir_path: str, save_name: str = None) -> None:
231
225
  """
232
226
  Saves the dataset as a file. Save only the examples.
@@ -313,7 +307,6 @@ class EvalDataset:
313
307
  return (
314
308
  f"{self.__class__.__name__}("
315
309
  f"examples={self.examples}, "
316
- f"sequences={self.sequences}, "
317
310
  f"_alias={self._alias}, "
318
311
  f"_id={self._id}"
319
312
  f")"