judgeval 0.0.31__py3-none-any.whl → 0.0.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
judgeval/__init__.py CHANGED
@@ -1,10 +1,12 @@
1
1
  # Import key components that should be publicly accessible
2
2
  from judgeval.clients import client, together_client
3
3
  from judgeval.judgment_client import JudgmentClient
4
+ from judgeval.version_check import check_latest_version
5
+ check_latest_version()
4
6
 
5
7
  __all__ = [
6
8
  # Clients
7
9
  'client',
8
10
  'together_client',
9
11
  'JudgmentClient',
10
- ]
12
+ ]
judgeval/common/tracer.py CHANGED
@@ -11,11 +11,12 @@ import time
11
11
  import uuid
12
12
  import warnings
13
13
  import contextvars
14
+ import sys
14
15
  from contextlib import contextmanager
15
16
  from dataclasses import dataclass, field
16
17
  from datetime import datetime
17
18
  from http import HTTPStatus
18
- from typing import Any, Dict, Generator, List, Literal, Optional, Tuple, TypeAlias, Union, Callable, Awaitable
19
+ from typing import Any, Dict, Generator, List, Literal, Optional, Tuple, TypeAlias, Union, Callable, Awaitable, Set
19
20
  from rich import print as rprint
20
21
 
21
22
  # Third-party imports
@@ -27,6 +28,7 @@ from rich import print as rprint
27
28
  from openai import OpenAI, AsyncOpenAI
28
29
  from together import Together, AsyncTogether
29
30
  from anthropic import Anthropic, AsyncAnthropic
31
+ from google import genai
30
32
 
31
33
  # Local application/library-specific imports
32
34
  from judgeval.constants import (
@@ -50,10 +52,11 @@ import concurrent.futures
50
52
 
51
53
  # Define context variables for tracking the current trace and the current span within a trace
52
54
  current_trace_var = contextvars.ContextVar('current_trace', default=None)
53
- current_span_var = contextvars.ContextVar('current_span', default=None) # NEW: ContextVar for the active span name
55
+ current_span_var = contextvars.ContextVar('current_span', default=None) # ContextVar for the active span name
56
+ in_traced_function_var = contextvars.ContextVar('in_traced_function', default=False) # Track if we're in a traced function
54
57
 
55
58
  # Define type aliases for better code readability and maintainability
56
- ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic, AsyncOpenAI, AsyncAnthropic, AsyncTogether] # Supported API clients
59
+ ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic, AsyncOpenAI, AsyncAnthropic, AsyncTogether, genai.Client, genai.client.AsyncClient] # Supported API clients
57
60
  TraceEntryType = Literal['enter', 'exit', 'output', 'input', 'evaluation'] # Valid trace entry types
58
61
  SpanType = Literal['span', 'tool', 'llm', 'evaluation', 'chain']
59
62
  @dataclass
@@ -888,6 +891,13 @@ class TraceClient:
888
891
  "parent_trace_id": self.parent_trace_id,
889
892
  "parent_name": self.parent_name
890
893
  }
894
+ # --- Log trace data before saving ---
895
+ try:
896
+ rprint(f"[TraceClient.save] Saving trace data for trace_id {self.trace_id}:")
897
+ rprint(json.dumps(trace_data, indent=2))
898
+ except Exception as log_e:
899
+ rprint(f"[TraceClient.save] Error logging trace data: {log_e}")
900
+ # --- End logging ---
891
901
  self.trace_manager_client.save_trace(trace_data)
892
902
 
893
903
  return self.trace_id, trace_data
@@ -910,7 +920,8 @@ class Tracer:
910
920
  rules: Optional[List[Rule]] = None, # Added rules parameter
911
921
  organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
912
922
  enable_monitoring: bool = os.getenv("JUDGMENT_MONITORING", "true").lower() == "true",
913
- enable_evaluations: bool = os.getenv("JUDGMENT_EVALUATIONS", "true").lower() == "true"
923
+ enable_evaluations: bool = os.getenv("JUDGMENT_EVALUATIONS", "true").lower() == "true",
924
+ deep_tracing: bool = True # NEW: Enable deep tracing by default
914
925
  ):
915
926
  if not hasattr(self, 'initialized'):
916
927
  if not api_key:
@@ -927,6 +938,7 @@ class Tracer:
927
938
  self.initialized: bool = True
928
939
  self.enable_monitoring: bool = enable_monitoring
929
940
  self.enable_evaluations: bool = enable_evaluations
941
+ self.deep_tracing: bool = deep_tracing # NEW: Store deep tracing setting
930
942
  elif hasattr(self, 'project_name') and self.project_name != project_name:
931
943
  warnings.warn(
932
944
  f"Attempting to initialize Tracer with project_name='{project_name}' but it was already initialized with "
@@ -941,12 +953,52 @@ class Tracer:
941
953
  """
942
954
  current_trace_var.set(trace)
943
955
 
944
- def get_current_trace(self):
956
+ def get_current_trace(self) -> Optional[TraceClient]:
945
957
  """
946
958
  Get the current trace context from contextvars
947
959
  """
948
960
  return current_trace_var.get()
949
961
 
962
+ def _apply_deep_tracing(self, func, span_type="span"):
963
+ """
964
+ Apply deep tracing to all functions in the same module as the given function.
965
+
966
+ Args:
967
+ func: The function being traced
968
+ span_type: Type of span to use for traced functions
969
+
970
+ Returns:
971
+ A tuple of (module, original_functions_dict) where original_functions_dict
972
+ contains the original functions that were replaced with traced versions.
973
+ """
974
+ module = inspect.getmodule(func)
975
+ if not module:
976
+ return None, {}
977
+
978
+ # Save original functions
979
+ original_functions = {}
980
+
981
+ # Find all functions in the module
982
+ for name, obj in inspect.getmembers(module, inspect.isfunction):
983
+ # Skip already wrapped functions
984
+ if hasattr(obj, '_judgment_traced'):
985
+ continue
986
+
987
+ # Create a traced version of the function
988
+ # Always use default span type "span" for child functions
989
+ traced_func = _create_deep_tracing_wrapper(obj, self, "span")
990
+
991
+ # Mark the function as traced to avoid double wrapping
992
+ traced_func._judgment_traced = True
993
+
994
+ # Save the original function
995
+ original_functions[name] = obj
996
+
997
+ # Replace with traced version
998
+ setattr(module, name, traced_func)
999
+
1000
+ return module, original_functions
1001
+
950
1002
  @contextmanager
951
1003
  def trace(
952
1004
  self,
@@ -992,14 +1044,8 @@ class Tracer:
992
1044
  finally:
993
1045
  # Reset the context variable
994
1046
  current_trace_var.reset(token)
995
-
996
- def get_current_trace(self) -> Optional[TraceClient]:
997
- """
998
- Get the current trace context from contextvars
999
- """
1000
- return current_trace_var.get()
1001
-
1002
- def observe(self, func=None, *, name=None, span_type: SpanType = "span", project_name: str = None, overwrite: bool = False):
1047
+
1048
+ def observe(self, func=None, *, name=None, span_type: SpanType = "span", project_name: str = None, overwrite: bool = False, deep_tracing: bool = None):
1003
1049
  """
1004
1050
  Decorator to trace function execution with detailed entry/exit information.
1005
1051
 
@@ -1009,20 +1055,37 @@ class Tracer:
1009
1055
  span_type: Type of span (default "span")
1010
1056
  project_name: Optional project name override
1011
1057
  overwrite: Whether to overwrite existing traces
1058
+ deep_tracing: Whether to enable deep tracing for this function and all nested calls.
1059
+ If None, uses the tracer's default setting.
1012
1060
  """
1013
1061
  # If monitoring is disabled, return the function as is
1014
1062
  if not self.enable_monitoring:
1015
1063
  return func if func else lambda f: f
1016
1064
 
1017
1065
  if func is None:
1018
- return lambda f: self.observe(f, name=name, span_type=span_type, project_name=project_name, overwrite=overwrite)
1066
+ return lambda f: self.observe(f, name=name, span_type=span_type, project_name=project_name,
1067
+ overwrite=overwrite, deep_tracing=deep_tracing)
1019
1068
 
1020
1069
  # Use provided name or fall back to function name
1021
1070
  span_name = name or func.__name__
1022
1071
 
1072
+ # Store custom attributes on the function object
1073
+ func._judgment_span_name = span_name
1074
+ func._judgment_span_type = span_type
1075
+
1076
+ # Use the provided deep_tracing value or fall back to the tracer's default
1077
+ use_deep_tracing = deep_tracing if deep_tracing is not None else self.deep_tracing
1078
+
1023
1079
  if asyncio.iscoroutinefunction(func):
1024
1080
  @functools.wraps(func)
1025
1081
  async def async_wrapper(*args, **kwargs):
1082
+ # Check if we're already in a traced function
1083
+ if in_traced_function_var.get():
1084
+ return await func(*args, **kwargs)
1085
+
1086
+ # Set in_traced_function_var to True
1087
+ token = in_traced_function_var.set(True)
1088
+
1026
1089
  # Get current trace from context
1027
1090
  current_trace = current_trace_var.get()
1028
1091
 
@@ -1057,9 +1120,18 @@ class Tracer:
1057
1120
  'kwargs': kwargs
1058
1121
  })
1059
1122
 
1123
+ # If deep tracing is enabled, apply monkey patching
1124
+ if use_deep_tracing:
1125
+ module, original_functions = self._apply_deep_tracing(func, span_type)
1126
+
1060
1127
  # Execute function
1061
1128
  result = await func(*args, **kwargs)
1062
1129
 
1130
+ # Restore original functions if deep tracing was enabled
1131
+ if use_deep_tracing and module and 'original_functions' in locals():
1132
+ for name, obj in original_functions.items():
1133
+ setattr(module, name, obj)
1134
+
1063
1135
  # Record output
1064
1136
  span.record_output(result)
1065
1137
 
@@ -1069,29 +1141,52 @@ class Tracer:
1069
1141
  finally:
1070
1142
  # Reset trace context (span context resets automatically)
1071
1143
  current_trace_var.reset(trace_token)
1144
+ # Reset in_traced_function_var
1145
+ in_traced_function_var.reset(token)
1072
1146
  else:
1073
1147
  # Already have a trace context, just create a span in it
1074
1148
  # The span method handles current_span_var
1075
- with current_trace.span(span_name, span_type=span_type) as span: # MODIFIED: Use span_name directly
1076
- # Record inputs
1077
- span.record_input({
1078
- 'args': str(args),
1079
- 'kwargs': kwargs
1080
- })
1081
-
1082
- # Execute function
1083
- result = await func(*args, **kwargs)
1084
-
1085
- # Record output
1086
- span.record_output(result)
1149
+
1150
+ try:
1151
+ with current_trace.span(span_name, span_type=span_type) as span: # MODIFIED: Use span_name directly
1152
+ # Record inputs
1153
+ span.record_input({
1154
+ 'args': str(args),
1155
+ 'kwargs': kwargs
1156
+ })
1157
+
1158
+ # If deep tracing is enabled, apply monkey patching
1159
+ if use_deep_tracing:
1160
+ module, original_functions = self._apply_deep_tracing(func, span_type)
1161
+
1162
+ # Execute function
1163
+ result = await func(*args, **kwargs)
1164
+
1165
+ # Restore original functions if deep tracing was enabled
1166
+ if use_deep_tracing and module and 'original_functions' in locals():
1167
+ for name, obj in original_functions.items():
1168
+ setattr(module, name, obj)
1169
+
1170
+ # Record output
1171
+ span.record_output(result)
1087
1172
 
1088
1173
  return result
1089
-
1174
+ finally:
1175
+ # Reset in_traced_function_var
1176
+ in_traced_function_var.reset(token)
1177
+
1090
1178
  return async_wrapper
1091
1179
  else:
1092
- # Non-async function implementation remains unchanged
1180
+ # Non-async function implementation with deep tracing
1093
1181
  @functools.wraps(func)
1094
1182
  def wrapper(*args, **kwargs):
1183
+ # Check if we're already in a traced function
1184
+ if in_traced_function_var.get():
1185
+ return func(*args, **kwargs)
1186
+
1187
+ # Set in_traced_function_var to True
1188
+ token = in_traced_function_var.set(True)
1189
+
1095
1190
  # Get current trace from context
1096
1191
  current_trace = current_trace_var.get()
1097
1192
 
@@ -1126,9 +1221,18 @@ class Tracer:
1126
1221
  'kwargs': kwargs
1127
1222
  })
1128
1223
 
1224
+ # If deep tracing is enabled, apply monkey patching
1225
+ if use_deep_tracing:
1226
+ module, original_functions = self._apply_deep_tracing(func, span_type)
1227
+
1129
1228
  # Execute function
1130
1229
  result = func(*args, **kwargs)
1131
1230
 
1231
+ # Restore original functions if deep tracing was enabled
1232
+ if use_deep_tracing and module and 'original_functions' in locals():
1233
+ for name, obj in original_functions.items():
1234
+ setattr(module, name, obj)
1235
+
1132
1236
  # Record output
1133
1237
  span.record_output(result)
1134
1238
 
@@ -1138,24 +1242,40 @@ class Tracer:
1138
1242
  finally:
1139
1243
  # Reset trace context (span context resets automatically)
1140
1244
  current_trace_var.reset(trace_token)
1245
+ # Reset in_traced_function_var
1246
+ in_traced_function_var.reset(token)
1141
1247
  else:
1142
1248
  # Already have a trace context, just create a span in it
1143
1249
  # The span method handles current_span_var
1144
- with current_trace.span(span_name, span_type=span_type) as span: # MODIFIED: Use span_name directly
1145
- # Record inputs
1146
- span.record_input({
1147
- 'args': str(args),
1148
- 'kwargs': kwargs
1149
- })
1150
-
1151
- # Execute function
1152
- result = func(*args, **kwargs)
1153
-
1154
- # Record output
1155
- span.record_output(result)
1250
+
1251
+ try:
1252
+ with current_trace.span(span_name, span_type=span_type) as span: # MODIFIED: Use span_name directly
1253
+ # Record inputs
1254
+ span.record_input({
1255
+ 'args': str(args),
1256
+ 'kwargs': kwargs
1257
+ })
1258
+
1259
+ # If deep tracing is enabled, apply monkey patching
1260
+ if use_deep_tracing:
1261
+ module, original_functions = self._apply_deep_tracing(func, span_type)
1262
+
1263
+ # Execute function
1264
+ result = func(*args, **kwargs)
1265
+
1266
+ # Restore original functions if deep tracing was enabled
1267
+ if use_deep_tracing and module and 'original_functions' in locals():
1268
+ for name, obj in original_functions.items():
1269
+ setattr(module, name, obj)
1270
+
1271
+ # Record output
1272
+ span.record_output(result)
1156
1273
 
1157
1274
  return result
1158
-
1275
+ finally:
1276
+ # Reset in_traced_function_var
1277
+ in_traced_function_var.reset(token)
1278
+
1159
1279
  return wrapper
1160
1280
 
1161
1281
  def score(self, func=None, scorers: List[Union[APIJudgmentScorer, JudgevalScorer]] = None, model: str = None, log_results: bool = True, *, name: str = None, span_type: SpanType = "span"):
@@ -1206,7 +1326,7 @@ def wrap(client: Any) -> Any:
1206
1326
  span_name, original_create = _get_client_config(client)
1207
1327
 
1208
1328
  # Handle async clients differently than synchronous clients (need an async function for async clients)
1209
- if (isinstance(client, (AsyncOpenAI, AsyncAnthropic, AsyncTogether))):
1329
+ if (isinstance(client, (AsyncOpenAI, AsyncAnthropic, AsyncTogether, genai.client.AsyncClient))):
1210
1330
  async def traced_create(*args, **kwargs):
1211
1331
  # Get the current trace from contextvars
1212
1332
  current_trace = current_trace_var.get()
@@ -1265,6 +1385,8 @@ def wrap(client: Any) -> Any:
1265
1385
  client.chat.completions.create = traced_create
1266
1386
  elif isinstance(client, (Anthropic, AsyncAnthropic)):
1267
1387
  client.messages.create = traced_create
1388
+ elif isinstance(client, (genai.Client, genai.client.AsyncClient)):
1389
+ client.models.generate_content = traced_create
1268
1390
 
1269
1391
  return client
1270
1392
 
@@ -1290,6 +1412,8 @@ def _get_client_config(client: ApiClient) -> tuple[str, callable]:
1290
1412
  return "TOGETHER_API_CALL", client.chat.completions.create
1291
1413
  elif isinstance(client, (Anthropic, AsyncAnthropic)):
1292
1414
  return "ANTHROPIC_API_CALL", client.messages.create
1415
+ elif isinstance(client, (genai.Client, genai.client.AsyncClient)):
1416
+ return "GOOGLE_API_CALL", client.models.generate_content
1293
1417
  raise ValueError(f"Unsupported client type: {type(client)}")
1294
1418
 
1295
1419
  def _format_input_data(client: ApiClient, **kwargs) -> dict:
@@ -1303,6 +1427,11 @@ def _format_input_data(client: ApiClient, **kwargs) -> dict:
1303
1427
  "model": kwargs.get("model"),
1304
1428
  "messages": kwargs.get("messages"),
1305
1429
  }
1430
+ elif isinstance(client, (genai.Client, genai.client.AsyncClient)):
1431
+ return {
1432
+ "model": kwargs.get("model"),
1433
+ "contents": kwargs.get("contents")
1434
+ }
1306
1435
  # Anthropic requires additional max_tokens parameter
1307
1436
  return {
1308
1437
  "model": kwargs.get("model"),
@@ -1330,6 +1459,15 @@ def _format_output_data(client: ApiClient, response: Any) -> dict:
1330
1459
  "total_tokens": response.usage.total_tokens
1331
1460
  }
1332
1461
  }
1462
+ elif isinstance(client, (genai.Client, genai.client.AsyncClient)):
1463
+ return {
1464
+ "content": response.candidates[0].content.parts[0].text,
1465
+ "usage": {
1466
+ "prompt_tokens": response.usage_metadata.prompt_token_count,
1467
+ "completion_tokens": response.usage_metadata.candidates_token_count,
1468
+ "total_tokens": response.usage_metadata.total_token_count
1469
+ }
1470
+ }
1333
1471
  # Anthropic has a different response structure
1334
1472
  return {
1335
1473
  "content": response.content[0].text,
@@ -1340,29 +1478,88 @@ def _format_output_data(client: ApiClient, response: Any) -> dict:
1340
1478
  }
1341
1479
  }
1342
1480
 
1343
- # Add a global context-preserving gather function
1344
- # async def trace_gather(*coroutines, return_exceptions=False): # REMOVED
1345
- # """ # REMOVED
1346
- # A wrapper around asyncio.gather that ensures the trace context # REMOVED
1347
- # is available within the gathered coroutines using contextvars.copy_context. # REMOVED
1348
- # """ # REMOVED
1349
- # # Get the original asyncio.gather (if we patched it) # REMOVED
1350
- # original_gather = getattr(asyncio, "_original_gather", asyncio.gather) # REMOVED
1351
- # # REMOVED
1352
- # # Use contextvars.copy_context() to ensure context propagation # REMOVED
1353
- # ctx = contextvars.copy_context() # REMOVED
1354
- # # REMOVED
1355
- # # Wrap the gather call within the copied context # REMOVED
1356
- # return await ctx.run(original_gather, *coroutines, return_exceptions=return_exceptions) # REMOVED
1357
-
1358
- # Store the original gather and apply the patch *once*
1359
- # global _original_gather_stored # REMOVED
1360
- # if not globals().get('_original_gather_stored'): # REMOVED
1361
- # # Check if asyncio.gather is already our wrapper to prevent double patching # REMOVED
1362
- # if asyncio.gather.__name__ != 'trace_gather': # REMOVED
1363
- # asyncio._original_gather = asyncio.gather # REMOVED
1364
- # asyncio.gather = trace_gather # REMOVED
1365
- # _original_gather_stored = True # REMOVED
1481
+ # Add a new function for deep tracing at the module level
1482
+ def _create_deep_tracing_wrapper(func, tracer, span_type="span"):
1483
+ """
1484
+ Creates a wrapper for a function that automatically traces it when called within a traced function.
1485
+ This enables deep tracing without requiring explicit @observe decorators on every function.
1486
+
1487
+ Args:
1488
+ func: The function to wrap
1489
+ tracer: The Tracer instance
1490
+ span_type: Type of span (default "span")
1491
+
1492
+ Returns:
1493
+ A wrapped function that will be traced when called
1494
+ """
1495
+ # Skip wrapping if the function is not callable or is a built-in
1496
+ if not callable(func) or isinstance(func, type) or func.__module__ == 'builtins':
1497
+ return func
1498
+
1499
+ # Get function name for the span - check for custom name set by @observe
1500
+ func_name = getattr(func, '_judgment_span_name', func.__name__)
1501
+
1502
+ # Check for custom span_type set by @observe
1503
+ func_span_type = getattr(func, '_judgment_span_type', "span")
1504
+
1505
+ # Store original function to prevent losing reference
1506
+ original_func = func
1507
+
1508
+ # Create appropriate wrapper based on whether the function is async or not
1509
+ if asyncio.iscoroutinefunction(func):
1510
+ @functools.wraps(func)
1511
+ async def async_deep_wrapper(*args, **kwargs):
1512
+ # Get current trace from context
1513
+ current_trace = current_trace_var.get()
1514
+
1515
+ # If no trace context, just call the function
1516
+ if not current_trace:
1517
+ return await original_func(*args, **kwargs)
1518
+
1519
+ # Create a span for this function call - use custom span_type if available
1520
+ with current_trace.span(func_name, span_type=func_span_type) as span:
1521
+ # Record inputs
1522
+ span.record_input({
1523
+ 'args': str(args),
1524
+ 'kwargs': kwargs
1525
+ })
1526
+
1527
+ # Execute function
1528
+ result = await original_func(*args, **kwargs)
1529
+
1530
+ # Record output
1531
+ span.record_output(result)
1532
+
1533
+ return result
1534
+
1535
+ return async_deep_wrapper
1536
+ else:
1537
+ @functools.wraps(func)
1538
+ def deep_wrapper(*args, **kwargs):
1539
+ # Get current trace from context
1540
+ current_trace = current_trace_var.get()
1541
+
1542
+ # If no trace context, just call the function
1543
+ if not current_trace:
1544
+ return original_func(*args, **kwargs)
1545
+
1546
+ # Create a span for this function call - use custom span_type if available
1547
+ with current_trace.span(func_name, span_type=func_span_type) as span:
1548
+ # Record inputs
1549
+ span.record_input({
1550
+ 'args': str(args),
1551
+ 'kwargs': kwargs
1552
+ })
1553
+
1554
+ # Execute function
1555
+ result = original_func(*args, **kwargs)
1556
+
1557
+ # Record output
1558
+ span.record_output(result)
1559
+
1560
+ return result
1561
+
1562
+ return deep_wrapper
1366
1563
 
1367
1564
  # Add the new TraceThreadPoolExecutor class
1368
1565
  class TraceThreadPoolExecutor(concurrent.futures.ThreadPoolExecutor):
judgeval/constants.py CHANGED
@@ -43,7 +43,7 @@ JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
43
43
  JUDGMENT_SEQUENCE_EVAL_API_URL = f"{ROOT_API}/evaluate_sequence/"
44
44
  JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
45
45
  JUDGMENT_DATASETS_APPEND_API_URL = f"{ROOT_API}/datasets/insert_examples/"
46
- JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull/"
46
+ JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull_for_judgeval/"
47
47
  JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
48
48
  JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
49
49
  JUDGMENT_DATASETS_PROJECT_STATS_API_URL = f"{ROOT_API}/datasets/fetch_stats_by_project/"
@@ -7,12 +7,13 @@ import yaml
7
7
  from dataclasses import dataclass, field
8
8
  from typing import List, Union, Literal
9
9
 
10
- from judgeval.data import Example
10
+ from judgeval.data import Example, Sequence
11
11
  from judgeval.common.logger import debug, error, warning, info
12
12
 
13
13
  @dataclass
14
14
  class EvalDataset:
15
15
  examples: List[Example]
16
+ sequences: List[Sequence]
16
17
  _alias: Union[str, None] = field(default=None)
17
18
  _id: Union[str, None] = field(default=None)
18
19
  judgment_api_key: str = field(default="")
@@ -21,11 +22,13 @@ class EvalDataset:
21
22
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"),
22
23
  organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
23
24
  examples: List[Example] = [],
25
+ sequences: List[Sequence] = []
24
26
  ):
25
27
  debug(f"Initializing EvalDataset with {len(examples)} examples")
26
28
  if not judgment_api_key:
27
29
  warning("No judgment_api_key provided")
28
30
  self.examples = examples
31
+ self.sequences = sequences
29
32
  self._alias = None
30
33
  self._id = None
31
34
  self.judgment_api_key = judgment_api_key
@@ -309,6 +312,7 @@ class EvalDataset:
309
312
  return (
310
313
  f"{self.__class__.__name__}("
311
314
  f"examples={self.examples}, "
315
+ f"sequences={self.sequences}, "
312
316
  f"_alias={self._alias}, "
313
317
  f"_id={self._id}"
314
318
  f")"
@@ -13,7 +13,7 @@ from judgeval.constants import (
13
13
  JUDGMENT_DATASETS_INSERT_API_URL,
14
14
  JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
15
15
  )
16
- from judgeval.data import Example
16
+ from judgeval.data import Example, Sequence
17
17
  from judgeval.data.datasets import EvalDataset
18
18
 
19
19
 
@@ -201,8 +201,8 @@ class EvalDatasetClient:
201
201
 
202
202
  info(f"Successfully pulled dataset with alias '{alias}'")
203
203
  payload = response.json()
204
-
205
204
  dataset.examples = [Example(**e) for e in payload.get("examples", [])]
205
+ dataset.sequences = [Sequence(**s) for s in payload.get("sequences", [])]
206
206
  dataset._alias = payload.get("alias")
207
207
  dataset._id = payload.get("id")
208
208
  progress.update(
judgeval/data/sequence.py CHANGED
@@ -16,6 +16,9 @@ class Sequence(BaseModel):
16
16
  scorers: Optional[Any] = None
17
17
  parent_sequence_id: Optional[str] = None
18
18
  sequence_order: Optional[int] = 0
19
+ root_sequence_id: Optional[str] = None
20
+ inputs: Optional[str] = None
21
+ output: Optional[str] = None
19
22
 
20
23
  @field_validator("scorers")
21
24
  def validate_scorer(cls, v):
@@ -30,28 +33,21 @@ class Sequence(BaseModel):
30
33
  raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
31
34
  return loaded_scorers
32
35
 
33
- @model_validator(mode='after')
34
- def set_parent_sequence_ids(self) -> "Sequence":
35
- """Recursively set the parent_sequence_id for all nested Sequences."""
36
- for item in self.items:
37
- if isinstance(item, Sequence):
38
- item.parent_sequence_id = self.sequence_id
39
- # Recurse into deeper nested sequences
40
- item.set_parent_sequence_ids()
41
- return self
36
+ @model_validator(mode="after")
37
+ def populate_sequence_metadata(self) -> "Sequence":
38
+ """Recursively set parent_sequence_id, root_sequence_id, and sequence_order."""
39
+ # If root_sequence_id isn't already set, assign it to self
40
+ if self.root_sequence_id is None:
41
+ self.root_sequence_id = self.sequence_id
42
42
 
43
- @model_validator(mode='after')
44
- def set_parent_and_order(self) -> "Sequence":
45
- """Set parent_sequence_id and sequence_order for all items."""
46
43
  for idx, item in enumerate(self.items):
47
- # Set sequence_order for both Example and Sequence objects
48
44
  item.sequence_order = idx
49
-
50
45
  if isinstance(item, Sequence):
51
46
  item.parent_sequence_id = self.sequence_id
52
- item.set_parent_and_order() # Recurse for nested sequences
47
+ item.root_sequence_id = self.root_sequence_id
48
+ item.populate_sequence_metadata()
53
49
  return self
54
-
50
+
55
51
  class Config:
56
52
  arbitrary_types_allowed = True
57
53
 
@@ -21,6 +21,7 @@ class SequenceRun(BaseModel):
21
21
  metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
22
22
  judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
23
23
  rules (Optional[List[Rule]]): Rules to evaluate against scoring results
24
+ append (Optional[bool]): Whether to append to existing evaluation results
24
25
  """
25
26
 
26
27
  # The user will specify whether they want log_results when they call run_eval
@@ -33,6 +34,7 @@ class SequenceRun(BaseModel):
33
34
  aggregator: Optional[str] = None
34
35
  metadata: Optional[Dict[str, Any]] = None
35
36
  trace_span_id: Optional[str] = None
37
+ append: Optional[bool] = False
36
38
  # API Key will be "" until user calls client.run_eval(), then API Key will be set
37
39
  judgment_api_key: Optional[str] = ""
38
40
  override: Optional[bool] = False
@@ -93,16 +93,47 @@ class JudgmentClient(metaclass=SingletonMeta):
93
93
  self,
94
94
  sequences: List[Sequence],
95
95
  model: Union[str, List[str], JudgevalJudge],
96
+ scorers: List[Union[ScorerWrapper, JudgevalScorer]],
96
97
  aggregator: Optional[str] = None,
97
98
  project_name: str = "default_project",
98
99
  eval_run_name: str = "default_eval_sequence",
99
100
  use_judgment: bool = True,
100
101
  log_results: bool = True,
102
+ append: bool = False,
101
103
  override: bool = False,
102
104
  ignore_errors: bool = True,
103
105
  rules: Optional[List[Rule]] = None
104
106
  ) -> List[ScoringResult]:
105
107
  try:
108
+ loaded_scorers = []
109
+ for scorer in scorers:
110
+ try:
111
+ if isinstance(scorer, ScorerWrapper):
112
+ loaded_scorers.append(scorer.load_implementation())
113
+ else:
114
+ loaded_scorers.append(scorer)
115
+ except Exception as e:
116
+ raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
117
+
118
+ def get_all_sequences(root: Sequence) -> List[Sequence]:
119
+ all_sequences = [root]
120
+
121
+ for item in root.items:
122
+ if isinstance(item, Sequence):
123
+ all_sequences.extend(get_all_sequences(item))
124
+
125
+ return all_sequences
126
+
127
+ def flatten_sequence_list(sequences: List[Sequence]) -> List[Sequence]:
128
+ flattened = []
129
+ for seq in sequences:
130
+ flattened.extend(get_all_sequences(seq))
131
+ return flattened
132
+
133
+ flattened_sequences = flatten_sequence_list(sequences)
134
+ for sequence in flattened_sequences:
135
+ sequence.scorers = loaded_scorers
136
+
106
137
  if rules:
107
138
  loaded_rules = []
108
139
  for rule in rules:
@@ -134,10 +165,10 @@ class JudgmentClient(metaclass=SingletonMeta):
134
165
  model=model,
135
166
  aggregator=aggregator,
136
167
  log_results=log_results,
168
+ append=append,
137
169
  judgment_api_key=self.judgment_api_key,
138
170
  organization_id=self.organization_id
139
171
  )
140
-
141
172
  return run_sequence_eval(sequence_run, override, ignore_errors, use_judgment)
142
173
  except ValueError as e:
143
174
  raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: \n{str(e)}")
@@ -244,98 +275,6 @@ class JudgmentClient(metaclass=SingletonMeta):
244
275
  raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
245
276
  except Exception as e:
246
277
  raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
247
-
248
- def evaluate_dataset(
249
- self,
250
- dataset: EvalDataset,
251
- scorers: List[Union[ScorerWrapper, JudgevalScorer]],
252
- model: Union[str, List[str], JudgevalJudge],
253
- aggregator: Optional[str] = None,
254
- metadata: Optional[Dict[str, Any]] = None,
255
- project_name: str = "",
256
- eval_run_name: str = "",
257
- log_results: bool = True,
258
- use_judgment: bool = True,
259
- rules: Optional[List[Rule]] = None
260
- ) -> List[ScoringResult]:
261
- """
262
- Executes an evaluation of a `EvalDataset` using one or more `Scorer`s
263
-
264
- Args:
265
- dataset (EvalDataset): The dataset containing examples to evaluate
266
- scorers (List[Union[ScorerWrapper, JudgevalScorer]]): A list of scorers to use for evaluation
267
- model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
268
- aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
269
- metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run
270
- project_name (str): The name of the project the evaluation results belong to
271
- eval_run_name (str): A name for this evaluation run
272
- log_results (bool): Whether to log the results to the Judgment API
273
- use_judgment (bool): Whether to use Judgment API for evaluation
274
- rules (Optional[List[Rule]]): Rules to evaluate against scoring results
275
-
276
- Returns:
277
- List[ScoringResult]: The results of the evaluation
278
- """
279
- try:
280
- # Load appropriate implementations for all scorers
281
- loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = []
282
- for scorer in scorers:
283
- try:
284
- if isinstance(scorer, ScorerWrapper):
285
- loaded_scorers.append(scorer.load_implementation(use_judgment=use_judgment))
286
- else:
287
- loaded_scorers.append(scorer)
288
- except Exception as e:
289
- raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
290
-
291
- # Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
292
- if rules and any(isinstance(scorer, JudgevalScorer) for scorer in loaded_scorers):
293
- raise ValueError("Cannot use Judgeval scorers (only API scorers) when using rules. Please either remove rules or use only APIJudgmentScorer types.")
294
-
295
- # Convert ScorerWrapper in rules to their implementations
296
- loaded_rules = None
297
- if rules:
298
- loaded_rules = []
299
- for rule in rules:
300
- try:
301
- processed_conditions = []
302
- for condition in rule.conditions:
303
- # Convert metric if it's a ScorerWrapper
304
- if isinstance(condition.metric, ScorerWrapper):
305
- try:
306
- condition_copy = condition.model_copy()
307
- condition_copy.metric = condition.metric.load_implementation(use_judgment=use_judgment)
308
- processed_conditions.append(condition_copy)
309
- except Exception as e:
310
- raise ValueError(f"Failed to convert ScorerWrapper to implementation in rule '{rule.name}', condition metric '{condition.metric}': {str(e)}")
311
- else:
312
- processed_conditions.append(condition)
313
-
314
- # Create new rule with processed conditions
315
- new_rule = rule.model_copy()
316
- new_rule.conditions = processed_conditions
317
- loaded_rules.append(new_rule)
318
- except Exception as e:
319
- raise ValueError(f"Failed to process rule '{rule.name}': {str(e)}")
320
-
321
- evaluation_run = EvaluationRun(
322
- log_results=log_results,
323
- project_name=project_name,
324
- eval_name=eval_run_name,
325
- examples=dataset.examples,
326
- scorers=loaded_scorers,
327
- model=model,
328
- aggregator=aggregator,
329
- metadata=metadata,
330
- judgment_api_key=self.judgment_api_key,
331
- rules=loaded_rules,
332
- organization_id=self.organization_id
333
- )
334
- return run_eval(evaluation_run)
335
- except ValueError as e:
336
- raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
337
- except Exception as e:
338
- raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
339
278
 
340
279
  def create_dataset(self) -> EvalDataset:
341
280
  return self.eval_dataset_client.create_dataset()
@@ -336,7 +336,7 @@ def check_examples(examples: List[Example], scorers: List[APIJudgmentScorer]) ->
336
336
 
337
337
  def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True, async_execution: bool = False) -> List[ScoringResult]:
338
338
  # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
339
- if not override and sequence_run.log_results:
339
+ if not override and sequence_run.log_results and not sequence_run.append:
340
340
  check_eval_run_name_exists(
341
341
  sequence_run.eval_name,
342
342
  sequence_run.project_name,
@@ -0,0 +1,22 @@
1
+ import importlib.metadata
2
+ import requests
3
+ import threading
4
+
5
+ def check_latest_version(package_name: str = "judgeval"):
6
+ def _check():
7
+ try:
8
+ current_version = importlib.metadata.version(package_name)
9
+ response = requests.get(f"https://pypi.org/pypi/{package_name}/json", timeout=2)
10
+ latest_version = response.json()["info"]["version"]
11
+
12
+ if current_version != latest_version:
13
+ print(
14
+ f"\033[93mUPDATE AVAILABLE:\033[0m You are using '{package_name}=={current_version}', "
15
+ f"but the latest version is '{latest_version}'. While this version is still supported, "
16
+ f"we recommend upgrading to avoid potential issues or missing features: "
17
+ f"`pip install --upgrade {package_name}`"
18
+ )
19
+ except Exception:
20
+ pass
21
+
22
+ threading.Thread(target=_check, daemon=True).start()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.31
3
+ Version: 0.0.32
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -1,25 +1,26 @@
1
- judgeval/__init__.py,sha256=dtXxsCmI4eEsZdGSUMy8P_pA0bc2-OSGAgb2C__yJoA,252
1
+ judgeval/__init__.py,sha256=x9HWt4waJwJMAqTuJSg2MezF9Zg-macEjeU-ajbly-8,330
2
2
  judgeval/clients.py,sha256=6VQmEqmfCngUdS2MuPBIpHvtDFqOENm8-_BmMvjLyRQ,944
3
- judgeval/constants.py,sha256=XTqijsuuLEhUBXTjzNJVsee5U_Gl14ULLO5uQVW_nEE,5398
3
+ judgeval/constants.py,sha256=_XmVAkebMyGrDvvanAVlMgVd4p6MLHdEVsTQFI0kz1k,5411
4
4
  judgeval/evaluation_run.py,sha256=WGzx-Ug2qhSmunFo8NrmSstBRsOUc5KpKq0Lc51rqsM,6739
5
- judgeval/judgment_client.py,sha256=FncHkjyFx2vfXv4cu4DzbOO0ideHNOWtHVbc8pSXNxk,29754
5
+ judgeval/judgment_client.py,sha256=k0q2s5A0RkhF9ElD9o-KWN10H36t3Of2PrvNF-silf8,26141
6
6
  judgeval/rules.py,sha256=B0ZL0pn72D4Jnlr0zMQ6CPHi7D8AQQRariXCVsiCMiI,20542
7
- judgeval/run_evaluation.py,sha256=2Mv1iLthJeFQZSVhjLOcJKRZ52Sy6OxLb2KyQ_yVwnA,28484
7
+ judgeval/run_evaluation.py,sha256=hnEY8QckEviXYNJutf-6tLFq2DWCzqWV1EVyPvrVXyA,28512
8
+ judgeval/version_check.py,sha256=bvJEidB7rAeXozoUbN9Yb97QOR_s2hgvpvj74jJ5HlY,943
8
9
  judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
9
10
  judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
10
11
  judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
11
- judgeval/common/tracer.py,sha256=9Qga-7rLFlQK-oM5eK1O_8Mn1SewIrPtFwWbSZFtSII,59651
12
+ judgeval/common/tracer.py,sha256=owRRfIZXPUOVCCn0macygnf18mcp8am1eULGnZXD0Kk,68876
12
13
  judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
13
14
  judgeval/data/__init__.py,sha256=xuKx_KCVHGp6CXvQuVmKl3v7pJp-qDaz0NccKxwjtO0,481
14
15
  judgeval/data/custom_example.py,sha256=QRBqiRiZS8UgVeTRHY0r1Jzm6yAYsyg6zmHxQGxdiQs,739
15
16
  judgeval/data/example.py,sha256=cJrmPGLel_P2sy1UaRvuVSAi35EnA9XMR11Lhp4aDLo,5930
16
17
  judgeval/data/result.py,sha256=Gb9tiSDsk1amXgh0cFG6JmlW_BMKxS2kuTwNA0rrHjA,3184
17
18
  judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
18
- judgeval/data/sequence.py,sha256=DlQUjyWQJB6iNmiftDZ9N6C-nPtrOC1e0JZ57U00zZk,2387
19
- judgeval/data/sequence_run.py,sha256=GrnYSZBcZmt4tKQYA_1v09MFB8n3ccrkOJd4qyweHMg,1987
19
+ judgeval/data/sequence.py,sha256=Fkk2HJGnPboH-Fvwgxub_ryG0eUXa3cbsj7ZD0qkeBo,2204
20
+ judgeval/data/sequence_run.py,sha256=RmYjfWKMWg-pcF5PLeiWfrhuDkjDZi5VEmAIEXN3Ib0,2104
20
21
  judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
21
- judgeval/data/datasets/dataset.py,sha256=AFYjksV_wXx5CqFYJsl3aN8yZ6hC50O1myRuOJ8s8_E,12867
22
- judgeval/data/datasets/eval_dataset_client.py,sha256=xzXlBJRBEEmwsB79_eepm0Da-Bz8yRodX7ttk-u-BxU,14986
22
+ judgeval/data/datasets/dataset.py,sha256=dhLo30hvpmmOK2R6O5wDs_neawUJ4lS8bb4S42SufNQ,13034
23
+ judgeval/data/datasets/eval_dataset_client.py,sha256=xjj66BO9Es9IxXqzQe1RT_e0kpeKlt7OrhRoSuj4KHM,15085
23
24
  judgeval/integrations/langgraph.py,sha256=J-cQfFP52TjJewdSTe-fcsUC4HDvjNbXoxmbmF0SgiE,11743
24
25
  judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
25
26
  judgeval/judges/base_judge.py,sha256=ch_S7uBB7lyv44Lf1d7mIGFpveOO58zOkkpImKgd9_4,994
@@ -90,7 +91,7 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py
90
91
  judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=Qk7lwHgRPYeGoxTOyclAh1VfGItfvHJ6l1t7Nk3SWFM,20927
91
92
  judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
92
93
  judgeval/utils/alerts.py,sha256=O19Xj7DA0YVjl8PWiuH4zfdZeu3yiLVvHfY8ah2wG0g,2759
93
- judgeval-0.0.31.dist-info/METADATA,sha256=g9288fIE7NDwXuqUylqCV0mby5hAY7yEztR8TOn5sNk,5418
94
- judgeval-0.0.31.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
95
- judgeval-0.0.31.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
96
- judgeval-0.0.31.dist-info/RECORD,,
94
+ judgeval-0.0.32.dist-info/METADATA,sha256=RJzqlHJwfYiOXEcyEEO5WQBM0DC1zQDuoN-Plix6U38,5418
95
+ judgeval-0.0.32.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
96
+ judgeval-0.0.32.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
97
+ judgeval-0.0.32.dist-info/RECORD,,