judgeval 0.0.40__py3-none-any.whl → 0.0.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
judgeval/common/tracer.py CHANGED
@@ -5,7 +5,6 @@ Tracing system for judgeval that allows for function tracing using decorators.
5
5
  import asyncio
6
6
  import functools
7
7
  import inspect
8
- import json
9
8
  import os
10
9
  import site
11
10
  import sysconfig
@@ -16,6 +15,7 @@ import uuid
16
15
  import warnings
17
16
  import contextvars
18
17
  import sys
18
+ import json
19
19
  from contextlib import contextmanager, asynccontextmanager, AbstractAsyncContextManager, AbstractContextManager # Import context manager bases
20
20
  from dataclasses import dataclass, field
21
21
  from datetime import datetime
@@ -29,20 +29,16 @@ from typing import (
29
29
  Literal,
30
30
  Optional,
31
31
  Tuple,
32
- Type,
33
- TypeVar,
34
32
  Union,
35
33
  AsyncGenerator,
36
34
  TypeAlias,
37
- Set
38
35
  )
39
36
  from rich import print as rprint
40
- import types # <--- Add this import
37
+ import types
41
38
 
42
39
  # Third-party imports
43
40
  import requests
44
41
  from litellm import cost_per_token as _original_cost_per_token
45
- from pydantic import BaseModel
46
42
  from rich import print as rprint
47
43
  from openai import OpenAI, AsyncOpenAI
48
44
  from together import Together, AsyncTogether
@@ -64,8 +60,7 @@ from judgeval.data import Example, Trace, TraceSpan, TraceUsage
64
60
  from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
65
61
  from judgeval.rules import Rule
66
62
  from judgeval.evaluation_run import EvaluationRun
67
- from judgeval.data.result import ScoringResult
68
- from judgeval.common.utils import validate_api_key
63
+ from judgeval.common.utils import ExcInfo, validate_api_key
69
64
  from judgeval.common.exceptions import JudgmentAPIError
70
65
 
71
66
  # Standard library imports needed for the new class
@@ -307,7 +302,7 @@ class TraceClient:
307
302
  tracer: Optional["Tracer"],
308
303
  trace_id: Optional[str] = None,
309
304
  name: str = "default",
310
- project_name: str = "default_project",
305
+ project_name: str = None,
311
306
  overwrite: bool = False,
312
307
  rules: Optional[List[Rule]] = None,
313
308
  enable_monitoring: bool = True,
@@ -317,7 +312,7 @@ class TraceClient:
317
312
  ):
318
313
  self.name = name
319
314
  self.trace_id = trace_id or str(uuid.uuid4())
320
- self.project_name = project_name
315
+ self.project_name = project_name or str(uuid.uuid4())
321
316
  self.overwrite = overwrite
322
317
  self.tracer = tracer
323
318
  self.rules = rules or []
@@ -507,6 +502,28 @@ class TraceClient:
507
502
  span = self.span_id_to_span[current_span_id]
508
503
  span.agent_name = agent_name
509
504
 
505
+ def record_state_before(self, state: dict):
506
+ """Records the agent's state before a tool execution on the current span.
507
+
508
+ Args:
509
+ state: A dictionary representing the agent's state.
510
+ """
511
+ current_span_id = current_span_var.get()
512
+ if current_span_id:
513
+ span = self.span_id_to_span[current_span_id]
514
+ span.state_before = state
515
+
516
+ def record_state_after(self, state: dict):
517
+ """Records the agent's state after a tool execution on the current span.
518
+
519
+ Args:
520
+ state: A dictionary representing the agent's state.
521
+ """
522
+ current_span_id = current_span_var.get()
523
+ if current_span_id:
524
+ span = self.span_id_to_span[current_span_id]
525
+ span.state_after = state
526
+
510
527
  async def _update_coroutine(self, span: TraceSpan, coroutine: Any, field: str):
511
528
  """Helper method to update the output of a trace entry once the coroutine completes"""
512
529
  try:
@@ -540,7 +557,7 @@ class TraceClient:
540
557
  # Removed else block - original didn't have one
541
558
  return None # Return None if no span_id found
542
559
 
543
- def record_error(self, error: Any):
560
+ def record_error(self, error: Dict[str, Any]):
544
561
  current_span_id = current_span_var.get()
545
562
  if current_span_id:
546
563
  span = self.span_id_to_span[current_span_id]
@@ -579,7 +596,7 @@ class TraceClient:
579
596
  "project_name": self.project_name,
580
597
  "created_at": datetime.utcfromtimestamp(self.start_time).isoformat(),
581
598
  "duration": total_duration,
582
- "entries": [span.model_dump() for span in self.trace_spans],
599
+ "trace_spans": [span.model_dump() for span in self.trace_spans],
583
600
  "evaluation_runs": [run.model_dump() for run in self.evaluation_runs],
584
601
  "overwrite": overwrite,
585
602
  "offline_mode": self.tracer.offline_mode,
@@ -599,7 +616,7 @@ class TraceClient:
599
616
  def delete(self):
600
617
  return self.trace_manager_client.delete_trace(self.trace_id)
601
618
 
602
- def _capture_exception_for_trace(current_trace: Optional['TraceClient'], exc_info: Tuple[Optional[type], Optional[BaseException], Optional[types.TracebackType]]):
619
+ def _capture_exception_for_trace(current_trace: Optional['TraceClient'], exc_info: ExcInfo):
603
620
  if not current_trace:
604
621
  return
605
622
 
@@ -609,6 +626,27 @@ def _capture_exception_for_trace(current_trace: Optional['TraceClient'], exc_inf
609
626
  "message": str(exc_value) if exc_value else "No exception message",
610
627
  "traceback": traceback.format_tb(exc_traceback_obj) if exc_traceback_obj else []
611
628
  }
629
+
630
+ # This is where we specially handle exceptions that we might want to collect additional data for.
631
+ # When we do this, always try checking the module from sys.modules instead of importing. This will
632
+ # Let us support a wider range of exceptions without needing to import them for all clients.
633
+
634
+ # Most clients (requests, httpx, urllib) support the standard format of exposing error.request.url and error.response.status_code
635
+ # The alternative is to hand select libraries we want from sys.modules and check for them:
636
+ # As an example: requests_module = sys.modules.get("requests", None) // then do things with requests_module;
637
+
638
+ # General HTTP Like errors
639
+ try:
640
+ url = getattr(getattr(exc_value, "request", None), "url", None)
641
+ status_code = getattr(getattr(exc_value, "response", None), "status_code", None)
642
+ if status_code:
643
+ formatted_exception["http"] = {
644
+ "url": url if url else "Unknown URL",
645
+ "status_code": status_code if status_code else None,
646
+ }
647
+ except Exception as e:
648
+ pass
649
+
612
650
  current_trace.record_error(formatted_exception)
613
651
  class _DeepTracer:
614
652
  _instance: Optional["_DeepTracer"] = None
@@ -907,7 +945,7 @@ class Tracer:
907
945
  def __init__(
908
946
  self,
909
947
  api_key: str = os.getenv("JUDGMENT_API_KEY"),
910
- project_name: str = "default_project",
948
+ project_name: str = None,
911
949
  rules: Optional[List[Rule]] = None, # Added rules parameter
912
950
  organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
913
951
  enable_monitoring: bool = os.getenv("JUDGMENT_MONITORING", "true").lower() == "true",
@@ -935,7 +973,7 @@ class Tracer:
935
973
  raise ValueError("S3 bucket name must be provided when use_s3 is True")
936
974
 
937
975
  self.api_key: str = api_key
938
- self.project_name: str = project_name
976
+ self.project_name: str = project_name or str(uuid.uuid4())
939
977
  self.organization_id: str = organization_id
940
978
  self._current_trace: Optional[str] = None
941
979
  self._active_trace_client: Optional[TraceClient] = None # Add active trace client attribute
@@ -1068,32 +1106,92 @@ class Tracer:
1068
1106
 
1069
1107
  rprint(f"[bold]{label}:[/bold] {msg}")
1070
1108
 
1071
- def identify(self, identifier: str):
1109
+ def identify(self, identifier: str, track_state: bool = False, track_attributes: Optional[List[str]] = None, field_mappings: Optional[Dict[str, str]] = None):
1072
1110
  """
1073
- Class decorator that associates a class with a custom identifier.
1111
+ Class decorator that associates a class with a custom identifier and enables state tracking.
1074
1112
 
1075
1113
  This decorator creates a mapping between the class name and the provided
1076
1114
  identifier, which can be useful for tagging, grouping, or referencing
1077
- classes in a standardized way.
1115
+ classes in a standardized way. It also enables automatic state capture
1116
+ for instances of the decorated class when used with tracing.
1078
1117
 
1079
1118
  Args:
1080
- identifier: The identifier to associate with the decorated class
1081
-
1082
- Returns:
1083
- A decorator function that registers the class with the given identifier
1119
+ identifier: The identifier to associate with the decorated class.
1120
+ This will be used as the instance name in traces.
1121
+ track_state: Whether to automatically capture the state (attributes)
1122
+ of instances before and after function execution. Defaults to False.
1123
+ track_attributes: Optional list of specific attribute names to track.
1124
+ If None, all non-private attributes (not starting with '_')
1125
+ will be tracked when track_state=True.
1126
+ field_mappings: Optional dictionary mapping internal attribute names to
1127
+ display names in the captured state. For example:
1128
+ {"system_prompt": "instructions"} will capture the
1129
+ 'instructions' attribute as 'system_prompt' in the state.
1084
1130
 
1085
1131
  Example:
1086
- @tracer.identify(identifier="user_model")
1132
+ @tracer.identify(identifier="user_model", track_state=True, track_attributes=["name", "age"], field_mappings={"system_prompt": "instructions"})
1087
1133
  class User:
1088
1134
  # Class implementation
1089
1135
  """
1090
1136
  def decorator(cls):
1091
1137
  class_name = cls.__name__
1092
- self.class_identifiers[class_name] = identifier
1138
+ self.class_identifiers[class_name] = {
1139
+ "identifier": identifier,
1140
+ "track_state": track_state,
1141
+ "track_attributes": track_attributes,
1142
+ "field_mappings": field_mappings or {}
1143
+ }
1093
1144
  return cls
1094
1145
 
1095
1146
  return decorator
1096
1147
 
1148
+ def _capture_instance_state(self, instance: Any, class_config: Dict[str, Any]) -> Dict[str, Any]:
1149
+ """
1150
+ Capture the state of an instance based on class configuration.
1151
+ Args:
1152
+ instance: The instance to capture the state of.
1153
+ class_config: Configuration dictionary for state capture,
1154
+ expected to contain 'track_attributes' and 'field_mappings'.
1155
+ """
1156
+ track_attributes = class_config.get('track_attributes')
1157
+ field_mappings = class_config.get('field_mappings')
1158
+
1159
+ if track_attributes:
1160
+
1161
+ state = {attr: getattr(instance, attr, None) for attr in track_attributes}
1162
+ else:
1163
+
1164
+ state = {k: v for k, v in instance.__dict__.items() if not k.startswith('_')}
1165
+
1166
+ if field_mappings:
1167
+ state['field_mappings'] = field_mappings
1168
+
1169
+ return state
1170
+
1171
+
1172
+ def _get_instance_state_if_tracked(self, args):
1173
+ """
1174
+ Extract instance state if the instance should be tracked.
1175
+
1176
+ Returns the captured state dict if tracking is enabled, None otherwise.
1177
+ """
1178
+ if args and hasattr(args[0], '__class__'):
1179
+ instance = args[0]
1180
+ class_name = instance.__class__.__name__
1181
+ if (class_name in self.class_identifiers and
1182
+ isinstance(self.class_identifiers[class_name], dict) and
1183
+ self.class_identifiers[class_name].get('track_state', False)):
1184
+ return self._capture_instance_state(instance, self.class_identifiers[class_name])
1185
+
1186
+ def _conditionally_capture_and_record_state(self, trace_client_instance: TraceClient, args: tuple, is_before: bool):
1187
+ """Captures instance state if tracked and records it via the trace_client."""
1188
+ state = self._get_instance_state_if_tracked(args)
1189
+ if state:
1190
+ if is_before:
1191
+ trace_client_instance.record_state_before(state)
1192
+ else:
1193
+ trace_client_instance.record_state_after(state)
1194
+
1097
1195
  def observe(self, func=None, *, name=None, span_type: SpanType = "span", project_name: str = None, overwrite: bool = False, deep_tracing: bool = None):
1098
1196
  """
1099
1197
  Decorator to trace function execution with detailed entry/exit information.
@@ -1171,6 +1269,9 @@ class Tracer:
1171
1269
  span.record_input(inputs)
1172
1270
  if agent_name:
1173
1271
  span.record_agent_name(agent_name)
1272
+
1273
+ # Capture state before execution
1274
+ self._conditionally_capture_and_record_state(span, args, is_before=True)
1174
1275
 
1175
1276
  if use_deep_tracing:
1176
1277
  with _DeepTracer():
@@ -1181,7 +1282,10 @@ class Tracer:
1181
1282
  except Exception as e:
1182
1283
  _capture_exception_for_trace(current_trace, sys.exc_info())
1183
1284
  raise e
1184
-
1285
+
1286
+ # Capture state after execution
1287
+ self._conditionally_capture_and_record_state(span, args, is_before=False)
1288
+
1185
1289
  # Record output
1186
1290
  span.record_output(result)
1187
1291
  return result
@@ -1199,6 +1303,9 @@ class Tracer:
1199
1303
  if agent_name:
1200
1304
  span.record_agent_name(agent_name)
1201
1305
 
1306
+ # Capture state before execution
1307
+ self._conditionally_capture_and_record_state(span, args, is_before=True)
1308
+
1202
1309
  if use_deep_tracing:
1203
1310
  with _DeepTracer():
1204
1311
  result = await func(*args, **kwargs)
@@ -1208,6 +1315,9 @@ class Tracer:
1208
1315
  except Exception as e:
1209
1316
  _capture_exception_for_trace(current_trace, sys.exc_info())
1210
1317
  raise e
1318
+
1319
+ # Capture state after execution
1320
+ self._conditionally_capture_and_record_state(span, args, is_before=False)
1211
1321
 
1212
1322
  span.record_output(result)
1213
1323
  return result
@@ -1258,6 +1368,9 @@ class Tracer:
1258
1368
  span.record_input(inputs)
1259
1369
  if agent_name:
1260
1370
  span.record_agent_name(agent_name)
1371
+ # Capture state before execution
1372
+ self._conditionally_capture_and_record_state(span, args, is_before=True)
1373
+
1261
1374
  if use_deep_tracing:
1262
1375
  with _DeepTracer():
1263
1376
  result = func(*args, **kwargs)
@@ -1267,6 +1380,10 @@ class Tracer:
1267
1380
  except Exception as e:
1268
1381
  _capture_exception_for_trace(current_trace, sys.exc_info())
1269
1382
  raise e
1383
+
1384
+ # Capture state after execution
1385
+ self._conditionally_capture_and_record_state(span, args, is_before=False)
1386
+
1270
1387
 
1271
1388
  # Record output
1272
1389
  span.record_output(result)
@@ -1286,6 +1403,9 @@ class Tracer:
1286
1403
  if agent_name:
1287
1404
  span.record_agent_name(agent_name)
1288
1405
 
1406
+ # Capture state before execution
1407
+ self._conditionally_capture_and_record_state(span, args, is_before=True)
1408
+
1289
1409
  if use_deep_tracing:
1290
1410
  with _DeepTracer():
1291
1411
  result = func(*args, **kwargs)
@@ -1296,6 +1416,9 @@ class Tracer:
1296
1416
  _capture_exception_for_trace(current_trace, sys.exc_info())
1297
1417
  raise e
1298
1418
 
1419
+ # Capture state after execution
1420
+ self._conditionally_capture_and_record_state(span, args, is_before=False)
1421
+
1299
1422
  span.record_output(result)
1300
1423
  return result
1301
1424
 
@@ -1369,13 +1492,6 @@ def wrap(client: Any) -> Any:
1369
1492
  span.record_usage(usage)
1370
1493
  return response
1371
1494
 
1372
- def _handle_error(span, e, is_async):
1373
- """Handle and record errors"""
1374
- call_type = "async" if is_async else "sync"
1375
- print(f"Error during wrapped {call_type} API call ({span_name}): {e}")
1376
- span.record_output({"error": str(e)})
1377
- raise
1378
-
1379
1495
  # --- Traced Async Functions ---
1380
1496
  async def traced_create_async(*args, **kwargs):
1381
1497
  current_trace = current_trace_var.get()
@@ -1389,7 +1505,8 @@ def wrap(client: Any) -> Any:
1389
1505
  response_or_iterator = await original_create(*args, **kwargs)
1390
1506
  return _format_and_record_output(span, response_or_iterator, is_streaming, True, False)
1391
1507
  except Exception as e:
1392
- return _handle_error(span, e, True)
1508
+ _capture_exception_for_trace(span, sys.exc_info())
1509
+ raise e
1393
1510
 
1394
1511
  # Async responses for OpenAI clients
1395
1512
  async def traced_response_create_async(*args, **kwargs):
@@ -1404,7 +1521,8 @@ def wrap(client: Any) -> Any:
1404
1521
  response_or_iterator = await original_responses_create(*args, **kwargs)
1405
1522
  return _format_and_record_output(span, response_or_iterator, is_streaming, True, True)
1406
1523
  except Exception as e:
1407
- return _handle_error(span, e, True)
1524
+ _capture_exception_for_trace(span, sys.exc_info())
1525
+ raise e
1408
1526
 
1409
1527
  # Function replacing .stream() for async clients
1410
1528
  def traced_stream_async(*args, **kwargs):
@@ -1435,7 +1553,8 @@ def wrap(client: Any) -> Any:
1435
1553
  response_or_iterator = original_create(*args, **kwargs)
1436
1554
  return _format_and_record_output(span, response_or_iterator, is_streaming, False, False)
1437
1555
  except Exception as e:
1438
- return _handle_error(span, e, False)
1556
+ _capture_exception_for_trace(span, sys.exc_info())
1557
+ raise e
1439
1558
 
1440
1559
  def traced_response_create_sync(*args, **kwargs):
1441
1560
  current_trace = current_trace_var.get()
@@ -1449,7 +1568,8 @@ def wrap(client: Any) -> Any:
1449
1568
  response_or_iterator = original_responses_create(*args, **kwargs)
1450
1569
  return _format_and_record_output(span, response_or_iterator, is_streaming, False, True)
1451
1570
  except Exception as e:
1452
- return _handle_error(span, e, False)
1571
+ _capture_exception_for_trace(span, sys.exc_info())
1572
+ raise e
1453
1573
 
1454
1574
  # Function replacing sync .stream()
1455
1575
  def traced_stream_sync(*args, **kwargs):
@@ -1990,10 +2110,12 @@ def get_instance_prefixed_name(instance, class_name, class_identifiers):
1990
2110
  Otherwise, returns None.
1991
2111
  """
1992
2112
  if class_name in class_identifiers:
1993
- attr = class_identifiers[class_name]
2113
+ class_config = class_identifiers[class_name]
2114
+ attr = class_config['identifier']
2115
+
1994
2116
  if hasattr(instance, attr):
1995
2117
  instance_name = getattr(instance, attr)
1996
2118
  return instance_name
1997
2119
  else:
1998
- raise Exception(f"Attribute {class_identifiers[class_name]} does not exist for {class_name}. Check your identify() decorator.")
2120
+ raise Exception(f"Attribute {attr} does not exist for {class_name}. Check your identify() decorator.")
1999
2121
  return None
judgeval/common/utils.py CHANGED
@@ -12,9 +12,10 @@ NOTE: any function beginning with 'a', e.g. 'afetch_together_api_response', is a
12
12
  import asyncio
13
13
  import concurrent.futures
14
14
  import os
15
+ from types import TracebackType
15
16
  import requests
16
17
  import pprint
17
- from typing import Any, Dict, List, Literal, Mapping, Optional, Union
18
+ from typing import Any, Dict, List, Literal, Mapping, Optional, TypeAlias, Union
18
19
 
19
20
  # Third-party imports
20
21
  import litellm
@@ -782,3 +783,6 @@ if __name__ == "__main__":
782
783
  ]
783
784
  ]
784
785
  ))
786
+
787
+ ExcInfo: TypeAlias = tuple[type[BaseException], BaseException, TracebackType]
788
+ OptExcInfo: TypeAlias = ExcInfo | tuple[None, None, None]
@@ -5,14 +5,15 @@ import json
5
5
  import os
6
6
  import yaml
7
7
  from dataclasses import dataclass, field
8
- from typing import List, Union, Literal
8
+ from typing import List, Union, Literal, Optional
9
9
 
10
- from judgeval.data import Example
10
+ from judgeval.data import Example, Trace
11
11
  from judgeval.common.logger import debug, error, warning, info
12
12
 
13
13
  @dataclass
14
14
  class EvalDataset:
15
15
  examples: List[Example]
16
+ traces: List[Trace]
16
17
  _alias: Union[str, None] = field(default=None)
17
18
  _id: Union[str, None] = field(default=None)
18
19
  judgment_api_key: str = field(default="")
@@ -20,12 +21,13 @@ class EvalDataset:
20
21
  def __init__(self,
21
22
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"),
22
23
  organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
23
- examples: List[Example] = [],
24
+ examples: Optional[List[Example]] = None,
25
+ traces: Optional[List[Trace]] = None
24
26
  ):
25
- debug(f"Initializing EvalDataset with {len(examples)} examples")
26
27
  if not judgment_api_key:
27
28
  warning("No judgment_api_key provided")
28
- self.examples = examples
29
+ self.examples = examples or []
30
+ self.traces = traces or []
29
31
  self._alias = None
30
32
  self._id = None
31
33
  self.judgment_api_key = judgment_api_key
@@ -218,8 +220,11 @@ class EvalDataset:
218
220
  self.add_example(e)
219
221
 
220
222
  def add_example(self, e: Example) -> None:
221
- self.examples = self.examples + [e]
223
+ self.examples.append(e)
222
224
  # TODO if we need to add rank, then we need to do it here
225
+
226
+ def add_trace(self, t: Trace) -> None:
227
+ self.traces.append(t)
223
228
 
224
229
  def save_as(self, file_type: Literal["json", "csv", "yaml"], dir_path: str, save_name: str = None) -> None:
225
230
  """
@@ -307,6 +312,7 @@ class EvalDataset:
307
312
  return (
308
313
  f"{self.__class__.__name__}("
309
314
  f"examples={self.examples}, "
315
+ f"traces={self.traces}, "
310
316
  f"_alias={self._alias}, "
311
317
  f"_id={self._id}"
312
318
  f")"
@@ -13,7 +13,7 @@ from judgeval.constants import (
13
13
  JUDGMENT_DATASETS_INSERT_API_URL,
14
14
  JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
15
15
  )
16
- from judgeval.data import Example
16
+ from judgeval.data import Example, Trace
17
17
  from judgeval.data.datasets import EvalDataset
18
18
 
19
19
 
@@ -58,6 +58,7 @@ class EvalDatasetClient:
58
58
  "dataset_alias": alias,
59
59
  "project_name": project_name,
60
60
  "examples": [e.to_dict() for e in dataset.examples],
61
+ "traces": [t.model_dump() for t in dataset.traces],
61
62
  "overwrite": overwrite,
62
63
  }
63
64
  try:
@@ -202,6 +203,7 @@ class EvalDatasetClient:
202
203
  info(f"Successfully pulled dataset with alias '{alias}'")
203
204
  payload = response.json()
204
205
  dataset.examples = [Example(**e) for e in payload.get("examples", [])]
206
+ dataset.traces = [Trace(**t) for t in payload.get("traces", [])]
205
207
  dataset._alias = payload.get("alias")
206
208
  dataset._id = payload.get("id")
207
209
  progress.update(
judgeval/data/trace.py CHANGED
@@ -33,6 +33,8 @@ class TraceSpan(BaseModel):
33
33
  additional_metadata: Optional[Dict[str, Any]] = None
34
34
  has_evaluation: Optional[bool] = False
35
35
  agent_name: Optional[str] = None
36
+ state_before: Optional[Dict[str, Any]] = None
37
+ state_after: Optional[Dict[str, Any]] = None
36
38
 
37
39
  def model_dump(self, **kwargs):
38
40
  return {
@@ -50,7 +52,9 @@ class TraceSpan(BaseModel):
50
52
  "span_type": self.span_type,
51
53
  "usage": self.usage.model_dump() if self.usage else None,
52
54
  "has_evaluation": self.has_evaluation,
53
- "agent_name": self.agent_name
55
+ "agent_name": self.agent_name,
56
+ "state_before": self.state_before,
57
+ "state_after": self.state_after
54
58
  }
55
59
 
56
60
  def print_span(self):
@@ -113,7 +117,7 @@ class Trace(BaseModel):
113
117
  name: str
114
118
  created_at: str
115
119
  duration: float
116
- entries: List[TraceSpan]
120
+ trace_spans: List[TraceSpan]
117
121
  overwrite: bool = False
118
122
  offline_mode: bool = False
119
123
  rules: Optional[Dict[str, Any]] = None
@@ -63,7 +63,15 @@ class SingletonMeta(type):
63
63
  return cls._instances[cls]
64
64
 
65
65
  class JudgmentClient(metaclass=SingletonMeta):
66
- def __init__(self, judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"), organization_id: str = os.getenv("JUDGMENT_ORG_ID")):
66
+ def __init__(self, judgment_api_key: Optional[str] = os.getenv("JUDGMENT_API_KEY"), organization_id: Optional[str] = os.getenv("JUDGMENT_ORG_ID")):
67
+ # Check if API key is None
68
+ if judgment_api_key is None:
69
+ raise ValueError("JUDGMENT_API_KEY cannot be None. Please provide a valid API key or set the JUDGMENT_API_KEY environment variable.")
70
+
71
+ # Check if organization ID is None
72
+ if organization_id is None:
73
+ raise ValueError("JUDGMENT_ORG_ID cannot be None. Please provide a valid organization ID or set the JUDGMENT_ORG_ID environment variable.")
74
+
67
75
  self.judgment_api_key = judgment_api_key
68
76
  self.organization_id = organization_id
69
77
  self.eval_dataset_client = EvalDatasetClient(judgment_api_key, organization_id)
@@ -1,6 +1,7 @@
1
1
  import asyncio
2
2
  import requests
3
3
  import time
4
+ import json
4
5
  import sys
5
6
  import itertools
6
7
  import threading
@@ -362,14 +363,26 @@ def check_examples(examples: List[Example], scorers: List[Union[APIJudgmentScore
362
363
  """
363
364
  Checks if the example contains the necessary parameters for the scorer.
364
365
  """
366
+ prompt_user = False
365
367
  for scorer in scorers:
366
368
  for example in examples:
367
369
  missing_params = []
368
370
  for param in scorer.required_params:
369
371
  if getattr(example, param.value) is None:
370
- missing_params.append(f"'{param.value}'")
372
+ missing_params.append(f"{param.value}")
371
373
  if missing_params:
372
- print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
374
+ rprint(f"[yellow]⚠️ WARNING:[/yellow] Example is missing required parameters for scorer [bold]{scorer.score_type.value}[/bold]")
375
+ rprint(f"Missing parameters: {', '.join(missing_params)}")
376
+ rprint(f"Example: {json.dumps(example.model_dump(), indent=2)}")
377
+ rprint("-"*40)
378
+ prompt_user = True
379
+
380
+ if prompt_user:
381
+ user_input = input("Do you want to continue? (y/n)")
382
+ if user_input.lower() != "y":
383
+ sys.exit(0)
384
+ else:
385
+ rprint("[green]Continuing...[/green]")
373
386
 
374
387
  def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: bool = True, function: Optional[Callable] = None, tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None, examples: Optional[List[Example]] = None) -> List[ScoringResult]:
375
388
  # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
@@ -407,7 +420,7 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
407
420
  for i, trace in enumerate(tracer.traces):
408
421
  # We set the root-level trace span with the expected tools of the Trace
409
422
  trace = Trace(**trace)
410
- trace.entries[0].expected_tools = examples[i].expected_tools
423
+ trace.trace_spans[0].expected_tools = examples[i].expected_tools
411
424
  new_traces.append(trace)
412
425
  trace_run.traces = new_traces
413
426
  tracer.traces = []
@@ -894,6 +907,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
894
907
  f"Processing evaluation '{evaluation_run.eval_name}': "
895
908
  )
896
909
  else:
910
+ check_examples(evaluation_run.examples, evaluation_run.scorers)
897
911
  if judgment_scorers:
898
912
  # Execute evaluation using Judgment API
899
913
  info("Starting API evaluation")
@@ -12,7 +12,7 @@ from judgeval.common.logger import debug, info, warning, error
12
12
  from judgeval.judges import JudgevalJudge
13
13
  from judgeval.judges.utils import create_judge
14
14
  from judgeval.constants import UNBOUNDED_SCORERS
15
-
15
+ from judgeval.data.example import ExampleParams
16
16
  class JudgevalScorer:
17
17
  """
18
18
  Base class for scorers in `judgeval`.
@@ -39,6 +39,7 @@ class JudgevalScorer:
39
39
  evaluation_cost: Optional[float] = None # The cost of running the scorer
40
40
  verbose_logs: Optional[str] = None # The verbose logs of the scorer
41
41
  additional_metadata: Optional[Dict] = None # Additional metadata for the scorer
42
+ required_params: Optional[List[ExampleParams]] = None # The required parameters for the scorer
42
43
  error: Optional[str] = None
43
44
  success: Optional[bool] = None
44
45
 
@@ -51,6 +52,7 @@ class JudgevalScorer:
51
52
  reason: Optional[str] = None,
52
53
  success: Optional[bool] = None,
53
54
  evaluation_model: Optional[str] = None,
55
+ required_params: Optional[List[ExampleParams]] = None,
54
56
  strict_mode: bool = False,
55
57
  async_mode: bool = True,
56
58
  verbose_mode: bool = True,
@@ -87,6 +89,7 @@ class JudgevalScorer:
87
89
  self.evaluation_cost = evaluation_cost
88
90
  self.verbose_logs = verbose_logs
89
91
  self.additional_metadata = additional_metadata
92
+ self.required_params = required_params
90
93
 
91
94
  def _add_model(self, model: Optional[Union[str, List[str], JudgevalJudge]] = None):
92
95
  """
@@ -30,6 +30,7 @@ from typing import List, Optional, Tuple, Any, Mapping
30
30
  from pydantic import BaseModel, model_serializer, Field
31
31
 
32
32
  from judgeval.data import Example
33
+ from judgeval.data.example import ExampleParams
33
34
  from judgeval.scorers import JudgevalScorer
34
35
  from judgeval.scorers.utils import (
35
36
  scorer_progress_meter,
@@ -64,6 +65,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
64
65
  async_mode: bool = True,
65
66
  strict_mode: bool = False,
66
67
  verbose_mode: bool = False,
68
+ required_params: Optional[List[ExampleParams]] = None,
67
69
  ):
68
70
  # Initialize BaseModel first
69
71
  BaseModel.__init__(
@@ -85,6 +87,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
85
87
  async_mode=async_mode,
86
88
  strict_mode=strict_mode,
87
89
  verbose_mode=verbose_mode,
90
+ required_params=required_params,
88
91
  )
89
92
 
90
93
  def score_example(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.40
3
+ Version: 0.0.41
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -37,11 +37,11 @@ Description-Content-Type: text/markdown
37
37
 
38
38
  <br>
39
39
 
40
- ## [🌐 Landing Page](https://www.judgmentlabs.ai/) • [📚 Docs](https://judgment.mintlify.app/getting_started) • [🚀 Demos](https://www.youtube.com/@AlexShan-j3o)
40
+ ## [🌐 Landing Page](https://www.judgmentlabs.ai/) • [📚 Docs](https://docs.judgmentlabs.ai/introduction) • [🚀 Demos](https://www.youtube.com/@AlexShan-j3o)
41
41
 
42
42
  [![X](https://img.shields.io/badge/-X/Twitter-000?logo=x&logoColor=white)](https://x.com/JudgmentLabs)
43
43
  [![LinkedIn](https://custom-icon-badges.demolab.com/badge/LinkedIn%20-0A66C2?logo=linkedin-white&logoColor=fff)](https://www.linkedin.com/company/judgmentlabs)
44
- [![Discord](https://img.shields.io/badge/-Discord-5865F2?logo=discord&logoColor=white)](https://discord.gg/FMxHkYTtFE)
44
+ [![Discord](https://img.shields.io/badge/-Discord-5865F2?logo=discord&logoColor=white)](https://discord.gg/ZCnSXYug)
45
45
 
46
46
  </div>
47
47
 
@@ -56,19 +56,28 @@ We support tracing agents built with LangGraph, OpenAI SDK, Anthropic, ... and a
56
56
  Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
57
57
 
58
58
  ## 📋 Table of Contents
59
- * [ Features](#-features)
60
- * [🔍 Tracing](#-tracing)
61
- * [🧪 Evals](#-evals)
62
- * [📡 Monitoring](#-monitoring)
63
- * [📊 Datasets](#-datasets)
64
- * [💡 Insights](#-insights)
65
- * [🛠️ Installation](#️-installation)
66
- * [🏁 Get Started](#-get-started)
67
- * [🏢 Self-Hosting](#-self-hosting)
68
- * [📚 Cookbooks](#-cookbooks)
69
- * [💻 Development with Cursor](#-development-with-cursor)
70
- * [ Star Us on GitHub](#-star-us-on-github)
71
- * [❤️ Contributors](#️-contributors)
59
+ - [🌐 Landing Page • 📚 Docs • 🚀 Demos](#-landing-page----docs---demos)
60
+ - [Judgeval: open-source testing, monitoring, and optimization for AI agents](#judgeval-open-source-testing-monitoring-and-optimization-for-ai-agents)
61
+ - [📋 Table of Contents](#-table-of-contents)
62
+ - [ Features](#-features)
63
+ - [🛠️ Installation](#️-installation)
64
+ - [🏁 Get Started](#-get-started)
65
+ - [🛰️ Tracing](#️-tracing)
66
+ - [📝 Offline Evaluations](#-offline-evaluations)
67
+ - [📡 Online Evaluations](#-online-evaluations)
68
+ - [🏢 Self-Hosting](#-self-hosting)
69
+ - [Key Features](#key-features)
70
+ - [Getting Started](#getting-started)
71
+ - [📚 Cookbooks](#-cookbooks)
72
+ - [Sample Agents](#sample-agents)
73
+ - [💰 LangGraph Financial QA Agent](#-langgraph-financial-qa-agent)
74
+ - [✈️ OpenAI Travel Agent](#️-openai-travel-agent)
75
+ - [Custom Evaluators](#custom-evaluators)
76
+ - [🔍 PII Detection](#-pii-detection)
77
+ - [📧 Cold Email Generation](#-cold-email-generation)
78
+ - [💻 Development with Cursor](#-development-with-cursor)
79
+ - [⭐ Star Us on GitHub](#-star-us-on-github)
80
+ - [❤️ Contributors](#️-contributors)
72
81
 
73
82
  <!-- Created by https://github.com/ekalinin/github-markdown-toc -->
74
83
 
@@ -2,27 +2,27 @@ judgeval/__init__.py,sha256=x9HWt4waJwJMAqTuJSg2MezF9Zg-macEjeU-ajbly-8,330
2
2
  judgeval/clients.py,sha256=EiTmvvWksTPyWIuMC9jz06SPY2vFzokIJUIGoScpisA,989
3
3
  judgeval/constants.py,sha256=xuO-Und5c0-K3yTRn2fAkwyY2uTf8b7dGd39CPVqkSQ,5661
4
4
  judgeval/evaluation_run.py,sha256=KNGtaGAwD18pDNOKF7PCMlLnQe9SpRLTs0XWFMrCiLc,6684
5
- judgeval/judgment_client.py,sha256=TkYNCzuy5toIqvsgCSGO4WyKfUSgEM_gX2pbQqWCFJo,24481
5
+ judgeval/judgment_client.py,sha256=JO3AkU-disPHQVK5g1SM-bs_EUSy8QZ3AaAj_Q2ag6s,24968
6
6
  judgeval/rules.py,sha256=jkh1cXXcUf8oRY7xJUZfcQBYWn_rjUW4GvrhRt15PeU,20265
7
- judgeval/run_evaluation.py,sha256=Kg2iFrpVq-rGMfQokM5s_LJ4BSqGNxjQUxnxfaiTOZ4,49135
7
+ judgeval/run_evaluation.py,sha256=MshtOGvWm_eGj2JamEtiMWvPjdCwrKTp9WcAUrBm2Fs,49673
8
8
  judgeval/version_check.py,sha256=bvJEidB7rAeXozoUbN9Yb97QOR_s2hgvpvj74jJ5HlY,943
9
9
  judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
10
10
  judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
11
11
  judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
12
12
  judgeval/common/s3_storage.py,sha256=W8wq9S7qJZdqdBR4sk3aEZ4K3-pz40DOoolOJrWs9Vo,3768
13
- judgeval/common/tracer.py,sha256=0HwWwbvnT9Z686q9ppqUB54xR-GKqlBnkXf29c76eow,84425
14
- judgeval/common/utils.py,sha256=w1SjpDtB1DTJapFSAvLzr_a3gGI45iacEoxIUnQXx4Q,34087
13
+ judgeval/common/tracer.py,sha256=rYNmyB3Z955xfnKmlase6gub8Xf5xz6nQefONs_Td5U,90870
14
+ judgeval/common/utils.py,sha256=sWdHfqgiF6AnKTQNmeUBfoEsddXgInI5M24t2-QYexk,34271
15
15
  judgeval/data/__init__.py,sha256=GX_GloDtBB35mv3INWbSTP2r9cwCU2IeIYjzRT0SAd8,530
16
16
  judgeval/data/custom_example.py,sha256=QRBqiRiZS8UgVeTRHY0r1Jzm6yAYsyg6zmHxQGxdiQs,739
17
17
  judgeval/data/example.py,sha256=jcK78ff-TKNl9Qtxvbd1g61crpo-s4fWHaqyMIbQNq0,6877
18
18
  judgeval/data/result.py,sha256=KfU9lhAKG_Xo2eGDm2uKVVRZpf177IDASg1cIwedJwE,3184
19
19
  judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
20
20
  judgeval/data/tool.py,sha256=eEEvGDNNYWhcQiI6cjDv3rO1VoOJJS5LWGS76Gb_gtY,1813
21
- judgeval/data/trace.py,sha256=ETZEb_MJfv4vWr2y_uZ7FfIua7GrV6jgSUVjjURAdlQ,4602
21
+ judgeval/data/trace.py,sha256=S9IQunatke-Kcxi2-qXg3CtbmxBk8VGBDJzWshx7zJg,4798
22
22
  judgeval/data/trace_run.py,sha256=fiB5Z5il9U9XqvksdA2DbLNd96U_Wrz8K00RuFJBy38,2324
23
23
  judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
24
- judgeval/data/datasets/dataset.py,sha256=oU9hvZTifK2x8em3FhL3oIqgHOByfJWH6C_9rIKnL5g,12773
25
- judgeval/data/datasets/eval_dataset_client.py,sha256=3RBfkaMrkudjnmY_qFwY4I-2mOPE3XK4WxkfSweLB-Q,15016
24
+ judgeval/data/datasets/dataset.py,sha256=pq9-A1mg2Brpjg1TufDU_eLo9sQhX0nw-UTGaf3jCXA,12952
25
+ judgeval/data/datasets/eval_dataset_client.py,sha256=LJ1bf1sZAC4ZBCRTQ1Y4VrJuNSslYBQ1y9YKuhYxwqY,15176
26
26
  judgeval/integrations/langgraph.py,sha256=L9zPPWVLGL2HWuwHPqM5Kic4S7EfQ_Y1Y3YKBJNfGCA,23004
27
27
  judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
28
28
  judgeval/judges/base_judge.py,sha256=ch_S7uBB7lyv44Lf1d7mIGFpveOO58zOkkpImKgd9_4,994
@@ -33,8 +33,8 @@ judgeval/judges/utils.py,sha256=vL-15_udU94JHUAiyrAvHAKMj6Fqypg01ek4YH5zVCM,2687
33
33
  judgeval/scorers/__init__.py,sha256=VKPveyGCv5Rc0YtuT7iAxSv-M5EuikqAVeaGNnYMuWE,1340
34
34
  judgeval/scorers/api_scorer.py,sha256=NQ_CrrUPhSUk1k2Q8rKpCG_TU2FT32sFEqvb-Yi54B0,2688
35
35
  judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
36
- judgeval/scorers/judgeval_scorer.py,sha256=id4s72vn4pWKbjZDnWKHGlc4kqyUkrFqdlX1SgyDj6c,7027
37
- judgeval/scorers/prompt_scorer.py,sha256=9MGSG2OVvX2i2CUZmXg0i3rJHQKMe2dMBdMDcnjp8mg,11845
36
+ judgeval/scorers/judgeval_scorer.py,sha256=_qtXzl5aa1FH_50kVPnRfiwyCtuXPKyrGU71_3pOrBw,7288
37
+ judgeval/scorers/prompt_scorer.py,sha256=Uf_QZhytd78cInKZv8wr66Angz5sxLklP5hEEcoabq4,12001
38
38
  judgeval/scorers/score.py,sha256=h4eVlbItqG8R0nQgSgeyicYSIraZV9MvV-RRaFu46mg,18762
39
39
  judgeval/scorers/utils.py,sha256=iHQVTlIANbmCTXz9kTeSdOytgUZ_T74Re61ajqsk_WQ,6827
40
40
  judgeval/scorers/judgeval_scorers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -62,7 +62,7 @@ judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py,sha256
62
62
  judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
63
63
  judgeval/utils/alerts.py,sha256=O19Xj7DA0YVjl8PWiuH4zfdZeu3yiLVvHfY8ah2wG0g,2759
64
64
  judgeval/utils/data_utils.py,sha256=pB4GBWi8XoM2zSR2NlLXH5kqcQ029BVhDxaVKkdmiBY,1860
65
- judgeval-0.0.40.dist-info/METADATA,sha256=pAFVIDRiMlCOrbfQ0-epidECcUHl_fFuiLPgGnhDJYo,56712
66
- judgeval-0.0.40.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
67
- judgeval-0.0.40.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
68
- judgeval-0.0.40.dist-info/RECORD,,
65
+ judgeval-0.0.41.dist-info/METADATA,sha256=-sO68MUEmN3s4ji7Vf1gTuPv60R7Ny6bMcuuKlFSSI8,57358
66
+ judgeval-0.0.41.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
67
+ judgeval-0.0.41.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
68
+ judgeval-0.0.41.dist-info/RECORD,,