judgeval 0.0.38__py3-none-any.whl → 0.0.40__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
judgeval/constants.py CHANGED
@@ -28,6 +28,8 @@ class APIScorer(str, Enum):
28
28
  GROUNDEDNESS = "groundedness"
29
29
  DERAILMENT = "derailment"
30
30
  TOOL_ORDER = "tool_order"
31
+ CLASSIFIER = "classifier"
32
+ TOOL_DEPENDENCY = "tool_dependency"
31
33
  @classmethod
32
34
  def _missing_(cls, value):
33
35
  # Handle case-insensitive lookup
@@ -59,6 +61,7 @@ JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
59
61
  JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
60
62
  JUDGMENT_TRACES_ADD_ANNOTATION_API_URL = f"{ROOT_API}/traces/add_annotation/"
61
63
  JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
64
+ JUDGMENT_GET_EVAL_STATUS_API_URL = f"{ROOT_API}/get_evaluation_status/"
62
65
  # RabbitMQ
63
66
  RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
64
67
  RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", 5672)
judgeval/data/__init__.py CHANGED
@@ -2,7 +2,7 @@ from judgeval.data.example import Example, ExampleParams
2
2
  from judgeval.data.custom_example import CustomExample
3
3
  from judgeval.data.scorer_data import ScorerData, create_scorer_data
4
4
  from judgeval.data.result import ScoringResult, generate_scoring_result
5
- from judgeval.data.trace import Trace, TraceSpan
5
+ from judgeval.data.trace import Trace, TraceSpan, TraceUsage
6
6
 
7
7
 
8
8
  __all__ = [
@@ -15,4 +15,5 @@ __all__ = [
15
15
  "generate_scoring_result",
16
16
  "Trace",
17
17
  "TraceSpan",
18
+ "TraceUsage"
18
19
  ]
judgeval/data/example.py CHANGED
@@ -8,6 +8,7 @@ from uuid import uuid4
8
8
  from pydantic import BaseModel, Field, field_validator
9
9
  from enum import Enum
10
10
  from datetime import datetime
11
+ from judgeval.data.tool import Tool
11
12
  import time
12
13
 
13
14
 
@@ -31,19 +32,19 @@ class Example(BaseModel):
31
32
  retrieval_context: Optional[List[str]] = None
32
33
  additional_metadata: Optional[Dict[str, Any]] = None
33
34
  tools_called: Optional[List[str]] = None
34
- expected_tools: Optional[List[Dict[str, Any]]] = None
35
+ expected_tools: Optional[List[Tool]] = None
35
36
  name: Optional[str] = None
36
37
  example_id: str = Field(default_factory=lambda: str(uuid4()))
37
38
  example_index: Optional[int] = None
38
- timestamp: Optional[str] = None
39
+ created_at: Optional[str] = None
39
40
  trace_id: Optional[str] = None
40
41
 
41
42
  def __init__(self, **data):
42
43
  if 'example_id' not in data:
43
44
  data['example_id'] = str(uuid4())
44
45
  # Set timestamp if not provided
45
- if 'timestamp' not in data:
46
- data['timestamp'] = datetime.now().strftime("%Y%m%d_%H%M%S")
46
+ if 'created_at' not in data:
47
+ data['created_at'] = datetime.now().isoformat()
47
48
  super().__init__(**data)
48
49
 
49
50
  @field_validator('input', mode='before')
@@ -82,17 +83,17 @@ class Example(BaseModel):
82
83
  raise ValueError(f"All items in expected_output must be strings but got {v}")
83
84
  return v
84
85
 
85
- @field_validator('expected_tools', mode='before')
86
+ @field_validator('expected_tools')
86
87
  @classmethod
87
88
  def validate_expected_tools(cls, v):
88
89
  if v is not None:
89
90
  if not isinstance(v, list):
90
- raise ValueError(f"Expected tools must be a list of dictionaries or None but got {v} of type {type(v)}")
91
+ raise ValueError(f"Expected tools must be a list of Tools or None but got {v} of type {type(v)}")
91
92
 
92
- # Check that each item in the list is a dictionary
93
+ # Check that each item in the list is a Tool
93
94
  for i, item in enumerate(v):
94
- if not isinstance(item, dict):
95
- raise ValueError(f"Expected tools must be a list of dictionaries, but item at index {i} is {item} of type {type(item)}")
95
+ if not isinstance(item, Tool):
96
+ raise ValueError(f"Expected tools must be a list of Tools, but item at index {i} is {item} of type {type(item)}")
96
97
 
97
98
  return v
98
99
 
@@ -122,9 +123,9 @@ class Example(BaseModel):
122
123
  raise ValueError(f"Example index must be an integer or None but got {v} of type {type(v)}")
123
124
  return v
124
125
 
125
- @field_validator('timestamp', mode='before')
126
+ @field_validator('created_at', mode='before')
126
127
  @classmethod
127
- def validate_timestamp(cls, v):
128
+ def validate_created_at(cls, v):
128
129
  if v is not None and not isinstance(v, str):
129
130
  raise ValueError(f"Timestamp must be a string or None but got {v} of type {type(v)}")
130
131
  return v
@@ -149,7 +150,7 @@ class Example(BaseModel):
149
150
  "name": self.name,
150
151
  "example_id": self.example_id,
151
152
  "example_index": self.example_index,
152
- "timestamp": self.timestamp,
153
+ "created_at": self.created_at,
153
154
  }
154
155
 
155
156
  def __str__(self):
@@ -165,5 +166,5 @@ class Example(BaseModel):
165
166
  f"name={self.name}, "
166
167
  f"example_id={self.example_id}, "
167
168
  f"example_index={self.example_index}, "
168
- f"timestamp={self.timestamp}, "
169
+ f"created_at={self.created_at}, "
169
170
  )
judgeval/data/tool.py ADDED
@@ -0,0 +1,47 @@
1
+ from pydantic import BaseModel, field_validator
2
+ from typing import Dict, Any, Optional, List
3
+ import warnings
4
+
5
+ class Tool(BaseModel):
6
+ tool_name: str
7
+ parameters: Optional[Dict[str, Any]] = None
8
+ agent_name: Optional[str] = None
9
+ result_dependencies: Optional[List[Dict[str, Any]]] = None
10
+ action_dependencies: Optional[List[Dict[str, Any]]] = None
11
+ require_all: Optional[bool] = None
12
+
13
+ @field_validator('tool_name')
14
+ def validate_tool_name(cls, v):
15
+ if not v:
16
+ warnings.warn("Tool name is empty or None", UserWarning)
17
+ return v
18
+
19
+ @field_validator('parameters')
20
+ def validate_parameters(cls, v):
21
+ if v is not None and not isinstance(v, dict):
22
+ warnings.warn(f"Parameters should be a dictionary, got {type(v)}", UserWarning)
23
+ return v
24
+
25
+ @field_validator('agent_name')
26
+ def validate_agent_name(cls, v):
27
+ if v is not None and not isinstance(v, str):
28
+ warnings.warn(f"Agent name should be a string, got {type(v)}", UserWarning)
29
+ return v
30
+
31
+ @field_validator('result_dependencies')
32
+ def validate_result_dependencies(cls, v):
33
+ if v is not None and not isinstance(v, list):
34
+ warnings.warn(f"Result dependencies should be a list, got {type(v)}", UserWarning)
35
+ return v
36
+
37
+ @field_validator('action_dependencies')
38
+ def validate_action_dependencies(cls, v):
39
+ if v is not None and not isinstance(v, list):
40
+ warnings.warn(f"Action dependencies should be a list, got {type(v)}", UserWarning)
41
+ return v
42
+
43
+ @field_validator('require_all')
44
+ def validate_require_all(cls, v):
45
+ if v is not None and not isinstance(v, bool):
46
+ warnings.warn(f"Require all should be a boolean, got {type(v)}", UserWarning)
47
+ return v
judgeval/data/trace.py CHANGED
@@ -1,39 +1,56 @@
1
1
  from pydantic import BaseModel
2
2
  from typing import Optional, Dict, Any, List
3
3
  from judgeval.evaluation_run import EvaluationRun
4
+ from judgeval.data.tool import Tool
4
5
  import json
5
6
  from datetime import datetime, timezone
6
7
 
8
+ class TraceUsage(BaseModel):
9
+ prompt_tokens: Optional[int] = None
10
+ completion_tokens: Optional[int] = None
11
+ total_tokens: Optional[int] = None
12
+ prompt_tokens_cost_usd: Optional[float] = None
13
+ completion_tokens_cost_usd: Optional[float] = None
14
+ total_cost_usd: Optional[float] = None
15
+ model_name: Optional[str] = None
16
+
7
17
  class TraceSpan(BaseModel):
8
18
  span_id: str
9
19
  trace_id: str
10
- function: Optional[str] = None
20
+ function: str
11
21
  depth: int
12
22
  created_at: Optional[Any] = None
13
23
  parent_span_id: Optional[str] = None
14
24
  span_type: Optional[str] = "span"
15
25
  inputs: Optional[Dict[str, Any]] = None
26
+ error: Optional[Dict[str, Any]] = None
16
27
  output: Optional[Any] = None
28
+ usage: Optional[TraceUsage] = None
17
29
  duration: Optional[float] = None
18
30
  annotation: Optional[List[Dict[str, Any]]] = None
19
31
  evaluation_runs: Optional[List[EvaluationRun]] = []
20
- expected_tools: Optional[List[Dict[str, Any]]] = None
32
+ expected_tools: Optional[List[Tool]] = None
21
33
  additional_metadata: Optional[Dict[str, Any]] = None
34
+ has_evaluation: Optional[bool] = False
35
+ agent_name: Optional[str] = None
22
36
 
23
37
  def model_dump(self, **kwargs):
24
38
  return {
25
39
  "span_id": self.span_id,
26
40
  "trace_id": self.trace_id,
27
41
  "depth": self.depth,
28
- # "created_at": datetime.fromtimestamp(self.created_at).isoformat(),
29
42
  "created_at": datetime.fromtimestamp(self.created_at, tz=timezone.utc).isoformat(),
30
- "inputs": self._serialize_inputs(),
31
- "output": self._serialize_output(),
43
+ "inputs": self._serialize_value(self.inputs),
44
+ "output": self._serialize_value(self.output),
45
+ "error": self._serialize_value(self.error),
32
46
  "evaluation_runs": [run.model_dump() for run in self.evaluation_runs] if self.evaluation_runs else [],
33
47
  "parent_span_id": self.parent_span_id,
34
48
  "function": self.function,
35
49
  "duration": self.duration,
36
- "span_type": self.span_type
50
+ "span_type": self.span_type,
51
+ "usage": self.usage.model_dump() if self.usage else None,
52
+ "has_evaluation": self.has_evaluation,
53
+ "agent_name": self.agent_name
37
54
  }
38
55
 
39
56
  def print_span(self):
@@ -41,30 +58,6 @@ class TraceSpan(BaseModel):
41
58
  indent = " " * self.depth
42
59
  parent_info = f" (parent_id: {self.parent_span_id})" if self.parent_span_id else ""
43
60
  print(f"{indent}→ {self.function} (id: {self.span_id}){parent_info}")
44
-
45
- def _serialize_inputs(self) -> dict:
46
- """Helper method to serialize input data safely."""
47
- if self.inputs is None:
48
- return {}
49
-
50
- serialized_inputs = {}
51
- for key, value in self.inputs.items():
52
- if isinstance(value, BaseModel):
53
- serialized_inputs[key] = value.model_dump()
54
- elif isinstance(value, (list, tuple)):
55
- # Handle lists/tuples of arguments
56
- serialized_inputs[key] = [
57
- item.model_dump() if isinstance(item, BaseModel)
58
- else None if not self._is_json_serializable(item)
59
- else item
60
- for item in value
61
- ]
62
- else:
63
- if self._is_json_serializable(value):
64
- serialized_inputs[key] = value
65
- else:
66
- serialized_inputs[key] = self.safe_stringify(value, self.function)
67
- return serialized_inputs
68
61
 
69
62
  def _is_json_serializable(self, obj: Any) -> bool:
70
63
  """Helper method to check if an object is JSON serializable."""
@@ -87,15 +80,11 @@ class TraceSpan(BaseModel):
87
80
  return repr(output)
88
81
  except (TypeError, OverflowError, ValueError):
89
82
  pass
90
-
91
- warnings.warn(
92
- f"Output for function {function_name} is not JSON serializable and could not be converted to string. Setting to None."
93
- )
94
83
  return None
95
84
 
96
- def _serialize_output(self) -> Any:
97
- """Helper method to serialize output data safely."""
98
- if self.output is None:
85
+ def _serialize_value(self, value: Any) -> Any:
86
+ """Helper method to deep serialize a value safely supporting Pydantic Models / regular PyObjects."""
87
+ if value is None:
99
88
  return None
100
89
 
101
90
  def serialize_value(value):
@@ -116,8 +105,8 @@ class TraceSpan(BaseModel):
116
105
  # Fallback to safe stringification
117
106
  return self.safe_stringify(value, self.function)
118
107
 
119
- # Start serialization with the top-level output
120
- return serialize_value(self.output)
108
+ # Start serialization with the top-level value
109
+ return serialize_value(value)
121
110
 
122
111
  class Trace(BaseModel):
123
112
  trace_id: str
@@ -1,4 +1,3 @@
1
-
2
1
  from pydantic import BaseModel
3
2
  from typing import List, Optional, Dict, Any, Union, Callable
4
3
  from judgeval.data import Trace
@@ -22,6 +21,7 @@ class TraceRun(BaseModel):
22
21
  judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
23
22
  rules (Optional[List[Rule]]): Rules to evaluate against scoring results
24
23
  append (Optional[bool]): Whether to append to existing evaluation results
24
+ tools (Optional[List[Dict[str, Any]]]): List of tools to use for evaluation
25
25
  """
26
26
 
27
27
  # The user will specify whether they want log_results when they call run_eval
@@ -40,6 +40,7 @@ class TraceRun(BaseModel):
40
40
  judgment_api_key: Optional[str] = ""
41
41
  override: Optional[bool] = False
42
42
  rules: Optional[List[Rule]] = None
43
+ tools: Optional[List[Dict[str, Any]]] = None
43
44
 
44
45
  class Config:
45
46
  arbitrary_types_allowed = True
@@ -1,5 +1,5 @@
1
1
  from typing import List, Optional, Dict, Any, Union
2
- from pydantic import BaseModel, field_validator
2
+ from pydantic import BaseModel, field_validator, Field
3
3
 
4
4
  from judgeval.data import Example, CustomExample
5
5
  from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
@@ -27,12 +27,12 @@ class EvaluationRun(BaseModel):
27
27
  # The user will specify whether they want log_results when they call run_eval
28
28
  log_results: bool = False # NOTE: log_results has to be set first because it is used to validate project_name and eval_name
29
29
  organization_id: Optional[str] = None
30
- project_name: Optional[str] = None
31
- eval_name: Optional[str] = None
30
+ project_name: Optional[str] = Field(default=None, validate_default=True)
31
+ eval_name: Optional[str] = Field(default=None, validate_default=True)
32
32
  examples: Union[List[Example], List[CustomExample]]
33
33
  scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
34
34
  model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1"
35
- aggregator: Optional[str] = None
35
+ aggregator: Optional[str] = Field(default=None, validate_default=True)
36
36
  metadata: Optional[Dict[str, Any]] = None
37
37
  trace_span_id: Optional[str] = None
38
38
  # API Key will be "" until user calls client.run_eval(), then API Key will be set
@@ -96,9 +96,6 @@ class EvaluationRun(BaseModel):
96
96
  def validate_scorers(cls, v):
97
97
  if not v:
98
98
  raise ValueError("Scorers cannot be empty.")
99
- for s in v:
100
- if not isinstance(s, APIJudgmentScorer) and not isinstance(s, JudgevalScorer):
101
- raise ValueError(f"Invalid type for Scorer: {type(s)}")
102
99
  return v
103
100
 
104
101
  @field_validator('model')
@@ -5,6 +5,7 @@ import os
5
5
  from uuid import uuid4
6
6
  from typing import Optional, List, Dict, Any, Union, Callable
7
7
  import requests
8
+ import asyncio
8
9
 
9
10
  from judgeval.constants import ROOT_API
10
11
  from judgeval.data.datasets import EvalDataset, EvalDatasetClient
@@ -121,7 +122,8 @@ class JudgmentClient(metaclass=SingletonMeta):
121
122
  ignore_errors: bool = True,
122
123
  rules: Optional[List[Rule]] = None,
123
124
  function: Optional[Callable] = None,
124
- tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None
125
+ tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
126
+ tools: Optional[List[Dict[str, Any]]] = None
125
127
  ) -> List[ScoringResult]:
126
128
  try:
127
129
 
@@ -151,6 +153,7 @@ class JudgmentClient(metaclass=SingletonMeta):
151
153
  append=append,
152
154
  judgment_api_key=self.judgment_api_key,
153
155
  organization_id=self.organization_id,
156
+ tools=tools
154
157
  )
155
158
  return run_trace_eval(trace_run, override, ignore_errors, function, tracer, examples)
156
159
  except ValueError as e:
@@ -173,7 +176,7 @@ class JudgmentClient(metaclass=SingletonMeta):
173
176
  ignore_errors: bool = True,
174
177
  async_execution: bool = False,
175
178
  rules: Optional[List[Rule]] = None
176
- ) -> List[ScoringResult]:
179
+ ) -> Union[List[ScoringResult], asyncio.Task]:
177
180
  """
178
181
  Executes an evaluation of `Example`s using one or more `Scorer`s
179
182
 
@@ -494,7 +497,9 @@ class JudgmentClient(metaclass=SingletonMeta):
494
497
  override: bool = False,
495
498
  rules: Optional[List[Rule]] = None,
496
499
  function: Optional[Callable] = None,
497
- tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None
500
+ tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
501
+ tools: Optional[List[Dict[str, Any]]] = None,
502
+ async_execution: bool = False
498
503
  ) -> None:
499
504
  """
500
505
  Asserts a test by running the evaluation and checking the results for success
@@ -512,6 +517,14 @@ class JudgmentClient(metaclass=SingletonMeta):
512
517
  override (bool): Whether to override an existing evaluation run with the same name
513
518
  rules (Optional[List[Rule]]): Rules to evaluate against scoring results
514
519
  """
520
+
521
+ # Check for enable_param_checking and tools
522
+ for scorer in scorers:
523
+ if hasattr(scorer, "kwargs") and scorer.kwargs is not None:
524
+ if scorer.kwargs.get("enable_param_checking") is True:
525
+ if not tools:
526
+ raise ValueError(f"You must provide the 'tools' argument to assert_test when using a scorer with enable_param_checking=True. If you do not want to do param checking, explicitly set enable_param_checking=False for the {scorer.__name__} scorer.")
527
+
515
528
  # Validate that exactly one of examples or test_file is provided
516
529
  if (examples is None and test_file is None) or (examples is not None and test_file is not None):
517
530
  raise ValueError("Exactly one of 'examples' or 'test_file' must be provided, but not both")
@@ -529,7 +542,8 @@ class JudgmentClient(metaclass=SingletonMeta):
529
542
  rules=rules,
530
543
  function=function,
531
544
  tracer=tracer,
532
- test_file=test_file
545
+ test_file=test_file,
546
+ tools=tools
533
547
  )
534
548
  else:
535
549
  results = self.run_evaluation(
@@ -542,7 +556,14 @@ class JudgmentClient(metaclass=SingletonMeta):
542
556
  project_name=project_name,
543
557
  eval_run_name=eval_run_name,
544
558
  override=override,
545
- rules=rules
559
+ rules=rules,
560
+ async_execution=async_execution
546
561
  )
547
562
 
548
- assert_test(results)
563
+ if async_execution:
564
+ # 'results' is an asyncio.Task here, awaiting it gives List[ScoringResult]
565
+ actual_results = asyncio.run(results)
566
+ assert_test(actual_results) # Call the synchronous imported function
567
+ else:
568
+ # 'results' is already List[ScoringResult] here (synchronous path)
569
+ assert_test(results) # Call the synchronous imported function