judgeval 0.0.39__py3-none-any.whl → 0.0.40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/clients.py +6 -4
- judgeval/common/tracer.py +361 -236
- judgeval/constants.py +2 -0
- judgeval/data/__init__.py +2 -1
- judgeval/data/example.py +7 -7
- judgeval/data/tool.py +29 -1
- judgeval/data/trace.py +26 -38
- judgeval/data/trace_run.py +2 -1
- judgeval/evaluation_run.py +4 -7
- judgeval/judgment_client.py +25 -6
- judgeval/run_evaluation.py +50 -16
- judgeval/scorers/__init__.py +4 -1
- judgeval/scorers/judgeval_scorer.py +8 -0
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +4 -0
- judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +124 -0
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +20 -0
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +1 -1
- judgeval/scorers/prompt_scorer.py +5 -164
- judgeval/scorers/score.py +15 -15
- judgeval-0.0.40.dist-info/METADATA +1441 -0
- {judgeval-0.0.39.dist-info → judgeval-0.0.40.dist-info}/RECORD +23 -21
- judgeval-0.0.39.dist-info/METADATA +0 -247
- {judgeval-0.0.39.dist-info → judgeval-0.0.40.dist-info}/WHEEL +0 -0
- {judgeval-0.0.39.dist-info → judgeval-0.0.40.dist-info}/licenses/LICENSE.md +0 -0
judgeval/constants.py
CHANGED
@@ -28,6 +28,8 @@ class APIScorer(str, Enum):
|
|
28
28
|
GROUNDEDNESS = "groundedness"
|
29
29
|
DERAILMENT = "derailment"
|
30
30
|
TOOL_ORDER = "tool_order"
|
31
|
+
CLASSIFIER = "classifier"
|
32
|
+
TOOL_DEPENDENCY = "tool_dependency"
|
31
33
|
@classmethod
|
32
34
|
def _missing_(cls, value):
|
33
35
|
# Handle case-insensitive lookup
|
judgeval/data/__init__.py
CHANGED
@@ -2,7 +2,7 @@ from judgeval.data.example import Example, ExampleParams
|
|
2
2
|
from judgeval.data.custom_example import CustomExample
|
3
3
|
from judgeval.data.scorer_data import ScorerData, create_scorer_data
|
4
4
|
from judgeval.data.result import ScoringResult, generate_scoring_result
|
5
|
-
from judgeval.data.trace import Trace, TraceSpan
|
5
|
+
from judgeval.data.trace import Trace, TraceSpan, TraceUsage
|
6
6
|
|
7
7
|
|
8
8
|
__all__ = [
|
@@ -15,4 +15,5 @@ __all__ = [
|
|
15
15
|
"generate_scoring_result",
|
16
16
|
"Trace",
|
17
17
|
"TraceSpan",
|
18
|
+
"TraceUsage"
|
18
19
|
]
|
judgeval/data/example.py
CHANGED
@@ -36,15 +36,15 @@ class Example(BaseModel):
|
|
36
36
|
name: Optional[str] = None
|
37
37
|
example_id: str = Field(default_factory=lambda: str(uuid4()))
|
38
38
|
example_index: Optional[int] = None
|
39
|
-
|
39
|
+
created_at: Optional[str] = None
|
40
40
|
trace_id: Optional[str] = None
|
41
41
|
|
42
42
|
def __init__(self, **data):
|
43
43
|
if 'example_id' not in data:
|
44
44
|
data['example_id'] = str(uuid4())
|
45
45
|
# Set timestamp if not provided
|
46
|
-
if '
|
47
|
-
data['
|
46
|
+
if 'created_at' not in data:
|
47
|
+
data['created_at'] = datetime.now().isoformat()
|
48
48
|
super().__init__(**data)
|
49
49
|
|
50
50
|
@field_validator('input', mode='before')
|
@@ -123,9 +123,9 @@ class Example(BaseModel):
|
|
123
123
|
raise ValueError(f"Example index must be an integer or None but got {v} of type {type(v)}")
|
124
124
|
return v
|
125
125
|
|
126
|
-
@field_validator('
|
126
|
+
@field_validator('created_at', mode='before')
|
127
127
|
@classmethod
|
128
|
-
def
|
128
|
+
def validate_created_at(cls, v):
|
129
129
|
if v is not None and not isinstance(v, str):
|
130
130
|
raise ValueError(f"Timestamp must be a string or None but got {v} of type {type(v)}")
|
131
131
|
return v
|
@@ -150,7 +150,7 @@ class Example(BaseModel):
|
|
150
150
|
"name": self.name,
|
151
151
|
"example_id": self.example_id,
|
152
152
|
"example_index": self.example_index,
|
153
|
-
"
|
153
|
+
"created_at": self.created_at,
|
154
154
|
}
|
155
155
|
|
156
156
|
def __str__(self):
|
@@ -166,5 +166,5 @@ class Example(BaseModel):
|
|
166
166
|
f"name={self.name}, "
|
167
167
|
f"example_id={self.example_id}, "
|
168
168
|
f"example_index={self.example_index}, "
|
169
|
-
f"
|
169
|
+
f"created_at={self.created_at}, "
|
170
170
|
)
|
judgeval/data/tool.py
CHANGED
@@ -1,10 +1,14 @@
|
|
1
1
|
from pydantic import BaseModel, field_validator
|
2
|
-
from typing import Dict, Any, Optional
|
2
|
+
from typing import Dict, Any, Optional, List
|
3
3
|
import warnings
|
4
4
|
|
5
5
|
class Tool(BaseModel):
|
6
6
|
tool_name: str
|
7
7
|
parameters: Optional[Dict[str, Any]] = None
|
8
|
+
agent_name: Optional[str] = None
|
9
|
+
result_dependencies: Optional[List[Dict[str, Any]]] = None
|
10
|
+
action_dependencies: Optional[List[Dict[str, Any]]] = None
|
11
|
+
require_all: Optional[bool] = None
|
8
12
|
|
9
13
|
@field_validator('tool_name')
|
10
14
|
def validate_tool_name(cls, v):
|
@@ -16,4 +20,28 @@ class Tool(BaseModel):
|
|
16
20
|
def validate_parameters(cls, v):
|
17
21
|
if v is not None and not isinstance(v, dict):
|
18
22
|
warnings.warn(f"Parameters should be a dictionary, got {type(v)}", UserWarning)
|
23
|
+
return v
|
24
|
+
|
25
|
+
@field_validator('agent_name')
|
26
|
+
def validate_agent_name(cls, v):
|
27
|
+
if v is not None and not isinstance(v, str):
|
28
|
+
warnings.warn(f"Agent name should be a string, got {type(v)}", UserWarning)
|
29
|
+
return v
|
30
|
+
|
31
|
+
@field_validator('result_dependencies')
|
32
|
+
def validate_result_dependencies(cls, v):
|
33
|
+
if v is not None and not isinstance(v, list):
|
34
|
+
warnings.warn(f"Result dependencies should be a list, got {type(v)}", UserWarning)
|
35
|
+
return v
|
36
|
+
|
37
|
+
@field_validator('action_dependencies')
|
38
|
+
def validate_action_dependencies(cls, v):
|
39
|
+
if v is not None and not isinstance(v, list):
|
40
|
+
warnings.warn(f"Action dependencies should be a list, got {type(v)}", UserWarning)
|
41
|
+
return v
|
42
|
+
|
43
|
+
@field_validator('require_all')
|
44
|
+
def validate_require_all(cls, v):
|
45
|
+
if v is not None and not isinstance(v, bool):
|
46
|
+
warnings.warn(f"Require all should be a boolean, got {type(v)}", UserWarning)
|
19
47
|
return v
|
judgeval/data/trace.py
CHANGED
@@ -5,36 +5,52 @@ from judgeval.data.tool import Tool
|
|
5
5
|
import json
|
6
6
|
from datetime import datetime, timezone
|
7
7
|
|
8
|
+
class TraceUsage(BaseModel):
|
9
|
+
prompt_tokens: Optional[int] = None
|
10
|
+
completion_tokens: Optional[int] = None
|
11
|
+
total_tokens: Optional[int] = None
|
12
|
+
prompt_tokens_cost_usd: Optional[float] = None
|
13
|
+
completion_tokens_cost_usd: Optional[float] = None
|
14
|
+
total_cost_usd: Optional[float] = None
|
15
|
+
model_name: Optional[str] = None
|
16
|
+
|
8
17
|
class TraceSpan(BaseModel):
|
9
18
|
span_id: str
|
10
19
|
trace_id: str
|
11
|
-
function:
|
20
|
+
function: str
|
12
21
|
depth: int
|
13
22
|
created_at: Optional[Any] = None
|
14
23
|
parent_span_id: Optional[str] = None
|
15
24
|
span_type: Optional[str] = "span"
|
16
25
|
inputs: Optional[Dict[str, Any]] = None
|
26
|
+
error: Optional[Dict[str, Any]] = None
|
17
27
|
output: Optional[Any] = None
|
28
|
+
usage: Optional[TraceUsage] = None
|
18
29
|
duration: Optional[float] = None
|
19
30
|
annotation: Optional[List[Dict[str, Any]]] = None
|
20
31
|
evaluation_runs: Optional[List[EvaluationRun]] = []
|
21
32
|
expected_tools: Optional[List[Tool]] = None
|
22
33
|
additional_metadata: Optional[Dict[str, Any]] = None
|
34
|
+
has_evaluation: Optional[bool] = False
|
35
|
+
agent_name: Optional[str] = None
|
23
36
|
|
24
37
|
def model_dump(self, **kwargs):
|
25
38
|
return {
|
26
39
|
"span_id": self.span_id,
|
27
40
|
"trace_id": self.trace_id,
|
28
41
|
"depth": self.depth,
|
29
|
-
# "created_at": datetime.fromtimestamp(self.created_at).isoformat(),
|
30
42
|
"created_at": datetime.fromtimestamp(self.created_at, tz=timezone.utc).isoformat(),
|
31
|
-
"inputs": self.
|
32
|
-
"output": self.
|
43
|
+
"inputs": self._serialize_value(self.inputs),
|
44
|
+
"output": self._serialize_value(self.output),
|
45
|
+
"error": self._serialize_value(self.error),
|
33
46
|
"evaluation_runs": [run.model_dump() for run in self.evaluation_runs] if self.evaluation_runs else [],
|
34
47
|
"parent_span_id": self.parent_span_id,
|
35
48
|
"function": self.function,
|
36
49
|
"duration": self.duration,
|
37
|
-
"span_type": self.span_type
|
50
|
+
"span_type": self.span_type,
|
51
|
+
"usage": self.usage.model_dump() if self.usage else None,
|
52
|
+
"has_evaluation": self.has_evaluation,
|
53
|
+
"agent_name": self.agent_name
|
38
54
|
}
|
39
55
|
|
40
56
|
def print_span(self):
|
@@ -42,30 +58,6 @@ class TraceSpan(BaseModel):
|
|
42
58
|
indent = " " * self.depth
|
43
59
|
parent_info = f" (parent_id: {self.parent_span_id})" if self.parent_span_id else ""
|
44
60
|
print(f"{indent}→ {self.function} (id: {self.span_id}){parent_info}")
|
45
|
-
|
46
|
-
def _serialize_inputs(self) -> dict:
|
47
|
-
"""Helper method to serialize input data safely."""
|
48
|
-
if self.inputs is None:
|
49
|
-
return {}
|
50
|
-
|
51
|
-
serialized_inputs = {}
|
52
|
-
for key, value in self.inputs.items():
|
53
|
-
if isinstance(value, BaseModel):
|
54
|
-
serialized_inputs[key] = value.model_dump()
|
55
|
-
elif isinstance(value, (list, tuple)):
|
56
|
-
# Handle lists/tuples of arguments
|
57
|
-
serialized_inputs[key] = [
|
58
|
-
item.model_dump() if isinstance(item, BaseModel)
|
59
|
-
else None if not self._is_json_serializable(item)
|
60
|
-
else item
|
61
|
-
for item in value
|
62
|
-
]
|
63
|
-
else:
|
64
|
-
if self._is_json_serializable(value):
|
65
|
-
serialized_inputs[key] = value
|
66
|
-
else:
|
67
|
-
serialized_inputs[key] = self.safe_stringify(value, self.function)
|
68
|
-
return serialized_inputs
|
69
61
|
|
70
62
|
def _is_json_serializable(self, obj: Any) -> bool:
|
71
63
|
"""Helper method to check if an object is JSON serializable."""
|
@@ -88,15 +80,11 @@ class TraceSpan(BaseModel):
|
|
88
80
|
return repr(output)
|
89
81
|
except (TypeError, OverflowError, ValueError):
|
90
82
|
pass
|
91
|
-
|
92
|
-
warnings.warn(
|
93
|
-
f"Output for function {function_name} is not JSON serializable and could not be converted to string. Setting to None."
|
94
|
-
)
|
95
83
|
return None
|
96
84
|
|
97
|
-
def
|
98
|
-
"""Helper method to serialize
|
99
|
-
if
|
85
|
+
def _serialize_value(self, value: Any) -> Any:
|
86
|
+
"""Helper method to deep serialize a value safely supporting Pydantic Models / regular PyObjects."""
|
87
|
+
if value is None:
|
100
88
|
return None
|
101
89
|
|
102
90
|
def serialize_value(value):
|
@@ -117,8 +105,8 @@ class TraceSpan(BaseModel):
|
|
117
105
|
# Fallback to safe stringification
|
118
106
|
return self.safe_stringify(value, self.function)
|
119
107
|
|
120
|
-
# Start serialization with the top-level
|
121
|
-
return serialize_value(
|
108
|
+
# Start serialization with the top-level value
|
109
|
+
return serialize_value(value)
|
122
110
|
|
123
111
|
class Trace(BaseModel):
|
124
112
|
trace_id: str
|
judgeval/data/trace_run.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
from pydantic import BaseModel
|
3
2
|
from typing import List, Optional, Dict, Any, Union, Callable
|
4
3
|
from judgeval.data import Trace
|
@@ -22,6 +21,7 @@ class TraceRun(BaseModel):
|
|
22
21
|
judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
|
23
22
|
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
24
23
|
append (Optional[bool]): Whether to append to existing evaluation results
|
24
|
+
tools (Optional[List[Dict[str, Any]]]): List of tools to use for evaluation
|
25
25
|
"""
|
26
26
|
|
27
27
|
# The user will specify whether they want log_results when they call run_eval
|
@@ -40,6 +40,7 @@ class TraceRun(BaseModel):
|
|
40
40
|
judgment_api_key: Optional[str] = ""
|
41
41
|
override: Optional[bool] = False
|
42
42
|
rules: Optional[List[Rule]] = None
|
43
|
+
tools: Optional[List[Dict[str, Any]]] = None
|
43
44
|
|
44
45
|
class Config:
|
45
46
|
arbitrary_types_allowed = True
|
judgeval/evaluation_run.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
from typing import List, Optional, Dict, Any, Union
|
2
|
-
from pydantic import BaseModel, field_validator
|
2
|
+
from pydantic import BaseModel, field_validator, Field
|
3
3
|
|
4
4
|
from judgeval.data import Example, CustomExample
|
5
5
|
from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
|
@@ -27,12 +27,12 @@ class EvaluationRun(BaseModel):
|
|
27
27
|
# The user will specify whether they want log_results when they call run_eval
|
28
28
|
log_results: bool = False # NOTE: log_results has to be set first because it is used to validate project_name and eval_name
|
29
29
|
organization_id: Optional[str] = None
|
30
|
-
project_name: Optional[str] = None
|
31
|
-
eval_name: Optional[str] = None
|
30
|
+
project_name: Optional[str] = Field(default=None, validate_default=True)
|
31
|
+
eval_name: Optional[str] = Field(default=None, validate_default=True)
|
32
32
|
examples: Union[List[Example], List[CustomExample]]
|
33
33
|
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
|
34
34
|
model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1"
|
35
|
-
aggregator: Optional[str] = None
|
35
|
+
aggregator: Optional[str] = Field(default=None, validate_default=True)
|
36
36
|
metadata: Optional[Dict[str, Any]] = None
|
37
37
|
trace_span_id: Optional[str] = None
|
38
38
|
# API Key will be "" until user calls client.run_eval(), then API Key will be set
|
@@ -96,9 +96,6 @@ class EvaluationRun(BaseModel):
|
|
96
96
|
def validate_scorers(cls, v):
|
97
97
|
if not v:
|
98
98
|
raise ValueError("Scorers cannot be empty.")
|
99
|
-
for s in v:
|
100
|
-
if not isinstance(s, APIJudgmentScorer) and not isinstance(s, JudgevalScorer):
|
101
|
-
raise ValueError(f"Invalid type for Scorer: {type(s)}")
|
102
99
|
return v
|
103
100
|
|
104
101
|
@field_validator('model')
|
judgeval/judgment_client.py
CHANGED
@@ -5,6 +5,7 @@ import os
|
|
5
5
|
from uuid import uuid4
|
6
6
|
from typing import Optional, List, Dict, Any, Union, Callable
|
7
7
|
import requests
|
8
|
+
import asyncio
|
8
9
|
|
9
10
|
from judgeval.constants import ROOT_API
|
10
11
|
from judgeval.data.datasets import EvalDataset, EvalDatasetClient
|
@@ -121,7 +122,8 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
121
122
|
ignore_errors: bool = True,
|
122
123
|
rules: Optional[List[Rule]] = None,
|
123
124
|
function: Optional[Callable] = None,
|
124
|
-
tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None
|
125
|
+
tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
|
126
|
+
tools: Optional[List[Dict[str, Any]]] = None
|
125
127
|
) -> List[ScoringResult]:
|
126
128
|
try:
|
127
129
|
|
@@ -151,6 +153,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
151
153
|
append=append,
|
152
154
|
judgment_api_key=self.judgment_api_key,
|
153
155
|
organization_id=self.organization_id,
|
156
|
+
tools=tools
|
154
157
|
)
|
155
158
|
return run_trace_eval(trace_run, override, ignore_errors, function, tracer, examples)
|
156
159
|
except ValueError as e:
|
@@ -173,7 +176,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
173
176
|
ignore_errors: bool = True,
|
174
177
|
async_execution: bool = False,
|
175
178
|
rules: Optional[List[Rule]] = None
|
176
|
-
) -> List[ScoringResult]:
|
179
|
+
) -> Union[List[ScoringResult], asyncio.Task]:
|
177
180
|
"""
|
178
181
|
Executes an evaluation of `Example`s using one or more `Scorer`s
|
179
182
|
|
@@ -480,7 +483,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
480
483
|
|
481
484
|
return response.json()["slug"]
|
482
485
|
|
483
|
-
|
486
|
+
def assert_test(
|
484
487
|
self,
|
485
488
|
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
|
486
489
|
examples: Optional[List[Example]] = None,
|
@@ -495,6 +498,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
495
498
|
rules: Optional[List[Rule]] = None,
|
496
499
|
function: Optional[Callable] = None,
|
497
500
|
tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
|
501
|
+
tools: Optional[List[Dict[str, Any]]] = None,
|
498
502
|
async_execution: bool = False
|
499
503
|
) -> None:
|
500
504
|
"""
|
@@ -513,6 +517,14 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
513
517
|
override (bool): Whether to override an existing evaluation run with the same name
|
514
518
|
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
515
519
|
"""
|
520
|
+
|
521
|
+
# Check for enable_param_checking and tools
|
522
|
+
for scorer in scorers:
|
523
|
+
if hasattr(scorer, "kwargs") and scorer.kwargs is not None:
|
524
|
+
if scorer.kwargs.get("enable_param_checking") is True:
|
525
|
+
if not tools:
|
526
|
+
raise ValueError(f"You must provide the 'tools' argument to assert_test when using a scorer with enable_param_checking=True. If you do not want to do param checking, explicitly set enable_param_checking=False for the {scorer.__name__} scorer.")
|
527
|
+
|
516
528
|
# Validate that exactly one of examples or test_file is provided
|
517
529
|
if (examples is None and test_file is None) or (examples is not None and test_file is not None):
|
518
530
|
raise ValueError("Exactly one of 'examples' or 'test_file' must be provided, but not both")
|
@@ -530,10 +542,11 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
530
542
|
rules=rules,
|
531
543
|
function=function,
|
532
544
|
tracer=tracer,
|
533
|
-
test_file=test_file
|
545
|
+
test_file=test_file,
|
546
|
+
tools=tools
|
534
547
|
)
|
535
548
|
else:
|
536
|
-
results =
|
549
|
+
results = self.run_evaluation(
|
537
550
|
examples=examples,
|
538
551
|
scorers=scorers,
|
539
552
|
model=model,
|
@@ -547,4 +560,10 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
547
560
|
async_execution=async_execution
|
548
561
|
)
|
549
562
|
|
550
|
-
|
563
|
+
if async_execution:
|
564
|
+
# 'results' is an asyncio.Task here, awaiting it gives List[ScoringResult]
|
565
|
+
actual_results = asyncio.run(results)
|
566
|
+
assert_test(actual_results) # Call the synchronous imported function
|
567
|
+
else:
|
568
|
+
# 'results' is already List[ScoringResult] here (synchronous path)
|
569
|
+
assert_test(results) # Call the synchronous imported function
|
judgeval/run_evaluation.py
CHANGED
@@ -204,9 +204,9 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
|
|
204
204
|
)
|
205
205
|
return results
|
206
206
|
|
207
|
-
def check_experiment_type(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str,
|
207
|
+
def check_experiment_type(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str, is_trace: bool) -> None:
|
208
208
|
"""
|
209
|
-
Checks if the current experiment, if one exists, has the same type (examples of
|
209
|
+
Checks if the current experiment, if one exists, has the same type (examples of traces)
|
210
210
|
"""
|
211
211
|
try:
|
212
212
|
response = requests.post(
|
@@ -220,7 +220,7 @@ def check_experiment_type(eval_name: str, project_name: str, judgment_api_key: s
|
|
220
220
|
"eval_name": eval_name,
|
221
221
|
"project_name": project_name,
|
222
222
|
"judgment_api_key": judgment_api_key,
|
223
|
-
"
|
223
|
+
"is_trace": is_trace
|
224
224
|
},
|
225
225
|
verify=True
|
226
226
|
)
|
@@ -382,7 +382,7 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
|
|
382
382
|
)
|
383
383
|
|
384
384
|
if trace_run.append:
|
385
|
-
# Check that the current experiment, if one exists, has the same type (examples
|
385
|
+
# Check that the current experiment, if one exists, has the same type (examples or traces)
|
386
386
|
check_experiment_type(
|
387
387
|
trace_run.eval_name,
|
388
388
|
trace_run.project_name,
|
@@ -390,13 +390,18 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
|
|
390
390
|
trace_run.organization_id,
|
391
391
|
True
|
392
392
|
)
|
393
|
-
|
394
393
|
if function and tracer:
|
395
394
|
new_traces: List[Trace] = []
|
396
395
|
tracer.offline_mode = True
|
396
|
+
tracer.traces = []
|
397
397
|
for example in examples:
|
398
398
|
if example.input:
|
399
|
-
|
399
|
+
if isinstance(example.input, str):
|
400
|
+
result = run_with_spinner("Running agent function: ", function, example.input)
|
401
|
+
elif isinstance(example.input, dict):
|
402
|
+
result = run_with_spinner("Running agent function: ", function, **example.input)
|
403
|
+
else:
|
404
|
+
raise ValueError(f"Input must be string or dict, got {type(example.input)}")
|
400
405
|
else:
|
401
406
|
result = run_with_spinner("Running agent function: ", function)
|
402
407
|
for i, trace in enumerate(tracer.traces):
|
@@ -405,6 +410,7 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
|
|
405
410
|
trace.entries[0].expected_tools = examples[i].expected_tools
|
406
411
|
new_traces.append(trace)
|
407
412
|
trace_run.traces = new_traces
|
413
|
+
tracer.traces = []
|
408
414
|
|
409
415
|
# Execute evaluation using Judgment API
|
410
416
|
info("Starting API evaluation")
|
@@ -423,7 +429,7 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
|
|
423
429
|
debug("Processing API results")
|
424
430
|
# TODO: allow for custom scorer on traces
|
425
431
|
if trace_run.log_results:
|
426
|
-
pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["
|
432
|
+
pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["agent_results"], trace_run)
|
427
433
|
rprint(pretty_str)
|
428
434
|
|
429
435
|
return scoring_results
|
@@ -504,7 +510,8 @@ async def _poll_evaluation_until_complete(eval_name: str, project_name: str, jud
|
|
504
510
|
info(f"Polling for evaluation '{eval_name}' in project '{project_name}' (attempt {poll_count})")
|
505
511
|
|
506
512
|
# Check status
|
507
|
-
response =
|
513
|
+
response = await asyncio.to_thread(
|
514
|
+
requests.get,
|
508
515
|
JUDGMENT_GET_EVAL_STATUS_API_URL,
|
509
516
|
headers={
|
510
517
|
"Content-Type": "application/json",
|
@@ -531,7 +538,8 @@ async def _poll_evaluation_until_complete(eval_name: str, project_name: str, jud
|
|
531
538
|
# If complete, get results and return
|
532
539
|
if status == "completed" or status == "complete":
|
533
540
|
info(f"Evaluation '{eval_name}' reported as completed, fetching and verifying results...")
|
534
|
-
results_response =
|
541
|
+
results_response = await asyncio.to_thread(
|
542
|
+
requests.post,
|
535
543
|
JUDGMENT_EVAL_FETCH_API_URL,
|
536
544
|
headers={
|
537
545
|
"Content-Type": "application/json",
|
@@ -723,7 +731,18 @@ class SpinnerWrappedTask:
|
|
723
731
|
|
724
732
|
def __await__(self):
|
725
733
|
async def _spin_and_await():
|
726
|
-
|
734
|
+
# self.task resolves to (scoring_results, pretty_str_to_print)
|
735
|
+
task_result_tuple = await await_with_spinner(self.task, self.message)
|
736
|
+
|
737
|
+
# Unpack the tuple
|
738
|
+
scoring_results, pretty_str_to_print = task_result_tuple
|
739
|
+
|
740
|
+
# Print the pretty string if it exists, after spinner is cleared
|
741
|
+
if pretty_str_to_print:
|
742
|
+
rprint(pretty_str_to_print)
|
743
|
+
|
744
|
+
# Return only the scoring_results to the original awaiter
|
745
|
+
return scoring_results
|
727
746
|
return _spin_and_await().__await__()
|
728
747
|
|
729
748
|
# Proxy all Task attributes and methods to the underlying task
|
@@ -756,7 +775,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
756
775
|
)
|
757
776
|
|
758
777
|
if evaluation_run.append:
|
759
|
-
# Check that the current experiment, if one exists, has the same type (examples of
|
778
|
+
# Check that the current experiment, if one exists, has the same type (examples of traces)
|
760
779
|
check_experiment_type(
|
761
780
|
evaluation_run.eval_name,
|
762
781
|
evaluation_run.project_name,
|
@@ -769,8 +788,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
769
788
|
debug("Initializing examples with IDs and timestamps")
|
770
789
|
for idx, example in enumerate(evaluation_run.examples):
|
771
790
|
example.example_index = idx # Set numeric index
|
772
|
-
example.
|
773
|
-
with example_logging_context(example.timestamp, example.example_id):
|
791
|
+
with example_logging_context(example.created_at, example.example_id):
|
774
792
|
debug(f"Initialized example {example.example_id} (index: {example.example_index})")
|
775
793
|
debug(f"Input: {example.input}")
|
776
794
|
debug(f"Actual output: {example.actual_output}")
|
@@ -824,7 +842,8 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
824
842
|
payload = evaluation_run.model_dump(warnings=False)
|
825
843
|
|
826
844
|
# Send the evaluation to the queue
|
827
|
-
response =
|
845
|
+
response = await asyncio.to_thread(
|
846
|
+
requests.post,
|
828
847
|
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
|
829
848
|
headers={
|
830
849
|
"Content-Type": "application/json",
|
@@ -843,13 +862,28 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
843
862
|
info(f"Successfully added evaluation '{evaluation_run.eval_name}' to queue")
|
844
863
|
|
845
864
|
# Poll until the evaluation is complete
|
846
|
-
|
865
|
+
results = await _poll_evaluation_until_complete(
|
847
866
|
eval_name=evaluation_run.eval_name,
|
848
867
|
project_name=evaluation_run.project_name,
|
849
868
|
judgment_api_key=evaluation_run.judgment_api_key,
|
850
869
|
organization_id=evaluation_run.organization_id,
|
851
870
|
original_examples=evaluation_run.examples # Pass the original examples
|
852
871
|
)
|
872
|
+
|
873
|
+
pretty_str_to_print = None
|
874
|
+
if evaluation_run.log_results and results: # Ensure results exist before logging
|
875
|
+
send_results = [scoring_result.model_dump(warnings=False) for scoring_result in results]
|
876
|
+
try:
|
877
|
+
# Run the blocking log_evaluation_results in a separate thread
|
878
|
+
pretty_str_to_print = await asyncio.to_thread(
|
879
|
+
log_evaluation_results,
|
880
|
+
send_results,
|
881
|
+
evaluation_run
|
882
|
+
)
|
883
|
+
except Exception as e:
|
884
|
+
error(f"Error logging results after async evaluation: {str(e)}")
|
885
|
+
|
886
|
+
return results, pretty_str_to_print
|
853
887
|
|
854
888
|
# Create a regular task
|
855
889
|
task = asyncio.create_task(_async_evaluation_workflow())
|
@@ -895,7 +929,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
895
929
|
# We should be removing local scorers soon
|
896
930
|
info("Starting local evaluation")
|
897
931
|
for example in evaluation_run.examples:
|
898
|
-
with example_logging_context(example.
|
932
|
+
with example_logging_context(example.created_at, example.example_id):
|
899
933
|
debug(f"Processing example {example.example_id}: {example.input}")
|
900
934
|
|
901
935
|
results: List[ScoringResult] = asyncio.run(
|
judgeval/scorers/__init__.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
2
2
|
from judgeval.scorers.judgeval_scorer import JudgevalScorer
|
3
|
-
from judgeval.scorers.prompt_scorer import PromptScorer
|
3
|
+
from judgeval.scorers.prompt_scorer import PromptScorer
|
4
4
|
from judgeval.scorers.judgeval_scorers.api_scorers import (
|
5
5
|
ExecutionOrderScorer,
|
6
6
|
JSONCorrectnessScorer,
|
@@ -17,6 +17,8 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
|
|
17
17
|
GroundednessScorer,
|
18
18
|
DerailmentScorer,
|
19
19
|
ToolOrderScorer,
|
20
|
+
ClassifierScorer,
|
21
|
+
ToolDependencyScorer,
|
20
22
|
)
|
21
23
|
from judgeval.scorers.judgeval_scorers.classifiers import (
|
22
24
|
Text2SQLScorer,
|
@@ -43,4 +45,5 @@ __all__ = [
|
|
43
45
|
"GroundednessScorer",
|
44
46
|
"DerailmentScorer",
|
45
47
|
"ToolOrderScorer",
|
48
|
+
"ToolDependencyScorer",
|
46
49
|
]
|
@@ -39,6 +39,8 @@ class JudgevalScorer:
|
|
39
39
|
evaluation_cost: Optional[float] = None # The cost of running the scorer
|
40
40
|
verbose_logs: Optional[str] = None # The verbose logs of the scorer
|
41
41
|
additional_metadata: Optional[Dict] = None # Additional metadata for the scorer
|
42
|
+
error: Optional[str] = None
|
43
|
+
success: Optional[bool] = None
|
42
44
|
|
43
45
|
def __init__(
|
44
46
|
self,
|
@@ -145,3 +147,9 @@ class JudgevalScorer:
|
|
145
147
|
"additional_metadata": self.additional_metadata,
|
146
148
|
}
|
147
149
|
return f"JudgevalScorer({attributes})"
|
150
|
+
|
151
|
+
def to_dict(self):
|
152
|
+
return {
|
153
|
+
"score_type": str(self.score_type), # Convert enum to string for serialization
|
154
|
+
"threshold": self.threshold
|
155
|
+
}
|
@@ -13,6 +13,8 @@ from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import
|
|
13
13
|
from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import GroundednessScorer
|
14
14
|
from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import DerailmentScorer
|
15
15
|
from judgeval.scorers.judgeval_scorers.api_scorers.tool_order import ToolOrderScorer
|
16
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.classifier_scorer import ClassifierScorer
|
17
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.tool_dependency import ToolDependencyScorer
|
16
18
|
__all__ = [
|
17
19
|
"ExecutionOrderScorer",
|
18
20
|
"JSONCorrectnessScorer",
|
@@ -29,4 +31,6 @@ __all__ = [
|
|
29
31
|
"GroundednessScorer",
|
30
32
|
"DerailmentScorer",
|
31
33
|
"ToolOrderScorer",
|
34
|
+
"ClassifierScorer",
|
35
|
+
"ToolDependencyScorer",
|
32
36
|
]
|