judgeval 0.0.38__py3-none-any.whl → 0.0.40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/clients.py +6 -4
- judgeval/common/tracer.py +361 -236
- judgeval/constants.py +3 -0
- judgeval/data/__init__.py +2 -1
- judgeval/data/example.py +14 -13
- judgeval/data/tool.py +47 -0
- judgeval/data/trace.py +28 -39
- judgeval/data/trace_run.py +2 -1
- judgeval/evaluation_run.py +4 -7
- judgeval/judgment_client.py +27 -6
- judgeval/run_evaluation.py +395 -37
- judgeval/scorers/__init__.py +4 -1
- judgeval/scorers/judgeval_scorer.py +8 -0
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +4 -0
- judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +124 -0
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +20 -0
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +1 -1
- judgeval/scorers/prompt_scorer.py +5 -164
- judgeval/scorers/score.py +15 -15
- judgeval-0.0.40.dist-info/METADATA +1441 -0
- {judgeval-0.0.38.dist-info → judgeval-0.0.40.dist-info}/RECORD +23 -20
- judgeval-0.0.38.dist-info/METADATA +0 -247
- {judgeval-0.0.38.dist-info → judgeval-0.0.40.dist-info}/WHEEL +0 -0
- {judgeval-0.0.38.dist-info → judgeval-0.0.40.dist-info}/licenses/LICENSE.md +0 -0
judgeval/constants.py
CHANGED
@@ -28,6 +28,8 @@ class APIScorer(str, Enum):
|
|
28
28
|
GROUNDEDNESS = "groundedness"
|
29
29
|
DERAILMENT = "derailment"
|
30
30
|
TOOL_ORDER = "tool_order"
|
31
|
+
CLASSIFIER = "classifier"
|
32
|
+
TOOL_DEPENDENCY = "tool_dependency"
|
31
33
|
@classmethod
|
32
34
|
def _missing_(cls, value):
|
33
35
|
# Handle case-insensitive lookup
|
@@ -59,6 +61,7 @@ JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
|
|
59
61
|
JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
|
60
62
|
JUDGMENT_TRACES_ADD_ANNOTATION_API_URL = f"{ROOT_API}/traces/add_annotation/"
|
61
63
|
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
|
64
|
+
JUDGMENT_GET_EVAL_STATUS_API_URL = f"{ROOT_API}/get_evaluation_status/"
|
62
65
|
# RabbitMQ
|
63
66
|
RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
|
64
67
|
RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", 5672)
|
judgeval/data/__init__.py
CHANGED
@@ -2,7 +2,7 @@ from judgeval.data.example import Example, ExampleParams
|
|
2
2
|
from judgeval.data.custom_example import CustomExample
|
3
3
|
from judgeval.data.scorer_data import ScorerData, create_scorer_data
|
4
4
|
from judgeval.data.result import ScoringResult, generate_scoring_result
|
5
|
-
from judgeval.data.trace import Trace, TraceSpan
|
5
|
+
from judgeval.data.trace import Trace, TraceSpan, TraceUsage
|
6
6
|
|
7
7
|
|
8
8
|
__all__ = [
|
@@ -15,4 +15,5 @@ __all__ = [
|
|
15
15
|
"generate_scoring_result",
|
16
16
|
"Trace",
|
17
17
|
"TraceSpan",
|
18
|
+
"TraceUsage"
|
18
19
|
]
|
judgeval/data/example.py
CHANGED
@@ -8,6 +8,7 @@ from uuid import uuid4
|
|
8
8
|
from pydantic import BaseModel, Field, field_validator
|
9
9
|
from enum import Enum
|
10
10
|
from datetime import datetime
|
11
|
+
from judgeval.data.tool import Tool
|
11
12
|
import time
|
12
13
|
|
13
14
|
|
@@ -31,19 +32,19 @@ class Example(BaseModel):
|
|
31
32
|
retrieval_context: Optional[List[str]] = None
|
32
33
|
additional_metadata: Optional[Dict[str, Any]] = None
|
33
34
|
tools_called: Optional[List[str]] = None
|
34
|
-
expected_tools: Optional[List[
|
35
|
+
expected_tools: Optional[List[Tool]] = None
|
35
36
|
name: Optional[str] = None
|
36
37
|
example_id: str = Field(default_factory=lambda: str(uuid4()))
|
37
38
|
example_index: Optional[int] = None
|
38
|
-
|
39
|
+
created_at: Optional[str] = None
|
39
40
|
trace_id: Optional[str] = None
|
40
41
|
|
41
42
|
def __init__(self, **data):
|
42
43
|
if 'example_id' not in data:
|
43
44
|
data['example_id'] = str(uuid4())
|
44
45
|
# Set timestamp if not provided
|
45
|
-
if '
|
46
|
-
data['
|
46
|
+
if 'created_at' not in data:
|
47
|
+
data['created_at'] = datetime.now().isoformat()
|
47
48
|
super().__init__(**data)
|
48
49
|
|
49
50
|
@field_validator('input', mode='before')
|
@@ -82,17 +83,17 @@ class Example(BaseModel):
|
|
82
83
|
raise ValueError(f"All items in expected_output must be strings but got {v}")
|
83
84
|
return v
|
84
85
|
|
85
|
-
@field_validator('expected_tools'
|
86
|
+
@field_validator('expected_tools')
|
86
87
|
@classmethod
|
87
88
|
def validate_expected_tools(cls, v):
|
88
89
|
if v is not None:
|
89
90
|
if not isinstance(v, list):
|
90
|
-
raise ValueError(f"Expected tools must be a list of
|
91
|
+
raise ValueError(f"Expected tools must be a list of Tools or None but got {v} of type {type(v)}")
|
91
92
|
|
92
|
-
# Check that each item in the list is a
|
93
|
+
# Check that each item in the list is a Tool
|
93
94
|
for i, item in enumerate(v):
|
94
|
-
if not isinstance(item,
|
95
|
-
raise ValueError(f"Expected tools must be a list of
|
95
|
+
if not isinstance(item, Tool):
|
96
|
+
raise ValueError(f"Expected tools must be a list of Tools, but item at index {i} is {item} of type {type(item)}")
|
96
97
|
|
97
98
|
return v
|
98
99
|
|
@@ -122,9 +123,9 @@ class Example(BaseModel):
|
|
122
123
|
raise ValueError(f"Example index must be an integer or None but got {v} of type {type(v)}")
|
123
124
|
return v
|
124
125
|
|
125
|
-
@field_validator('
|
126
|
+
@field_validator('created_at', mode='before')
|
126
127
|
@classmethod
|
127
|
-
def
|
128
|
+
def validate_created_at(cls, v):
|
128
129
|
if v is not None and not isinstance(v, str):
|
129
130
|
raise ValueError(f"Timestamp must be a string or None but got {v} of type {type(v)}")
|
130
131
|
return v
|
@@ -149,7 +150,7 @@ class Example(BaseModel):
|
|
149
150
|
"name": self.name,
|
150
151
|
"example_id": self.example_id,
|
151
152
|
"example_index": self.example_index,
|
152
|
-
"
|
153
|
+
"created_at": self.created_at,
|
153
154
|
}
|
154
155
|
|
155
156
|
def __str__(self):
|
@@ -165,5 +166,5 @@ class Example(BaseModel):
|
|
165
166
|
f"name={self.name}, "
|
166
167
|
f"example_id={self.example_id}, "
|
167
168
|
f"example_index={self.example_index}, "
|
168
|
-
f"
|
169
|
+
f"created_at={self.created_at}, "
|
169
170
|
)
|
judgeval/data/tool.py
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
from pydantic import BaseModel, field_validator
|
2
|
+
from typing import Dict, Any, Optional, List
|
3
|
+
import warnings
|
4
|
+
|
5
|
+
class Tool(BaseModel):
|
6
|
+
tool_name: str
|
7
|
+
parameters: Optional[Dict[str, Any]] = None
|
8
|
+
agent_name: Optional[str] = None
|
9
|
+
result_dependencies: Optional[List[Dict[str, Any]]] = None
|
10
|
+
action_dependencies: Optional[List[Dict[str, Any]]] = None
|
11
|
+
require_all: Optional[bool] = None
|
12
|
+
|
13
|
+
@field_validator('tool_name')
|
14
|
+
def validate_tool_name(cls, v):
|
15
|
+
if not v:
|
16
|
+
warnings.warn("Tool name is empty or None", UserWarning)
|
17
|
+
return v
|
18
|
+
|
19
|
+
@field_validator('parameters')
|
20
|
+
def validate_parameters(cls, v):
|
21
|
+
if v is not None and not isinstance(v, dict):
|
22
|
+
warnings.warn(f"Parameters should be a dictionary, got {type(v)}", UserWarning)
|
23
|
+
return v
|
24
|
+
|
25
|
+
@field_validator('agent_name')
|
26
|
+
def validate_agent_name(cls, v):
|
27
|
+
if v is not None and not isinstance(v, str):
|
28
|
+
warnings.warn(f"Agent name should be a string, got {type(v)}", UserWarning)
|
29
|
+
return v
|
30
|
+
|
31
|
+
@field_validator('result_dependencies')
|
32
|
+
def validate_result_dependencies(cls, v):
|
33
|
+
if v is not None and not isinstance(v, list):
|
34
|
+
warnings.warn(f"Result dependencies should be a list, got {type(v)}", UserWarning)
|
35
|
+
return v
|
36
|
+
|
37
|
+
@field_validator('action_dependencies')
|
38
|
+
def validate_action_dependencies(cls, v):
|
39
|
+
if v is not None and not isinstance(v, list):
|
40
|
+
warnings.warn(f"Action dependencies should be a list, got {type(v)}", UserWarning)
|
41
|
+
return v
|
42
|
+
|
43
|
+
@field_validator('require_all')
|
44
|
+
def validate_require_all(cls, v):
|
45
|
+
if v is not None and not isinstance(v, bool):
|
46
|
+
warnings.warn(f"Require all should be a boolean, got {type(v)}", UserWarning)
|
47
|
+
return v
|
judgeval/data/trace.py
CHANGED
@@ -1,39 +1,56 @@
|
|
1
1
|
from pydantic import BaseModel
|
2
2
|
from typing import Optional, Dict, Any, List
|
3
3
|
from judgeval.evaluation_run import EvaluationRun
|
4
|
+
from judgeval.data.tool import Tool
|
4
5
|
import json
|
5
6
|
from datetime import datetime, timezone
|
6
7
|
|
8
|
+
class TraceUsage(BaseModel):
|
9
|
+
prompt_tokens: Optional[int] = None
|
10
|
+
completion_tokens: Optional[int] = None
|
11
|
+
total_tokens: Optional[int] = None
|
12
|
+
prompt_tokens_cost_usd: Optional[float] = None
|
13
|
+
completion_tokens_cost_usd: Optional[float] = None
|
14
|
+
total_cost_usd: Optional[float] = None
|
15
|
+
model_name: Optional[str] = None
|
16
|
+
|
7
17
|
class TraceSpan(BaseModel):
|
8
18
|
span_id: str
|
9
19
|
trace_id: str
|
10
|
-
function:
|
20
|
+
function: str
|
11
21
|
depth: int
|
12
22
|
created_at: Optional[Any] = None
|
13
23
|
parent_span_id: Optional[str] = None
|
14
24
|
span_type: Optional[str] = "span"
|
15
25
|
inputs: Optional[Dict[str, Any]] = None
|
26
|
+
error: Optional[Dict[str, Any]] = None
|
16
27
|
output: Optional[Any] = None
|
28
|
+
usage: Optional[TraceUsage] = None
|
17
29
|
duration: Optional[float] = None
|
18
30
|
annotation: Optional[List[Dict[str, Any]]] = None
|
19
31
|
evaluation_runs: Optional[List[EvaluationRun]] = []
|
20
|
-
expected_tools: Optional[List[
|
32
|
+
expected_tools: Optional[List[Tool]] = None
|
21
33
|
additional_metadata: Optional[Dict[str, Any]] = None
|
34
|
+
has_evaluation: Optional[bool] = False
|
35
|
+
agent_name: Optional[str] = None
|
22
36
|
|
23
37
|
def model_dump(self, **kwargs):
|
24
38
|
return {
|
25
39
|
"span_id": self.span_id,
|
26
40
|
"trace_id": self.trace_id,
|
27
41
|
"depth": self.depth,
|
28
|
-
# "created_at": datetime.fromtimestamp(self.created_at).isoformat(),
|
29
42
|
"created_at": datetime.fromtimestamp(self.created_at, tz=timezone.utc).isoformat(),
|
30
|
-
"inputs": self.
|
31
|
-
"output": self.
|
43
|
+
"inputs": self._serialize_value(self.inputs),
|
44
|
+
"output": self._serialize_value(self.output),
|
45
|
+
"error": self._serialize_value(self.error),
|
32
46
|
"evaluation_runs": [run.model_dump() for run in self.evaluation_runs] if self.evaluation_runs else [],
|
33
47
|
"parent_span_id": self.parent_span_id,
|
34
48
|
"function": self.function,
|
35
49
|
"duration": self.duration,
|
36
|
-
"span_type": self.span_type
|
50
|
+
"span_type": self.span_type,
|
51
|
+
"usage": self.usage.model_dump() if self.usage else None,
|
52
|
+
"has_evaluation": self.has_evaluation,
|
53
|
+
"agent_name": self.agent_name
|
37
54
|
}
|
38
55
|
|
39
56
|
def print_span(self):
|
@@ -41,30 +58,6 @@ class TraceSpan(BaseModel):
|
|
41
58
|
indent = " " * self.depth
|
42
59
|
parent_info = f" (parent_id: {self.parent_span_id})" if self.parent_span_id else ""
|
43
60
|
print(f"{indent}→ {self.function} (id: {self.span_id}){parent_info}")
|
44
|
-
|
45
|
-
def _serialize_inputs(self) -> dict:
|
46
|
-
"""Helper method to serialize input data safely."""
|
47
|
-
if self.inputs is None:
|
48
|
-
return {}
|
49
|
-
|
50
|
-
serialized_inputs = {}
|
51
|
-
for key, value in self.inputs.items():
|
52
|
-
if isinstance(value, BaseModel):
|
53
|
-
serialized_inputs[key] = value.model_dump()
|
54
|
-
elif isinstance(value, (list, tuple)):
|
55
|
-
# Handle lists/tuples of arguments
|
56
|
-
serialized_inputs[key] = [
|
57
|
-
item.model_dump() if isinstance(item, BaseModel)
|
58
|
-
else None if not self._is_json_serializable(item)
|
59
|
-
else item
|
60
|
-
for item in value
|
61
|
-
]
|
62
|
-
else:
|
63
|
-
if self._is_json_serializable(value):
|
64
|
-
serialized_inputs[key] = value
|
65
|
-
else:
|
66
|
-
serialized_inputs[key] = self.safe_stringify(value, self.function)
|
67
|
-
return serialized_inputs
|
68
61
|
|
69
62
|
def _is_json_serializable(self, obj: Any) -> bool:
|
70
63
|
"""Helper method to check if an object is JSON serializable."""
|
@@ -87,15 +80,11 @@ class TraceSpan(BaseModel):
|
|
87
80
|
return repr(output)
|
88
81
|
except (TypeError, OverflowError, ValueError):
|
89
82
|
pass
|
90
|
-
|
91
|
-
warnings.warn(
|
92
|
-
f"Output for function {function_name} is not JSON serializable and could not be converted to string. Setting to None."
|
93
|
-
)
|
94
83
|
return None
|
95
84
|
|
96
|
-
def
|
97
|
-
"""Helper method to serialize
|
98
|
-
if
|
85
|
+
def _serialize_value(self, value: Any) -> Any:
|
86
|
+
"""Helper method to deep serialize a value safely supporting Pydantic Models / regular PyObjects."""
|
87
|
+
if value is None:
|
99
88
|
return None
|
100
89
|
|
101
90
|
def serialize_value(value):
|
@@ -116,8 +105,8 @@ class TraceSpan(BaseModel):
|
|
116
105
|
# Fallback to safe stringification
|
117
106
|
return self.safe_stringify(value, self.function)
|
118
107
|
|
119
|
-
# Start serialization with the top-level
|
120
|
-
return serialize_value(
|
108
|
+
# Start serialization with the top-level value
|
109
|
+
return serialize_value(value)
|
121
110
|
|
122
111
|
class Trace(BaseModel):
|
123
112
|
trace_id: str
|
judgeval/data/trace_run.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
from pydantic import BaseModel
|
3
2
|
from typing import List, Optional, Dict, Any, Union, Callable
|
4
3
|
from judgeval.data import Trace
|
@@ -22,6 +21,7 @@ class TraceRun(BaseModel):
|
|
22
21
|
judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
|
23
22
|
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
24
23
|
append (Optional[bool]): Whether to append to existing evaluation results
|
24
|
+
tools (Optional[List[Dict[str, Any]]]): List of tools to use for evaluation
|
25
25
|
"""
|
26
26
|
|
27
27
|
# The user will specify whether they want log_results when they call run_eval
|
@@ -40,6 +40,7 @@ class TraceRun(BaseModel):
|
|
40
40
|
judgment_api_key: Optional[str] = ""
|
41
41
|
override: Optional[bool] = False
|
42
42
|
rules: Optional[List[Rule]] = None
|
43
|
+
tools: Optional[List[Dict[str, Any]]] = None
|
43
44
|
|
44
45
|
class Config:
|
45
46
|
arbitrary_types_allowed = True
|
judgeval/evaluation_run.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
from typing import List, Optional, Dict, Any, Union
|
2
|
-
from pydantic import BaseModel, field_validator
|
2
|
+
from pydantic import BaseModel, field_validator, Field
|
3
3
|
|
4
4
|
from judgeval.data import Example, CustomExample
|
5
5
|
from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
|
@@ -27,12 +27,12 @@ class EvaluationRun(BaseModel):
|
|
27
27
|
# The user will specify whether they want log_results when they call run_eval
|
28
28
|
log_results: bool = False # NOTE: log_results has to be set first because it is used to validate project_name and eval_name
|
29
29
|
organization_id: Optional[str] = None
|
30
|
-
project_name: Optional[str] = None
|
31
|
-
eval_name: Optional[str] = None
|
30
|
+
project_name: Optional[str] = Field(default=None, validate_default=True)
|
31
|
+
eval_name: Optional[str] = Field(default=None, validate_default=True)
|
32
32
|
examples: Union[List[Example], List[CustomExample]]
|
33
33
|
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
|
34
34
|
model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1"
|
35
|
-
aggregator: Optional[str] = None
|
35
|
+
aggregator: Optional[str] = Field(default=None, validate_default=True)
|
36
36
|
metadata: Optional[Dict[str, Any]] = None
|
37
37
|
trace_span_id: Optional[str] = None
|
38
38
|
# API Key will be "" until user calls client.run_eval(), then API Key will be set
|
@@ -96,9 +96,6 @@ class EvaluationRun(BaseModel):
|
|
96
96
|
def validate_scorers(cls, v):
|
97
97
|
if not v:
|
98
98
|
raise ValueError("Scorers cannot be empty.")
|
99
|
-
for s in v:
|
100
|
-
if not isinstance(s, APIJudgmentScorer) and not isinstance(s, JudgevalScorer):
|
101
|
-
raise ValueError(f"Invalid type for Scorer: {type(s)}")
|
102
99
|
return v
|
103
100
|
|
104
101
|
@field_validator('model')
|
judgeval/judgment_client.py
CHANGED
@@ -5,6 +5,7 @@ import os
|
|
5
5
|
from uuid import uuid4
|
6
6
|
from typing import Optional, List, Dict, Any, Union, Callable
|
7
7
|
import requests
|
8
|
+
import asyncio
|
8
9
|
|
9
10
|
from judgeval.constants import ROOT_API
|
10
11
|
from judgeval.data.datasets import EvalDataset, EvalDatasetClient
|
@@ -121,7 +122,8 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
121
122
|
ignore_errors: bool = True,
|
122
123
|
rules: Optional[List[Rule]] = None,
|
123
124
|
function: Optional[Callable] = None,
|
124
|
-
tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None
|
125
|
+
tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
|
126
|
+
tools: Optional[List[Dict[str, Any]]] = None
|
125
127
|
) -> List[ScoringResult]:
|
126
128
|
try:
|
127
129
|
|
@@ -151,6 +153,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
151
153
|
append=append,
|
152
154
|
judgment_api_key=self.judgment_api_key,
|
153
155
|
organization_id=self.organization_id,
|
156
|
+
tools=tools
|
154
157
|
)
|
155
158
|
return run_trace_eval(trace_run, override, ignore_errors, function, tracer, examples)
|
156
159
|
except ValueError as e:
|
@@ -173,7 +176,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
173
176
|
ignore_errors: bool = True,
|
174
177
|
async_execution: bool = False,
|
175
178
|
rules: Optional[List[Rule]] = None
|
176
|
-
) -> List[ScoringResult]:
|
179
|
+
) -> Union[List[ScoringResult], asyncio.Task]:
|
177
180
|
"""
|
178
181
|
Executes an evaluation of `Example`s using one or more `Scorer`s
|
179
182
|
|
@@ -494,7 +497,9 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
494
497
|
override: bool = False,
|
495
498
|
rules: Optional[List[Rule]] = None,
|
496
499
|
function: Optional[Callable] = None,
|
497
|
-
tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None
|
500
|
+
tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
|
501
|
+
tools: Optional[List[Dict[str, Any]]] = None,
|
502
|
+
async_execution: bool = False
|
498
503
|
) -> None:
|
499
504
|
"""
|
500
505
|
Asserts a test by running the evaluation and checking the results for success
|
@@ -512,6 +517,14 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
512
517
|
override (bool): Whether to override an existing evaluation run with the same name
|
513
518
|
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
514
519
|
"""
|
520
|
+
|
521
|
+
# Check for enable_param_checking and tools
|
522
|
+
for scorer in scorers:
|
523
|
+
if hasattr(scorer, "kwargs") and scorer.kwargs is not None:
|
524
|
+
if scorer.kwargs.get("enable_param_checking") is True:
|
525
|
+
if not tools:
|
526
|
+
raise ValueError(f"You must provide the 'tools' argument to assert_test when using a scorer with enable_param_checking=True. If you do not want to do param checking, explicitly set enable_param_checking=False for the {scorer.__name__} scorer.")
|
527
|
+
|
515
528
|
# Validate that exactly one of examples or test_file is provided
|
516
529
|
if (examples is None and test_file is None) or (examples is not None and test_file is not None):
|
517
530
|
raise ValueError("Exactly one of 'examples' or 'test_file' must be provided, but not both")
|
@@ -529,7 +542,8 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
529
542
|
rules=rules,
|
530
543
|
function=function,
|
531
544
|
tracer=tracer,
|
532
|
-
test_file=test_file
|
545
|
+
test_file=test_file,
|
546
|
+
tools=tools
|
533
547
|
)
|
534
548
|
else:
|
535
549
|
results = self.run_evaluation(
|
@@ -542,7 +556,14 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
542
556
|
project_name=project_name,
|
543
557
|
eval_run_name=eval_run_name,
|
544
558
|
override=override,
|
545
|
-
rules=rules
|
559
|
+
rules=rules,
|
560
|
+
async_execution=async_execution
|
546
561
|
)
|
547
562
|
|
548
|
-
|
563
|
+
if async_execution:
|
564
|
+
# 'results' is an asyncio.Task here, awaiting it gives List[ScoringResult]
|
565
|
+
actual_results = asyncio.run(results)
|
566
|
+
assert_test(actual_results) # Call the synchronous imported function
|
567
|
+
else:
|
568
|
+
# 'results' is already List[ScoringResult] here (synchronous path)
|
569
|
+
assert_test(results) # Call the synchronous imported function
|