judgeval 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/api/api.py +38 -7
- judgeval/common/api/constants.py +9 -1
- judgeval/common/storage/s3_storage.py +2 -3
- judgeval/common/tracer/core.py +66 -32
- judgeval/common/tracer/otel_span_processor.py +4 -50
- judgeval/common/tracer/span_transformer.py +16 -10
- judgeval/common/utils.py +46 -38
- judgeval/constants.py +2 -0
- judgeval/data/example.py +9 -37
- judgeval/data/judgment_types.py +23 -45
- judgeval/data/result.py +8 -14
- judgeval/data/scripts/openapi_transform.py +5 -5
- judgeval/data/trace.py +3 -4
- judgeval/dataset.py +192 -0
- judgeval/evaluation_run.py +1 -0
- judgeval/judges/litellm_judge.py +2 -2
- judgeval/judges/mixture_of_judges.py +6 -6
- judgeval/judges/together_judge.py +6 -3
- judgeval/judgment_client.py +9 -71
- judgeval/run_evaluation.py +41 -9
- judgeval/scorers/score.py +11 -7
- judgeval/scorers/utils.py +3 -3
- judgeval/utils/file_utils.py +40 -25
- {judgeval-0.2.0.dist-info → judgeval-0.3.1.dist-info}/METADATA +10 -6
- {judgeval-0.2.0.dist-info → judgeval-0.3.1.dist-info}/RECORD +27 -29
- judgeval/data/datasets/__init__.py +0 -4
- judgeval/data/datasets/dataset.py +0 -341
- judgeval/data/datasets/eval_dataset_client.py +0 -214
- {judgeval-0.2.0.dist-info → judgeval-0.3.1.dist-info}/WHEEL +0 -0
- {judgeval-0.2.0.dist-info → judgeval-0.3.1.dist-info}/licenses/LICENSE.md +0 -0
judgeval/dataset.py
ADDED
@@ -0,0 +1,192 @@
|
|
1
|
+
import datetime
|
2
|
+
import orjson
|
3
|
+
import os
|
4
|
+
import yaml
|
5
|
+
from dataclasses import dataclass
|
6
|
+
from typing import List, Literal, Optional
|
7
|
+
|
8
|
+
from judgeval.data import Example, Trace
|
9
|
+
from judgeval.utils.file_utils import get_examples_from_yaml, get_examples_from_json
|
10
|
+
from judgeval.common.api.api import JudgmentApiClient
|
11
|
+
from judgeval.common.logger import judgeval_logger
|
12
|
+
|
13
|
+
|
14
|
+
@dataclass
|
15
|
+
class Dataset:
|
16
|
+
examples: List[Example]
|
17
|
+
traces: List[Trace]
|
18
|
+
name: str
|
19
|
+
project_name: str
|
20
|
+
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or ""
|
21
|
+
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or ""
|
22
|
+
|
23
|
+
@classmethod
|
24
|
+
def get(
|
25
|
+
cls,
|
26
|
+
name: str,
|
27
|
+
project_name: str,
|
28
|
+
):
|
29
|
+
client = JudgmentApiClient(cls.judgment_api_key, cls.organization_id)
|
30
|
+
dataset = client.pull_dataset(name, project_name)
|
31
|
+
if not dataset:
|
32
|
+
judgeval_logger.error(f"Dataset {name} not found in project {project_name}")
|
33
|
+
raise ValueError(f"Dataset {name} not found in project {project_name}")
|
34
|
+
examples = dataset.get("examples", [])
|
35
|
+
for e in examples:
|
36
|
+
if isinstance(e, dict) and isinstance(e.get("data"), dict):
|
37
|
+
e.update(e.pop("data"))
|
38
|
+
return cls(
|
39
|
+
name=name,
|
40
|
+
project_name=project_name,
|
41
|
+
examples=[Example(**e) for e in examples],
|
42
|
+
traces=[Trace(**t) for t in dataset.get("traces", [])],
|
43
|
+
)
|
44
|
+
|
45
|
+
@classmethod
|
46
|
+
def create(
|
47
|
+
cls,
|
48
|
+
name: str,
|
49
|
+
project_name: str,
|
50
|
+
examples: Optional[List[Example]] = None,
|
51
|
+
traces: Optional[List[Trace]] = None,
|
52
|
+
overwrite: bool = False,
|
53
|
+
):
|
54
|
+
if examples and traces:
|
55
|
+
raise ValueError("Only one of examples or traces must be provided")
|
56
|
+
|
57
|
+
if not examples:
|
58
|
+
examples = []
|
59
|
+
|
60
|
+
if not traces:
|
61
|
+
traces = []
|
62
|
+
|
63
|
+
client = JudgmentApiClient(cls.judgment_api_key, cls.organization_id)
|
64
|
+
client.push_dataset(
|
65
|
+
name,
|
66
|
+
project_name,
|
67
|
+
examples=[e.model_dump() for e in examples],
|
68
|
+
traces=[t.model_dump() for t in traces],
|
69
|
+
overwrite=overwrite,
|
70
|
+
)
|
71
|
+
return cls(
|
72
|
+
name=name,
|
73
|
+
project_name=project_name,
|
74
|
+
examples=examples,
|
75
|
+
traces=traces,
|
76
|
+
)
|
77
|
+
|
78
|
+
def add_from_json(self, file_path: str) -> None:
|
79
|
+
"""
|
80
|
+
Adds examples from a JSON file.
|
81
|
+
|
82
|
+
The JSON file is expected to have the following format:
|
83
|
+
[
|
84
|
+
{
|
85
|
+
"key_01": "value_01",
|
86
|
+
"key_02": "value_02"
|
87
|
+
},
|
88
|
+
{
|
89
|
+
"key_11": "value_11",
|
90
|
+
"key_12": "value_12",
|
91
|
+
"key_13": "value_13"
|
92
|
+
},
|
93
|
+
...
|
94
|
+
]
|
95
|
+
"""
|
96
|
+
examples = get_examples_from_json(file_path)
|
97
|
+
self.add_examples(examples)
|
98
|
+
|
99
|
+
def add_from_yaml(self, file_path: str) -> None:
|
100
|
+
"""
|
101
|
+
Adds examples from a YAML file.
|
102
|
+
|
103
|
+
The YAML file is expected to have the following format:
|
104
|
+
- key_01: value_01
|
105
|
+
key_02: value_02
|
106
|
+
- key_11: value_11
|
107
|
+
key_12: value_12
|
108
|
+
key_13: value_13
|
109
|
+
...
|
110
|
+
"""
|
111
|
+
|
112
|
+
examples = get_examples_from_yaml(file_path)
|
113
|
+
self.add_examples(examples)
|
114
|
+
|
115
|
+
def add_examples(self, examples: List[Example]) -> None:
|
116
|
+
client = JudgmentApiClient(self.judgment_api_key, self.organization_id)
|
117
|
+
client.append_examples(
|
118
|
+
dataset_alias=self.name,
|
119
|
+
project_name=self.project_name,
|
120
|
+
examples=[e.model_dump() for e in examples],
|
121
|
+
)
|
122
|
+
|
123
|
+
def add_traces(self, traces: List[Trace]) -> None:
|
124
|
+
client = JudgmentApiClient(self.judgment_api_key, self.organization_id)
|
125
|
+
client.append_traces(
|
126
|
+
dataset_alias=self.name,
|
127
|
+
project_name=self.project_name,
|
128
|
+
traces=[t.model_dump() for t in traces],
|
129
|
+
)
|
130
|
+
|
131
|
+
def save_as(
|
132
|
+
self,
|
133
|
+
file_type: Literal["json", "yaml"],
|
134
|
+
dir_path: str,
|
135
|
+
save_name: str | None = None,
|
136
|
+
) -> None:
|
137
|
+
"""
|
138
|
+
Saves the dataset as a file. Save only the examples.
|
139
|
+
|
140
|
+
Args:
|
141
|
+
file_type (Literal["json", "csv"]): The file type to save the dataset as.
|
142
|
+
dir_path (str): The directory path to save the file to.
|
143
|
+
save_name (str, optional): The name of the file to save. Defaults to None.
|
144
|
+
"""
|
145
|
+
if not os.path.exists(dir_path):
|
146
|
+
os.makedirs(dir_path)
|
147
|
+
file_name = (
|
148
|
+
datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
149
|
+
if save_name is None
|
150
|
+
else save_name
|
151
|
+
)
|
152
|
+
complete_path = os.path.join(dir_path, f"{file_name}.{file_type}")
|
153
|
+
if file_type == "json":
|
154
|
+
with open(complete_path, "wb") as file:
|
155
|
+
file.write(
|
156
|
+
orjson.dumps(
|
157
|
+
{
|
158
|
+
"examples": [e.to_dict() for e in self.examples],
|
159
|
+
},
|
160
|
+
option=orjson.OPT_INDENT_2,
|
161
|
+
)
|
162
|
+
)
|
163
|
+
elif file_type == "yaml":
|
164
|
+
with open(complete_path, "w") as file:
|
165
|
+
yaml_data = {
|
166
|
+
"examples": [e.to_dict() for e in self.examples],
|
167
|
+
}
|
168
|
+
yaml.dump(yaml_data, file, default_flow_style=False)
|
169
|
+
else:
|
170
|
+
ACCEPTABLE_FILE_TYPES = ["json", "yaml"]
|
171
|
+
raise TypeError(
|
172
|
+
f"Invalid file type: {file_type}. Please choose from {ACCEPTABLE_FILE_TYPES}"
|
173
|
+
)
|
174
|
+
|
175
|
+
def delete(self):
|
176
|
+
client = JudgmentApiClient(self.judgment_api_key, self.organization_id)
|
177
|
+
client.delete_dataset(self.name, self.project_name)
|
178
|
+
|
179
|
+
def __iter__(self):
|
180
|
+
return iter(self.examples)
|
181
|
+
|
182
|
+
def __len__(self):
|
183
|
+
return len(self.examples)
|
184
|
+
|
185
|
+
def __str__(self):
|
186
|
+
return (
|
187
|
+
f"{self.__class__.__name__}("
|
188
|
+
f"examples={self.examples}, "
|
189
|
+
f"traces={self.traces}, "
|
190
|
+
f"name={self.name}"
|
191
|
+
f")"
|
192
|
+
)
|
judgeval/evaluation_run.py
CHANGED
@@ -36,6 +36,7 @@ class EvaluationRun(BaseModel):
|
|
36
36
|
data["scorers"] = [
|
37
37
|
scorer.model_dump() for scorer in self.scorers
|
38
38
|
] # Pydantic has problems with properly calling model_dump() on the scorers, so we need to do it manually
|
39
|
+
data["examples"] = [example.model_dump() for example in self.examples]
|
39
40
|
|
40
41
|
return data
|
41
42
|
|
judgeval/judges/litellm_judge.py
CHANGED
@@ -22,7 +22,7 @@ class LiteLLMJudge(JudgevalJudge):
|
|
22
22
|
def generate(
|
23
23
|
self,
|
24
24
|
input: Union[str, List[Mapping[str, str]]],
|
25
|
-
schema: pydantic.BaseModel = None,
|
25
|
+
schema: Union[pydantic.BaseModel, None] = None,
|
26
26
|
) -> str:
|
27
27
|
if isinstance(input, str):
|
28
28
|
convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
|
@@ -42,7 +42,7 @@ class LiteLLMJudge(JudgevalJudge):
|
|
42
42
|
async def a_generate(
|
43
43
|
self,
|
44
44
|
input: Union[str, List[Mapping[str, str]]],
|
45
|
-
schema: pydantic.BaseModel = None,
|
45
|
+
schema: Union[pydantic.BaseModel, None] = None,
|
46
46
|
) -> str:
|
47
47
|
if isinstance(input, str):
|
48
48
|
convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
|
@@ -18,8 +18,8 @@ from judgeval.common.logger import judgeval_logger
|
|
18
18
|
|
19
19
|
def build_dynamic_mixture_prompt(
|
20
20
|
judge_responses: List[str],
|
21
|
-
custom_system_prompt: str
|
22
|
-
custom_conversation_history: List[dict]
|
21
|
+
custom_system_prompt: Union[str, None] = None,
|
22
|
+
custom_conversation_history: Union[List[dict], None] = None,
|
23
23
|
) -> List[dict]:
|
24
24
|
"""
|
25
25
|
Dynamically builds a prompt to mix judge responses together for the Mixture of Judges model.
|
@@ -178,8 +178,8 @@ class MixtureOfJudges(JudgevalJudge):
|
|
178
178
|
def generate(
|
179
179
|
self,
|
180
180
|
input: Union[str, List[dict]],
|
181
|
-
response_schema: pydantic.BaseModel = None,
|
182
|
-
aggregation_schema: pydantic.BaseModel = None,
|
181
|
+
response_schema: Union[pydantic.BaseModel, None] = None,
|
182
|
+
aggregation_schema: Union[pydantic.BaseModel, None] = None,
|
183
183
|
**kwargs,
|
184
184
|
) -> str:
|
185
185
|
"""
|
@@ -230,8 +230,8 @@ class MixtureOfJudges(JudgevalJudge):
|
|
230
230
|
async def a_generate(
|
231
231
|
self,
|
232
232
|
input: Union[str, List[dict]],
|
233
|
-
response_schema: pydantic.BaseModel = None,
|
234
|
-
aggregation_schema: pydantic.BaseModel = None,
|
233
|
+
response_schema: Union[pydantic.BaseModel, None] = None,
|
234
|
+
aggregation_schema: Union[pydantic.BaseModel, None] = None,
|
235
235
|
**kwargs,
|
236
236
|
) -> str:
|
237
237
|
"""
|
@@ -11,6 +11,7 @@ from judgeval.common.utils import (
|
|
11
11
|
afetch_together_api_response,
|
12
12
|
)
|
13
13
|
from judgeval.common.logger import judgeval_logger
|
14
|
+
from judgeval.constants import DEFAULT_TOGETHER_MODEL
|
14
15
|
|
15
16
|
BASE_CONVERSATION = [
|
16
17
|
{"role": "system", "content": "You are a helpful assistant."},
|
@@ -18,13 +19,15 @@ BASE_CONVERSATION = [
|
|
18
19
|
|
19
20
|
|
20
21
|
class TogetherJudge(JudgevalJudge):
|
21
|
-
def __init__(self, model: str =
|
22
|
+
def __init__(self, model: str = DEFAULT_TOGETHER_MODEL, **kwargs):
|
22
23
|
self.model = model
|
23
24
|
self.kwargs = kwargs
|
24
25
|
super().__init__(model_name=model)
|
25
26
|
|
26
27
|
# TODO: Fix cost for generate and a_generate
|
27
|
-
def generate(
|
28
|
+
def generate(
|
29
|
+
self, input: Union[str, List[dict]], schema: Union[BaseModel, None] = None
|
30
|
+
) -> str:
|
28
31
|
if isinstance(input, str):
|
29
32
|
convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
|
30
33
|
return fetch_together_api_response(
|
@@ -40,7 +43,7 @@ class TogetherJudge(JudgevalJudge):
|
|
40
43
|
raise TypeError("Input must be a string or a list of dictionaries.")
|
41
44
|
|
42
45
|
async def a_generate(
|
43
|
-
self, input: Union[str, List[dict]], schema: BaseModel = None
|
46
|
+
self, input: Union[str, List[dict]], schema: Union[BaseModel, None] = None
|
44
47
|
) -> str:
|
45
48
|
if isinstance(input, str):
|
46
49
|
convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
|
judgeval/judgment_client.py
CHANGED
@@ -6,7 +6,6 @@ import os
|
|
6
6
|
from uuid import uuid4
|
7
7
|
from typing import Optional, List, Dict, Any, Union, Callable
|
8
8
|
|
9
|
-
from judgeval.data.datasets import EvalDataset, EvalDatasetClient
|
10
9
|
from judgeval.data import (
|
11
10
|
ScoringResult,
|
12
11
|
Example,
|
@@ -25,11 +24,11 @@ from judgeval.run_evaluation import (
|
|
25
24
|
from judgeval.data.trace_run import TraceRun
|
26
25
|
from judgeval.common.api import JudgmentApiClient
|
27
26
|
from judgeval.common.exceptions import JudgmentAPIError
|
28
|
-
from langchain_core.callbacks import BaseCallbackHandler
|
29
27
|
from judgeval.common.tracer import Tracer
|
30
28
|
from judgeval.common.utils import validate_api_key
|
31
29
|
from pydantic import BaseModel
|
32
30
|
from judgeval.common.logger import judgeval_logger
|
31
|
+
from judgeval.integrations.langgraph import JudgevalCallbackHandler
|
33
32
|
|
34
33
|
|
35
34
|
class EvalRunRequestBody(BaseModel):
|
@@ -71,7 +70,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
71
70
|
self.judgment_api_key = api_key
|
72
71
|
self.organization_id = organization_id
|
73
72
|
self.api_client = JudgmentApiClient(api_key, organization_id)
|
74
|
-
self.eval_dataset_client = EvalDatasetClient(api_key, organization_id)
|
75
73
|
|
76
74
|
# Verify API key is valid
|
77
75
|
result, response = validate_api_key(api_key)
|
@@ -86,7 +84,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
86
84
|
scorers: List[Union[APIScorerConfig, BaseScorer]],
|
87
85
|
examples: Optional[List[Example]] = None,
|
88
86
|
function: Optional[Callable] = None,
|
89
|
-
tracer: Optional[Union[Tracer,
|
87
|
+
tracer: Optional[Union[Tracer, JudgevalCallbackHandler]] = None,
|
90
88
|
traces: Optional[List[Trace]] = None,
|
91
89
|
tools: Optional[List[Dict[str, Any]]] = None,
|
92
90
|
project_name: str = "default_project",
|
@@ -178,70 +176,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
178
176
|
except Exception as e:
|
179
177
|
raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
|
180
178
|
|
181
|
-
def create_dataset(self) -> EvalDataset:
|
182
|
-
return self.eval_dataset_client.create_dataset()
|
183
|
-
|
184
|
-
def push_dataset(
|
185
|
-
self,
|
186
|
-
alias: str,
|
187
|
-
dataset: EvalDataset,
|
188
|
-
project_name: str,
|
189
|
-
overwrite: Optional[bool] = False,
|
190
|
-
) -> bool:
|
191
|
-
"""
|
192
|
-
Uploads an `EvalDataset` to the Judgment platform for storage.
|
193
|
-
|
194
|
-
Args:
|
195
|
-
alias (str): The name to use for the dataset
|
196
|
-
dataset (EvalDataset): The dataset to upload to Judgment
|
197
|
-
overwrite (Optional[bool]): Whether to overwrite the dataset if it already exists
|
198
|
-
|
199
|
-
Returns:
|
200
|
-
bool: Whether the dataset was successfully uploaded
|
201
|
-
"""
|
202
|
-
# Set judgment_api_key just in case it was not set
|
203
|
-
dataset.judgment_api_key = self.judgment_api_key
|
204
|
-
return self.eval_dataset_client.push(dataset, alias, project_name, overwrite)
|
205
|
-
|
206
|
-
def append_dataset(
|
207
|
-
self, alias: str, examples: List[Example], project_name: str
|
208
|
-
) -> bool:
|
209
|
-
"""
|
210
|
-
Appends an `EvalDataset` to the Judgment platform for storage.
|
211
|
-
"""
|
212
|
-
return self.eval_dataset_client.append_examples(alias, examples, project_name)
|
213
|
-
|
214
|
-
def pull_dataset(self, alias: str, project_name: str) -> EvalDataset:
|
215
|
-
"""
|
216
|
-
Retrieves a saved `EvalDataset` from the Judgment platform.
|
217
|
-
|
218
|
-
Args:
|
219
|
-
alias (str): The name of the dataset to retrieve
|
220
|
-
|
221
|
-
Returns:
|
222
|
-
EvalDataset: The retrieved dataset
|
223
|
-
"""
|
224
|
-
return self.eval_dataset_client.pull(alias, project_name)
|
225
|
-
|
226
|
-
def delete_dataset(self, alias: str, project_name: str) -> bool:
|
227
|
-
"""
|
228
|
-
Deletes a saved `EvalDataset` from the Judgment platform.
|
229
|
-
"""
|
230
|
-
return self.eval_dataset_client.delete(alias, project_name)
|
231
|
-
|
232
|
-
def pull_project_dataset_stats(self, project_name: str) -> dict:
|
233
|
-
"""
|
234
|
-
Retrieves all dataset stats from the Judgment platform for the project.
|
235
|
-
|
236
|
-
Args:
|
237
|
-
project_name (str): The name of the project to retrieve
|
238
|
-
|
239
|
-
Returns:
|
240
|
-
dict: The retrieved dataset stats
|
241
|
-
"""
|
242
|
-
return self.eval_dataset_client.pull_project_dataset_stats(project_name)
|
243
|
-
|
244
|
-
# Maybe add option where you can pass in the EvaluationRun object and it will pull the eval results from the backend
|
245
179
|
def pull_eval(
|
246
180
|
self, project_name: str, eval_run_name: str
|
247
181
|
) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
|
@@ -262,8 +196,12 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
262
196
|
"""
|
263
197
|
Creates a project on the server.
|
264
198
|
"""
|
265
|
-
|
266
|
-
|
199
|
+
try:
|
200
|
+
self.api_client.create_project(project_name)
|
201
|
+
return True
|
202
|
+
except Exception as e:
|
203
|
+
judgeval_logger.error(f"Error creating project: {e}")
|
204
|
+
return False
|
267
205
|
|
268
206
|
def delete_project(self, project_name: str) -> bool:
|
269
207
|
"""
|
@@ -314,7 +252,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
314
252
|
scorers: List[Union[APIScorerConfig, BaseScorer]],
|
315
253
|
examples: Optional[List[Example]] = None,
|
316
254
|
function: Optional[Callable] = None,
|
317
|
-
tracer: Optional[Union[Tracer,
|
255
|
+
tracer: Optional[Union[Tracer, JudgevalCallbackHandler]] = None,
|
318
256
|
traces: Optional[List[Trace]] = None,
|
319
257
|
tools: Optional[List[Dict[str, Any]]] = None,
|
320
258
|
model: Optional[str] = "gpt-4.1",
|
judgeval/run_evaluation.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
import asyncio
|
2
2
|
import concurrent.futures
|
3
3
|
import time
|
4
|
-
import
|
4
|
+
import orjson
|
5
5
|
import sys
|
6
6
|
import threading
|
7
7
|
from typing import List, Dict, Union, Optional, Callable, Tuple, Any
|
@@ -20,7 +20,7 @@ from judgeval.common.logger import judgeval_logger
|
|
20
20
|
from judgeval.evaluation_run import EvaluationRun
|
21
21
|
from judgeval.data.trace_run import TraceRun
|
22
22
|
from judgeval.common.tracer import Tracer
|
23
|
-
from
|
23
|
+
from judgeval.integrations.langgraph import JudgevalCallbackHandler
|
24
24
|
|
25
25
|
|
26
26
|
def safe_run_async(coro):
|
@@ -191,6 +191,24 @@ def check_eval_run_name_exists(
|
|
191
191
|
raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
|
192
192
|
|
193
193
|
|
194
|
+
def check_example_keys(
|
195
|
+
keys: List[str],
|
196
|
+
eval_name: str,
|
197
|
+
project_name: str,
|
198
|
+
judgment_api_key: str,
|
199
|
+
organization_id: str,
|
200
|
+
) -> None:
|
201
|
+
"""
|
202
|
+
Checks if the current experiment (if one exists) has the same keys for example
|
203
|
+
"""
|
204
|
+
api_client = JudgmentApiClient(judgment_api_key, organization_id)
|
205
|
+
try:
|
206
|
+
api_client.check_example_keys(keys, eval_name, project_name)
|
207
|
+
except Exception as e:
|
208
|
+
judgeval_logger.error(f"Failed to check if example keys match: {str(e)}")
|
209
|
+
raise JudgmentAPIError(f"Failed to check if example keys match: {str(e)}")
|
210
|
+
|
211
|
+
|
194
212
|
def log_evaluation_results(
|
195
213
|
scoring_results: List[ScoringResult],
|
196
214
|
run: Union[EvaluationRun, TraceRun],
|
@@ -245,7 +263,9 @@ def check_examples(
|
|
245
263
|
f"[yellow]⚠️ WARNING:[/yellow] Example is missing required parameters for scorer [bold]{scorer.score_type.value}[/bold]"
|
246
264
|
)
|
247
265
|
rprint(f"Missing parameters: {', '.join(missing_params)}")
|
248
|
-
rprint(
|
266
|
+
rprint(
|
267
|
+
f"Example: {orjson.dumps(example.model_dump(), option=orjson.OPT_INDENT_2).decode('utf-8')}"
|
268
|
+
)
|
249
269
|
rprint("-" * 40)
|
250
270
|
prompt_user = True
|
251
271
|
|
@@ -262,7 +282,7 @@ def run_trace_eval(
|
|
262
282
|
judgment_api_key: str,
|
263
283
|
override: bool = False,
|
264
284
|
function: Optional[Callable] = None,
|
265
|
-
tracer: Optional[Union[Tracer,
|
285
|
+
tracer: Optional[Union[Tracer, JudgevalCallbackHandler]] = None,
|
266
286
|
examples: Optional[List[Example]] = None,
|
267
287
|
) -> List[ScoringResult]:
|
268
288
|
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
@@ -394,7 +414,7 @@ def _poll_evaluation_until_complete(
|
|
394
414
|
expected_scorer_data_count: int,
|
395
415
|
poll_interval_seconds: float = 5,
|
396
416
|
max_failures: int = 5,
|
397
|
-
max_poll_count: int =
|
417
|
+
max_poll_count: int = 60, # This should be equivalent to 5 minutes
|
398
418
|
) -> Tuple[List[ScoringResult], str]:
|
399
419
|
"""
|
400
420
|
Polls until the evaluation is complete and returns the results.
|
@@ -500,6 +520,14 @@ def run_eval(
|
|
500
520
|
Returns:
|
501
521
|
List[ScoringResult]: A list of ScoringResult objects
|
502
522
|
"""
|
523
|
+
# Check that every example has the same keys
|
524
|
+
keys = evaluation_run.examples[0].get_fields().keys()
|
525
|
+
for example in evaluation_run.examples:
|
526
|
+
current_keys = example.get_fields().keys()
|
527
|
+
if current_keys != keys:
|
528
|
+
raise ValueError(
|
529
|
+
f"All examples must have the same keys: {current_keys} != {keys}"
|
530
|
+
)
|
503
531
|
|
504
532
|
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
505
533
|
if not override and not evaluation_run.append:
|
@@ -520,9 +548,14 @@ def run_eval(
|
|
520
548
|
False,
|
521
549
|
)
|
522
550
|
|
523
|
-
|
524
|
-
|
525
|
-
|
551
|
+
# Ensure that current experiment (if one exists) has the same keys for example
|
552
|
+
check_example_keys(
|
553
|
+
keys=list(keys),
|
554
|
+
eval_name=evaluation_run.eval_name,
|
555
|
+
project_name=evaluation_run.project_name,
|
556
|
+
judgment_api_key=judgment_api_key,
|
557
|
+
organization_id=evaluation_run.organization_id,
|
558
|
+
)
|
526
559
|
|
527
560
|
judgment_scorers: List[APIScorerConfig] = []
|
528
561
|
local_scorers: List[BaseScorer] = []
|
@@ -601,7 +634,6 @@ def run_eval(
|
|
601
634
|
send_results = [
|
602
635
|
scoring_result.model_dump(warnings=False) for scoring_result in results
|
603
636
|
]
|
604
|
-
|
605
637
|
url = log_evaluation_results(send_results, evaluation_run, judgment_api_key)
|
606
638
|
rprint(
|
607
639
|
f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
|
judgeval/scorers/score.py
CHANGED
@@ -30,15 +30,19 @@ async def safe_a_score_example(
|
|
30
30
|
Args:
|
31
31
|
scorer (BaseScorer): The `BaseScorer` to use for scoring the example.
|
32
32
|
example (Example): The `Example` to be scored.
|
33
|
-
|
34
|
-
ignore_errors (bool): Whether to ignore errors during the evaluation.
|
35
|
-
If set to false, any error will be raised and stop the evaluation.
|
36
|
-
If set to true, the error will be stored in the `error` attribute of the `BaseScorer` and the `success` attribute will be set to False.
|
37
|
-
|
38
|
-
skip_on_missing_params (bool): Whether to skip the test case if required parameters are missing.
|
39
33
|
"""
|
40
34
|
try:
|
41
|
-
|
35
|
+
score = await scorer.a_score_example(example)
|
36
|
+
if score is None:
|
37
|
+
raise Exception("a_score_example need to return a score")
|
38
|
+
elif score < 0:
|
39
|
+
judgeval_logger.warning("score cannot be less than 0 , setting to 0")
|
40
|
+
score = 0
|
41
|
+
elif score > 1:
|
42
|
+
judgeval_logger.warning("score cannot be greater than 1 , setting to 1")
|
43
|
+
score = 1
|
44
|
+
else:
|
45
|
+
scorer.score = score
|
42
46
|
scorer.success = scorer.success_check()
|
43
47
|
except Exception as e:
|
44
48
|
judgeval_logger.error(f"Error during scoring: {str(e)}")
|
judgeval/scorers/utils.py
CHANGED
@@ -4,7 +4,7 @@ Util functions for Scorer objects
|
|
4
4
|
|
5
5
|
import asyncio
|
6
6
|
import nest_asyncio
|
7
|
-
import
|
7
|
+
import orjson
|
8
8
|
import re
|
9
9
|
from typing import List, Optional
|
10
10
|
|
@@ -48,8 +48,8 @@ def parse_response_json(llm_response: str, scorer: Optional[BaseScorer] = None)
|
|
48
48
|
) # Remove trailing comma if present
|
49
49
|
|
50
50
|
try:
|
51
|
-
return
|
52
|
-
except
|
51
|
+
return orjson.loads(json_str)
|
52
|
+
except orjson.JSONDecodeError:
|
53
53
|
error_str = "Evaluation LLM outputted an invalid JSON. Please use a stronger evaluation model."
|
54
54
|
if scorer is not None:
|
55
55
|
scorer.error = error_str
|
judgeval/utils/file_utils.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
import yaml
|
2
|
+
import orjson
|
2
3
|
from typing import List
|
3
4
|
from judgeval.common.logger import judgeval_logger
|
4
5
|
|
@@ -9,37 +10,19 @@ def get_examples_from_yaml(file_path: str) -> List[Example] | None:
|
|
9
10
|
"""
|
10
11
|
Adds examples from a YAML file.
|
11
12
|
|
12
|
-
The format of the YAML file is expected to be a dictionary with one key: "examples".
|
13
|
-
The value of the key is a list of dictionaries, where each dictionary represents an example.
|
14
|
-
|
15
13
|
The YAML file is expected to have the following format:
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
- "context2"
|
23
|
-
retrieval_context:
|
24
|
-
- "retrieval1"
|
25
|
-
additional_metadata:
|
26
|
-
key: "value"
|
27
|
-
tools_called:
|
28
|
-
- "tool1"
|
29
|
-
expected_tools:
|
30
|
-
- {tool_name: "tool1", parameters: {"query": "test query 1"}}
|
31
|
-
- {tool_name: "tool2", parameters: {"query": "test query 2"}}
|
32
|
-
name: "test example"
|
33
|
-
example_id: null
|
34
|
-
timestamp: "20241230_160117"
|
35
|
-
trace_id: "123"
|
14
|
+
- key_01: value_01
|
15
|
+
key_02: value_02
|
16
|
+
- key_11: value_11
|
17
|
+
key_12: value_12
|
18
|
+
key_13: value_13
|
19
|
+
...
|
36
20
|
"""
|
37
21
|
try:
|
38
22
|
with open(file_path, "r") as file:
|
39
23
|
payload = yaml.safe_load(file)
|
40
24
|
if payload is None:
|
41
25
|
raise ValueError("The YAML file is empty.")
|
42
|
-
examples = payload.get("examples", [])
|
43
26
|
except FileNotFoundError:
|
44
27
|
judgeval_logger.error(f"YAML file not found: {file_path}")
|
45
28
|
raise FileNotFoundError(f"The file {file_path} was not found.")
|
@@ -47,5 +30,37 @@ def get_examples_from_yaml(file_path: str) -> List[Example] | None:
|
|
47
30
|
judgeval_logger.error(f"Invalid YAML file: {file_path}")
|
48
31
|
raise ValueError(f"The file {file_path} is not a valid YAML file.")
|
49
32
|
|
50
|
-
new_examples = [Example(**e) for e in
|
33
|
+
new_examples = [Example(**e) for e in payload]
|
34
|
+
return new_examples
|
35
|
+
|
36
|
+
|
37
|
+
def get_examples_from_json(file_path: str) -> List[Example] | None:
|
38
|
+
"""
|
39
|
+
Adds examples from a JSON file.
|
40
|
+
|
41
|
+
The JSON file is expected to have the following format:
|
42
|
+
[
|
43
|
+
{
|
44
|
+
"key_01": "value_01",
|
45
|
+
"key_02": "value_02"
|
46
|
+
},
|
47
|
+
{
|
48
|
+
"key_11": "value_11",
|
49
|
+
"key_12": "value_12",
|
50
|
+
"key_13": "value_13"
|
51
|
+
},
|
52
|
+
...
|
53
|
+
]
|
54
|
+
"""
|
55
|
+
try:
|
56
|
+
with open(file_path, "rb") as file:
|
57
|
+
payload = orjson.loads(file.read())
|
58
|
+
except FileNotFoundError:
|
59
|
+
judgeval_logger.error(f"JSON file not found: {file_path}")
|
60
|
+
raise FileNotFoundError(f"The file {file_path} was not found.")
|
61
|
+
except orjson.JSONDecodeError:
|
62
|
+
judgeval_logger.error(f"Invalid JSON file: {file_path}")
|
63
|
+
raise ValueError(f"The file {file_path} is not a valid JSON file.")
|
64
|
+
|
65
|
+
new_examples = [Example(**e) for e in payload]
|
51
66
|
return new_examples
|