judgeval 0.5.0__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/cli.py +65 -0
- judgeval/common/api/api.py +44 -38
- judgeval/common/api/constants.py +18 -5
- judgeval/common/api/json_encoder.py +8 -9
- judgeval/common/tracer/core.py +448 -256
- judgeval/common/tracer/otel_span_processor.py +1 -1
- judgeval/common/tracer/span_processor.py +1 -1
- judgeval/common/tracer/span_transformer.py +2 -1
- judgeval/common/tracer/trace_manager.py +6 -1
- judgeval/common/trainer/__init__.py +5 -0
- judgeval/common/trainer/config.py +125 -0
- judgeval/common/trainer/console.py +151 -0
- judgeval/common/trainer/trainable_model.py +238 -0
- judgeval/common/trainer/trainer.py +301 -0
- judgeval/data/evaluation_run.py +104 -0
- judgeval/data/judgment_types.py +37 -8
- judgeval/data/trace.py +1 -0
- judgeval/data/trace_run.py +0 -2
- judgeval/integrations/langgraph.py +2 -1
- judgeval/judgment_client.py +90 -135
- judgeval/local_eval_queue.py +3 -5
- judgeval/run_evaluation.py +43 -299
- judgeval/scorers/base_scorer.py +9 -10
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +17 -3
- {judgeval-0.5.0.dist-info → judgeval-0.7.0.dist-info}/METADATA +10 -47
- {judgeval-0.5.0.dist-info → judgeval-0.7.0.dist-info}/RECORD +29 -22
- judgeval-0.7.0.dist-info/entry_points.txt +2 -0
- judgeval/evaluation_run.py +0 -80
- {judgeval-0.5.0.dist-info → judgeval-0.7.0.dist-info}/WHEEL +0 -0
- {judgeval-0.5.0.dist-info → judgeval-0.7.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/judgment_client.py
CHANGED
@@ -4,35 +4,31 @@ Implements the JudgmentClient to interact with the Judgment API.
|
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
6
|
import os
|
7
|
+
import importlib.util
|
8
|
+
from pathlib import Path
|
7
9
|
from uuid import uuid4
|
8
|
-
from typing import Optional, List, Dict,
|
10
|
+
from typing import Optional, List, Dict, Union
|
9
11
|
|
10
12
|
from judgeval.data import (
|
11
13
|
ScoringResult,
|
12
14
|
Example,
|
13
|
-
Trace,
|
14
15
|
)
|
15
16
|
from judgeval.scorers import (
|
16
17
|
APIScorerConfig,
|
17
18
|
BaseScorer,
|
18
19
|
)
|
19
|
-
from judgeval.evaluation_run import EvaluationRun
|
20
|
+
from judgeval.data.evaluation_run import EvaluationRun
|
20
21
|
from judgeval.run_evaluation import (
|
21
22
|
run_eval,
|
22
23
|
assert_test,
|
23
|
-
run_trace_eval,
|
24
24
|
)
|
25
|
-
from judgeval.data.trace_run import TraceRun
|
26
25
|
from judgeval.common.api import JudgmentApiClient
|
27
26
|
from judgeval.common.exceptions import JudgmentAPIError
|
28
|
-
from judgeval.common.tracer import Tracer
|
29
27
|
from judgeval.common.utils import validate_api_key
|
30
28
|
from pydantic import BaseModel
|
31
29
|
from judgeval.common.logger import judgeval_logger
|
32
30
|
|
33
31
|
|
34
|
-
if TYPE_CHECKING:
|
35
|
-
from judgeval.integrations.langgraph import JudgevalCallbackHandler
|
36
32
|
from judgeval.constants import DEFAULT_GPT_MODEL
|
37
33
|
|
38
34
|
|
@@ -84,50 +80,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
84
80
|
else:
|
85
81
|
judgeval_logger.info("Successfully initialized JudgmentClient!")
|
86
82
|
|
87
|
-
def run_trace_evaluation(
|
88
|
-
self,
|
89
|
-
scorers: List[Union[APIScorerConfig, BaseScorer]],
|
90
|
-
examples: Optional[List[Example]] = None,
|
91
|
-
function: Optional[Callable] = None,
|
92
|
-
tracer: Optional[Union[Tracer, JudgevalCallbackHandler]] = None,
|
93
|
-
traces: Optional[List[Trace]] = None,
|
94
|
-
tools: Optional[List[Dict[str, Any]]] = None,
|
95
|
-
project_name: str = "default_project",
|
96
|
-
eval_run_name: str = "default_eval_trace",
|
97
|
-
model: Optional[str] = DEFAULT_GPT_MODEL,
|
98
|
-
append: bool = False,
|
99
|
-
override: bool = False,
|
100
|
-
) -> List[ScoringResult]:
|
101
|
-
try:
|
102
|
-
if examples and not function:
|
103
|
-
raise ValueError("Cannot pass in examples without a function")
|
104
|
-
|
105
|
-
if traces and function:
|
106
|
-
raise ValueError("Cannot pass in traces and function")
|
107
|
-
|
108
|
-
if examples and traces:
|
109
|
-
raise ValueError("Cannot pass in both examples and traces")
|
110
|
-
|
111
|
-
trace_run = TraceRun(
|
112
|
-
project_name=project_name,
|
113
|
-
eval_name=eval_run_name,
|
114
|
-
traces=traces,
|
115
|
-
scorers=scorers,
|
116
|
-
model=model,
|
117
|
-
append=append,
|
118
|
-
organization_id=self.organization_id,
|
119
|
-
tools=tools,
|
120
|
-
)
|
121
|
-
return run_trace_eval(
|
122
|
-
trace_run, self.judgment_api_key, override, function, tracer, examples
|
123
|
-
)
|
124
|
-
except ValueError as e:
|
125
|
-
raise ValueError(
|
126
|
-
f"Please check your TraceRun object, one or more fields are invalid: \n{str(e)}"
|
127
|
-
)
|
128
|
-
except Exception as e:
|
129
|
-
raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
|
130
|
-
|
131
83
|
def run_evaluation(
|
132
84
|
self,
|
133
85
|
examples: List[Example],
|
@@ -135,8 +87,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
135
87
|
model: Optional[str] = DEFAULT_GPT_MODEL,
|
136
88
|
project_name: str = "default_project",
|
137
89
|
eval_run_name: str = "default_eval_run",
|
138
|
-
|
139
|
-
append: bool = False,
|
90
|
+
show_url: bool = True,
|
140
91
|
) -> List[ScoringResult]:
|
141
92
|
"""
|
142
93
|
Executes an evaluation of `Example`s using one or more `Scorer`s
|
@@ -147,21 +98,13 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
147
98
|
model (str): The model used as a judge when using LLM as a Judge
|
148
99
|
project_name (str): The name of the project the evaluation results belong to
|
149
100
|
eval_run_name (str): A name for this evaluation run
|
150
|
-
override (bool): Whether to override an existing evaluation run with the same name
|
151
|
-
append (bool): Whether to append to an existing evaluation run with the same name
|
152
101
|
|
153
102
|
Returns:
|
154
103
|
List[ScoringResult]: The results of the evaluation
|
155
104
|
"""
|
156
|
-
if override and append:
|
157
|
-
raise ValueError(
|
158
|
-
"Cannot set both override and append to True. Please choose one."
|
159
|
-
)
|
160
105
|
|
161
106
|
try:
|
162
107
|
eval = EvaluationRun(
|
163
|
-
append=append,
|
164
|
-
override=override,
|
165
108
|
project_name=project_name,
|
166
109
|
eval_name=eval_run_name,
|
167
110
|
examples=examples,
|
@@ -172,7 +115,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
172
115
|
return run_eval(
|
173
116
|
eval,
|
174
117
|
self.judgment_api_key,
|
175
|
-
|
118
|
+
show_url=show_url,
|
176
119
|
)
|
177
120
|
except ValueError as e:
|
178
121
|
raise ValueError(
|
@@ -181,22 +124,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
181
124
|
except Exception as e:
|
182
125
|
raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
|
183
126
|
|
184
|
-
def pull_eval(
|
185
|
-
self, project_name: str, eval_run_name: str
|
186
|
-
) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
|
187
|
-
"""Pull evaluation results from the server.
|
188
|
-
|
189
|
-
Args:
|
190
|
-
project_name (str): Name of the project
|
191
|
-
eval_run_name (str): Name of the evaluation run
|
192
|
-
|
193
|
-
Returns:
|
194
|
-
Dict[str, Union[str, List[ScoringResult]]]: Dictionary containing:
|
195
|
-
- id (str): The evaluation run ID
|
196
|
-
- results (List[ScoringResult]): List of scoring results
|
197
|
-
"""
|
198
|
-
return self.api_client.fetch_evaluation_results(project_name, eval_run_name)
|
199
|
-
|
200
127
|
def create_project(self, project_name: str) -> bool:
|
201
128
|
"""
|
202
129
|
Creates a project on the server.
|
@@ -222,8 +149,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
222
149
|
model: Optional[str] = DEFAULT_GPT_MODEL,
|
223
150
|
project_name: str = "default_test",
|
224
151
|
eval_run_name: str = str(uuid4()),
|
225
|
-
override: bool = False,
|
226
|
-
append: bool = False,
|
227
152
|
) -> None:
|
228
153
|
"""
|
229
154
|
Asserts a test by running the evaluation and checking the results for success
|
@@ -234,9 +159,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
234
159
|
model (str): The model used as a judge when using LLM as a Judge
|
235
160
|
project_name (str): The name of the project the evaluation results belong to
|
236
161
|
eval_run_name (str): A name for this evaluation run
|
237
|
-
override (bool): Whether to override an existing evaluation run with the same name
|
238
|
-
append (bool): Whether to append to an existing evaluation run with the same name
|
239
|
-
async_execution (bool): Whether to run the evaluation asynchronously
|
240
162
|
"""
|
241
163
|
|
242
164
|
results: List[ScoringResult]
|
@@ -247,66 +169,99 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
247
169
|
model=model,
|
248
170
|
project_name=project_name,
|
249
171
|
eval_run_name=eval_run_name,
|
250
|
-
override=override,
|
251
|
-
append=append,
|
252
172
|
)
|
253
173
|
assert_test(results)
|
254
174
|
|
255
|
-
def
|
175
|
+
def _extract_scorer_name(self, scorer_file_path: str) -> str:
|
176
|
+
"""Extract scorer name from the scorer file by importing it."""
|
177
|
+
try:
|
178
|
+
spec = importlib.util.spec_from_file_location(
|
179
|
+
"scorer_module", scorer_file_path
|
180
|
+
)
|
181
|
+
if spec is None or spec.loader is None:
|
182
|
+
raise ImportError(f"Could not load spec from {scorer_file_path}")
|
183
|
+
|
184
|
+
module = importlib.util.module_from_spec(spec)
|
185
|
+
spec.loader.exec_module(module)
|
186
|
+
|
187
|
+
for attr_name in dir(module):
|
188
|
+
attr = getattr(module, attr_name)
|
189
|
+
if (
|
190
|
+
isinstance(attr, type)
|
191
|
+
and any("Scorer" in str(base) for base in attr.__mro__)
|
192
|
+
and attr.__module__ == "scorer_module"
|
193
|
+
):
|
194
|
+
try:
|
195
|
+
# Instantiate the scorer and get its name
|
196
|
+
scorer_instance = attr()
|
197
|
+
if hasattr(scorer_instance, "name"):
|
198
|
+
return scorer_instance.name
|
199
|
+
except Exception:
|
200
|
+
# Skip if instantiation fails
|
201
|
+
continue
|
202
|
+
|
203
|
+
raise AttributeError("No scorer class found or could be instantiated")
|
204
|
+
except Exception as e:
|
205
|
+
judgeval_logger.warning(f"Could not extract scorer name: {e}")
|
206
|
+
return Path(scorer_file_path).stem
|
207
|
+
|
208
|
+
def upload_custom_scorer(
|
256
209
|
self,
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
traces: Optional[List[Trace]] = None,
|
262
|
-
tools: Optional[List[Dict[str, Any]]] = None,
|
263
|
-
model: Optional[str] = DEFAULT_GPT_MODEL,
|
264
|
-
project_name: str = "default_test",
|
265
|
-
eval_run_name: str = str(uuid4()),
|
266
|
-
override: bool = False,
|
267
|
-
append: bool = False,
|
268
|
-
async_execution: bool = False,
|
269
|
-
) -> None:
|
210
|
+
scorer_file_path: str,
|
211
|
+
requirements_file_path: Optional[str] = None,
|
212
|
+
unique_name: Optional[str] = None,
|
213
|
+
) -> bool:
|
270
214
|
"""
|
271
|
-
|
215
|
+
Upload custom ExampleScorer from files to backend.
|
272
216
|
|
273
217
|
Args:
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
async_execution (bool): Whether to run the evaluation asynchronously
|
218
|
+
scorer_file_path: Path to Python file containing CustomScorer class
|
219
|
+
requirements_file_path: Optional path to requirements.txt
|
220
|
+
unique_name: Optional unique identifier (auto-detected from scorer.name if not provided)
|
221
|
+
|
222
|
+
Returns:
|
223
|
+
bool: True if upload successful
|
224
|
+
|
225
|
+
Raises:
|
226
|
+
ValueError: If scorer file is invalid
|
227
|
+
FileNotFoundError: If scorer file doesn't exist
|
285
228
|
"""
|
229
|
+
import os
|
286
230
|
|
287
|
-
|
288
|
-
|
289
|
-
if hasattr(scorer, "kwargs") and scorer.kwargs is not None:
|
290
|
-
if scorer.kwargs.get("enable_param_checking") is True:
|
291
|
-
if not tools:
|
292
|
-
raise ValueError(
|
293
|
-
f"You must provide the 'tools' argument to assert_test when using a scorer with enable_param_checking=True. If you do not want to do param checking, explicitly set enable_param_checking=False for the {scorer.__name__} scorer."
|
294
|
-
)
|
231
|
+
if not os.path.exists(scorer_file_path):
|
232
|
+
raise FileNotFoundError(f"Scorer file not found: {scorer_file_path}")
|
295
233
|
|
296
|
-
|
234
|
+
# Auto-detect scorer name if not provided
|
235
|
+
if unique_name is None:
|
236
|
+
unique_name = self._extract_scorer_name(scorer_file_path)
|
237
|
+
judgeval_logger.info(f"Auto-detected scorer name: '{unique_name}'")
|
297
238
|
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
scorers=scorers,
|
302
|
-
model=model,
|
303
|
-
project_name=project_name,
|
304
|
-
eval_run_name=eval_run_name,
|
305
|
-
override=override,
|
306
|
-
append=append,
|
307
|
-
function=function,
|
308
|
-
tracer=tracer,
|
309
|
-
tools=tools,
|
310
|
-
)
|
239
|
+
# Read scorer code
|
240
|
+
with open(scorer_file_path, "r") as f:
|
241
|
+
scorer_code = f.read()
|
311
242
|
|
312
|
-
|
243
|
+
# Read requirements (optional)
|
244
|
+
requirements_text = ""
|
245
|
+
if requirements_file_path and os.path.exists(requirements_file_path):
|
246
|
+
with open(requirements_file_path, "r") as f:
|
247
|
+
requirements_text = f.read()
|
248
|
+
|
249
|
+
try:
|
250
|
+
response = self.api_client.upload_custom_scorer(
|
251
|
+
scorer_name=unique_name,
|
252
|
+
scorer_code=scorer_code,
|
253
|
+
requirements_text=requirements_text,
|
254
|
+
)
|
255
|
+
|
256
|
+
if response.get("status") == "success":
|
257
|
+
judgeval_logger.info(
|
258
|
+
f"Successfully uploaded custom scorer: {unique_name}"
|
259
|
+
)
|
260
|
+
return True
|
261
|
+
else:
|
262
|
+
judgeval_logger.error(f"Failed to upload custom scorer: {unique_name}")
|
263
|
+
return False
|
264
|
+
|
265
|
+
except Exception as e:
|
266
|
+
judgeval_logger.error(f"Error uploading custom scorer: {e}")
|
267
|
+
raise
|
judgeval/local_eval_queue.py
CHANGED
@@ -13,9 +13,8 @@ import time
|
|
13
13
|
from judgeval.common.logger import judgeval_logger
|
14
14
|
from judgeval.constants import MAX_CONCURRENT_EVALUATIONS
|
15
15
|
from judgeval.data import ScoringResult
|
16
|
-
from judgeval.evaluation_run import EvaluationRun
|
16
|
+
from judgeval.data.evaluation_run import EvaluationRun
|
17
17
|
from judgeval.utils.async_utils import safe_run_async
|
18
|
-
from judgeval.scorers import BaseScorer
|
19
18
|
from judgeval.scorers.score import a_execute_scoring
|
20
19
|
|
21
20
|
|
@@ -43,9 +42,8 @@ class LocalEvaluationQueue:
|
|
43
42
|
|
44
43
|
def _process_run(self, evaluation_run: EvaluationRun) -> List[ScoringResult]:
|
45
44
|
"""Execute evaluation run locally and return results."""
|
46
|
-
local_scorers = [s for s in evaluation_run.scorers if isinstance(s, BaseScorer)]
|
47
45
|
|
48
|
-
if not
|
46
|
+
if not evaluation_run.custom_scorers:
|
49
47
|
raise ValueError(
|
50
48
|
"LocalEvaluationQueue only supports runs with local scorers (BaseScorer). "
|
51
49
|
"Found only APIScorerConfig instances."
|
@@ -54,7 +52,7 @@ class LocalEvaluationQueue:
|
|
54
52
|
return safe_run_async(
|
55
53
|
a_execute_scoring(
|
56
54
|
evaluation_run.examples,
|
57
|
-
|
55
|
+
evaluation_run.custom_scorers,
|
58
56
|
model=evaluation_run.model,
|
59
57
|
throttle_value=0,
|
60
58
|
max_concurrent=self._max_concurrent // self._num_workers,
|