judgeval 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/cli.py +65 -0
- judgeval/common/api/api.py +44 -38
- judgeval/common/api/constants.py +18 -5
- judgeval/common/api/json_encoder.py +8 -9
- judgeval/common/tracer/core.py +278 -256
- judgeval/common/tracer/otel_span_processor.py +1 -1
- judgeval/common/tracer/span_processor.py +1 -1
- judgeval/common/tracer/span_transformer.py +2 -1
- judgeval/data/evaluation_run.py +104 -0
- judgeval/data/judgment_types.py +37 -8
- judgeval/data/trace.py +1 -0
- judgeval/data/trace_run.py +0 -2
- judgeval/integrations/langgraph.py +2 -1
- judgeval/judgment_client.py +102 -47
- judgeval/local_eval_queue.py +3 -5
- judgeval/run_evaluation.py +33 -192
- judgeval/scorers/base_scorer.py +9 -10
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +17 -3
- {judgeval-0.5.0.dist-info → judgeval-0.6.0.dist-info}/METADATA +3 -1
- {judgeval-0.5.0.dist-info → judgeval-0.6.0.dist-info}/RECORD +23 -21
- judgeval-0.6.0.dist-info/entry_points.txt +2 -0
- judgeval/evaluation_run.py +0 -80
- {judgeval-0.5.0.dist-info → judgeval-0.6.0.dist-info}/WHEEL +0 -0
- {judgeval-0.5.0.dist-info → judgeval-0.6.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -11,7 +11,7 @@ from pydantic import BaseModel
|
|
11
11
|
|
12
12
|
from judgeval.common.api.json_encoder import json_encoder
|
13
13
|
from judgeval.data import TraceSpan
|
14
|
-
from judgeval.evaluation_run import EvaluationRun
|
14
|
+
from judgeval.data.evaluation_run import EvaluationRun
|
15
15
|
|
16
16
|
|
17
17
|
class SpanTransformer:
|
@@ -150,6 +150,7 @@ class SpanTransformer:
|
|
150
150
|
"additional_metadata": judgment_data.get("additional_metadata"),
|
151
151
|
"has_evaluation": judgment_data.get("has_evaluation", False),
|
152
152
|
"agent_name": judgment_data.get("agent_name"),
|
153
|
+
"class_name": judgment_data.get("class_name"),
|
153
154
|
"state_before": judgment_data.get("state_before"),
|
154
155
|
"state_after": judgment_data.get("state_after"),
|
155
156
|
"update_id": judgment_data.get("update_id", 1),
|
@@ -0,0 +1,104 @@
|
|
1
|
+
from typing import List, Optional, Union
|
2
|
+
from pydantic import field_validator, model_validator, Field
|
3
|
+
from datetime import datetime, timezone
|
4
|
+
import uuid
|
5
|
+
|
6
|
+
from judgeval.data import Example
|
7
|
+
from judgeval.scorers import BaseScorer, APIScorerConfig
|
8
|
+
from judgeval.constants import ACCEPTABLE_MODELS
|
9
|
+
from judgeval.data.judgment_types import EvaluationRunJudgmentType
|
10
|
+
|
11
|
+
|
12
|
+
class EvaluationRun(EvaluationRunJudgmentType):
|
13
|
+
"""
|
14
|
+
Stores example and evaluation scorers together for running an eval task
|
15
|
+
|
16
|
+
Args:
|
17
|
+
project_name (str): The name of the project the evaluation results belong to
|
18
|
+
eval_name (str): A name for this evaluation run
|
19
|
+
examples (List[Example]): The examples to evaluate
|
20
|
+
scorers (List[Union[BaseScorer, APIScorerConfig]]): A list of scorers to use for evaluation
|
21
|
+
model (str): The model used as a judge when using LLM as a Judge
|
22
|
+
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
|
23
|
+
"""
|
24
|
+
|
25
|
+
id: Optional[str] = Field(default_factory=lambda: str(uuid.uuid4()))
|
26
|
+
created_at: Optional[str] = Field(
|
27
|
+
default_factory=lambda: datetime.now(timezone.utc).isoformat()
|
28
|
+
)
|
29
|
+
custom_scorers: Optional[List[BaseScorer]] = None
|
30
|
+
judgment_scorers: Optional[List[APIScorerConfig]] = None
|
31
|
+
organization_id: Optional[str] = None
|
32
|
+
|
33
|
+
def __init__(
|
34
|
+
self,
|
35
|
+
scorers: Optional[List[Union[BaseScorer, APIScorerConfig]]] = None,
|
36
|
+
**kwargs,
|
37
|
+
):
|
38
|
+
"""
|
39
|
+
Initialize EvaluationRun with automatic scorer classification.
|
40
|
+
|
41
|
+
Args:
|
42
|
+
scorers: List of scorers that will be automatically sorted into custom_scorers or judgment_scorers
|
43
|
+
**kwargs: Other initialization arguments
|
44
|
+
"""
|
45
|
+
if scorers is not None:
|
46
|
+
# Automatically sort scorers into appropriate fields
|
47
|
+
custom_scorers = [s for s in scorers if isinstance(s, BaseScorer)]
|
48
|
+
judgment_scorers = [s for s in scorers if isinstance(s, APIScorerConfig)]
|
49
|
+
|
50
|
+
# Always set both fields as lists (even if empty) to satisfy validation
|
51
|
+
kwargs["custom_scorers"] = custom_scorers
|
52
|
+
kwargs["judgment_scorers"] = judgment_scorers
|
53
|
+
|
54
|
+
super().__init__(**kwargs)
|
55
|
+
|
56
|
+
def model_dump(self, **kwargs):
|
57
|
+
data = super().model_dump(**kwargs)
|
58
|
+
data["custom_scorers"] = [s.model_dump() for s in self.custom_scorers]
|
59
|
+
data["judgment_scorers"] = [s.model_dump() for s in self.judgment_scorers]
|
60
|
+
data["examples"] = [example.model_dump() for example in self.examples]
|
61
|
+
|
62
|
+
return data
|
63
|
+
|
64
|
+
@field_validator("examples")
|
65
|
+
def validate_examples(cls, v):
|
66
|
+
if not v:
|
67
|
+
raise ValueError("Examples cannot be empty.")
|
68
|
+
for item in v:
|
69
|
+
if not isinstance(item, Example):
|
70
|
+
raise ValueError(f"Item of type {type(item)} is not a Example")
|
71
|
+
return v
|
72
|
+
|
73
|
+
@model_validator(mode="after")
|
74
|
+
@classmethod
|
75
|
+
def validate_scorer_lists(cls, values):
|
76
|
+
custom_scorers = values.custom_scorers
|
77
|
+
judgment_scorers = values.judgment_scorers
|
78
|
+
|
79
|
+
# Check that both lists are not empty
|
80
|
+
if not custom_scorers and not judgment_scorers:
|
81
|
+
raise ValueError(
|
82
|
+
"At least one of custom_scorers or judgment_scorers must be provided."
|
83
|
+
)
|
84
|
+
|
85
|
+
# Check that only one list is filled
|
86
|
+
if custom_scorers and judgment_scorers:
|
87
|
+
raise ValueError(
|
88
|
+
"Only one of custom_scorers or judgment_scorers can be provided, not both."
|
89
|
+
)
|
90
|
+
|
91
|
+
return values
|
92
|
+
|
93
|
+
@field_validator("model")
|
94
|
+
def validate_model(cls, v, values):
|
95
|
+
if not v:
|
96
|
+
raise ValueError("Model cannot be empty.")
|
97
|
+
|
98
|
+
# Check if model is string or list of strings
|
99
|
+
if isinstance(v, str):
|
100
|
+
if v not in ACCEPTABLE_MODELS:
|
101
|
+
raise ValueError(
|
102
|
+
f"Model name {v} not recognized. Please select a valid model name.)"
|
103
|
+
)
|
104
|
+
return v
|
judgeval/data/judgment_types.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# generated by datamodel-codegen:
|
2
2
|
# filename: openapi_new.json
|
3
|
-
# timestamp: 2025-08-
|
3
|
+
# timestamp: 2025-08-08T18:50:51+00:00
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
6
|
|
@@ -51,6 +51,31 @@ class ScorerConfigJudgmentType(BaseModel):
|
|
51
51
|
kwargs: Annotated[Optional[Dict[str, Any]], Field(title="Kwargs")] = None
|
52
52
|
|
53
53
|
|
54
|
+
class BaseScorerJudgmentType(BaseModel):
|
55
|
+
score_type: Annotated[str, Field(title="Score Type")]
|
56
|
+
threshold: Annotated[Optional[float], Field(title="Threshold")] = 0.5
|
57
|
+
name: Annotated[Optional[str], Field(title="Name")] = None
|
58
|
+
class_name: Annotated[Optional[str], Field(title="Class Name")] = None
|
59
|
+
score: Annotated[Optional[float], Field(title="Score")] = None
|
60
|
+
score_breakdown: Annotated[
|
61
|
+
Optional[Dict[str, Any]], Field(title="Score Breakdown")
|
62
|
+
] = None
|
63
|
+
reason: Annotated[Optional[str], Field(title="Reason")] = ""
|
64
|
+
using_native_model: Annotated[Optional[bool], Field(title="Using Native Model")] = (
|
65
|
+
None
|
66
|
+
)
|
67
|
+
success: Annotated[Optional[bool], Field(title="Success")] = None
|
68
|
+
model: Annotated[Optional[str], Field(title="Model")] = None
|
69
|
+
model_client: Annotated[Any, Field(title="Model Client")] = None
|
70
|
+
strict_mode: Annotated[Optional[bool], Field(title="Strict Mode")] = False
|
71
|
+
error: Annotated[Optional[str], Field(title="Error")] = None
|
72
|
+
additional_metadata: Annotated[
|
73
|
+
Optional[Dict[str, Any]], Field(title="Additional Metadata")
|
74
|
+
] = None
|
75
|
+
user: Annotated[Optional[str], Field(title="User")] = None
|
76
|
+
server_hosted: Annotated[Optional[bool], Field(title="Server Hosted")] = False
|
77
|
+
|
78
|
+
|
54
79
|
class TraceUsageJudgmentType(BaseModel):
|
55
80
|
prompt_tokens: Annotated[Optional[int], Field(title="Prompt Tokens")] = None
|
56
81
|
completion_tokens: Annotated[Optional[int], Field(title="Completion Tokens")] = None
|
@@ -90,16 +115,21 @@ class HTTPValidationErrorJudgmentType(BaseModel):
|
|
90
115
|
] = None
|
91
116
|
|
92
117
|
|
93
|
-
class
|
118
|
+
class EvaluationRunJudgmentType(BaseModel):
|
119
|
+
id: Annotated[Optional[str], Field(title="Id")] = None
|
94
120
|
project_name: Annotated[Optional[str], Field(title="Project Name")] = None
|
95
121
|
eval_name: Annotated[Optional[str], Field(title="Eval Name")] = None
|
96
122
|
examples: Annotated[List[ExampleJudgmentType], Field(title="Examples")]
|
97
|
-
|
123
|
+
custom_scorers: Annotated[
|
124
|
+
Optional[List[BaseScorerJudgmentType]], Field(title="Custom Scorers")
|
125
|
+
] = Field(default_factory=list)
|
126
|
+
judgment_scorers: Annotated[
|
127
|
+
Optional[List[ScorerConfigJudgmentType]], Field(title="Judgment Scorers")
|
128
|
+
] = Field(default_factory=list)
|
98
129
|
model: Annotated[str, Field(title="Model")]
|
99
|
-
append: Annotated[Optional[bool], Field(title="Append")] = False
|
100
|
-
override: Annotated[Optional[bool], Field(title="Override")] = False
|
101
130
|
trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
|
102
131
|
trace_id: Annotated[Optional[str], Field(title="Trace Id")] = None
|
132
|
+
created_at: Annotated[Optional[str], Field(title="Created At")] = None
|
103
133
|
|
104
134
|
|
105
135
|
class TraceSpanJudgmentType(BaseModel):
|
@@ -123,6 +153,7 @@ class TraceSpanJudgmentType(BaseModel):
|
|
123
153
|
] = None
|
124
154
|
has_evaluation: Annotated[Optional[bool], Field(title="Has Evaluation")] = False
|
125
155
|
agent_name: Annotated[Optional[str], Field(title="Agent Name")] = None
|
156
|
+
class_name: Annotated[Optional[str], Field(title="Class Name")] = None
|
126
157
|
state_before: Annotated[Optional[Dict[str, Any]], Field(title="State Before")] = (
|
127
158
|
None
|
128
159
|
)
|
@@ -172,8 +203,6 @@ class TraceRunJudgmentType(BaseModel):
|
|
172
203
|
traces: Annotated[List[TraceJudgmentType], Field(title="Traces")]
|
173
204
|
scorers: Annotated[List[ScorerConfigJudgmentType], Field(title="Scorers")]
|
174
205
|
model: Annotated[str, Field(title="Model")]
|
175
|
-
append: Annotated[Optional[bool], Field(title="Append")] = False
|
176
|
-
override: Annotated[Optional[bool], Field(title="Override")] = False
|
177
206
|
trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
|
178
207
|
tools: Annotated[Optional[List[Dict[str, Any]]], Field(title="Tools")] = None
|
179
208
|
|
@@ -181,5 +210,5 @@ class TraceRunJudgmentType(BaseModel):
|
|
181
210
|
class EvalResultsJudgmentType(BaseModel):
|
182
211
|
results: Annotated[List[ScoringResultJudgmentType], Field(title="Results")]
|
183
212
|
run: Annotated[
|
184
|
-
Union[TraceRunJudgmentType,
|
213
|
+
Union[TraceRunJudgmentType, EvaluationRunJudgmentType], Field(title="Run")
|
185
214
|
]
|
judgeval/data/trace.py
CHANGED
@@ -32,6 +32,7 @@ class TraceSpan(TraceSpanJudgmentType):
|
|
32
32
|
"usage": self.usage.model_dump() if self.usage else None,
|
33
33
|
"has_evaluation": self.has_evaluation,
|
34
34
|
"agent_name": self.agent_name,
|
35
|
+
"class_name": self.class_name,
|
35
36
|
"state_before": self.state_before,
|
36
37
|
"state_after": self.state_after,
|
37
38
|
"additional_metadata": json_encoder(self.additional_metadata),
|
judgeval/data/trace_run.py
CHANGED
@@ -29,8 +29,6 @@ class TraceRun(BaseModel):
|
|
29
29
|
scorers: List[Union[APIScorerConfig, BaseScorer]]
|
30
30
|
model: Optional[str] = DEFAULT_GPT_MODEL
|
31
31
|
trace_span_id: Optional[str] = None
|
32
|
-
append: Optional[bool] = False
|
33
|
-
override: Optional[bool] = False
|
34
32
|
rules: Optional[List[Rule]] = None
|
35
33
|
tools: Optional[List[Dict[str, Any]]] = None
|
36
34
|
|
@@ -133,7 +133,8 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
133
133
|
inputs: Optional[Dict[str, Any]] = None,
|
134
134
|
) -> None:
|
135
135
|
"""Start tracking a span, ensuring trace client exists"""
|
136
|
-
|
136
|
+
if name.startswith("__") and name.endswith("__"):
|
137
|
+
return
|
137
138
|
start_time = time.time()
|
138
139
|
span_id = str(uuid.uuid4())
|
139
140
|
parent_span_id: Optional[str] = None
|
judgeval/judgment_client.py
CHANGED
@@ -4,6 +4,8 @@ Implements the JudgmentClient to interact with the Judgment API.
|
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
6
|
import os
|
7
|
+
import importlib.util
|
8
|
+
from pathlib import Path
|
7
9
|
from uuid import uuid4
|
8
10
|
from typing import Optional, List, Dict, Any, Union, Callable, TYPE_CHECKING
|
9
11
|
|
@@ -16,7 +18,7 @@ from judgeval.scorers import (
|
|
16
18
|
APIScorerConfig,
|
17
19
|
BaseScorer,
|
18
20
|
)
|
19
|
-
from judgeval.evaluation_run import EvaluationRun
|
21
|
+
from judgeval.data.evaluation_run import EvaluationRun
|
20
22
|
from judgeval.run_evaluation import (
|
21
23
|
run_eval,
|
22
24
|
assert_test,
|
@@ -95,8 +97,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
95
97
|
project_name: str = "default_project",
|
96
98
|
eval_run_name: str = "default_eval_trace",
|
97
99
|
model: Optional[str] = DEFAULT_GPT_MODEL,
|
98
|
-
append: bool = False,
|
99
|
-
override: bool = False,
|
100
100
|
) -> List[ScoringResult]:
|
101
101
|
try:
|
102
102
|
if examples and not function:
|
@@ -114,12 +114,11 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
114
114
|
traces=traces,
|
115
115
|
scorers=scorers,
|
116
116
|
model=model,
|
117
|
-
append=append,
|
118
117
|
organization_id=self.organization_id,
|
119
118
|
tools=tools,
|
120
119
|
)
|
121
120
|
return run_trace_eval(
|
122
|
-
trace_run, self.judgment_api_key,
|
121
|
+
trace_run, self.judgment_api_key, function, tracer, examples
|
123
122
|
)
|
124
123
|
except ValueError as e:
|
125
124
|
raise ValueError(
|
@@ -135,8 +134,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
135
134
|
model: Optional[str] = DEFAULT_GPT_MODEL,
|
136
135
|
project_name: str = "default_project",
|
137
136
|
eval_run_name: str = "default_eval_run",
|
138
|
-
override: bool = False,
|
139
|
-
append: bool = False,
|
140
137
|
) -> List[ScoringResult]:
|
141
138
|
"""
|
142
139
|
Executes an evaluation of `Example`s using one or more `Scorer`s
|
@@ -147,21 +144,13 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
147
144
|
model (str): The model used as a judge when using LLM as a Judge
|
148
145
|
project_name (str): The name of the project the evaluation results belong to
|
149
146
|
eval_run_name (str): A name for this evaluation run
|
150
|
-
override (bool): Whether to override an existing evaluation run with the same name
|
151
|
-
append (bool): Whether to append to an existing evaluation run with the same name
|
152
147
|
|
153
148
|
Returns:
|
154
149
|
List[ScoringResult]: The results of the evaluation
|
155
150
|
"""
|
156
|
-
if override and append:
|
157
|
-
raise ValueError(
|
158
|
-
"Cannot set both override and append to True. Please choose one."
|
159
|
-
)
|
160
151
|
|
161
152
|
try:
|
162
153
|
eval = EvaluationRun(
|
163
|
-
append=append,
|
164
|
-
override=override,
|
165
154
|
project_name=project_name,
|
166
155
|
eval_name=eval_run_name,
|
167
156
|
examples=examples,
|
@@ -172,7 +161,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
172
161
|
return run_eval(
|
173
162
|
eval,
|
174
163
|
self.judgment_api_key,
|
175
|
-
override,
|
176
164
|
)
|
177
165
|
except ValueError as e:
|
178
166
|
raise ValueError(
|
@@ -181,22 +169,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
181
169
|
except Exception as e:
|
182
170
|
raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
|
183
171
|
|
184
|
-
def pull_eval(
|
185
|
-
self, project_name: str, eval_run_name: str
|
186
|
-
) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
|
187
|
-
"""Pull evaluation results from the server.
|
188
|
-
|
189
|
-
Args:
|
190
|
-
project_name (str): Name of the project
|
191
|
-
eval_run_name (str): Name of the evaluation run
|
192
|
-
|
193
|
-
Returns:
|
194
|
-
Dict[str, Union[str, List[ScoringResult]]]: Dictionary containing:
|
195
|
-
- id (str): The evaluation run ID
|
196
|
-
- results (List[ScoringResult]): List of scoring results
|
197
|
-
"""
|
198
|
-
return self.api_client.fetch_evaluation_results(project_name, eval_run_name)
|
199
|
-
|
200
172
|
def create_project(self, project_name: str) -> bool:
|
201
173
|
"""
|
202
174
|
Creates a project on the server.
|
@@ -222,8 +194,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
222
194
|
model: Optional[str] = DEFAULT_GPT_MODEL,
|
223
195
|
project_name: str = "default_test",
|
224
196
|
eval_run_name: str = str(uuid4()),
|
225
|
-
override: bool = False,
|
226
|
-
append: bool = False,
|
227
197
|
) -> None:
|
228
198
|
"""
|
229
199
|
Asserts a test by running the evaluation and checking the results for success
|
@@ -234,9 +204,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
234
204
|
model (str): The model used as a judge when using LLM as a Judge
|
235
205
|
project_name (str): The name of the project the evaluation results belong to
|
236
206
|
eval_run_name (str): A name for this evaluation run
|
237
|
-
override (bool): Whether to override an existing evaluation run with the same name
|
238
|
-
append (bool): Whether to append to an existing evaluation run with the same name
|
239
|
-
async_execution (bool): Whether to run the evaluation asynchronously
|
240
207
|
"""
|
241
208
|
|
242
209
|
results: List[ScoringResult]
|
@@ -247,8 +214,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
247
214
|
model=model,
|
248
215
|
project_name=project_name,
|
249
216
|
eval_run_name=eval_run_name,
|
250
|
-
override=override,
|
251
|
-
append=append,
|
252
217
|
)
|
253
218
|
assert_test(results)
|
254
219
|
|
@@ -263,9 +228,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
263
228
|
model: Optional[str] = DEFAULT_GPT_MODEL,
|
264
229
|
project_name: str = "default_test",
|
265
230
|
eval_run_name: str = str(uuid4()),
|
266
|
-
override: bool = False,
|
267
|
-
append: bool = False,
|
268
|
-
async_execution: bool = False,
|
269
231
|
) -> None:
|
270
232
|
"""
|
271
233
|
Asserts a test by running the evaluation and checking the results for success
|
@@ -276,12 +238,9 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
276
238
|
model (str): The model used as a judge when using LLM as a Judge
|
277
239
|
project_name (str): The name of the project the evaluation results belong to
|
278
240
|
eval_run_name (str): A name for this evaluation run
|
279
|
-
override (bool): Whether to override an existing evaluation run with the same name
|
280
|
-
append (bool): Whether to append to an existing evaluation run with the same name
|
281
241
|
function (Optional[Callable]): A function to use for evaluation
|
282
242
|
tracer (Optional[Union[Tracer, BaseCallbackHandler]]): A tracer to use for evaluation
|
283
243
|
tools (Optional[List[Dict[str, Any]]]): A list of tools to use for evaluation
|
284
|
-
async_execution (bool): Whether to run the evaluation asynchronously
|
285
244
|
"""
|
286
245
|
|
287
246
|
# Check for enable_param_checking and tools
|
@@ -302,11 +261,107 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
302
261
|
model=model,
|
303
262
|
project_name=project_name,
|
304
263
|
eval_run_name=eval_run_name,
|
305
|
-
override=override,
|
306
|
-
append=append,
|
307
264
|
function=function,
|
308
265
|
tracer=tracer,
|
309
266
|
tools=tools,
|
310
267
|
)
|
311
268
|
|
312
269
|
assert_test(results)
|
270
|
+
|
271
|
+
def _extract_scorer_name(self, scorer_file_path: str) -> str:
|
272
|
+
"""Extract scorer name from the scorer file by importing it."""
|
273
|
+
try:
|
274
|
+
spec = importlib.util.spec_from_file_location(
|
275
|
+
"scorer_module", scorer_file_path
|
276
|
+
)
|
277
|
+
if spec is None or spec.loader is None:
|
278
|
+
raise ImportError(f"Could not load spec from {scorer_file_path}")
|
279
|
+
|
280
|
+
module = importlib.util.module_from_spec(spec)
|
281
|
+
spec.loader.exec_module(module)
|
282
|
+
|
283
|
+
for attr_name in dir(module):
|
284
|
+
attr = getattr(module, attr_name)
|
285
|
+
if (
|
286
|
+
isinstance(attr, type)
|
287
|
+
and any("Scorer" in str(base) for base in attr.__mro__)
|
288
|
+
and attr.__module__ == "scorer_module"
|
289
|
+
):
|
290
|
+
try:
|
291
|
+
# Instantiate the scorer and get its name
|
292
|
+
scorer_instance = attr()
|
293
|
+
if hasattr(scorer_instance, "name"):
|
294
|
+
return scorer_instance.name
|
295
|
+
except Exception:
|
296
|
+
# Skip if instantiation fails
|
297
|
+
continue
|
298
|
+
|
299
|
+
raise AttributeError("No scorer class found or could be instantiated")
|
300
|
+
except Exception as e:
|
301
|
+
judgeval_logger.warning(f"Could not extract scorer name: {e}")
|
302
|
+
return Path(scorer_file_path).stem
|
303
|
+
|
304
|
+
def save_custom_scorer(
|
305
|
+
self,
|
306
|
+
scorer_file_path: str,
|
307
|
+
requirements_file_path: Optional[str] = None,
|
308
|
+
unique_name: Optional[str] = None,
|
309
|
+
) -> bool:
|
310
|
+
"""
|
311
|
+
Upload custom ExampleScorer from files to backend.
|
312
|
+
|
313
|
+
Args:
|
314
|
+
scorer_file_path: Path to Python file containing CustomScorer class
|
315
|
+
requirements_file_path: Optional path to requirements.txt
|
316
|
+
unique_name: Optional unique identifier (auto-detected from scorer.name if not provided)
|
317
|
+
|
318
|
+
Returns:
|
319
|
+
bool: True if upload successful
|
320
|
+
|
321
|
+
Raises:
|
322
|
+
ValueError: If scorer file is invalid
|
323
|
+
FileNotFoundError: If scorer file doesn't exist
|
324
|
+
"""
|
325
|
+
import os
|
326
|
+
|
327
|
+
if not os.path.exists(scorer_file_path):
|
328
|
+
raise FileNotFoundError(f"Scorer file not found: {scorer_file_path}")
|
329
|
+
|
330
|
+
# Auto-detect scorer name if not provided
|
331
|
+
if unique_name is None:
|
332
|
+
unique_name = self._extract_scorer_name(scorer_file_path)
|
333
|
+
judgeval_logger.info(f"Auto-detected scorer name: '{unique_name}'")
|
334
|
+
|
335
|
+
# Read scorer code
|
336
|
+
with open(scorer_file_path, "r") as f:
|
337
|
+
scorer_code = f.read()
|
338
|
+
|
339
|
+
# Read requirements (optional)
|
340
|
+
requirements_text = ""
|
341
|
+
if requirements_file_path and os.path.exists(requirements_file_path):
|
342
|
+
with open(requirements_file_path, "r") as f:
|
343
|
+
requirements_text = f.read()
|
344
|
+
|
345
|
+
# Upload to backend
|
346
|
+
judgeval_logger.info(
|
347
|
+
f"Uploading custom scorer: {unique_name}, this can take a couple of minutes..."
|
348
|
+
)
|
349
|
+
try:
|
350
|
+
response = self.api_client.upload_custom_scorer(
|
351
|
+
scorer_name=unique_name,
|
352
|
+
scorer_code=scorer_code,
|
353
|
+
requirements_text=requirements_text,
|
354
|
+
)
|
355
|
+
|
356
|
+
if response.get("status") == "success":
|
357
|
+
judgeval_logger.info(
|
358
|
+
f"Successfully uploaded custom scorer: {unique_name}"
|
359
|
+
)
|
360
|
+
return True
|
361
|
+
else:
|
362
|
+
judgeval_logger.error(f"Failed to upload custom scorer: {unique_name}")
|
363
|
+
return False
|
364
|
+
|
365
|
+
except Exception as e:
|
366
|
+
judgeval_logger.error(f"Error uploading custom scorer: {e}")
|
367
|
+
raise
|
judgeval/local_eval_queue.py
CHANGED
@@ -13,9 +13,8 @@ import time
|
|
13
13
|
from judgeval.common.logger import judgeval_logger
|
14
14
|
from judgeval.constants import MAX_CONCURRENT_EVALUATIONS
|
15
15
|
from judgeval.data import ScoringResult
|
16
|
-
from judgeval.evaluation_run import EvaluationRun
|
16
|
+
from judgeval.data.evaluation_run import EvaluationRun
|
17
17
|
from judgeval.utils.async_utils import safe_run_async
|
18
|
-
from judgeval.scorers import BaseScorer
|
19
18
|
from judgeval.scorers.score import a_execute_scoring
|
20
19
|
|
21
20
|
|
@@ -43,9 +42,8 @@ class LocalEvaluationQueue:
|
|
43
42
|
|
44
43
|
def _process_run(self, evaluation_run: EvaluationRun) -> List[ScoringResult]:
|
45
44
|
"""Execute evaluation run locally and return results."""
|
46
|
-
local_scorers = [s for s in evaluation_run.scorers if isinstance(s, BaseScorer)]
|
47
45
|
|
48
|
-
if not
|
46
|
+
if not evaluation_run.custom_scorers:
|
49
47
|
raise ValueError(
|
50
48
|
"LocalEvaluationQueue only supports runs with local scorers (BaseScorer). "
|
51
49
|
"Found only APIScorerConfig instances."
|
@@ -54,7 +52,7 @@ class LocalEvaluationQueue:
|
|
54
52
|
return safe_run_async(
|
55
53
|
a_execute_scoring(
|
56
54
|
evaluation_run.examples,
|
57
|
-
|
55
|
+
evaluation_run.custom_scorers,
|
58
56
|
model=evaluation_run.model,
|
59
57
|
throttle_value=0,
|
60
58
|
max_concurrent=self._max_concurrent // self._num_workers,
|