judgeval 0.5.0__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/cli.py +65 -0
- judgeval/common/api/api.py +44 -38
- judgeval/common/api/constants.py +18 -5
- judgeval/common/api/json_encoder.py +8 -9
- judgeval/common/tracer/core.py +448 -256
- judgeval/common/tracer/otel_span_processor.py +1 -1
- judgeval/common/tracer/span_processor.py +1 -1
- judgeval/common/tracer/span_transformer.py +2 -1
- judgeval/common/tracer/trace_manager.py +6 -1
- judgeval/common/trainer/__init__.py +5 -0
- judgeval/common/trainer/config.py +125 -0
- judgeval/common/trainer/console.py +151 -0
- judgeval/common/trainer/trainable_model.py +238 -0
- judgeval/common/trainer/trainer.py +301 -0
- judgeval/data/evaluation_run.py +104 -0
- judgeval/data/judgment_types.py +37 -8
- judgeval/data/trace.py +1 -0
- judgeval/data/trace_run.py +0 -2
- judgeval/integrations/langgraph.py +2 -1
- judgeval/judgment_client.py +90 -135
- judgeval/local_eval_queue.py +3 -5
- judgeval/run_evaluation.py +43 -299
- judgeval/scorers/base_scorer.py +9 -10
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +17 -3
- {judgeval-0.5.0.dist-info → judgeval-0.7.0.dist-info}/METADATA +10 -47
- {judgeval-0.5.0.dist-info → judgeval-0.7.0.dist-info}/RECORD +29 -22
- judgeval-0.7.0.dist-info/entry_points.txt +2 -0
- judgeval/evaluation_run.py +0 -80
- {judgeval-0.5.0.dist-info → judgeval-0.7.0.dist-info}/WHEEL +0 -0
- {judgeval-0.5.0.dist-info → judgeval-0.7.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/cli.py
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
|
3
|
+
import typer
|
4
|
+
from pathlib import Path
|
5
|
+
from dotenv import load_dotenv
|
6
|
+
from judgeval.common.logger import judgeval_logger
|
7
|
+
from judgeval.judgment_client import JudgmentClient
|
8
|
+
|
9
|
+
load_dotenv()
|
10
|
+
|
11
|
+
app = typer.Typer(
|
12
|
+
no_args_is_help=True,
|
13
|
+
rich_markup_mode=None,
|
14
|
+
rich_help_panel=None,
|
15
|
+
pretty_exceptions_enable=False,
|
16
|
+
pretty_exceptions_show_locals=False,
|
17
|
+
pretty_exceptions_short=False,
|
18
|
+
)
|
19
|
+
|
20
|
+
|
21
|
+
@app.command("upload_scorer")
|
22
|
+
def upload_scorer(
|
23
|
+
scorer_file_path: str,
|
24
|
+
requirements_file_path: str,
|
25
|
+
unique_name: str = typer.Option(
|
26
|
+
None, help="Custom name for the scorer (auto-detected if not provided)"
|
27
|
+
),
|
28
|
+
):
|
29
|
+
# Validate file paths
|
30
|
+
if not Path(scorer_file_path).exists():
|
31
|
+
judgeval_logger.error(f"Scorer file not found: {scorer_file_path}")
|
32
|
+
raise typer.Exit(1)
|
33
|
+
|
34
|
+
if not Path(requirements_file_path).exists():
|
35
|
+
judgeval_logger.error(f"Requirements file not found: {requirements_file_path}")
|
36
|
+
raise typer.Exit(1)
|
37
|
+
|
38
|
+
try:
|
39
|
+
client = JudgmentClient()
|
40
|
+
|
41
|
+
result = client.upload_custom_scorer(
|
42
|
+
scorer_file_path=scorer_file_path,
|
43
|
+
requirements_file_path=requirements_file_path,
|
44
|
+
unique_name=unique_name,
|
45
|
+
)
|
46
|
+
|
47
|
+
if not result:
|
48
|
+
judgeval_logger.error("Failed to upload custom scorer")
|
49
|
+
raise typer.Exit(1)
|
50
|
+
|
51
|
+
raise typer.Exit(0)
|
52
|
+
except Exception:
|
53
|
+
raise
|
54
|
+
|
55
|
+
|
56
|
+
@app.command()
|
57
|
+
def version():
|
58
|
+
"""Show version info"""
|
59
|
+
judgeval_logger.info("JudgEval CLI v0.0.0")
|
60
|
+
|
61
|
+
|
62
|
+
if __name__ == "__main__":
|
63
|
+
app()
|
64
|
+
|
65
|
+
# judgeval upload_scorer /Users/alanzhang/repo/JudgmentLabs/judgeval/src/demo/profile_match_scorer.py /Users/alanzhang/repo/JudgmentLabs/judgeval/src/demo/requirements.txt
|
judgeval/common/api/api.py
CHANGED
@@ -20,13 +20,11 @@ from judgeval.common.api.constants import (
|
|
20
20
|
JUDGMENT_EVAL_DELETE_API_URL,
|
21
21
|
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
|
22
22
|
JUDGMENT_GET_EVAL_STATUS_API_URL,
|
23
|
-
JUDGMENT_CHECK_EXPERIMENT_TYPE_API_URL,
|
24
|
-
JUDGMENT_EVAL_RUN_NAME_EXISTS_API_URL,
|
25
23
|
JUDGMENT_SCORER_SAVE_API_URL,
|
26
24
|
JUDGMENT_SCORER_FETCH_API_URL,
|
27
25
|
JUDGMENT_SCORER_EXISTS_API_URL,
|
26
|
+
JUDGMENT_CUSTOM_SCORER_UPLOAD_API_URL,
|
28
27
|
JUDGMENT_DATASETS_APPEND_TRACES_API_URL,
|
29
|
-
JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL,
|
30
28
|
)
|
31
29
|
from judgeval.common.api.constants import (
|
32
30
|
TraceFetchPayload,
|
@@ -45,12 +43,11 @@ from judgeval.common.api.constants import (
|
|
45
43
|
DeleteEvalRunRequestBody,
|
46
44
|
EvalLogPayload,
|
47
45
|
EvalStatusPayload,
|
48
|
-
CheckExperimentTypePayload,
|
49
|
-
EvalRunNameExistsPayload,
|
50
46
|
ScorerSavePayload,
|
51
47
|
ScorerFetchPayload,
|
52
48
|
ScorerExistsPayload,
|
53
|
-
|
49
|
+
CustomScorerUploadPayload,
|
50
|
+
CustomScorerTemplateResponse,
|
54
51
|
)
|
55
52
|
from judgeval.utils.requests import requests
|
56
53
|
from judgeval.common.api.json_encoder import json_encoder
|
@@ -97,14 +94,20 @@ class JudgmentApiClient:
|
|
97
94
|
method: Literal["POST", "PATCH", "GET", "DELETE"],
|
98
95
|
url: str,
|
99
96
|
payload: Any,
|
97
|
+
timeout: Optional[Union[float, tuple]] = None,
|
100
98
|
) -> Any:
|
99
|
+
# Prepare request kwargs with optional timeout
|
100
|
+
request_kwargs = self._request_kwargs()
|
101
|
+
if timeout is not None:
|
102
|
+
request_kwargs["timeout"] = timeout
|
103
|
+
|
101
104
|
if method == "GET":
|
102
105
|
r = requests.request(
|
103
106
|
method,
|
104
107
|
url,
|
105
108
|
params=payload,
|
106
109
|
headers=self._headers(),
|
107
|
-
**
|
110
|
+
**request_kwargs,
|
108
111
|
)
|
109
112
|
else:
|
110
113
|
r = requests.request(
|
@@ -112,7 +115,7 @@ class JudgmentApiClient:
|
|
112
115
|
url,
|
113
116
|
json=json_encoder(payload),
|
114
117
|
headers=self._headers(),
|
115
|
-
**
|
118
|
+
**request_kwargs,
|
116
119
|
)
|
117
120
|
|
118
121
|
try:
|
@@ -186,10 +189,10 @@ class JudgmentApiClient:
|
|
186
189
|
payload: EvalLogPayload = {"results": results, "run": run}
|
187
190
|
return self._do_request("POST", JUDGMENT_EVAL_LOG_API_URL, payload)
|
188
191
|
|
189
|
-
def fetch_evaluation_results(self,
|
192
|
+
def fetch_evaluation_results(self, experiment_run_id: str, project_name: str):
|
190
193
|
payload: EvalRunRequestBody = {
|
191
194
|
"project_name": project_name,
|
192
|
-
"
|
195
|
+
"experiment_run_id": experiment_run_id,
|
193
196
|
}
|
194
197
|
return self._do_request("POST", JUDGMENT_EVAL_FETCH_API_URL, payload)
|
195
198
|
|
@@ -204,43 +207,21 @@ class JudgmentApiClient:
|
|
204
207
|
def add_to_evaluation_queue(self, payload: Dict[str, Any]):
|
205
208
|
return self._do_request("POST", JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL, payload)
|
206
209
|
|
207
|
-
def get_evaluation_status(self,
|
210
|
+
def get_evaluation_status(self, experiment_run_id: str, project_name: str):
|
208
211
|
payload: EvalStatusPayload = {
|
209
|
-
"
|
212
|
+
"experiment_run_id": experiment_run_id,
|
210
213
|
"project_name": project_name,
|
211
214
|
"judgment_api_key": self.api_key,
|
212
215
|
}
|
213
216
|
return self._do_request("GET", JUDGMENT_GET_EVAL_STATUS_API_URL, payload)
|
214
217
|
|
215
|
-
def
|
216
|
-
|
217
|
-
|
218
|
-
"project_name": project_name,
|
219
|
-
"judgment_api_key": self.api_key,
|
220
|
-
"is_trace": is_trace,
|
221
|
-
}
|
222
|
-
return self._do_request("POST", JUDGMENT_CHECK_EXPERIMENT_TYPE_API_URL, payload)
|
223
|
-
|
224
|
-
def check_eval_run_name_exists(self, eval_name: str, project_name: str):
|
225
|
-
payload: EvalRunNameExistsPayload = {
|
226
|
-
"eval_name": eval_name,
|
227
|
-
"project_name": project_name,
|
228
|
-
"judgment_api_key": self.api_key,
|
229
|
-
}
|
230
|
-
return self._do_request("POST", JUDGMENT_EVAL_RUN_NAME_EXISTS_API_URL, payload)
|
231
|
-
|
232
|
-
def check_example_keys(self, keys: List[str], eval_name: str, project_name: str):
|
233
|
-
payload: CheckExampleKeysPayload = {
|
234
|
-
"keys": keys,
|
235
|
-
"eval_name": eval_name,
|
236
|
-
"project_name": project_name,
|
237
|
-
}
|
238
|
-
return self._do_request("POST", JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL, payload)
|
239
|
-
|
240
|
-
def save_scorer(self, name: str, prompt: str, options: Optional[dict] = None):
|
218
|
+
def save_scorer(
|
219
|
+
self, name: str, prompt: str, threshold: float, options: Optional[dict] = None
|
220
|
+
):
|
241
221
|
payload: ScorerSavePayload = {
|
242
222
|
"name": name,
|
243
223
|
"prompt": prompt,
|
224
|
+
"threshold": threshold,
|
244
225
|
"options": options,
|
245
226
|
}
|
246
227
|
try:
|
@@ -292,6 +273,31 @@ class JudgmentApiClient:
|
|
292
273
|
request=e.request,
|
293
274
|
)
|
294
275
|
|
276
|
+
def upload_custom_scorer(
|
277
|
+
self,
|
278
|
+
scorer_name: str,
|
279
|
+
scorer_code: str,
|
280
|
+
requirements_text: str,
|
281
|
+
) -> CustomScorerTemplateResponse:
|
282
|
+
"""Upload custom scorer to backend"""
|
283
|
+
payload: CustomScorerUploadPayload = {
|
284
|
+
"scorer_name": scorer_name,
|
285
|
+
"scorer_code": scorer_code,
|
286
|
+
"requirements_text": requirements_text,
|
287
|
+
}
|
288
|
+
|
289
|
+
try:
|
290
|
+
# Use longer timeout for custom scorer upload (5 minutes)
|
291
|
+
response = self._do_request(
|
292
|
+
"POST",
|
293
|
+
JUDGMENT_CUSTOM_SCORER_UPLOAD_API_URL,
|
294
|
+
payload,
|
295
|
+
timeout=(10, 300),
|
296
|
+
)
|
297
|
+
return response
|
298
|
+
except JudgmentAPIException as e:
|
299
|
+
raise e
|
300
|
+
|
295
301
|
def push_dataset(
|
296
302
|
self,
|
297
303
|
dataset_alias: str,
|
judgeval/common/api/constants.py
CHANGED
@@ -49,9 +49,9 @@ JUDGMENT_EVAL_DELETE_API_URL = (
|
|
49
49
|
JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
|
50
50
|
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
|
51
51
|
JUDGMENT_GET_EVAL_STATUS_API_URL = f"{ROOT_API}/get_evaluation_status/"
|
52
|
-
|
53
|
-
|
54
|
-
|
52
|
+
|
53
|
+
# Custom Scorers API
|
54
|
+
JUDGMENT_CUSTOM_SCORER_UPLOAD_API_URL = f"{ROOT_API}/upload_scorer/"
|
55
55
|
|
56
56
|
|
57
57
|
# Evaluation API Payloads
|
@@ -73,9 +73,9 @@ class EvalLogPayload(TypedDict):
|
|
73
73
|
|
74
74
|
|
75
75
|
class EvalStatusPayload(TypedDict):
|
76
|
-
|
77
|
-
project_name: str
|
76
|
+
experiment_run_id: str
|
78
77
|
judgment_api_key: str
|
78
|
+
project_name: str
|
79
79
|
|
80
80
|
|
81
81
|
class CheckExperimentTypePayload(TypedDict):
|
@@ -162,6 +162,7 @@ JUDGMENT_SCORER_EXISTS_API_URL = f"{ROOT_API}/scorer_exists/"
|
|
162
162
|
class ScorerSavePayload(TypedDict):
|
163
163
|
name: str
|
164
164
|
prompt: str
|
165
|
+
threshold: float
|
165
166
|
options: Optional[dict]
|
166
167
|
|
167
168
|
|
@@ -171,3 +172,15 @@ class ScorerFetchPayload(TypedDict):
|
|
171
172
|
|
172
173
|
class ScorerExistsPayload(TypedDict):
|
173
174
|
name: str
|
175
|
+
|
176
|
+
|
177
|
+
class CustomScorerUploadPayload(TypedDict):
|
178
|
+
scorer_name: str
|
179
|
+
scorer_code: str
|
180
|
+
requirements_text: str
|
181
|
+
|
182
|
+
|
183
|
+
class CustomScorerTemplateResponse(TypedDict):
|
184
|
+
scorer_name: str
|
185
|
+
status: str
|
186
|
+
message: str
|
@@ -84,7 +84,7 @@ def json_encoder(
|
|
84
84
|
)
|
85
85
|
|
86
86
|
# Sequences
|
87
|
-
if isinstance(obj, (list, set, frozenset,
|
87
|
+
if isinstance(obj, (list, set, frozenset, tuple, deque)):
|
88
88
|
return _dump_sequence(
|
89
89
|
obj=obj,
|
90
90
|
)
|
@@ -169,16 +169,15 @@ def _dump_other(
|
|
169
169
|
obj: Any,
|
170
170
|
) -> Any:
|
171
171
|
"""
|
172
|
-
Dump an object to a
|
172
|
+
Dump an object to a representation without iterating it.
|
173
|
+
|
174
|
+
Avoids calling dict(obj) which can consume iterators/generators or
|
175
|
+
invoke user-defined iteration protocols.
|
173
176
|
"""
|
174
177
|
try:
|
175
|
-
data = dict(obj)
|
176
|
-
except Exception:
|
177
178
|
return repr(obj)
|
178
|
-
|
179
|
-
|
180
|
-
data,
|
181
|
-
)
|
179
|
+
except Exception:
|
180
|
+
return str(obj)
|
182
181
|
|
183
182
|
|
184
183
|
def iso_format(o: Union[datetime.date, datetime.time]) -> str:
|
@@ -218,7 +217,7 @@ ENCODERS_BY_TYPE: Dict[Type[Any], Callable[[Any], Any]] = {
|
|
218
217
|
Enum: lambda o: o.value,
|
219
218
|
frozenset: list,
|
220
219
|
deque: list,
|
221
|
-
GeneratorType:
|
220
|
+
GeneratorType: repr,
|
222
221
|
Path: str,
|
223
222
|
Pattern: lambda o: o.pattern,
|
224
223
|
SecretBytes: str,
|