judgeval 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +139 -12
- judgeval/api/__init__.py +501 -0
- judgeval/api/api_types.py +344 -0
- judgeval/cli.py +2 -4
- judgeval/constants.py +10 -26
- judgeval/data/evaluation_run.py +49 -26
- judgeval/data/example.py +2 -2
- judgeval/data/judgment_types.py +266 -82
- judgeval/data/result.py +4 -5
- judgeval/data/scorer_data.py +4 -2
- judgeval/data/tool.py +2 -2
- judgeval/data/trace.py +7 -50
- judgeval/data/trace_run.py +7 -4
- judgeval/{dataset.py → dataset/__init__.py} +43 -28
- judgeval/env.py +67 -0
- judgeval/{run_evaluation.py → evaluation/__init__.py} +29 -95
- judgeval/exceptions.py +27 -0
- judgeval/integrations/langgraph/__init__.py +788 -0
- judgeval/judges/__init__.py +2 -2
- judgeval/judges/litellm_judge.py +75 -15
- judgeval/judges/together_judge.py +86 -18
- judgeval/judges/utils.py +7 -21
- judgeval/{common/logger.py → logger.py} +8 -6
- judgeval/scorers/__init__.py +0 -4
- judgeval/scorers/agent_scorer.py +3 -7
- judgeval/scorers/api_scorer.py +8 -13
- judgeval/scorers/base_scorer.py +52 -32
- judgeval/scorers/example_scorer.py +1 -3
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -14
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +45 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +3 -3
- judgeval/scorers/score.py +21 -31
- judgeval/scorers/trace_api_scorer.py +5 -0
- judgeval/scorers/utils.py +1 -103
- judgeval/tracer/__init__.py +1075 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +37 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +43 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +67 -0
- judgeval/tracer/llm/__init__.py +1233 -0
- judgeval/{common/tracer → tracer/llm}/providers.py +5 -10
- judgeval/{local_eval_queue.py → tracer/local_eval_queue.py} +15 -10
- judgeval/tracer/managers.py +188 -0
- judgeval/tracer/processors/__init__.py +181 -0
- judgeval/tracer/utils.py +20 -0
- judgeval/trainer/__init__.py +5 -0
- judgeval/{common/trainer → trainer}/config.py +12 -9
- judgeval/{common/trainer → trainer}/console.py +2 -9
- judgeval/{common/trainer → trainer}/trainable_model.py +12 -7
- judgeval/{common/trainer → trainer}/trainer.py +119 -17
- judgeval/utils/async_utils.py +2 -3
- judgeval/utils/decorators.py +24 -0
- judgeval/utils/file_utils.py +37 -4
- judgeval/utils/guards.py +32 -0
- judgeval/utils/meta.py +14 -0
- judgeval/{common/api/json_encoder.py → utils/serialize.py} +7 -1
- judgeval/utils/testing.py +88 -0
- judgeval/utils/url.py +10 -0
- judgeval/{version_check.py → utils/version_check.py} +3 -3
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/METADATA +12 -14
- judgeval-0.9.0.dist-info/RECORD +80 -0
- judgeval/clients.py +0 -35
- judgeval/common/__init__.py +0 -13
- judgeval/common/api/__init__.py +0 -3
- judgeval/common/api/api.py +0 -375
- judgeval/common/api/constants.py +0 -186
- judgeval/common/exceptions.py +0 -27
- judgeval/common/storage/__init__.py +0 -6
- judgeval/common/storage/s3_storage.py +0 -97
- judgeval/common/tracer/__init__.py +0 -31
- judgeval/common/tracer/constants.py +0 -22
- judgeval/common/tracer/core.py +0 -2427
- judgeval/common/tracer/otel_exporter.py +0 -108
- judgeval/common/tracer/otel_span_processor.py +0 -188
- judgeval/common/tracer/span_processor.py +0 -37
- judgeval/common/tracer/span_transformer.py +0 -207
- judgeval/common/tracer/trace_manager.py +0 -101
- judgeval/common/trainer/__init__.py +0 -5
- judgeval/common/utils.py +0 -948
- judgeval/integrations/langgraph.py +0 -844
- judgeval/judges/mixture_of_judges.py +0 -287
- judgeval/judgment_client.py +0 -267
- judgeval/rules.py +0 -521
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
- judgeval/utils/alerts.py +0 -93
- judgeval/utils/requests.py +0 -50
- judgeval-0.8.0.dist-info/RECORD +0 -82
- {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/WHEEL +0 -0
- {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/entry_points.txt +0 -0
- {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -7,8 +7,9 @@ from typing import List, Literal, Optional
|
|
7
7
|
|
8
8
|
from judgeval.data import Example, Trace
|
9
9
|
from judgeval.utils.file_utils import get_examples_from_yaml, get_examples_from_json
|
10
|
-
from judgeval.
|
11
|
-
from judgeval.
|
10
|
+
from judgeval.api import JudgmentSyncClient
|
11
|
+
from judgeval.logger import judgeval_logger
|
12
|
+
from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
|
12
13
|
|
13
14
|
|
14
15
|
@dataclass
|
@@ -17,8 +18,8 @@ class Dataset:
|
|
17
18
|
traces: List[Trace]
|
18
19
|
name: str
|
19
20
|
project_name: str
|
20
|
-
judgment_api_key: str =
|
21
|
-
organization_id: str =
|
21
|
+
judgment_api_key: str = JUDGMENT_API_KEY or ""
|
22
|
+
organization_id: str = JUDGMENT_ORG_ID or ""
|
22
23
|
|
23
24
|
@classmethod
|
24
25
|
def get(
|
@@ -26,10 +27,14 @@ class Dataset:
|
|
26
27
|
name: str,
|
27
28
|
project_name: str,
|
28
29
|
):
|
29
|
-
client =
|
30
|
-
dataset = client.
|
30
|
+
client = JudgmentSyncClient(cls.judgment_api_key, cls.organization_id)
|
31
|
+
dataset = client.datasets_pull_for_judgeval(
|
32
|
+
{
|
33
|
+
"dataset_alias": name,
|
34
|
+
"project_name": project_name,
|
35
|
+
},
|
36
|
+
)
|
31
37
|
if not dataset:
|
32
|
-
judgeval_logger.error(f"Dataset {name} not found in project {project_name}")
|
33
38
|
raise ValueError(f"Dataset {name} not found in project {project_name}")
|
34
39
|
examples = dataset.get("examples", [])
|
35
40
|
for e in examples:
|
@@ -61,14 +66,17 @@ class Dataset:
|
|
61
66
|
if not traces:
|
62
67
|
traces = []
|
63
68
|
|
64
|
-
client =
|
65
|
-
client.
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
69
|
+
client = JudgmentSyncClient(cls.judgment_api_key, cls.organization_id)
|
70
|
+
client.datasets_push(
|
71
|
+
{
|
72
|
+
"dataset_alias": name,
|
73
|
+
"project_name": project_name,
|
74
|
+
"examples": [e.model_dump() for e in examples], # type: ignore
|
75
|
+
"traces": [t.model_dump() for t in traces], # type: ignore
|
76
|
+
"overwrite": overwrite,
|
77
|
+
}
|
71
78
|
)
|
79
|
+
|
72
80
|
judgeval_logger.info(f"Succesfull created dataset {name}!")
|
73
81
|
return cls(
|
74
82
|
name=name,
|
@@ -115,19 +123,30 @@ class Dataset:
|
|
115
123
|
self.add_examples(examples)
|
116
124
|
|
117
125
|
def add_examples(self, examples: List[Example]) -> None:
|
118
|
-
client =
|
119
|
-
client.
|
120
|
-
|
121
|
-
|
122
|
-
|
126
|
+
client = JudgmentSyncClient(self.judgment_api_key, self.organization_id)
|
127
|
+
client.datasets_insert_examples(
|
128
|
+
{
|
129
|
+
"dataset_alias": self.name,
|
130
|
+
"project_name": self.project_name,
|
131
|
+
"examples": [
|
132
|
+
{
|
133
|
+
"name": e.name,
|
134
|
+
"created_at": e.created_at,
|
135
|
+
"example_id": e.example_id,
|
136
|
+
}
|
137
|
+
for e in examples
|
138
|
+
],
|
139
|
+
}
|
123
140
|
)
|
124
141
|
|
125
142
|
def add_traces(self, traces: List[Trace]) -> None:
|
126
|
-
client =
|
127
|
-
client.
|
128
|
-
|
129
|
-
|
130
|
-
|
143
|
+
client = JudgmentSyncClient(self.judgment_api_key, self.organization_id)
|
144
|
+
client.traces_add_to_dataset(
|
145
|
+
{
|
146
|
+
"dataset_alias": self.name,
|
147
|
+
"project_name": self.project_name,
|
148
|
+
"traces": [t.model_dump() for t in traces], # type: ignore
|
149
|
+
}
|
131
150
|
)
|
132
151
|
|
133
152
|
def save_as(
|
@@ -174,10 +193,6 @@ class Dataset:
|
|
174
193
|
f"Invalid file type: {file_type}. Please choose from {ACCEPTABLE_FILE_TYPES}"
|
175
194
|
)
|
176
195
|
|
177
|
-
def delete(self):
|
178
|
-
client = JudgmentApiClient(self.judgment_api_key, self.organization_id)
|
179
|
-
client.delete_dataset(self.name, self.project_name)
|
180
|
-
|
181
196
|
def __iter__(self):
|
182
197
|
return iter(self.examples)
|
183
198
|
|
judgeval/env.py
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
from dotenv import load_dotenv
|
3
|
+
|
4
|
+
load_dotenv()
|
5
|
+
|
6
|
+
import os
|
7
|
+
from typing import overload
|
8
|
+
|
9
|
+
|
10
|
+
@overload
|
11
|
+
def optional_env_var(var_name: str) -> str | None: ...
|
12
|
+
|
13
|
+
|
14
|
+
@overload
|
15
|
+
def optional_env_var(var_name: str, default: str) -> str: ...
|
16
|
+
|
17
|
+
|
18
|
+
def optional_env_var(var_name: str, default: str | None = None) -> str | None:
|
19
|
+
return os.getenv(var_name, default)
|
20
|
+
|
21
|
+
|
22
|
+
JUDGMENT_API_KEY = optional_env_var("JUDGMENT_API_KEY")
|
23
|
+
JUDGMENT_ORG_ID = optional_env_var("JUDGMENT_ORG_ID")
|
24
|
+
JUDGMENT_API_URL = optional_env_var("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
|
25
|
+
|
26
|
+
JUDGMENT_DEFAULT_GPT_MODEL = optional_env_var("JUDGMENT_DEFAULT_GPT_MODEL", "gpt-4.1")
|
27
|
+
JUDGMENT_DEFAULT_TOGETHER_MODEL = optional_env_var(
|
28
|
+
"JUDGMENT_DEFAULT_TOGETHER_MODEL", "meta-llama/Meta-Llama-3-8B-Instruct-Lite"
|
29
|
+
)
|
30
|
+
JUDGMENT_MAX_CONCURRENT_EVALUATIONS = int(
|
31
|
+
optional_env_var("JUDGMENT_MAX_CONCURRENT_EVALUATIONS", "10")
|
32
|
+
)
|
33
|
+
|
34
|
+
JUDGMENT_S3_ACCESS_KEY_ID = optional_env_var("JUDGMENT_S3_ACCESS_KEY_ID")
|
35
|
+
JUDGMENT_S3_SECRET_ACCESS_KEY = optional_env_var("JUDGMENT_S3_SECRET_ACCESS_KEY")
|
36
|
+
JUDGMENT_S3_REGION_NAME = optional_env_var("JUDGMENT_S3_REGION_NAME")
|
37
|
+
JUDGMENT_S3_BUCKET_NAME = optional_env_var("JUDGMENT_S3_BUCKET_NAME")
|
38
|
+
JUDGMENT_S3_PREFIX = optional_env_var("JUDGMENT_S3_PREFIX", "spans/")
|
39
|
+
JUDGMENT_S3_ENDPOINT_URL = optional_env_var("JUDGMENT_S3_ENDPOINT_URL")
|
40
|
+
JUDGMENT_S3_SIGNATURE_VERSION = optional_env_var("JUDGMENT_S3_SIGNATURE_VERSION", "s3")
|
41
|
+
JUDGMENT_S3_ADDRESSING_STYLE = optional_env_var("JUDGMENT_S3_ADDRESSING_STYLE", "auto")
|
42
|
+
|
43
|
+
|
44
|
+
JUDGMENT_NO_COLOR = optional_env_var("JUDGMENT_NO_COLOR")
|
45
|
+
|
46
|
+
|
47
|
+
TOGETHERAI_API_KEY = optional_env_var("TOGETHERAI_API_KEY")
|
48
|
+
TOGETHER_API_KEY = optional_env_var("TOGETHER_API_KEY")
|
49
|
+
|
50
|
+
__all__ = (
|
51
|
+
"JUDGMENT_API_KEY",
|
52
|
+
"JUDGMENT_ORG_ID",
|
53
|
+
"JUDGMENT_API_URL",
|
54
|
+
"JUDGMENT_DEFAULT_GPT_MODEL",
|
55
|
+
"JUDGMENT_DEFAULT_TOGETHER_MODEL",
|
56
|
+
"JUDGMENT_MAX_CONCURRENT_EVALUATIONS",
|
57
|
+
"JUDGMENT_S3_ACCESS_KEY_ID",
|
58
|
+
"JUDGMENT_S3_SECRET_ACCESS_KEY",
|
59
|
+
"JUDGMENT_S3_REGION_NAME",
|
60
|
+
"JUDGMENT_S3_BUCKET_NAME",
|
61
|
+
"JUDGMENT_S3_PREFIX",
|
62
|
+
"JUDGMENT_S3_ENDPOINT_URL",
|
63
|
+
"JUDGMENT_S3_ADDRESSING_STYLE",
|
64
|
+
"JUDGMENT_NO_COLOR",
|
65
|
+
"TOGETHERAI_API_KEY",
|
66
|
+
"TOGETHER_API_KEY",
|
67
|
+
)
|
@@ -6,19 +6,18 @@ import time
|
|
6
6
|
import orjson
|
7
7
|
import sys
|
8
8
|
import threading
|
9
|
-
from typing import List, Dict, Union, Tuple,
|
9
|
+
from typing import List, Dict, Union, Tuple, TYPE_CHECKING
|
10
10
|
from rich import print as rprint
|
11
11
|
|
12
12
|
from judgeval.data import ScorerData, ScoringResult, Example
|
13
13
|
from judgeval.scorers import BaseScorer, APIScorerConfig
|
14
14
|
from judgeval.scorers.score import a_execute_scoring
|
15
|
-
from judgeval.
|
16
|
-
from judgeval.
|
17
|
-
|
15
|
+
from judgeval.api import JudgmentSyncClient
|
16
|
+
from judgeval.env import (
|
17
|
+
JUDGMENT_MAX_CONCURRENT_EVALUATIONS,
|
18
18
|
)
|
19
|
-
from judgeval.
|
20
|
-
from judgeval.
|
21
|
-
from judgeval.common.logger import judgeval_logger
|
19
|
+
from judgeval.exceptions import JudgmentAPIError, JudgmentRuntimeError
|
20
|
+
from judgeval.logger import judgeval_logger
|
22
21
|
|
23
22
|
|
24
23
|
if TYPE_CHECKING:
|
@@ -48,72 +47,6 @@ def safe_run_async(coro):
|
|
48
47
|
return asyncio.run(coro)
|
49
48
|
|
50
49
|
|
51
|
-
def send_to_rabbitmq(evaluation_run: EvaluationRun) -> Dict[str, Any]:
|
52
|
-
"""
|
53
|
-
Sends an evaluation run to the RabbitMQ evaluation queue.
|
54
|
-
"""
|
55
|
-
if not evaluation_run.judgment_api_key or not evaluation_run.organization_id:
|
56
|
-
raise ValueError("API key and organization ID are required")
|
57
|
-
if not evaluation_run.eval_name or not evaluation_run.project_name:
|
58
|
-
raise ValueError("Eval name and project name are required")
|
59
|
-
api_client = JudgmentApiClient(
|
60
|
-
evaluation_run.judgment_api_key, evaluation_run.organization_id
|
61
|
-
)
|
62
|
-
return api_client.add_to_evaluation_queue(
|
63
|
-
evaluation_run.eval_name, evaluation_run.project_name
|
64
|
-
)
|
65
|
-
|
66
|
-
|
67
|
-
def execute_api_eval(evaluation_run: EvaluationRun) -> Dict:
|
68
|
-
"""
|
69
|
-
Executes an evaluation of a list of `Example`s using one or more `JudgmentScorer`s via the Judgment API.
|
70
|
-
|
71
|
-
Args:
|
72
|
-
evaluation_run (EvaluationRun): The evaluation run object containing the examples, scorers, and metadata
|
73
|
-
|
74
|
-
Returns:
|
75
|
-
List[Dict]: The results of the evaluation. Each result is a dictionary containing the fields of a `ScoringResult`
|
76
|
-
object.
|
77
|
-
"""
|
78
|
-
|
79
|
-
try:
|
80
|
-
# submit API request to execute evals
|
81
|
-
if not evaluation_run.judgment_api_key or not evaluation_run.organization_id:
|
82
|
-
raise ValueError("API key and organization ID are required")
|
83
|
-
api_client = JudgmentApiClient(
|
84
|
-
evaluation_run.judgment_api_key, evaluation_run.organization_id
|
85
|
-
)
|
86
|
-
return api_client.run_evaluation(evaluation_run.model_dump())
|
87
|
-
except Exception as e:
|
88
|
-
judgeval_logger.error(f"Error: {e}")
|
89
|
-
|
90
|
-
details = "No details provided"
|
91
|
-
if isinstance(e, JudgmentAPIException):
|
92
|
-
details = e.response_json.get("detail", "No details provided")
|
93
|
-
|
94
|
-
raise JudgmentAPIError(
|
95
|
-
"An error occurred while executing the Judgment API request: " + details
|
96
|
-
)
|
97
|
-
|
98
|
-
|
99
|
-
def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResult]:
|
100
|
-
"""
|
101
|
-
Checks if any `ScoringResult` objects are missing `scorers_data`.
|
102
|
-
|
103
|
-
If any are missing, logs an error and returns the results.
|
104
|
-
"""
|
105
|
-
for i, result in enumerate(results):
|
106
|
-
if not result.scorers_data:
|
107
|
-
judgeval_logger.error(
|
108
|
-
f"Scorer data is missing for example {i}. "
|
109
|
-
"This is usually caused when the example does not contain "
|
110
|
-
"the fields required by the scorer. "
|
111
|
-
"Check that your example contains the fields required by the scorers. "
|
112
|
-
"TODO add docs link here for reference."
|
113
|
-
)
|
114
|
-
return results
|
115
|
-
|
116
|
-
|
117
50
|
def log_evaluation_results(
|
118
51
|
scoring_results: List[ScoringResult],
|
119
52
|
run: EvaluationRun,
|
@@ -135,17 +68,19 @@ def log_evaluation_results(
|
|
135
68
|
if not judgment_api_key or not run.organization_id:
|
136
69
|
raise ValueError("API key and organization ID are required")
|
137
70
|
|
138
|
-
api_client =
|
139
|
-
response = api_client.
|
140
|
-
|
141
|
-
|
71
|
+
api_client = JudgmentSyncClient(judgment_api_key, run.organization_id)
|
72
|
+
response = api_client.log_eval_results(
|
73
|
+
{
|
74
|
+
"results": scoring_results, # type: ignore
|
75
|
+
"run": run.model_dump(warnings=False), # type: ignore
|
76
|
+
}
|
142
77
|
)
|
143
78
|
url = response.get("ui_results_url")
|
144
79
|
return url
|
145
80
|
|
146
81
|
except Exception as e:
|
147
82
|
judgeval_logger.error(f"Failed to save evaluation results to DB: {str(e)}")
|
148
|
-
raise
|
83
|
+
raise JudgmentRuntimeError(
|
149
84
|
f"Request failed while saving evaluation results to DB: {str(e)}"
|
150
85
|
)
|
151
86
|
|
@@ -209,7 +144,7 @@ def _poll_evaluation_until_complete(
|
|
209
144
|
"""
|
210
145
|
poll_count = 0
|
211
146
|
exception_count = 0
|
212
|
-
api_client =
|
147
|
+
api_client = JudgmentSyncClient(judgment_api_key, organization_id)
|
213
148
|
while poll_count < max_poll_count:
|
214
149
|
poll_count += 1
|
215
150
|
try:
|
@@ -222,8 +157,11 @@ def _poll_evaluation_until_complete(
|
|
222
157
|
time.sleep(poll_interval_seconds)
|
223
158
|
continue
|
224
159
|
|
225
|
-
results_response = api_client.
|
226
|
-
|
160
|
+
results_response = api_client.fetch_experiment_run(
|
161
|
+
{
|
162
|
+
"experiment_run_id": experiment_run_id,
|
163
|
+
"project_name": project_name,
|
164
|
+
}
|
227
165
|
)
|
228
166
|
url = results_response.get("ui_results_url")
|
229
167
|
|
@@ -264,13 +202,13 @@ def _poll_evaluation_until_complete(
|
|
264
202
|
|
265
203
|
judgeval_logger.error(f"Error checking evaluation status: {str(e)}")
|
266
204
|
if exception_count > max_failures:
|
267
|
-
raise
|
205
|
+
raise JudgmentRuntimeError(
|
268
206
|
f"Error checking evaluation status after {poll_count} attempts: {str(e)}"
|
269
207
|
)
|
270
208
|
|
271
209
|
time.sleep(poll_interval_seconds)
|
272
210
|
|
273
|
-
raise
|
211
|
+
raise JudgmentRuntimeError(
|
274
212
|
f"Error checking evaluation status after {poll_count} attempts"
|
275
213
|
)
|
276
214
|
|
@@ -286,15 +224,12 @@ def progress_logger(stop_event, msg="Working...", interval=5):
|
|
286
224
|
def run_eval(
|
287
225
|
evaluation_run: EvaluationRun,
|
288
226
|
judgment_api_key: str,
|
289
|
-
show_url: bool = True,
|
290
227
|
) -> List[ScoringResult]:
|
291
228
|
"""
|
292
229
|
Executes an evaluation of `Example`s using one or more `Scorer`s
|
293
230
|
|
294
231
|
Args:
|
295
232
|
evaluation_run (EvaluationRun): Stores example and evaluation together for running
|
296
|
-
judgment_api_key (str): API key for authentication
|
297
|
-
show_url (bool): Whether to display the evaluation results URL. Defaults to True.
|
298
233
|
|
299
234
|
Returns:
|
300
235
|
List[ScoringResult]: A list of ScoringResult objects
|
@@ -339,11 +274,11 @@ def run_eval(
|
|
339
274
|
)
|
340
275
|
t.start()
|
341
276
|
try:
|
342
|
-
api_client =
|
277
|
+
api_client = JudgmentSyncClient(
|
343
278
|
judgment_api_key, evaluation_run.organization_id
|
344
279
|
)
|
345
|
-
response = api_client.
|
346
|
-
evaluation_run.model_dump(warnings=False)
|
280
|
+
response = api_client.add_to_run_eval_queue_examples(
|
281
|
+
evaluation_run.model_dump(warnings=False) # type: ignore
|
347
282
|
)
|
348
283
|
|
349
284
|
if not response.get("success", False):
|
@@ -351,7 +286,7 @@ def run_eval(
|
|
351
286
|
judgeval_logger.error(
|
352
287
|
f"Error adding evaluation to queue: {error_message}"
|
353
288
|
)
|
354
|
-
raise
|
289
|
+
raise JudgmentRuntimeError(error_message)
|
355
290
|
|
356
291
|
num_scorers = (
|
357
292
|
len(evaluation_run.judgment_scorers)
|
@@ -375,7 +310,7 @@ def run_eval(
|
|
375
310
|
evaluation_run.custom_scorers,
|
376
311
|
model=evaluation_run.model,
|
377
312
|
throttle_value=0,
|
378
|
-
max_concurrent=
|
313
|
+
max_concurrent=JUDGMENT_MAX_CONCURRENT_EVALUATIONS,
|
379
314
|
)
|
380
315
|
)
|
381
316
|
|
@@ -383,10 +318,9 @@ def run_eval(
|
|
383
318
|
scoring_result.model_dump(warnings=False) for scoring_result in results
|
384
319
|
]
|
385
320
|
url = log_evaluation_results(send_results, evaluation_run, judgment_api_key)
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
)
|
321
|
+
rprint(
|
322
|
+
f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
|
323
|
+
)
|
390
324
|
return results
|
391
325
|
|
392
326
|
|
judgeval/exceptions.py
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from httpx import HTTPError, Response
|
4
|
+
|
5
|
+
|
6
|
+
class JudgmentAPIError(HTTPError):
|
7
|
+
status_code: int
|
8
|
+
detail: str
|
9
|
+
response: Response
|
10
|
+
|
11
|
+
def __init__(self, status_code: int, detail: str, response: Response):
|
12
|
+
self.status_code = status_code
|
13
|
+
self.detail = detail
|
14
|
+
self.response = response
|
15
|
+
super().__init__(f"{status_code}: {detail}")
|
16
|
+
|
17
|
+
|
18
|
+
class JudgmentTestError(Exception): ...
|
19
|
+
|
20
|
+
|
21
|
+
class JudgmentRuntimeError(RuntimeError): ...
|
22
|
+
|
23
|
+
|
24
|
+
class InvalidJudgeModelError(Exception): ...
|
25
|
+
|
26
|
+
|
27
|
+
__all__ = ("JudgmentAPIError", "JudgmentRuntimeError", "InvalidJudgeModelError")
|