judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of judgeval might be problematic. Click here for more details.
- judgeval/__init__.py +177 -12
- judgeval/api/__init__.py +519 -0
- judgeval/api/api_types.py +407 -0
- judgeval/cli.py +79 -0
- judgeval/constants.py +76 -47
- judgeval/data/__init__.py +3 -3
- judgeval/data/evaluation_run.py +125 -0
- judgeval/data/example.py +15 -56
- judgeval/data/judgment_types.py +450 -0
- judgeval/data/result.py +29 -73
- judgeval/data/scorer_data.py +29 -62
- judgeval/data/scripts/fix_default_factory.py +23 -0
- judgeval/data/scripts/openapi_transform.py +123 -0
- judgeval/data/trace.py +121 -0
- judgeval/dataset/__init__.py +264 -0
- judgeval/env.py +52 -0
- judgeval/evaluation/__init__.py +344 -0
- judgeval/exceptions.py +27 -0
- judgeval/integrations/langgraph/__init__.py +13 -0
- judgeval/integrations/openlit/__init__.py +50 -0
- judgeval/judges/__init__.py +2 -3
- judgeval/judges/base_judge.py +2 -3
- judgeval/judges/litellm_judge.py +100 -20
- judgeval/judges/together_judge.py +101 -20
- judgeval/judges/utils.py +20 -24
- judgeval/logger.py +62 -0
- judgeval/prompt/__init__.py +330 -0
- judgeval/scorers/__init__.py +18 -25
- judgeval/scorers/agent_scorer.py +17 -0
- judgeval/scorers/api_scorer.py +45 -41
- judgeval/scorers/base_scorer.py +83 -38
- judgeval/scorers/example_scorer.py +17 -0
- judgeval/scorers/exceptions.py +1 -0
- judgeval/scorers/judgeval_scorers/__init__.py +0 -148
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
- judgeval/scorers/score.py +77 -306
- judgeval/scorers/utils.py +4 -199
- judgeval/tracer/__init__.py +1122 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +40 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +59 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +63 -0
- judgeval/tracer/llm/__init__.py +7 -0
- judgeval/tracer/llm/config.py +78 -0
- judgeval/tracer/llm/constants.py +9 -0
- judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
- judgeval/tracer/llm/llm_anthropic/config.py +6 -0
- judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
- judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
- judgeval/tracer/llm/llm_google/__init__.py +3 -0
- judgeval/tracer/llm/llm_google/config.py +6 -0
- judgeval/tracer/llm/llm_google/generate_content.py +127 -0
- judgeval/tracer/llm/llm_google/wrapper.py +30 -0
- judgeval/tracer/llm/llm_openai/__init__.py +3 -0
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
- judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
- judgeval/tracer/llm/llm_openai/config.py +6 -0
- judgeval/tracer/llm/llm_openai/responses.py +506 -0
- judgeval/tracer/llm/llm_openai/utils.py +42 -0
- judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
- judgeval/tracer/llm/llm_together/__init__.py +3 -0
- judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
- judgeval/tracer/llm/llm_together/config.py +6 -0
- judgeval/tracer/llm/llm_together/wrapper.py +52 -0
- judgeval/tracer/llm/providers.py +19 -0
- judgeval/tracer/managers.py +167 -0
- judgeval/tracer/processors/__init__.py +220 -0
- judgeval/tracer/utils.py +19 -0
- judgeval/trainer/__init__.py +14 -0
- judgeval/trainer/base_trainer.py +122 -0
- judgeval/trainer/config.py +128 -0
- judgeval/trainer/console.py +144 -0
- judgeval/trainer/fireworks_trainer.py +396 -0
- judgeval/trainer/trainable_model.py +243 -0
- judgeval/trainer/trainer.py +70 -0
- judgeval/utils/async_utils.py +39 -0
- judgeval/utils/decorators/__init__.py +0 -0
- judgeval/utils/decorators/dont_throw.py +37 -0
- judgeval/utils/decorators/use_once.py +13 -0
- judgeval/utils/file_utils.py +97 -0
- judgeval/utils/guards.py +36 -0
- judgeval/utils/meta.py +27 -0
- judgeval/utils/project.py +15 -0
- judgeval/utils/serialize.py +253 -0
- judgeval/utils/testing.py +70 -0
- judgeval/utils/url.py +10 -0
- judgeval/utils/version_check.py +28 -0
- judgeval/utils/wrappers/README.md +3 -0
- judgeval/utils/wrappers/__init__.py +15 -0
- judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
- judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
- judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
- judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
- judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
- judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
- judgeval/utils/wrappers/py.typed +0 -0
- judgeval/utils/wrappers/utils.py +35 -0
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- judgeval-0.22.2.dist-info/METADATA +265 -0
- judgeval-0.22.2.dist-info/RECORD +112 -0
- judgeval-0.22.2.dist-info/entry_points.txt +2 -0
- judgeval/clients.py +0 -39
- judgeval/common/__init__.py +0 -8
- judgeval/common/exceptions.py +0 -28
- judgeval/common/logger.py +0 -189
- judgeval/common/tracer.py +0 -798
- judgeval/common/utils.py +0 -763
- judgeval/data/api_example.py +0 -111
- judgeval/data/datasets/__init__.py +0 -5
- judgeval/data/datasets/dataset.py +0 -286
- judgeval/data/datasets/eval_dataset_client.py +0 -193
- judgeval/data/datasets/ground_truth.py +0 -54
- judgeval/data/datasets/utils.py +0 -74
- judgeval/evaluation_run.py +0 -132
- judgeval/judges/mixture_of_judges.py +0 -248
- judgeval/judgment_client.py +0 -354
- judgeval/run_evaluation.py +0 -439
- judgeval/scorers/judgeval_scorer.py +0 -140
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
- judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
- judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
- judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
- judgeval/scorers/prompt_scorer.py +0 -439
- judgeval-0.0.11.dist-info/METADATA +0 -36
- judgeval-0.0.11.dist-info/RECORD +0 -84
- {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
- {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
judgeval/judgment_client.py
DELETED
|
@@ -1,354 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Implements the JudgmentClient to interact with the Judgment API.
|
|
3
|
-
"""
|
|
4
|
-
import os
|
|
5
|
-
from typing import Optional, List, Dict, Any, Union
|
|
6
|
-
import requests
|
|
7
|
-
|
|
8
|
-
from judgeval.constants import ROOT_API
|
|
9
|
-
from judgeval.data.datasets import EvalDataset, EvalDatasetClient
|
|
10
|
-
from judgeval.data import (
|
|
11
|
-
ScoringResult,
|
|
12
|
-
Example
|
|
13
|
-
)
|
|
14
|
-
from judgeval.scorers import (
|
|
15
|
-
APIJudgmentScorer,
|
|
16
|
-
JudgevalScorer,
|
|
17
|
-
ClassifierScorer,
|
|
18
|
-
ScorerWrapper
|
|
19
|
-
)
|
|
20
|
-
from judgeval.evaluation_run import EvaluationRun
|
|
21
|
-
from judgeval.run_evaluation import (
|
|
22
|
-
run_eval,
|
|
23
|
-
assert_test
|
|
24
|
-
)
|
|
25
|
-
from judgeval.judges import JudgevalJudge
|
|
26
|
-
from judgeval.constants import JUDGMENT_EVAL_FETCH_API_URL, JUDGMENT_EVAL_DELETE_API_URL, JUDGMENT_EVAL_DELETE_PROJECT_API_URL
|
|
27
|
-
from judgeval.common.exceptions import JudgmentAPIError
|
|
28
|
-
from pydantic import BaseModel
|
|
29
|
-
|
|
30
|
-
class EvalRunRequestBody(BaseModel):
|
|
31
|
-
eval_name: str
|
|
32
|
-
project_name: str
|
|
33
|
-
judgment_api_key: str
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
class JudgmentClient:
|
|
37
|
-
def __init__(self, judgment_api_key: str = os.getenv("JUDGMENT_API_KEY")):
|
|
38
|
-
self.judgment_api_key = judgment_api_key
|
|
39
|
-
self.eval_dataset_client = EvalDatasetClient(judgment_api_key)
|
|
40
|
-
|
|
41
|
-
# Verify API key is valid
|
|
42
|
-
result, response = self._validate_api_key()
|
|
43
|
-
if not result:
|
|
44
|
-
# May be bad to output their invalid API key...
|
|
45
|
-
raise JudgmentAPIError(f"Issue with passed in Judgment API key: {response}")
|
|
46
|
-
else:
|
|
47
|
-
print(f"Successfully initialized JudgmentClient, welcome back {response.get('detail', {}).get('user_name', 'user')}!")
|
|
48
|
-
|
|
49
|
-
def run_evaluation(
|
|
50
|
-
self,
|
|
51
|
-
examples: List[Example],
|
|
52
|
-
scorers: List[Union[ScorerWrapper, JudgevalScorer]],
|
|
53
|
-
model: Union[str, List[str], JudgevalJudge],
|
|
54
|
-
aggregator: Optional[str] = None,
|
|
55
|
-
metadata: Optional[Dict[str, Any]] = None,
|
|
56
|
-
log_results: bool = True,
|
|
57
|
-
project_name: str = "default_project",
|
|
58
|
-
eval_run_name: str = "default_eval_run",
|
|
59
|
-
override: bool = False,
|
|
60
|
-
use_judgment: bool = True
|
|
61
|
-
) -> List[ScoringResult]:
|
|
62
|
-
"""
|
|
63
|
-
Executes an evaluation of `Example`s using one or more `Scorer`s
|
|
64
|
-
"""
|
|
65
|
-
try:
|
|
66
|
-
# Load appropriate implementations for all scorers
|
|
67
|
-
loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = [
|
|
68
|
-
scorer.load_implementation(use_judgment=use_judgment) if isinstance(scorer, ScorerWrapper) else scorer
|
|
69
|
-
for scorer in scorers
|
|
70
|
-
]
|
|
71
|
-
|
|
72
|
-
eval = EvaluationRun(
|
|
73
|
-
log_results=log_results,
|
|
74
|
-
project_name=project_name,
|
|
75
|
-
eval_name=eval_run_name,
|
|
76
|
-
examples=examples,
|
|
77
|
-
scorers=loaded_scorers,
|
|
78
|
-
model=model,
|
|
79
|
-
aggregator=aggregator,
|
|
80
|
-
metadata=metadata,
|
|
81
|
-
judgment_api_key=self.judgment_api_key
|
|
82
|
-
)
|
|
83
|
-
return run_eval(eval, override)
|
|
84
|
-
except ValueError as e:
|
|
85
|
-
raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
|
|
86
|
-
|
|
87
|
-
def evaluate_dataset(
|
|
88
|
-
self,
|
|
89
|
-
dataset: EvalDataset,
|
|
90
|
-
scorers: List[Union[ScorerWrapper, JudgevalScorer]],
|
|
91
|
-
model: Union[str, List[str], JudgevalJudge],
|
|
92
|
-
aggregator: Optional[str] = None,
|
|
93
|
-
metadata: Optional[Dict[str, Any]] = None,
|
|
94
|
-
project_name: str = "",
|
|
95
|
-
eval_run_name: str = "",
|
|
96
|
-
log_results: bool = False,
|
|
97
|
-
use_judgment: bool = True
|
|
98
|
-
) -> List[ScoringResult]:
|
|
99
|
-
"""
|
|
100
|
-
Executes an evaluation of a `EvalDataset` using one or more `Scorer`s
|
|
101
|
-
"""
|
|
102
|
-
try:
|
|
103
|
-
# Load appropriate implementations for all scorers
|
|
104
|
-
loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = [
|
|
105
|
-
scorer.load_implementation(use_judgment=use_judgment) if isinstance(scorer, ScorerWrapper) else scorer
|
|
106
|
-
for scorer in scorers
|
|
107
|
-
]
|
|
108
|
-
|
|
109
|
-
evaluation_run = EvaluationRun(
|
|
110
|
-
log_results=log_results,
|
|
111
|
-
project_name=project_name,
|
|
112
|
-
eval_name=eval_run_name,
|
|
113
|
-
examples=dataset.examples,
|
|
114
|
-
scorers=loaded_scorers,
|
|
115
|
-
model=model,
|
|
116
|
-
aggregator=aggregator,
|
|
117
|
-
metadata=metadata,
|
|
118
|
-
judgment_api_key=self.judgment_api_key
|
|
119
|
-
)
|
|
120
|
-
return run_eval(evaluation_run)
|
|
121
|
-
except ValueError as e:
|
|
122
|
-
raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
|
|
123
|
-
|
|
124
|
-
def create_dataset(self) -> EvalDataset:
|
|
125
|
-
return self.eval_dataset_client.create_dataset()
|
|
126
|
-
|
|
127
|
-
def push_dataset(self, alias: str, dataset: EvalDataset, overwrite: Optional[bool] = False) -> bool:
|
|
128
|
-
"""
|
|
129
|
-
Uploads an `EvalDataset` to the Judgment platform for storage.
|
|
130
|
-
|
|
131
|
-
Args:
|
|
132
|
-
alias (str): The name to use for the dataset
|
|
133
|
-
dataset (EvalDataset): The dataset to upload to Judgment
|
|
134
|
-
overwrite (Optional[bool]): Whether to overwrite the dataset if it already exists
|
|
135
|
-
|
|
136
|
-
Returns:
|
|
137
|
-
bool: Whether the dataset was successfully uploaded
|
|
138
|
-
"""
|
|
139
|
-
# Set judgment_api_key just in case it was not set
|
|
140
|
-
dataset.judgment_api_key = self.judgment_api_key
|
|
141
|
-
return self.eval_dataset_client.push(dataset, alias, overwrite)
|
|
142
|
-
|
|
143
|
-
def pull_dataset(self, alias: str) -> EvalDataset:
|
|
144
|
-
"""
|
|
145
|
-
Retrieves a saved `EvalDataset` from the Judgment platform.
|
|
146
|
-
|
|
147
|
-
Args:
|
|
148
|
-
alias (str): The name of the dataset to retrieve
|
|
149
|
-
|
|
150
|
-
Returns:
|
|
151
|
-
EvalDataset: The retrieved dataset
|
|
152
|
-
"""
|
|
153
|
-
return self.eval_dataset_client.pull(alias)
|
|
154
|
-
|
|
155
|
-
def pull_all_user_dataset_stats(self) -> dict:
|
|
156
|
-
"""
|
|
157
|
-
Retrieves all dataset stats from the Judgment platform for the user.
|
|
158
|
-
|
|
159
|
-
Args:
|
|
160
|
-
alias (str): The name of the dataset to retrieve
|
|
161
|
-
|
|
162
|
-
Returns:
|
|
163
|
-
EvalDataset: The retrieved dataset
|
|
164
|
-
"""
|
|
165
|
-
return self.eval_dataset_client.pull_all_user_dataset_stats()
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
# Maybe add option where you can pass in the EvaluationRun object and it will pull the eval results from the backend
|
|
169
|
-
def pull_eval(self, project_name: str, eval_run_name: str) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
|
|
170
|
-
"""Pull evaluation results from the server.
|
|
171
|
-
|
|
172
|
-
Args:
|
|
173
|
-
project_name (str): Name of the project
|
|
174
|
-
eval_run_name (str): Name of the evaluation run
|
|
175
|
-
|
|
176
|
-
Returns:
|
|
177
|
-
Dict[str, Union[str, List[ScoringResult]]]: Dictionary containing:
|
|
178
|
-
- id (str): The evaluation run ID
|
|
179
|
-
- results (List[ScoringResult]): List of scoring results
|
|
180
|
-
"""
|
|
181
|
-
eval_run_request_body = EvalRunRequestBody(project_name=project_name,
|
|
182
|
-
eval_name=eval_run_name,
|
|
183
|
-
judgment_api_key=self.judgment_api_key)
|
|
184
|
-
eval_run = requests.post(JUDGMENT_EVAL_FETCH_API_URL,
|
|
185
|
-
json=eval_run_request_body.model_dump())
|
|
186
|
-
if eval_run.status_code != requests.codes.ok:
|
|
187
|
-
raise ValueError(f"Error fetching eval results: {eval_run.json()}")
|
|
188
|
-
|
|
189
|
-
eval_run_result = [{}]
|
|
190
|
-
for result in eval_run.json():
|
|
191
|
-
result_id = result.get("id", "")
|
|
192
|
-
result_data = result.get("result", dict())
|
|
193
|
-
filtered_result = {k: v for k, v in result_data.items() if k in ScoringResult.__annotations__}
|
|
194
|
-
eval_run_result[0]["id"] = result_id
|
|
195
|
-
eval_run_result[0]["results"] = [ScoringResult(**filtered_result)]
|
|
196
|
-
return eval_run_result
|
|
197
|
-
|
|
198
|
-
def delete_eval(self, project_name: str, eval_run_name: str) -> bool:
|
|
199
|
-
"""
|
|
200
|
-
Deletes an evaluation from the server by project and run name.
|
|
201
|
-
|
|
202
|
-
Args:
|
|
203
|
-
project_name (str): Name of the project
|
|
204
|
-
eval_run_name (str): Name of the evaluation run
|
|
205
|
-
|
|
206
|
-
Returns:
|
|
207
|
-
bool: Whether the evaluation was successfully deleted
|
|
208
|
-
"""
|
|
209
|
-
eval_run_request_body = EvalRunRequestBody(project_name=project_name,
|
|
210
|
-
eval_name=eval_run_name,
|
|
211
|
-
judgment_api_key=self.judgment_api_key)
|
|
212
|
-
response = requests.delete(JUDGMENT_EVAL_DELETE_API_URL,
|
|
213
|
-
json=eval_run_request_body.model_dump(),
|
|
214
|
-
headers={
|
|
215
|
-
"Content-Type": "application/json",
|
|
216
|
-
})
|
|
217
|
-
if response.status_code != requests.codes.ok:
|
|
218
|
-
raise ValueError(f"Error deleting eval results: {response.json()}")
|
|
219
|
-
return response.json()
|
|
220
|
-
|
|
221
|
-
def delete_project_evals(self, project_name: str) -> bool:
|
|
222
|
-
"""
|
|
223
|
-
Deletes all evaluations from the server for a given project.
|
|
224
|
-
|
|
225
|
-
Args:
|
|
226
|
-
project_name (str): Name of the project
|
|
227
|
-
|
|
228
|
-
Returns:
|
|
229
|
-
bool: Whether the evaluations were successfully deleted
|
|
230
|
-
"""
|
|
231
|
-
response = requests.delete(JUDGMENT_EVAL_DELETE_PROJECT_API_URL,
|
|
232
|
-
json={
|
|
233
|
-
"project_name": project_name,
|
|
234
|
-
"judgment_api_key": self.judgment_api_key
|
|
235
|
-
},
|
|
236
|
-
headers={
|
|
237
|
-
"Content-Type": "application/json",
|
|
238
|
-
})
|
|
239
|
-
if response.status_code != requests.codes.ok:
|
|
240
|
-
raise ValueError(f"Error deleting eval results: {response.json()}")
|
|
241
|
-
return response.json()
|
|
242
|
-
|
|
243
|
-
def _validate_api_key(self):
|
|
244
|
-
"""
|
|
245
|
-
Validates that the user api key is valid
|
|
246
|
-
"""
|
|
247
|
-
response = requests.post(
|
|
248
|
-
f"{ROOT_API}/validate_api_key/",
|
|
249
|
-
json={"api_key": self.judgment_api_key}
|
|
250
|
-
)
|
|
251
|
-
if response.status_code == 200:
|
|
252
|
-
return True, response.json()
|
|
253
|
-
else:
|
|
254
|
-
return False, response.json().get("detail", "Error validating API key")
|
|
255
|
-
|
|
256
|
-
def fetch_classifier_scorer(self, slug: str) -> ClassifierScorer:
|
|
257
|
-
"""
|
|
258
|
-
Fetches a classifier scorer configuration from the Judgment API.
|
|
259
|
-
|
|
260
|
-
Args:
|
|
261
|
-
slug (str): Slug identifier of the custom scorer to fetch
|
|
262
|
-
|
|
263
|
-
Returns:
|
|
264
|
-
ClassifierScorer: The configured classifier scorer object
|
|
265
|
-
|
|
266
|
-
Raises:
|
|
267
|
-
JudgmentAPIError: If the scorer cannot be fetched or doesn't exist
|
|
268
|
-
"""
|
|
269
|
-
request_body = {
|
|
270
|
-
"slug": slug,
|
|
271
|
-
"judgment_api_key": self.judgment_api_key
|
|
272
|
-
}
|
|
273
|
-
|
|
274
|
-
response = requests.post(
|
|
275
|
-
f"{ROOT_API}/fetch_scorer/",
|
|
276
|
-
json=request_body
|
|
277
|
-
)
|
|
278
|
-
|
|
279
|
-
if response.status_code == 500:
|
|
280
|
-
raise JudgmentAPIError(f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {response.json().get('detail', '')}")
|
|
281
|
-
elif response.status_code != 200:
|
|
282
|
-
raise JudgmentAPIError(f"Failed to fetch classifier scorer '{slug}': {response.json().get('detail', '')}")
|
|
283
|
-
|
|
284
|
-
scorer_config = response.json()
|
|
285
|
-
|
|
286
|
-
try:
|
|
287
|
-
return ClassifierScorer(**scorer_config)
|
|
288
|
-
except Exception as e:
|
|
289
|
-
raise JudgmentAPIError(f"Failed to create classifier scorer '{slug}' with config {scorer_config}: {str(e)}")
|
|
290
|
-
|
|
291
|
-
def push_classifier_scorer(self, scorer: ClassifierScorer, slug: str = None) -> str:
|
|
292
|
-
"""
|
|
293
|
-
Pushes a classifier scorer configuration to the Judgment API.
|
|
294
|
-
|
|
295
|
-
Args:
|
|
296
|
-
slug (str): Slug identifier for the scorer. If it exists, the scorer will be updated.
|
|
297
|
-
scorer (ClassifierScorer): The classifier scorer to save
|
|
298
|
-
|
|
299
|
-
Returns:
|
|
300
|
-
str: The slug identifier of the saved scorer
|
|
301
|
-
|
|
302
|
-
Raises:
|
|
303
|
-
JudgmentAPIError: If there's an error saving the scorer
|
|
304
|
-
"""
|
|
305
|
-
request_body = {
|
|
306
|
-
"name": scorer.name,
|
|
307
|
-
"conversation": scorer.conversation,
|
|
308
|
-
"options": scorer.options,
|
|
309
|
-
"judgment_api_key": self.judgment_api_key,
|
|
310
|
-
"slug": slug
|
|
311
|
-
}
|
|
312
|
-
|
|
313
|
-
response = requests.post(
|
|
314
|
-
f"{ROOT_API}/save_scorer/",
|
|
315
|
-
json=request_body
|
|
316
|
-
)
|
|
317
|
-
|
|
318
|
-
if response.status_code == 500:
|
|
319
|
-
raise JudgmentAPIError(f"The server is temporarily unavailable. \
|
|
320
|
-
Please try your request again in a few moments. \
|
|
321
|
-
Error details: {response.json().get('detail', '')}")
|
|
322
|
-
elif response.status_code != 200:
|
|
323
|
-
raise JudgmentAPIError(f"Failed to save classifier scorer: {response.json().get('detail', '')}")
|
|
324
|
-
|
|
325
|
-
return response.json()["slug"]
|
|
326
|
-
|
|
327
|
-
def assert_test(
|
|
328
|
-
self,
|
|
329
|
-
examples: List[Example],
|
|
330
|
-
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
|
|
331
|
-
model: Union[str, List[str], JudgevalJudge],
|
|
332
|
-
aggregator: Optional[str] = None,
|
|
333
|
-
metadata: Optional[Dict[str, Any]] = None,
|
|
334
|
-
log_results: bool = True,
|
|
335
|
-
project_name: str = "default_project",
|
|
336
|
-
eval_run_name: str = "default_eval_run",
|
|
337
|
-
override: bool = False,
|
|
338
|
-
) -> None:
|
|
339
|
-
"""
|
|
340
|
-
Asserts a test by running the evaluation and checking the results for success
|
|
341
|
-
"""
|
|
342
|
-
results = self.run_evaluation(
|
|
343
|
-
examples=examples,
|
|
344
|
-
scorers=scorers,
|
|
345
|
-
model=model,
|
|
346
|
-
aggregator=aggregator,
|
|
347
|
-
metadata=metadata,
|
|
348
|
-
log_results=log_results,
|
|
349
|
-
project_name=project_name,
|
|
350
|
-
eval_run_name=eval_run_name,
|
|
351
|
-
override=override
|
|
352
|
-
)
|
|
353
|
-
|
|
354
|
-
assert_test(results)
|