judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of judgeval might be problematic. Click here for more details.

Files changed (171) hide show
  1. judgeval/__init__.py +177 -12
  2. judgeval/api/__init__.py +519 -0
  3. judgeval/api/api_types.py +407 -0
  4. judgeval/cli.py +79 -0
  5. judgeval/constants.py +76 -47
  6. judgeval/data/__init__.py +3 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +15 -56
  9. judgeval/data/judgment_types.py +450 -0
  10. judgeval/data/result.py +29 -73
  11. judgeval/data/scorer_data.py +29 -62
  12. judgeval/data/scripts/fix_default_factory.py +23 -0
  13. judgeval/data/scripts/openapi_transform.py +123 -0
  14. judgeval/data/trace.py +121 -0
  15. judgeval/dataset/__init__.py +264 -0
  16. judgeval/env.py +52 -0
  17. judgeval/evaluation/__init__.py +344 -0
  18. judgeval/exceptions.py +27 -0
  19. judgeval/integrations/langgraph/__init__.py +13 -0
  20. judgeval/integrations/openlit/__init__.py +50 -0
  21. judgeval/judges/__init__.py +2 -3
  22. judgeval/judges/base_judge.py +2 -3
  23. judgeval/judges/litellm_judge.py +100 -20
  24. judgeval/judges/together_judge.py +101 -20
  25. judgeval/judges/utils.py +20 -24
  26. judgeval/logger.py +62 -0
  27. judgeval/prompt/__init__.py +330 -0
  28. judgeval/scorers/__init__.py +18 -25
  29. judgeval/scorers/agent_scorer.py +17 -0
  30. judgeval/scorers/api_scorer.py +45 -41
  31. judgeval/scorers/base_scorer.py +83 -38
  32. judgeval/scorers/example_scorer.py +17 -0
  33. judgeval/scorers/exceptions.py +1 -0
  34. judgeval/scorers/judgeval_scorers/__init__.py +0 -148
  35. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
  36. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
  37. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
  38. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
  39. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
  40. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
  41. judgeval/scorers/score.py +77 -306
  42. judgeval/scorers/utils.py +4 -199
  43. judgeval/tracer/__init__.py +1122 -2
  44. judgeval/tracer/constants.py +1 -0
  45. judgeval/tracer/exporters/__init__.py +40 -0
  46. judgeval/tracer/exporters/s3.py +119 -0
  47. judgeval/tracer/exporters/store.py +59 -0
  48. judgeval/tracer/exporters/utils.py +32 -0
  49. judgeval/tracer/keys.py +63 -0
  50. judgeval/tracer/llm/__init__.py +7 -0
  51. judgeval/tracer/llm/config.py +78 -0
  52. judgeval/tracer/llm/constants.py +9 -0
  53. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  54. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  55. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  56. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  57. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  58. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  59. judgeval/tracer/llm/llm_google/config.py +6 -0
  60. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  61. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  62. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  63. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  64. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  65. judgeval/tracer/llm/llm_openai/config.py +6 -0
  66. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  67. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  68. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  69. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  70. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  71. judgeval/tracer/llm/llm_together/config.py +6 -0
  72. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  73. judgeval/tracer/llm/providers.py +19 -0
  74. judgeval/tracer/managers.py +167 -0
  75. judgeval/tracer/processors/__init__.py +220 -0
  76. judgeval/tracer/utils.py +19 -0
  77. judgeval/trainer/__init__.py +14 -0
  78. judgeval/trainer/base_trainer.py +122 -0
  79. judgeval/trainer/config.py +128 -0
  80. judgeval/trainer/console.py +144 -0
  81. judgeval/trainer/fireworks_trainer.py +396 -0
  82. judgeval/trainer/trainable_model.py +243 -0
  83. judgeval/trainer/trainer.py +70 -0
  84. judgeval/utils/async_utils.py +39 -0
  85. judgeval/utils/decorators/__init__.py +0 -0
  86. judgeval/utils/decorators/dont_throw.py +37 -0
  87. judgeval/utils/decorators/use_once.py +13 -0
  88. judgeval/utils/file_utils.py +97 -0
  89. judgeval/utils/guards.py +36 -0
  90. judgeval/utils/meta.py +27 -0
  91. judgeval/utils/project.py +15 -0
  92. judgeval/utils/serialize.py +253 -0
  93. judgeval/utils/testing.py +70 -0
  94. judgeval/utils/url.py +10 -0
  95. judgeval/utils/version_check.py +28 -0
  96. judgeval/utils/wrappers/README.md +3 -0
  97. judgeval/utils/wrappers/__init__.py +15 -0
  98. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  99. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  100. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  101. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  102. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  103. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  104. judgeval/utils/wrappers/py.typed +0 -0
  105. judgeval/utils/wrappers/utils.py +35 -0
  106. judgeval/version.py +5 -0
  107. judgeval/warnings.py +4 -0
  108. judgeval-0.22.2.dist-info/METADATA +265 -0
  109. judgeval-0.22.2.dist-info/RECORD +112 -0
  110. judgeval-0.22.2.dist-info/entry_points.txt +2 -0
  111. judgeval/clients.py +0 -39
  112. judgeval/common/__init__.py +0 -8
  113. judgeval/common/exceptions.py +0 -28
  114. judgeval/common/logger.py +0 -189
  115. judgeval/common/tracer.py +0 -798
  116. judgeval/common/utils.py +0 -763
  117. judgeval/data/api_example.py +0 -111
  118. judgeval/data/datasets/__init__.py +0 -5
  119. judgeval/data/datasets/dataset.py +0 -286
  120. judgeval/data/datasets/eval_dataset_client.py +0 -193
  121. judgeval/data/datasets/ground_truth.py +0 -54
  122. judgeval/data/datasets/utils.py +0 -74
  123. judgeval/evaluation_run.py +0 -132
  124. judgeval/judges/mixture_of_judges.py +0 -248
  125. judgeval/judgment_client.py +0 -354
  126. judgeval/run_evaluation.py +0 -439
  127. judgeval/scorers/judgeval_scorer.py +0 -140
  128. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
  129. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
  130. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
  131. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
  132. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
  133. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
  134. judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
  135. judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
  136. judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
  137. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
  138. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
  139. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
  140. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
  141. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
  142. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
  143. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
  144. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
  145. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
  146. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
  147. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
  148. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
  149. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
  150. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
  151. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
  152. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
  153. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
  154. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
  155. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
  156. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
  157. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
  158. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
  159. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
  160. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
  161. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
  162. judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
  163. judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
  164. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
  165. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
  166. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
  167. judgeval/scorers/prompt_scorer.py +0 -439
  168. judgeval-0.0.11.dist-info/METADATA +0 -36
  169. judgeval-0.0.11.dist-info/RECORD +0 -84
  170. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
  171. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,354 +0,0 @@
1
- """
2
- Implements the JudgmentClient to interact with the Judgment API.
3
- """
4
- import os
5
- from typing import Optional, List, Dict, Any, Union
6
- import requests
7
-
8
- from judgeval.constants import ROOT_API
9
- from judgeval.data.datasets import EvalDataset, EvalDatasetClient
10
- from judgeval.data import (
11
- ScoringResult,
12
- Example
13
- )
14
- from judgeval.scorers import (
15
- APIJudgmentScorer,
16
- JudgevalScorer,
17
- ClassifierScorer,
18
- ScorerWrapper
19
- )
20
- from judgeval.evaluation_run import EvaluationRun
21
- from judgeval.run_evaluation import (
22
- run_eval,
23
- assert_test
24
- )
25
- from judgeval.judges import JudgevalJudge
26
- from judgeval.constants import JUDGMENT_EVAL_FETCH_API_URL, JUDGMENT_EVAL_DELETE_API_URL, JUDGMENT_EVAL_DELETE_PROJECT_API_URL
27
- from judgeval.common.exceptions import JudgmentAPIError
28
- from pydantic import BaseModel
29
-
30
- class EvalRunRequestBody(BaseModel):
31
- eval_name: str
32
- project_name: str
33
- judgment_api_key: str
34
-
35
-
36
- class JudgmentClient:
37
- def __init__(self, judgment_api_key: str = os.getenv("JUDGMENT_API_KEY")):
38
- self.judgment_api_key = judgment_api_key
39
- self.eval_dataset_client = EvalDatasetClient(judgment_api_key)
40
-
41
- # Verify API key is valid
42
- result, response = self._validate_api_key()
43
- if not result:
44
- # May be bad to output their invalid API key...
45
- raise JudgmentAPIError(f"Issue with passed in Judgment API key: {response}")
46
- else:
47
- print(f"Successfully initialized JudgmentClient, welcome back {response.get('detail', {}).get('user_name', 'user')}!")
48
-
49
- def run_evaluation(
50
- self,
51
- examples: List[Example],
52
- scorers: List[Union[ScorerWrapper, JudgevalScorer]],
53
- model: Union[str, List[str], JudgevalJudge],
54
- aggregator: Optional[str] = None,
55
- metadata: Optional[Dict[str, Any]] = None,
56
- log_results: bool = True,
57
- project_name: str = "default_project",
58
- eval_run_name: str = "default_eval_run",
59
- override: bool = False,
60
- use_judgment: bool = True
61
- ) -> List[ScoringResult]:
62
- """
63
- Executes an evaluation of `Example`s using one or more `Scorer`s
64
- """
65
- try:
66
- # Load appropriate implementations for all scorers
67
- loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = [
68
- scorer.load_implementation(use_judgment=use_judgment) if isinstance(scorer, ScorerWrapper) else scorer
69
- for scorer in scorers
70
- ]
71
-
72
- eval = EvaluationRun(
73
- log_results=log_results,
74
- project_name=project_name,
75
- eval_name=eval_run_name,
76
- examples=examples,
77
- scorers=loaded_scorers,
78
- model=model,
79
- aggregator=aggregator,
80
- metadata=metadata,
81
- judgment_api_key=self.judgment_api_key
82
- )
83
- return run_eval(eval, override)
84
- except ValueError as e:
85
- raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
86
-
87
- def evaluate_dataset(
88
- self,
89
- dataset: EvalDataset,
90
- scorers: List[Union[ScorerWrapper, JudgevalScorer]],
91
- model: Union[str, List[str], JudgevalJudge],
92
- aggregator: Optional[str] = None,
93
- metadata: Optional[Dict[str, Any]] = None,
94
- project_name: str = "",
95
- eval_run_name: str = "",
96
- log_results: bool = False,
97
- use_judgment: bool = True
98
- ) -> List[ScoringResult]:
99
- """
100
- Executes an evaluation of a `EvalDataset` using one or more `Scorer`s
101
- """
102
- try:
103
- # Load appropriate implementations for all scorers
104
- loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = [
105
- scorer.load_implementation(use_judgment=use_judgment) if isinstance(scorer, ScorerWrapper) else scorer
106
- for scorer in scorers
107
- ]
108
-
109
- evaluation_run = EvaluationRun(
110
- log_results=log_results,
111
- project_name=project_name,
112
- eval_name=eval_run_name,
113
- examples=dataset.examples,
114
- scorers=loaded_scorers,
115
- model=model,
116
- aggregator=aggregator,
117
- metadata=metadata,
118
- judgment_api_key=self.judgment_api_key
119
- )
120
- return run_eval(evaluation_run)
121
- except ValueError as e:
122
- raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
123
-
124
- def create_dataset(self) -> EvalDataset:
125
- return self.eval_dataset_client.create_dataset()
126
-
127
- def push_dataset(self, alias: str, dataset: EvalDataset, overwrite: Optional[bool] = False) -> bool:
128
- """
129
- Uploads an `EvalDataset` to the Judgment platform for storage.
130
-
131
- Args:
132
- alias (str): The name to use for the dataset
133
- dataset (EvalDataset): The dataset to upload to Judgment
134
- overwrite (Optional[bool]): Whether to overwrite the dataset if it already exists
135
-
136
- Returns:
137
- bool: Whether the dataset was successfully uploaded
138
- """
139
- # Set judgment_api_key just in case it was not set
140
- dataset.judgment_api_key = self.judgment_api_key
141
- return self.eval_dataset_client.push(dataset, alias, overwrite)
142
-
143
- def pull_dataset(self, alias: str) -> EvalDataset:
144
- """
145
- Retrieves a saved `EvalDataset` from the Judgment platform.
146
-
147
- Args:
148
- alias (str): The name of the dataset to retrieve
149
-
150
- Returns:
151
- EvalDataset: The retrieved dataset
152
- """
153
- return self.eval_dataset_client.pull(alias)
154
-
155
- def pull_all_user_dataset_stats(self) -> dict:
156
- """
157
- Retrieves all dataset stats from the Judgment platform for the user.
158
-
159
- Args:
160
- alias (str): The name of the dataset to retrieve
161
-
162
- Returns:
163
- EvalDataset: The retrieved dataset
164
- """
165
- return self.eval_dataset_client.pull_all_user_dataset_stats()
166
-
167
-
168
- # Maybe add option where you can pass in the EvaluationRun object and it will pull the eval results from the backend
169
- def pull_eval(self, project_name: str, eval_run_name: str) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
170
- """Pull evaluation results from the server.
171
-
172
- Args:
173
- project_name (str): Name of the project
174
- eval_run_name (str): Name of the evaluation run
175
-
176
- Returns:
177
- Dict[str, Union[str, List[ScoringResult]]]: Dictionary containing:
178
- - id (str): The evaluation run ID
179
- - results (List[ScoringResult]): List of scoring results
180
- """
181
- eval_run_request_body = EvalRunRequestBody(project_name=project_name,
182
- eval_name=eval_run_name,
183
- judgment_api_key=self.judgment_api_key)
184
- eval_run = requests.post(JUDGMENT_EVAL_FETCH_API_URL,
185
- json=eval_run_request_body.model_dump())
186
- if eval_run.status_code != requests.codes.ok:
187
- raise ValueError(f"Error fetching eval results: {eval_run.json()}")
188
-
189
- eval_run_result = [{}]
190
- for result in eval_run.json():
191
- result_id = result.get("id", "")
192
- result_data = result.get("result", dict())
193
- filtered_result = {k: v for k, v in result_data.items() if k in ScoringResult.__annotations__}
194
- eval_run_result[0]["id"] = result_id
195
- eval_run_result[0]["results"] = [ScoringResult(**filtered_result)]
196
- return eval_run_result
197
-
198
- def delete_eval(self, project_name: str, eval_run_name: str) -> bool:
199
- """
200
- Deletes an evaluation from the server by project and run name.
201
-
202
- Args:
203
- project_name (str): Name of the project
204
- eval_run_name (str): Name of the evaluation run
205
-
206
- Returns:
207
- bool: Whether the evaluation was successfully deleted
208
- """
209
- eval_run_request_body = EvalRunRequestBody(project_name=project_name,
210
- eval_name=eval_run_name,
211
- judgment_api_key=self.judgment_api_key)
212
- response = requests.delete(JUDGMENT_EVAL_DELETE_API_URL,
213
- json=eval_run_request_body.model_dump(),
214
- headers={
215
- "Content-Type": "application/json",
216
- })
217
- if response.status_code != requests.codes.ok:
218
- raise ValueError(f"Error deleting eval results: {response.json()}")
219
- return response.json()
220
-
221
- def delete_project_evals(self, project_name: str) -> bool:
222
- """
223
- Deletes all evaluations from the server for a given project.
224
-
225
- Args:
226
- project_name (str): Name of the project
227
-
228
- Returns:
229
- bool: Whether the evaluations were successfully deleted
230
- """
231
- response = requests.delete(JUDGMENT_EVAL_DELETE_PROJECT_API_URL,
232
- json={
233
- "project_name": project_name,
234
- "judgment_api_key": self.judgment_api_key
235
- },
236
- headers={
237
- "Content-Type": "application/json",
238
- })
239
- if response.status_code != requests.codes.ok:
240
- raise ValueError(f"Error deleting eval results: {response.json()}")
241
- return response.json()
242
-
243
- def _validate_api_key(self):
244
- """
245
- Validates that the user api key is valid
246
- """
247
- response = requests.post(
248
- f"{ROOT_API}/validate_api_key/",
249
- json={"api_key": self.judgment_api_key}
250
- )
251
- if response.status_code == 200:
252
- return True, response.json()
253
- else:
254
- return False, response.json().get("detail", "Error validating API key")
255
-
256
- def fetch_classifier_scorer(self, slug: str) -> ClassifierScorer:
257
- """
258
- Fetches a classifier scorer configuration from the Judgment API.
259
-
260
- Args:
261
- slug (str): Slug identifier of the custom scorer to fetch
262
-
263
- Returns:
264
- ClassifierScorer: The configured classifier scorer object
265
-
266
- Raises:
267
- JudgmentAPIError: If the scorer cannot be fetched or doesn't exist
268
- """
269
- request_body = {
270
- "slug": slug,
271
- "judgment_api_key": self.judgment_api_key
272
- }
273
-
274
- response = requests.post(
275
- f"{ROOT_API}/fetch_scorer/",
276
- json=request_body
277
- )
278
-
279
- if response.status_code == 500:
280
- raise JudgmentAPIError(f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {response.json().get('detail', '')}")
281
- elif response.status_code != 200:
282
- raise JudgmentAPIError(f"Failed to fetch classifier scorer '{slug}': {response.json().get('detail', '')}")
283
-
284
- scorer_config = response.json()
285
-
286
- try:
287
- return ClassifierScorer(**scorer_config)
288
- except Exception as e:
289
- raise JudgmentAPIError(f"Failed to create classifier scorer '{slug}' with config {scorer_config}: {str(e)}")
290
-
291
- def push_classifier_scorer(self, scorer: ClassifierScorer, slug: str = None) -> str:
292
- """
293
- Pushes a classifier scorer configuration to the Judgment API.
294
-
295
- Args:
296
- slug (str): Slug identifier for the scorer. If it exists, the scorer will be updated.
297
- scorer (ClassifierScorer): The classifier scorer to save
298
-
299
- Returns:
300
- str: The slug identifier of the saved scorer
301
-
302
- Raises:
303
- JudgmentAPIError: If there's an error saving the scorer
304
- """
305
- request_body = {
306
- "name": scorer.name,
307
- "conversation": scorer.conversation,
308
- "options": scorer.options,
309
- "judgment_api_key": self.judgment_api_key,
310
- "slug": slug
311
- }
312
-
313
- response = requests.post(
314
- f"{ROOT_API}/save_scorer/",
315
- json=request_body
316
- )
317
-
318
- if response.status_code == 500:
319
- raise JudgmentAPIError(f"The server is temporarily unavailable. \
320
- Please try your request again in a few moments. \
321
- Error details: {response.json().get('detail', '')}")
322
- elif response.status_code != 200:
323
- raise JudgmentAPIError(f"Failed to save classifier scorer: {response.json().get('detail', '')}")
324
-
325
- return response.json()["slug"]
326
-
327
- def assert_test(
328
- self,
329
- examples: List[Example],
330
- scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
331
- model: Union[str, List[str], JudgevalJudge],
332
- aggregator: Optional[str] = None,
333
- metadata: Optional[Dict[str, Any]] = None,
334
- log_results: bool = True,
335
- project_name: str = "default_project",
336
- eval_run_name: str = "default_eval_run",
337
- override: bool = False,
338
- ) -> None:
339
- """
340
- Asserts a test by running the evaluation and checking the results for success
341
- """
342
- results = self.run_evaluation(
343
- examples=examples,
344
- scorers=scorers,
345
- model=model,
346
- aggregator=aggregator,
347
- metadata=metadata,
348
- log_results=log_results,
349
- project_name=project_name,
350
- eval_run_name=eval_run_name,
351
- override=override
352
- )
353
-
354
- assert_test(results)