judgeval 0.0.26__py3-none-any.whl → 0.0.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
judgeval/constants.py CHANGED
@@ -41,14 +41,16 @@ ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
41
41
  JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
42
42
  JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
43
43
  JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull/"
44
+ JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
44
45
  JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
45
- JUDGMENT_DATASETS_PULL_ALL_API_URL = f"{ROOT_API}/datasets/get_all_stats/"
46
- JUDGMENT_DATASETS_EDIT_API_URL = f"{ROOT_API}/datasets/edit/"
46
+ JUDGMENT_DATASETS_PROJECT_STATS_API_URL = f"{ROOT_API}/datasets/fetch_stats_by_project/"
47
+ JUDGMENT_DATASETS_INSERT_API_URL = f"{ROOT_API}/datasets/insert_examples/"
47
48
  JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
48
49
  JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
49
50
  JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_names/"
50
51
  JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
51
52
  JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete/"
53
+ JUDGMENT_PROJECT_CREATE_API_URL = f"{ROOT_API}/projects/add/"
52
54
  JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
53
55
  JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
54
56
  JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
judgeval/data/__init__.py CHANGED
@@ -1,13 +1,10 @@
1
1
  from judgeval.data.example import Example, ExampleParams
2
- from judgeval.data.api_example import ProcessExample, create_process_example
3
2
  from judgeval.data.scorer_data import ScorerData, create_scorer_data
4
3
  from judgeval.data.result import ScoringResult, generate_scoring_result
5
4
 
6
5
  __all__ = [
7
6
  "Example",
8
7
  "ExampleParams",
9
- "ProcessExample",
10
- "create_process_example",
11
8
  "ScorerData",
12
9
  "create_scorer_data",
13
10
  "ScoringResult",
@@ -7,8 +7,9 @@ from judgeval.common.logger import debug, error, warning, info
7
7
  from judgeval.constants import (
8
8
  JUDGMENT_DATASETS_PUSH_API_URL,
9
9
  JUDGMENT_DATASETS_PULL_API_URL,
10
- JUDGMENT_DATASETS_PULL_ALL_API_URL,
11
- JUDGMENT_DATASETS_EDIT_API_URL,
10
+ JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
11
+ JUDGMENT_DATASETS_DELETE_API_URL,
12
+ JUDGMENT_DATASETS_INSERT_API_URL,
12
13
  JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
13
14
  )
14
15
  from judgeval.data import Example
@@ -25,7 +26,7 @@ class EvalDatasetClient:
25
26
  def create_dataset(self) -> EvalDataset:
26
27
  return EvalDataset(judgment_api_key=self.judgment_api_key)
27
28
 
28
- def push(self, dataset: EvalDataset, alias: str, overwrite: Optional[bool] = False) -> bool:
29
+ def push(self, dataset: EvalDataset, alias: str, project_name: str, overwrite: Optional[bool] = False) -> bool:
29
30
  debug(f"Pushing dataset with alias '{alias}' (overwrite={overwrite})")
30
31
  if overwrite:
31
32
  warning(f"Overwrite enabled for alias '{alias}'")
@@ -53,7 +54,8 @@ class EvalDatasetClient:
53
54
  total=100,
54
55
  )
55
56
  content = {
56
- "alias": alias,
57
+ "dataset_alias": alias,
58
+ "project_name": project_name,
57
59
  "examples": [e.to_dict() for e in dataset.examples],
58
60
  "overwrite": overwrite,
59
61
  }
@@ -88,7 +90,7 @@ class EvalDatasetClient:
88
90
  )
89
91
  return True
90
92
 
91
- def pull(self, alias: str) -> EvalDataset:
93
+ def pull(self, alias: str, project_name: str) -> EvalDataset:
92
94
  debug(f"Pulling dataset with alias '{alias}'")
93
95
  """
94
96
  Pulls the dataset from Judgment platform
@@ -96,7 +98,7 @@ class EvalDatasetClient:
96
98
  Mock request:
97
99
  {
98
100
  "alias": alias,
99
- "user_id": user_id
101
+ "project_name": project_name
100
102
  }
101
103
  ==>
102
104
  {
@@ -118,7 +120,8 @@ class EvalDatasetClient:
118
120
  total=100,
119
121
  )
120
122
  request_body = {
121
- "alias": alias,
123
+ "dataset_alias": alias,
124
+ "project_name": project_name
122
125
  }
123
126
 
124
127
  try:
@@ -139,24 +142,58 @@ class EvalDatasetClient:
139
142
 
140
143
  info(f"Successfully pulled dataset with alias '{alias}'")
141
144
  payload = response.json()
145
+
142
146
  dataset.examples = [Example(**e) for e in payload.get("examples", [])]
143
- dataset._alias = payload.get("_alias")
144
- dataset._id = payload.get("_id")
147
+ dataset._alias = payload.get("alias")
148
+ dataset._id = payload.get("id")
145
149
  progress.update(
146
150
  task_id,
147
151
  description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
148
152
  )
149
153
 
150
154
  return dataset
155
+
156
+ def delete(self, alias: str, project_name: str) -> bool:
157
+ with Progress(
158
+ SpinnerColumn(style="rgb(106,0,255)"),
159
+ TextColumn("[progress.description]{task.description}"),
160
+ transient=False,
161
+ ) as progress:
162
+ task_id = progress.add_task(
163
+ f"Deleting [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] from Judgment...",
164
+ total=100,
165
+ )
166
+ request_body = {
167
+ "dataset_alias": alias,
168
+ "project_name": project_name
169
+ }
151
170
 
152
- def pull_all_user_dataset_stats(self) -> dict:
153
- debug(f"Pulling user datasets stats for user_id: {self.judgment_api_key}'")
171
+ try:
172
+ response = requests.post(
173
+ JUDGMENT_DATASETS_DELETE_API_URL,
174
+ json=request_body,
175
+ headers={
176
+ "Content-Type": "application/json",
177
+ "Authorization": f"Bearer {self.judgment_api_key}",
178
+ "X-Organization-Id": self.organization_id
179
+ },
180
+ verify=True
181
+ )
182
+ response.raise_for_status()
183
+ except requests.exceptions.RequestException as e:
184
+ error(f"Error deleting dataset: {str(e)}")
185
+ raise
186
+
187
+ return True
188
+
189
+ def pull_project_dataset_stats(self, project_name: str) -> dict:
190
+ debug(f"Pulling project datasets stats for project_name: {project_name}'")
154
191
  """
155
- Pulls the user datasets stats from Judgment platform
192
+ Pulls the project datasets stats from Judgment platform
156
193
 
157
194
  Mock request:
158
195
  {
159
- "user_id": user_id
196
+ "project_name": project_name
160
197
  }
161
198
  ==>
162
199
  {
@@ -177,11 +214,12 @@ class EvalDatasetClient:
177
214
  total=100,
178
215
  )
179
216
  request_body = {
217
+ "project_name": project_name
180
218
  }
181
219
 
182
220
  try:
183
221
  response = requests.post(
184
- JUDGMENT_DATASETS_PULL_ALL_API_URL,
222
+ JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
185
223
  json=request_body,
186
224
  headers={
187
225
  "Content-Type": "application/json",
@@ -205,7 +243,7 @@ class EvalDatasetClient:
205
243
 
206
244
  return payload
207
245
 
208
- def edit_dataset(self, alias: str, examples: List[Example]) -> bool:
246
+ def insert_dataset(self, alias: str, examples: List[Example], project_name: str) -> bool:
209
247
  """
210
248
  Edits the dataset on Judgment platform by adding new examples
211
249
 
@@ -213,7 +251,7 @@ class EvalDatasetClient:
213
251
  {
214
252
  "alias": alias,
215
253
  "examples": [...],
216
- "judgment_api_key": self.judgment_api_key
254
+ "project_name": project_name
217
255
  }
218
256
  """
219
257
  with Progress(
@@ -227,13 +265,14 @@ class EvalDatasetClient:
227
265
  )
228
266
 
229
267
  content = {
230
- "alias": alias,
268
+ "dataset_alias": alias,
231
269
  "examples": [e.to_dict() for e in examples],
270
+ "project_name": project_name
232
271
  }
233
272
 
234
273
  try:
235
274
  response = requests.post(
236
- JUDGMENT_DATASETS_EDIT_API_URL,
275
+ JUDGMENT_DATASETS_INSERT_API_URL,
237
276
  json=content,
238
277
  headers={
239
278
  "Content-Type": "application/json",
@@ -250,7 +289,7 @@ class EvalDatasetClient:
250
289
  info(f"Successfully edited dataset '{alias}'")
251
290
  return True
252
291
 
253
- def export_jsonl(self, alias: str) -> requests.Response:
292
+ def export_jsonl(self, alias: str, project_name: str) -> requests.Response:
254
293
  """Export dataset in JSONL format from Judgment platform"""
255
294
  debug(f"Exporting dataset with alias '{alias}' as JSONL")
256
295
  with Progress(
@@ -265,7 +304,7 @@ class EvalDatasetClient:
265
304
  try:
266
305
  response = requests.post(
267
306
  JUDGMENT_DATASETS_EXPORT_JSONL_API_URL,
268
- json={"alias": alias},
307
+ json={"dataset_alias": alias, "project_name": project_name},
269
308
  headers={
270
309
  "Content-Type": "application/json",
271
310
  "Authorization": f"Bearer {self.judgment_api_key}",
judgeval/data/result.py CHANGED
@@ -1,10 +1,11 @@
1
1
  from dataclasses import dataclass
2
2
  from typing import List, Union, Optional, Dict, Any, Union
3
+ from judgeval.common.logger import debug, error
4
+ from pydantic import BaseModel
5
+ from judgeval.data import ScorerData, Example
3
6
 
4
- from judgeval.data import ScorerData, ProcessExample
5
7
 
6
- @dataclass
7
- class ScoringResult:
8
+ class ScoringResult(BaseModel):
8
9
  """
9
10
  A ScoringResult contains the output of one or more scorers applied to a single example.
10
11
  Ie: One input, one actual_output, one expected_output, etc..., and 1+ scorer (Faithfulness, Hallucination, Summarization, etc...)
@@ -13,69 +14,44 @@ class ScoringResult:
13
14
  success (bool): Whether the evaluation was successful.
14
15
  This means that all scorers applied to this example returned a success.
15
16
  scorer_data (List[ScorerData]): The scorers data for the evaluated example
16
- input (Optional[str]): The input to the example
17
- actual_output (Optional[str]): The actual output of the example
18
- expected_output (Optional[str]): The expected output of the example
19
- context (Optional[List[str]]): The context of the example
20
- retrieval_context (Optional[List[str]]): The retrieval context of the example
21
- additional_metadata (Optional[Dict[str, Any]]): The additional metadata of the example
22
- tools_called (Optional[List[str]]): The tools called by the example
23
- expected_tools (Optional[List[str]]): The expected tools of the example
24
- trace_id (Optional[str]): The trace id of the example
17
+ data_object (Optional[Example]): The original example object that was used to create the ScoringResult, can be Example, CustomExample (future), WorkflowRun (future)
25
18
 
26
19
  """
27
20
  # Fields for scoring outputs
28
21
  success: bool # used for unit testing
29
22
  scorers_data: Union[List[ScorerData], None]
23
+ name: Optional[str] = None
30
24
 
31
- # Inputs from the original example
32
- input: Optional[str] = None
33
- actual_output: Optional[Union[str, List[str]]] = None
34
- expected_output: Optional[Union[str, List[str]]] = None
35
- context: Optional[List[str]] = None
36
- retrieval_context: Optional[List[str]] = None
37
- additional_metadata: Optional[Dict[str, Any]] = None
38
- tools_called: Optional[List[str]] = None
39
- expected_tools: Optional[List[str]] = None
25
+ # The original example object that was used to create the ScoringResult
26
+ data_object: Optional[Example] = None #can be Example, CustomExample (future), WorkflowRun (future)
40
27
  trace_id: Optional[str] = None
41
28
 
42
- example_id: Optional[str] = None
43
- eval_run_name: Optional[str] = None
29
+ # Additional fields for internal use
30
+ run_duration: Optional[float] = None
31
+ evaluation_cost: Optional[float] = None
44
32
 
45
33
  def to_dict(self) -> dict:
46
34
  """Convert the ScoringResult instance to a dictionary, properly serializing scorer_data."""
47
35
  return {
48
36
  "success": self.success,
49
37
  "scorers_data": [scorer_data.to_dict() for scorer_data in self.scorers_data] if self.scorers_data else None,
50
- "input": self.input,
51
- "actual_output": self.actual_output,
52
- "expected_output": self.expected_output,
53
- "context": self.context,
54
- "retrieval_context": self.retrieval_context,
55
- "additional_metadata": self.additional_metadata,
56
- "tools_called": self.tools_called,
57
- "expected_tools": self.expected_tools,
58
- "trace_id": self.trace_id,
59
- "example_id": self.example_id
38
+ "data_object": self.data_object.to_dict() if self.data_object else None,
60
39
  }
61
-
40
+
62
41
  def __str__(self) -> str:
63
42
  return f"ScoringResult(\
64
43
  success={self.success}, \
65
44
  scorer_data={self.scorers_data}, \
66
- input={self.input}, \
67
- actual_output={self.actual_output}, \
68
- expected_output={self.expected_output}, \
69
- context={self.context}, \
70
- retrieval_context={self.retrieval_context}, \
71
- additional_metadata={self.additional_metadata}, \
72
- tools_called={self.tools_called}, \
73
- expected_tools={self.expected_tools}, \
74
- trace_id={self.trace_id})"
45
+ data_object={self.data_object}, \
46
+ run_duration={self.run_duration}, \
47
+ evaluation_cost={self.evaluation_cost})"
75
48
 
76
49
 
77
50
  def generate_scoring_result(
78
- process_example: ProcessExample,
51
+ example: Example,
52
+ success: bool,
53
+ scorers_data: List[ScorerData],
54
+ run_duration: float,
79
55
  ) -> ScoringResult:
80
56
  """
81
57
  Creates a final ScoringResult object for an evaluation run based on the results from a completed LLMApiTestCase.
@@ -83,16 +59,18 @@ def generate_scoring_result(
83
59
  When an LLMTestCase is executed, it turns into an LLMApiTestCase and the progress of the evaluation run is tracked.
84
60
  At the end of the evaluation run, we create a TestResult object out of the completed LLMApiTestCase.
85
61
  """
86
- return ScoringResult(
87
- success=process_example.success,
88
- scorers_data=process_example.scorers_data,
89
- input=process_example.input,
90
- actual_output=process_example.actual_output,
91
- expected_output=process_example.expected_output,
92
- context=process_example.context,
93
- retrieval_context=process_example.retrieval_context,
94
- additional_metadata=process_example.additional_metadata,
95
- tools_called=process_example.tools_called,
96
- expected_tools=process_example.expected_tools,
97
- trace_id=process_example.trace_id
62
+ if example.name is not None:
63
+ name = example.name
64
+ else:
65
+ name = "Test Case Placeholder"
66
+ debug(f"No name provided for example, using default name: {name}")
67
+ debug(f"Creating ScoringResult for: {name}")
68
+ scoring_result = ScoringResult(
69
+ name=name,
70
+ data_object=example,
71
+ success=success,
72
+ scorers_data=scorers_data,
73
+ run_duration=run_duration,
74
+ evaluation_cost=None,
98
75
  )
76
+ return scoring_result
@@ -27,7 +27,8 @@ from judgeval.constants import (
27
27
  JUDGMENT_EVAL_FETCH_API_URL,
28
28
  JUDGMENT_EVAL_DELETE_API_URL,
29
29
  JUDGMENT_EVAL_DELETE_PROJECT_API_URL,
30
- JUDGMENT_PROJECT_DELETE_API_URL
30
+ JUDGMENT_PROJECT_DELETE_API_URL,
31
+ JUDGMENT_PROJECT_CREATE_API_URL
31
32
  )
32
33
  from judgeval.common.exceptions import JudgmentAPIError
33
34
  from pydantic import BaseModel
@@ -43,8 +44,16 @@ class DeleteEvalRunRequestBody(BaseModel):
43
44
  project_name: str
44
45
  judgment_api_key: str
45
46
 
47
+ class SingletonMeta(type):
48
+ _instances = {}
46
49
 
47
- class JudgmentClient:
50
+ def __call__(cls, *args, **kwargs):
51
+ if cls not in cls._instances:
52
+ instance = super().__call__(*args, **kwargs)
53
+ cls._instances[cls] = instance
54
+ return cls._instances[cls]
55
+
56
+ class JudgmentClient(metaclass=SingletonMeta):
48
57
  def __init__(self, judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"), organization_id: str = os.getenv("JUDGMENT_ORG_ID")):
49
58
  self.judgment_api_key = judgment_api_key
50
59
  self.organization_id = organization_id
@@ -56,8 +65,8 @@ class JudgmentClient:
56
65
  # May be bad to output their invalid API key...
57
66
  raise JudgmentAPIError(f"Issue with passed in Judgment API key: {response}")
58
67
  else:
59
- print(f"Successfully initialized JudgmentClient, welcome back {response.get('detail', {}).get('user_name', 'user')}!")
60
-
68
+ print(f"Successfully initialized JudgmentClient!")
69
+
61
70
  def a_run_evaluation(
62
71
  self,
63
72
  examples: List[Example],
@@ -267,7 +276,7 @@ class JudgmentClient:
267
276
  def create_dataset(self) -> EvalDataset:
268
277
  return self.eval_dataset_client.create_dataset()
269
278
 
270
- def push_dataset(self, alias: str, dataset: EvalDataset, overwrite: Optional[bool] = False) -> bool:
279
+ def push_dataset(self, alias: str, dataset: EvalDataset, project_name: str, overwrite: Optional[bool] = False) -> bool:
271
280
  """
272
281
  Uploads an `EvalDataset` to the Judgment platform for storage.
273
282
 
@@ -281,9 +290,9 @@ class JudgmentClient:
281
290
  """
282
291
  # Set judgment_api_key just in case it was not set
283
292
  dataset.judgment_api_key = self.judgment_api_key
284
- return self.eval_dataset_client.push(dataset, alias, overwrite)
293
+ return self.eval_dataset_client.push(dataset, alias, project_name, overwrite)
285
294
 
286
- def pull_dataset(self, alias: str) -> EvalDataset:
295
+ def pull_dataset(self, alias: str, project_name: str) -> EvalDataset:
287
296
  """
288
297
  Retrieves a saved `EvalDataset` from the Judgment platform.
289
298
 
@@ -293,25 +302,31 @@ class JudgmentClient:
293
302
  Returns:
294
303
  EvalDataset: The retrieved dataset
295
304
  """
296
- return self.eval_dataset_client.pull(alias)
305
+ return self.eval_dataset_client.pull(alias, project_name)
306
+
307
+ def delete_dataset(self, alias: str, project_name: str) -> bool:
308
+ """
309
+ Deletes a saved `EvalDataset` from the Judgment platform.
310
+ """
311
+ return self.eval_dataset_client.delete(alias, project_name)
297
312
 
298
- def pull_all_user_dataset_stats(self) -> dict:
313
+ def pull_project_dataset_stats(self, project_name: str) -> dict:
299
314
  """
300
- Retrieves all dataset stats from the Judgment platform for the user.
315
+ Retrieves all dataset stats from the Judgment platform for the project.
301
316
 
302
317
  Args:
303
- alias (str): The name of the dataset to retrieve
318
+ project_name (str): The name of the project to retrieve
304
319
 
305
320
  Returns:
306
- EvalDataset: The retrieved dataset
321
+ dict: The retrieved dataset stats
307
322
  """
308
- return self.eval_dataset_client.pull_all_user_dataset_stats()
323
+ return self.eval_dataset_client.pull_project_dataset_stats(project_name)
309
324
 
310
- def edit_dataset(self, alias: str, examples: List[Example]) -> bool:
325
+ def insert_dataset(self, alias: str, examples: List[Example], project_name: str) -> bool:
311
326
  """
312
327
  Edits the dataset on Judgment platform by adding new examples
313
328
  """
314
- return self.eval_dataset_client.edit_dataset(alias, examples)
329
+ return self.eval_dataset_client.insert_dataset(alias, examples, project_name)
315
330
 
316
331
  # Maybe add option where you can pass in the EvaluationRun object and it will pull the eval results from the backend
317
332
  def pull_eval(self, project_name: str, eval_run_name: str) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
@@ -402,6 +417,23 @@ class JudgmentClient:
402
417
  raise ValueError(f"Error deleting eval results: {response.json()}")
403
418
  return response.json()
404
419
 
420
+ def create_project(self, project_name: str) -> bool:
421
+ """
422
+ Creates a project on the server.
423
+ """
424
+ response = requests.post(JUDGMENT_PROJECT_CREATE_API_URL,
425
+ json={
426
+ "project_name": project_name,
427
+ },
428
+ headers={
429
+ "Content-Type": "application/json",
430
+ "Authorization": f"Bearer {self.judgment_api_key}",
431
+ "X-Organization-Id": self.organization_id
432
+ })
433
+ if response.status_code != requests.codes.ok:
434
+ raise ValueError(f"Error creating project: {response.json()}")
435
+ return response.json()
436
+
405
437
  def delete_project(self, project_name: str) -> bool:
406
438
  """
407
439
  Deletes a project from the server. Which also deletes all evaluations and traces associated with the project.
@@ -117,21 +117,23 @@ def merge_results(api_results: List[ScoringResult], local_results: List[ScoringR
117
117
 
118
118
  # Each ScoringResult in api and local have all the same fields besides `scorers_data`
119
119
  for api_result, local_result in zip(api_results, local_results):
120
- if api_result.input != local_result.input:
120
+ if not (api_result.data_object and local_result.data_object):
121
+ raise ValueError("Data object is None in one of the results.")
122
+ if api_result.data_object.input != local_result.data_object.input:
121
123
  raise ValueError("The API and local results are not aligned.")
122
- if api_result.actual_output != local_result.actual_output:
124
+ if api_result.data_object.actual_output != local_result.data_object.actual_output:
123
125
  raise ValueError("The API and local results are not aligned.")
124
- if api_result.expected_output != local_result.expected_output:
126
+ if api_result.data_object.expected_output != local_result.data_object.expected_output:
125
127
  raise ValueError("The API and local results are not aligned.")
126
- if api_result.context != local_result.context:
128
+ if api_result.data_object.context != local_result.data_object.context:
127
129
  raise ValueError("The API and local results are not aligned.")
128
- if api_result.retrieval_context != local_result.retrieval_context:
130
+ if api_result.data_object.retrieval_context != local_result.data_object.retrieval_context:
129
131
  raise ValueError("The API and local results are not aligned.")
130
- if api_result.additional_metadata != local_result.additional_metadata:
132
+ if api_result.data_object.additional_metadata != local_result.data_object.additional_metadata:
131
133
  raise ValueError("The API and local results are not aligned.")
132
- if api_result.tools_called != local_result.tools_called:
134
+ if api_result.data_object.tools_called != local_result.data_object.tools_called:
133
135
  raise ValueError("The API and local results are not aligned.")
134
- if api_result.expected_tools != local_result.expected_tools:
136
+ if api_result.data_object.expected_tools != local_result.data_object.expected_tools:
135
137
  raise ValueError("The API and local results are not aligned.")
136
138
 
137
139
 
@@ -422,23 +424,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
422
424
 
423
425
  # Convert the response data to `ScoringResult` objects
424
426
  debug("Processing API results")
425
- for idx, result in enumerate(response_data["results"]):
426
- with example_logging_context(evaluation_run.examples[idx].timestamp, evaluation_run.examples[idx].example_id):
427
- for scorer in judgment_scorers:
428
- debug(f"Processing API result for example {idx} and scorer {scorer.score_type}")
429
- # filter for key-value pairs that are used to initialize ScoringResult
430
- # there may be some stuff in here that doesn't belong in ScoringResult
431
- # TODO: come back and refactor this to have ScoringResult take in **kwargs
432
- filtered_result = {k: v for k, v in result.items() if k in ScoringResult.__annotations__}
433
-
434
- # Convert scorers_data dicts to ScorerData objects
435
- if "scorers_data" in filtered_result and filtered_result["scorers_data"]:
436
- filtered_result["scorers_data"] = [
437
- ScorerData(**scorer_dict)
438
- for scorer_dict in filtered_result["scorers_data"]
439
- ]
440
-
441
- api_results.append(ScoringResult(**filtered_result))
427
+ api_results = [ScoringResult(**result) for result in response_data["results"]]
442
428
  # Run local evals
443
429
  if local_scorers: # List[JudgevalScorer]
444
430
  # We should be removing local scorers soon
@@ -477,7 +463,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
477
463
  # judgment_api_key=evaluation_run.judgment_api_key,
478
464
  # organization_id=evaluation_run.organization_id
479
465
  # )
480
-
466
+ # print(merged_results)
481
467
  if evaluation_run.log_results:
482
468
  pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, merged_results, evaluation_run)
483
469
  rprint(pretty_str)
@@ -504,15 +490,14 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
504
490
 
505
491
  # Create a test case context with all relevant fields
506
492
  test_case = {
507
- 'input': result.input,
508
- 'actual_output': result.actual_output,
509
- 'expected_output': result.expected_output,
510
- 'context': result.context,
511
- 'retrieval_context': result.retrieval_context,
512
- 'additional_metadata': result.additional_metadata,
513
- 'tools_called': result.tools_called,
514
- 'expected_tools': result.expected_tools,
515
- 'eval_run_name': result.eval_run_name,
493
+ 'input': result.data_object.input,
494
+ 'actual_output': result.data_object.actual_output,
495
+ 'expected_output': result.data_object.expected_output,
496
+ 'context': result.data_object.context,
497
+ 'retrieval_context': result.data_object.retrieval_context,
498
+ 'additional_metadata': result.data_object.additional_metadata,
499
+ 'tools_called': result.data_object.tools_called,
500
+ 'expected_tools': result.data_object.expected_tools,
516
501
  'failed_scorers': []
517
502
  }
518
503
  if result.scorers_data:
@@ -533,7 +518,6 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
533
518
  error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
534
519
  error_msg += f"Tools Called: {fail_case['tools_called']}\n"
535
520
  error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
536
- error_msg += f"Eval Run Name: {fail_case['eval_run_name']}\n"
537
521
 
538
522
  for fail_scorer in fail_case['failed_scorers']:
539
523
 
judgeval/scorers/score.py CHANGED
@@ -13,7 +13,6 @@ from judgeval.data import (
13
13
  Example,
14
14
  ScoringResult,
15
15
  generate_scoring_result,
16
- create_process_example,
17
16
  create_scorer_data,
18
17
  )
19
18
  from judgeval.scorers import JudgevalScorer
@@ -400,7 +399,6 @@ async def a_eval_examples_helper(
400
399
  scorer.error = None # Reset scorer error
401
400
 
402
401
  # scoring the Example
403
- process_example = create_process_example(example) # Creates process example to track progress
404
402
  scoring_start_time = time.perf_counter()
405
403
  await score_with_indicator(
406
404
  scorers=scorers,
@@ -411,22 +409,22 @@ async def a_eval_examples_helper(
411
409
  ) # execute the scoring functions of each scorer on the example
412
410
 
413
411
  # Now that all the scoring functions of each scorer have executed, we collect
414
- # the results and update the process example with the scorer data
412
+ # the results and update the ScoringResult with the scorer data
413
+ success = True
414
+ scorer_data_list = []
415
415
  for scorer in scorers:
416
416
  # At this point, the scorer has been executed and already contains data.
417
417
  if getattr(scorer, 'skipped', False):
418
418
  continue
419
419
  scorer_data = create_scorer_data(scorer) # Fetch scorer data from completed scorer evaluation
420
- process_example.update_scorer_data(scorer_data) # Update process example with the same scorer data
420
+ success = success and scorer_data.success
421
+ scorer_data_list.append(scorer_data)
421
422
 
422
- test_end_time = time.perf_counter()
423
- run_duration = test_end_time - scoring_start_time
423
+ scoring_end_time = time.perf_counter()
424
+ run_duration = scoring_end_time - scoring_start_time
424
425
 
425
- process_example.update_run_duration(run_duration) # Update process example with execution time duration
426
-
427
- # Generate the scoring result and store it safely (to avoid race conditions)
428
- result = generate_scoring_result(process_example)
429
- scoring_results[score_index] = result
426
+ scoring_result = generate_scoring_result(example, scorer_data_list, run_duration, success)
427
+ scoring_results[score_index] = scoring_result
430
428
 
431
429
  if pbar is not None:
432
430
  pbar.update(1)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.26
3
+ Version: 0.0.27
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues