judgeval 0.0.26__py3-none-any.whl → 0.0.28__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
judgeval/constants.py CHANGED
@@ -41,14 +41,16 @@ ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
41
41
  JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
42
42
  JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
43
43
  JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull/"
44
+ JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
44
45
  JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
45
- JUDGMENT_DATASETS_PULL_ALL_API_URL = f"{ROOT_API}/datasets/get_all_stats/"
46
- JUDGMENT_DATASETS_EDIT_API_URL = f"{ROOT_API}/datasets/edit/"
46
+ JUDGMENT_DATASETS_PROJECT_STATS_API_URL = f"{ROOT_API}/datasets/fetch_stats_by_project/"
47
+ JUDGMENT_DATASETS_INSERT_API_URL = f"{ROOT_API}/datasets/insert_examples/"
47
48
  JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
48
49
  JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
49
50
  JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_names/"
50
51
  JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
51
52
  JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete/"
53
+ JUDGMENT_PROJECT_CREATE_API_URL = f"{ROOT_API}/projects/add/"
52
54
  JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
53
55
  JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
54
56
  JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
judgeval/data/__init__.py CHANGED
@@ -1,13 +1,10 @@
1
1
  from judgeval.data.example import Example, ExampleParams
2
- from judgeval.data.api_example import ProcessExample, create_process_example
3
2
  from judgeval.data.scorer_data import ScorerData, create_scorer_data
4
3
  from judgeval.data.result import ScoringResult, generate_scoring_result
5
4
 
6
5
  __all__ = [
7
6
  "Example",
8
7
  "ExampleParams",
9
- "ProcessExample",
10
- "create_process_example",
11
8
  "ScorerData",
12
9
  "create_scorer_data",
13
10
  "ScoringResult",
@@ -2,6 +2,7 @@ from typing import List, Optional, Dict, Any, Union
2
2
  from pydantic import BaseModel, ConfigDict, model_validator
3
3
 
4
4
  from judgeval.data.example import Example
5
+ from judgeval.data.custom_example import CustomExample
5
6
  from judgeval.data.scorer_data import ScorerData
6
7
  from judgeval.common.logger import debug, error
7
8
 
@@ -12,13 +13,13 @@ class ProcessExample(BaseModel):
12
13
  internal operations and keeping track of the evaluation process.
13
14
  """
14
15
  name: str
15
- input: Optional[str] = None
16
- actual_output: Optional[Union[str, List[str]]] = None
17
- expected_output: Optional[Union[str, List[str]]] = None
18
- context: Optional[list] = None
19
- retrieval_context: Optional[list] = None
20
- tools_called: Optional[list] = None
21
- expected_tools: Optional[list] = None
16
+ # input: Optional[str] = None
17
+ # actual_output: Optional[Union[str, List[str]]] = None
18
+ # expected_output: Optional[Union[str, List[str]]] = None
19
+ # context: Optional[list] = None
20
+ # retrieval_context: Optional[list] = None
21
+ # tools_called: Optional[list] = None
22
+ # expected_tools: Optional[list] = None
22
23
 
23
24
  # make these optional, not all test cases in a conversation will be evaluated
24
25
  success: Optional[bool] = None
@@ -57,10 +58,10 @@ class ProcessExample(BaseModel):
57
58
 
58
59
  def update_run_duration(self, run_duration: float):
59
60
  self.run_duration = run_duration
60
-
61
61
 
62
- def create_process_example(
63
- example: Example,
62
+
63
+ def create_process_custom_example(
64
+ example: CustomExample,
64
65
  ) -> ProcessExample:
65
66
  """
66
67
  When an LLM Test Case is executed, we track its progress using an ProcessExample.
@@ -79,13 +80,6 @@ def create_process_example(
79
80
  debug(f"Creating ProcessExample for: {name}")
80
81
  process_ex = ProcessExample(
81
82
  name=name,
82
- input=example.input,
83
- actual_output=example.actual_output,
84
- expected_output=example.expected_output,
85
- context=example.context,
86
- retrieval_context=example.retrieval_context,
87
- tools_called=example.tools_called,
88
- expected_tools=example.expected_tools,
89
83
  success=success,
90
84
  scorers_data=scorers_data,
91
85
  run_duration=None,
@@ -94,5 +88,4 @@ def create_process_example(
94
88
  additional_metadata=example.additional_metadata,
95
89
  trace_id=example.trace_id
96
90
  )
97
- return process_ex
98
-
91
+ return process_ex
@@ -7,8 +7,9 @@ from judgeval.common.logger import debug, error, warning, info
7
7
  from judgeval.constants import (
8
8
  JUDGMENT_DATASETS_PUSH_API_URL,
9
9
  JUDGMENT_DATASETS_PULL_API_URL,
10
- JUDGMENT_DATASETS_PULL_ALL_API_URL,
11
- JUDGMENT_DATASETS_EDIT_API_URL,
10
+ JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
11
+ JUDGMENT_DATASETS_DELETE_API_URL,
12
+ JUDGMENT_DATASETS_INSERT_API_URL,
12
13
  JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
13
14
  )
14
15
  from judgeval.data import Example
@@ -25,7 +26,7 @@ class EvalDatasetClient:
25
26
  def create_dataset(self) -> EvalDataset:
26
27
  return EvalDataset(judgment_api_key=self.judgment_api_key)
27
28
 
28
- def push(self, dataset: EvalDataset, alias: str, overwrite: Optional[bool] = False) -> bool:
29
+ def push(self, dataset: EvalDataset, alias: str, project_name: str, overwrite: Optional[bool] = False) -> bool:
29
30
  debug(f"Pushing dataset with alias '{alias}' (overwrite={overwrite})")
30
31
  if overwrite:
31
32
  warning(f"Overwrite enabled for alias '{alias}'")
@@ -53,7 +54,8 @@ class EvalDatasetClient:
53
54
  total=100,
54
55
  )
55
56
  content = {
56
- "alias": alias,
57
+ "dataset_alias": alias,
58
+ "project_name": project_name,
57
59
  "examples": [e.to_dict() for e in dataset.examples],
58
60
  "overwrite": overwrite,
59
61
  }
@@ -88,7 +90,7 @@ class EvalDatasetClient:
88
90
  )
89
91
  return True
90
92
 
91
- def pull(self, alias: str) -> EvalDataset:
93
+ def pull(self, alias: str, project_name: str) -> EvalDataset:
92
94
  debug(f"Pulling dataset with alias '{alias}'")
93
95
  """
94
96
  Pulls the dataset from Judgment platform
@@ -96,7 +98,7 @@ class EvalDatasetClient:
96
98
  Mock request:
97
99
  {
98
100
  "alias": alias,
99
- "user_id": user_id
101
+ "project_name": project_name
100
102
  }
101
103
  ==>
102
104
  {
@@ -118,7 +120,8 @@ class EvalDatasetClient:
118
120
  total=100,
119
121
  )
120
122
  request_body = {
121
- "alias": alias,
123
+ "dataset_alias": alias,
124
+ "project_name": project_name
122
125
  }
123
126
 
124
127
  try:
@@ -139,24 +142,58 @@ class EvalDatasetClient:
139
142
 
140
143
  info(f"Successfully pulled dataset with alias '{alias}'")
141
144
  payload = response.json()
145
+
142
146
  dataset.examples = [Example(**e) for e in payload.get("examples", [])]
143
- dataset._alias = payload.get("_alias")
144
- dataset._id = payload.get("_id")
147
+ dataset._alias = payload.get("alias")
148
+ dataset._id = payload.get("id")
145
149
  progress.update(
146
150
  task_id,
147
151
  description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
148
152
  )
149
153
 
150
154
  return dataset
155
+
156
+ def delete(self, alias: str, project_name: str) -> bool:
157
+ with Progress(
158
+ SpinnerColumn(style="rgb(106,0,255)"),
159
+ TextColumn("[progress.description]{task.description}"),
160
+ transient=False,
161
+ ) as progress:
162
+ task_id = progress.add_task(
163
+ f"Deleting [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] from Judgment...",
164
+ total=100,
165
+ )
166
+ request_body = {
167
+ "dataset_alias": alias,
168
+ "project_name": project_name
169
+ }
151
170
 
152
- def pull_all_user_dataset_stats(self) -> dict:
153
- debug(f"Pulling user datasets stats for user_id: {self.judgment_api_key}'")
171
+ try:
172
+ response = requests.post(
173
+ JUDGMENT_DATASETS_DELETE_API_URL,
174
+ json=request_body,
175
+ headers={
176
+ "Content-Type": "application/json",
177
+ "Authorization": f"Bearer {self.judgment_api_key}",
178
+ "X-Organization-Id": self.organization_id
179
+ },
180
+ verify=True
181
+ )
182
+ response.raise_for_status()
183
+ except requests.exceptions.RequestException as e:
184
+ error(f"Error deleting dataset: {str(e)}")
185
+ raise
186
+
187
+ return True
188
+
189
+ def pull_project_dataset_stats(self, project_name: str) -> dict:
190
+ debug(f"Pulling project datasets stats for project_name: {project_name}'")
154
191
  """
155
- Pulls the user datasets stats from Judgment platform
192
+ Pulls the project datasets stats from Judgment platform
156
193
 
157
194
  Mock request:
158
195
  {
159
- "user_id": user_id
196
+ "project_name": project_name
160
197
  }
161
198
  ==>
162
199
  {
@@ -177,11 +214,12 @@ class EvalDatasetClient:
177
214
  total=100,
178
215
  )
179
216
  request_body = {
217
+ "project_name": project_name
180
218
  }
181
219
 
182
220
  try:
183
221
  response = requests.post(
184
- JUDGMENT_DATASETS_PULL_ALL_API_URL,
222
+ JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
185
223
  json=request_body,
186
224
  headers={
187
225
  "Content-Type": "application/json",
@@ -205,7 +243,7 @@ class EvalDatasetClient:
205
243
 
206
244
  return payload
207
245
 
208
- def edit_dataset(self, alias: str, examples: List[Example]) -> bool:
246
+ def insert_dataset(self, alias: str, examples: List[Example], project_name: str) -> bool:
209
247
  """
210
248
  Edits the dataset on Judgment platform by adding new examples
211
249
 
@@ -213,7 +251,7 @@ class EvalDatasetClient:
213
251
  {
214
252
  "alias": alias,
215
253
  "examples": [...],
216
- "judgment_api_key": self.judgment_api_key
254
+ "project_name": project_name
217
255
  }
218
256
  """
219
257
  with Progress(
@@ -227,13 +265,14 @@ class EvalDatasetClient:
227
265
  )
228
266
 
229
267
  content = {
230
- "alias": alias,
268
+ "dataset_alias": alias,
231
269
  "examples": [e.to_dict() for e in examples],
270
+ "project_name": project_name
232
271
  }
233
272
 
234
273
  try:
235
274
  response = requests.post(
236
- JUDGMENT_DATASETS_EDIT_API_URL,
275
+ JUDGMENT_DATASETS_INSERT_API_URL,
237
276
  json=content,
238
277
  headers={
239
278
  "Content-Type": "application/json",
@@ -250,7 +289,7 @@ class EvalDatasetClient:
250
289
  info(f"Successfully edited dataset '{alias}'")
251
290
  return True
252
291
 
253
- def export_jsonl(self, alias: str) -> requests.Response:
292
+ def export_jsonl(self, alias: str, project_name: str) -> requests.Response:
254
293
  """Export dataset in JSONL format from Judgment platform"""
255
294
  debug(f"Exporting dataset with alias '{alias}' as JSONL")
256
295
  with Progress(
@@ -265,7 +304,7 @@ class EvalDatasetClient:
265
304
  try:
266
305
  response = requests.post(
267
306
  JUDGMENT_DATASETS_EXPORT_JSONL_API_URL,
268
- json={"alias": alias},
307
+ json={"dataset_alias": alias, "project_name": project_name},
269
308
  headers={
270
309
  "Content-Type": "application/json",
271
310
  "Authorization": f"Bearer {self.judgment_api_key}",
judgeval/data/result.py CHANGED
@@ -1,10 +1,11 @@
1
1
  from dataclasses import dataclass
2
2
  from typing import List, Union, Optional, Dict, Any, Union
3
+ from judgeval.common.logger import debug, error
4
+ from pydantic import BaseModel
5
+ from judgeval.data import ScorerData, Example
3
6
 
4
- from judgeval.data import ScorerData, ProcessExample
5
7
 
6
- @dataclass
7
- class ScoringResult:
8
+ class ScoringResult(BaseModel):
8
9
  """
9
10
  A ScoringResult contains the output of one or more scorers applied to a single example.
10
11
  Ie: One input, one actual_output, one expected_output, etc..., and 1+ scorer (Faithfulness, Hallucination, Summarization, etc...)
@@ -13,69 +14,44 @@ class ScoringResult:
13
14
  success (bool): Whether the evaluation was successful.
14
15
  This means that all scorers applied to this example returned a success.
15
16
  scorer_data (List[ScorerData]): The scorers data for the evaluated example
16
- input (Optional[str]): The input to the example
17
- actual_output (Optional[str]): The actual output of the example
18
- expected_output (Optional[str]): The expected output of the example
19
- context (Optional[List[str]]): The context of the example
20
- retrieval_context (Optional[List[str]]): The retrieval context of the example
21
- additional_metadata (Optional[Dict[str, Any]]): The additional metadata of the example
22
- tools_called (Optional[List[str]]): The tools called by the example
23
- expected_tools (Optional[List[str]]): The expected tools of the example
24
- trace_id (Optional[str]): The trace id of the example
17
+ data_object (Optional[Example]): The original example object that was used to create the ScoringResult, can be Example, CustomExample (future), WorkflowRun (future)
25
18
 
26
19
  """
27
20
  # Fields for scoring outputs
28
21
  success: bool # used for unit testing
29
22
  scorers_data: Union[List[ScorerData], None]
23
+ name: Optional[str] = None
30
24
 
31
- # Inputs from the original example
32
- input: Optional[str] = None
33
- actual_output: Optional[Union[str, List[str]]] = None
34
- expected_output: Optional[Union[str, List[str]]] = None
35
- context: Optional[List[str]] = None
36
- retrieval_context: Optional[List[str]] = None
37
- additional_metadata: Optional[Dict[str, Any]] = None
38
- tools_called: Optional[List[str]] = None
39
- expected_tools: Optional[List[str]] = None
25
+ # The original example object that was used to create the ScoringResult
26
+ data_object: Optional[Example] = None #can be Example, CustomExample (future), WorkflowRun (future)
40
27
  trace_id: Optional[str] = None
41
28
 
42
- example_id: Optional[str] = None
43
- eval_run_name: Optional[str] = None
29
+ # Additional fields for internal use
30
+ run_duration: Optional[float] = None
31
+ evaluation_cost: Optional[float] = None
44
32
 
45
33
  def to_dict(self) -> dict:
46
34
  """Convert the ScoringResult instance to a dictionary, properly serializing scorer_data."""
47
35
  return {
48
36
  "success": self.success,
49
37
  "scorers_data": [scorer_data.to_dict() for scorer_data in self.scorers_data] if self.scorers_data else None,
50
- "input": self.input,
51
- "actual_output": self.actual_output,
52
- "expected_output": self.expected_output,
53
- "context": self.context,
54
- "retrieval_context": self.retrieval_context,
55
- "additional_metadata": self.additional_metadata,
56
- "tools_called": self.tools_called,
57
- "expected_tools": self.expected_tools,
58
- "trace_id": self.trace_id,
59
- "example_id": self.example_id
38
+ "data_object": self.data_object.to_dict() if self.data_object else None,
60
39
  }
61
-
40
+
62
41
  def __str__(self) -> str:
63
42
  return f"ScoringResult(\
64
43
  success={self.success}, \
65
44
  scorer_data={self.scorers_data}, \
66
- input={self.input}, \
67
- actual_output={self.actual_output}, \
68
- expected_output={self.expected_output}, \
69
- context={self.context}, \
70
- retrieval_context={self.retrieval_context}, \
71
- additional_metadata={self.additional_metadata}, \
72
- tools_called={self.tools_called}, \
73
- expected_tools={self.expected_tools}, \
74
- trace_id={self.trace_id})"
45
+ data_object={self.data_object}, \
46
+ run_duration={self.run_duration}, \
47
+ evaluation_cost={self.evaluation_cost})"
75
48
 
76
49
 
77
50
  def generate_scoring_result(
78
- process_example: ProcessExample,
51
+ example: Example,
52
+ scorers_data: List[ScorerData],
53
+ run_duration: float,
54
+ success: bool,
79
55
  ) -> ScoringResult:
80
56
  """
81
57
  Creates a final ScoringResult object for an evaluation run based on the results from a completed LLMApiTestCase.
@@ -83,16 +59,18 @@ def generate_scoring_result(
83
59
  When an LLMTestCase is executed, it turns into an LLMApiTestCase and the progress of the evaluation run is tracked.
84
60
  At the end of the evaluation run, we create a TestResult object out of the completed LLMApiTestCase.
85
61
  """
86
- return ScoringResult(
87
- success=process_example.success,
88
- scorers_data=process_example.scorers_data,
89
- input=process_example.input,
90
- actual_output=process_example.actual_output,
91
- expected_output=process_example.expected_output,
92
- context=process_example.context,
93
- retrieval_context=process_example.retrieval_context,
94
- additional_metadata=process_example.additional_metadata,
95
- tools_called=process_example.tools_called,
96
- expected_tools=process_example.expected_tools,
97
- trace_id=process_example.trace_id
62
+ if example.name is not None:
63
+ name = example.name
64
+ else:
65
+ name = "Test Case Placeholder"
66
+ debug(f"No name provided for example, using default name: {name}")
67
+ debug(f"Creating ScoringResult for: {name}")
68
+ scoring_result = ScoringResult(
69
+ name=name,
70
+ data_object=example,
71
+ success=success,
72
+ scorers_data=scorers_data,
73
+ run_duration=run_duration,
74
+ evaluation_cost=None,
98
75
  )
76
+ return scoring_result
@@ -34,6 +34,7 @@ class EvaluationRun(BaseModel):
34
34
  model: Union[str, List[str], JudgevalJudge]
35
35
  aggregator: Optional[str] = None
36
36
  metadata: Optional[Dict[str, Any]] = None
37
+ trace_span_id: Optional[str] = None
37
38
  # API Key will be "" until user calls client.run_eval(), then API Key will be set
38
39
  judgment_api_key: Optional[str] = ""
39
40
  override: Optional[bool] = False
@@ -27,7 +27,8 @@ from judgeval.constants import (
27
27
  JUDGMENT_EVAL_FETCH_API_URL,
28
28
  JUDGMENT_EVAL_DELETE_API_URL,
29
29
  JUDGMENT_EVAL_DELETE_PROJECT_API_URL,
30
- JUDGMENT_PROJECT_DELETE_API_URL
30
+ JUDGMENT_PROJECT_DELETE_API_URL,
31
+ JUDGMENT_PROJECT_CREATE_API_URL
31
32
  )
32
33
  from judgeval.common.exceptions import JudgmentAPIError
33
34
  from pydantic import BaseModel
@@ -43,8 +44,16 @@ class DeleteEvalRunRequestBody(BaseModel):
43
44
  project_name: str
44
45
  judgment_api_key: str
45
46
 
47
+ class SingletonMeta(type):
48
+ _instances = {}
46
49
 
47
- class JudgmentClient:
50
+ def __call__(cls, *args, **kwargs):
51
+ if cls not in cls._instances:
52
+ instance = super().__call__(*args, **kwargs)
53
+ cls._instances[cls] = instance
54
+ return cls._instances[cls]
55
+
56
+ class JudgmentClient(metaclass=SingletonMeta):
48
57
  def __init__(self, judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"), organization_id: str = os.getenv("JUDGMENT_ORG_ID")):
49
58
  self.judgment_api_key = judgment_api_key
50
59
  self.organization_id = organization_id
@@ -56,8 +65,8 @@ class JudgmentClient:
56
65
  # May be bad to output their invalid API key...
57
66
  raise JudgmentAPIError(f"Issue with passed in Judgment API key: {response}")
58
67
  else:
59
- print(f"Successfully initialized JudgmentClient, welcome back {response.get('detail', {}).get('user_name', 'user')}!")
60
-
68
+ print(f"Successfully initialized JudgmentClient!")
69
+
61
70
  def a_run_evaluation(
62
71
  self,
63
72
  examples: List[Example],
@@ -267,7 +276,7 @@ class JudgmentClient:
267
276
  def create_dataset(self) -> EvalDataset:
268
277
  return self.eval_dataset_client.create_dataset()
269
278
 
270
- def push_dataset(self, alias: str, dataset: EvalDataset, overwrite: Optional[bool] = False) -> bool:
279
+ def push_dataset(self, alias: str, dataset: EvalDataset, project_name: str, overwrite: Optional[bool] = False) -> bool:
271
280
  """
272
281
  Uploads an `EvalDataset` to the Judgment platform for storage.
273
282
 
@@ -281,9 +290,9 @@ class JudgmentClient:
281
290
  """
282
291
  # Set judgment_api_key just in case it was not set
283
292
  dataset.judgment_api_key = self.judgment_api_key
284
- return self.eval_dataset_client.push(dataset, alias, overwrite)
293
+ return self.eval_dataset_client.push(dataset, alias, project_name, overwrite)
285
294
 
286
- def pull_dataset(self, alias: str) -> EvalDataset:
295
+ def pull_dataset(self, alias: str, project_name: str) -> EvalDataset:
287
296
  """
288
297
  Retrieves a saved `EvalDataset` from the Judgment platform.
289
298
 
@@ -293,25 +302,31 @@ class JudgmentClient:
293
302
  Returns:
294
303
  EvalDataset: The retrieved dataset
295
304
  """
296
- return self.eval_dataset_client.pull(alias)
305
+ return self.eval_dataset_client.pull(alias, project_name)
306
+
307
+ def delete_dataset(self, alias: str, project_name: str) -> bool:
308
+ """
309
+ Deletes a saved `EvalDataset` from the Judgment platform.
310
+ """
311
+ return self.eval_dataset_client.delete(alias, project_name)
297
312
 
298
- def pull_all_user_dataset_stats(self) -> dict:
313
+ def pull_project_dataset_stats(self, project_name: str) -> dict:
299
314
  """
300
- Retrieves all dataset stats from the Judgment platform for the user.
315
+ Retrieves all dataset stats from the Judgment platform for the project.
301
316
 
302
317
  Args:
303
- alias (str): The name of the dataset to retrieve
318
+ project_name (str): The name of the project to retrieve
304
319
 
305
320
  Returns:
306
- EvalDataset: The retrieved dataset
321
+ dict: The retrieved dataset stats
307
322
  """
308
- return self.eval_dataset_client.pull_all_user_dataset_stats()
323
+ return self.eval_dataset_client.pull_project_dataset_stats(project_name)
309
324
 
310
- def edit_dataset(self, alias: str, examples: List[Example]) -> bool:
325
+ def insert_dataset(self, alias: str, examples: List[Example], project_name: str) -> bool:
311
326
  """
312
327
  Edits the dataset on Judgment platform by adding new examples
313
328
  """
314
- return self.eval_dataset_client.edit_dataset(alias, examples)
329
+ return self.eval_dataset_client.insert_dataset(alias, examples, project_name)
315
330
 
316
331
  # Maybe add option where you can pass in the EvaluationRun object and it will pull the eval results from the backend
317
332
  def pull_eval(self, project_name: str, eval_run_name: str) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
@@ -402,6 +417,23 @@ class JudgmentClient:
402
417
  raise ValueError(f"Error deleting eval results: {response.json()}")
403
418
  return response.json()
404
419
 
420
+ def create_project(self, project_name: str) -> bool:
421
+ """
422
+ Creates a project on the server.
423
+ """
424
+ response = requests.post(JUDGMENT_PROJECT_CREATE_API_URL,
425
+ json={
426
+ "project_name": project_name,
427
+ },
428
+ headers={
429
+ "Content-Type": "application/json",
430
+ "Authorization": f"Bearer {self.judgment_api_key}",
431
+ "X-Organization-Id": self.organization_id
432
+ })
433
+ if response.status_code != requests.codes.ok:
434
+ raise ValueError(f"Error creating project: {response.json()}")
435
+ return response.json()
436
+
405
437
  def delete_project(self, project_name: str) -> bool:
406
438
  """
407
439
  Deletes a project from the server. Which also deletes all evaluations and traces associated with the project.
@@ -117,21 +117,23 @@ def merge_results(api_results: List[ScoringResult], local_results: List[ScoringR
117
117
 
118
118
  # Each ScoringResult in api and local have all the same fields besides `scorers_data`
119
119
  for api_result, local_result in zip(api_results, local_results):
120
- if api_result.input != local_result.input:
120
+ if not (api_result.data_object and local_result.data_object):
121
+ raise ValueError("Data object is None in one of the results.")
122
+ if api_result.data_object.input != local_result.data_object.input:
121
123
  raise ValueError("The API and local results are not aligned.")
122
- if api_result.actual_output != local_result.actual_output:
124
+ if api_result.data_object.actual_output != local_result.data_object.actual_output:
123
125
  raise ValueError("The API and local results are not aligned.")
124
- if api_result.expected_output != local_result.expected_output:
126
+ if api_result.data_object.expected_output != local_result.data_object.expected_output:
125
127
  raise ValueError("The API and local results are not aligned.")
126
- if api_result.context != local_result.context:
128
+ if api_result.data_object.context != local_result.data_object.context:
127
129
  raise ValueError("The API and local results are not aligned.")
128
- if api_result.retrieval_context != local_result.retrieval_context:
130
+ if api_result.data_object.retrieval_context != local_result.data_object.retrieval_context:
129
131
  raise ValueError("The API and local results are not aligned.")
130
- if api_result.additional_metadata != local_result.additional_metadata:
132
+ if api_result.data_object.additional_metadata != local_result.data_object.additional_metadata:
131
133
  raise ValueError("The API and local results are not aligned.")
132
- if api_result.tools_called != local_result.tools_called:
134
+ if api_result.data_object.tools_called != local_result.data_object.tools_called:
133
135
  raise ValueError("The API and local results are not aligned.")
134
- if api_result.expected_tools != local_result.expected_tools:
136
+ if api_result.data_object.expected_tools != local_result.data_object.expected_tools:
135
137
  raise ValueError("The API and local results are not aligned.")
136
138
 
137
139
 
@@ -422,23 +424,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
422
424
 
423
425
  # Convert the response data to `ScoringResult` objects
424
426
  debug("Processing API results")
425
- for idx, result in enumerate(response_data["results"]):
426
- with example_logging_context(evaluation_run.examples[idx].timestamp, evaluation_run.examples[idx].example_id):
427
- for scorer in judgment_scorers:
428
- debug(f"Processing API result for example {idx} and scorer {scorer.score_type}")
429
- # filter for key-value pairs that are used to initialize ScoringResult
430
- # there may be some stuff in here that doesn't belong in ScoringResult
431
- # TODO: come back and refactor this to have ScoringResult take in **kwargs
432
- filtered_result = {k: v for k, v in result.items() if k in ScoringResult.__annotations__}
433
-
434
- # Convert scorers_data dicts to ScorerData objects
435
- if "scorers_data" in filtered_result and filtered_result["scorers_data"]:
436
- filtered_result["scorers_data"] = [
437
- ScorerData(**scorer_dict)
438
- for scorer_dict in filtered_result["scorers_data"]
439
- ]
440
-
441
- api_results.append(ScoringResult(**filtered_result))
427
+ api_results = [ScoringResult(**result) for result in response_data["results"]]
442
428
  # Run local evals
443
429
  if local_scorers: # List[JudgevalScorer]
444
430
  # We should be removing local scorers soon
@@ -477,7 +463,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
477
463
  # judgment_api_key=evaluation_run.judgment_api_key,
478
464
  # organization_id=evaluation_run.organization_id
479
465
  # )
480
-
466
+ # print(merged_results)
481
467
  if evaluation_run.log_results:
482
468
  pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, merged_results, evaluation_run)
483
469
  rprint(pretty_str)
@@ -504,15 +490,14 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
504
490
 
505
491
  # Create a test case context with all relevant fields
506
492
  test_case = {
507
- 'input': result.input,
508
- 'actual_output': result.actual_output,
509
- 'expected_output': result.expected_output,
510
- 'context': result.context,
511
- 'retrieval_context': result.retrieval_context,
512
- 'additional_metadata': result.additional_metadata,
513
- 'tools_called': result.tools_called,
514
- 'expected_tools': result.expected_tools,
515
- 'eval_run_name': result.eval_run_name,
493
+ 'input': result.data_object.input,
494
+ 'actual_output': result.data_object.actual_output,
495
+ 'expected_output': result.data_object.expected_output,
496
+ 'context': result.data_object.context,
497
+ 'retrieval_context': result.data_object.retrieval_context,
498
+ 'additional_metadata': result.data_object.additional_metadata,
499
+ 'tools_called': result.data_object.tools_called,
500
+ 'expected_tools': result.data_object.expected_tools,
516
501
  'failed_scorers': []
517
502
  }
518
503
  if result.scorers_data:
@@ -533,7 +518,6 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
533
518
  error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
534
519
  error_msg += f"Tools Called: {fail_case['tools_called']}\n"
535
520
  error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
536
- error_msg += f"Eval Run Name: {fail_case['eval_run_name']}\n"
537
521
 
538
522
  for fail_scorer in fail_case['failed_scorers']:
539
523