judgeval 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
judgeval/dataset.py ADDED
@@ -0,0 +1,192 @@
1
+ import datetime
2
+ import orjson
3
+ import os
4
+ import yaml
5
+ from dataclasses import dataclass
6
+ from typing import List, Literal, Optional
7
+
8
+ from judgeval.data import Example, Trace
9
+ from judgeval.utils.file_utils import get_examples_from_yaml, get_examples_from_json
10
+ from judgeval.common.api.api import JudgmentApiClient
11
+ from judgeval.common.logger import judgeval_logger
12
+
13
+
14
+ @dataclass
15
+ class Dataset:
16
+ examples: List[Example]
17
+ traces: List[Trace]
18
+ name: str
19
+ project_name: str
20
+ judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or ""
21
+ organization_id: str = os.getenv("JUDGMENT_ORG_ID") or ""
22
+
23
+ @classmethod
24
+ def get(
25
+ cls,
26
+ name: str,
27
+ project_name: str,
28
+ ):
29
+ client = JudgmentApiClient(cls.judgment_api_key, cls.organization_id)
30
+ dataset = client.pull_dataset(name, project_name)
31
+ if not dataset:
32
+ judgeval_logger.error(f"Dataset {name} not found in project {project_name}")
33
+ raise ValueError(f"Dataset {name} not found in project {project_name}")
34
+ examples = dataset.get("examples", [])
35
+ for e in examples:
36
+ if isinstance(e, dict) and isinstance(e.get("data"), dict):
37
+ e.update(e.pop("data"))
38
+ return cls(
39
+ name=name,
40
+ project_name=project_name,
41
+ examples=[Example(**e) for e in examples],
42
+ traces=[Trace(**t) for t in dataset.get("traces", [])],
43
+ )
44
+
45
+ @classmethod
46
+ def create(
47
+ cls,
48
+ name: str,
49
+ project_name: str,
50
+ examples: Optional[List[Example]] = None,
51
+ traces: Optional[List[Trace]] = None,
52
+ overwrite: bool = False,
53
+ ):
54
+ if examples and traces:
55
+ raise ValueError("Only one of examples or traces must be provided")
56
+
57
+ if not examples:
58
+ examples = []
59
+
60
+ if not traces:
61
+ traces = []
62
+
63
+ client = JudgmentApiClient(cls.judgment_api_key, cls.organization_id)
64
+ client.push_dataset(
65
+ name,
66
+ project_name,
67
+ examples=[e.model_dump() for e in examples],
68
+ traces=[t.model_dump() for t in traces],
69
+ overwrite=overwrite,
70
+ )
71
+ return cls(
72
+ name=name,
73
+ project_name=project_name,
74
+ examples=examples,
75
+ traces=traces,
76
+ )
77
+
78
+ def add_from_json(self, file_path: str) -> None:
79
+ """
80
+ Adds examples from a JSON file.
81
+
82
+ The JSON file is expected to have the following format:
83
+ [
84
+ {
85
+ "key_01": "value_01",
86
+ "key_02": "value_02"
87
+ },
88
+ {
89
+ "key_11": "value_11",
90
+ "key_12": "value_12",
91
+ "key_13": "value_13"
92
+ },
93
+ ...
94
+ ]
95
+ """
96
+ examples = get_examples_from_json(file_path)
97
+ self.add_examples(examples)
98
+
99
+ def add_from_yaml(self, file_path: str) -> None:
100
+ """
101
+ Adds examples from a YAML file.
102
+
103
+ The YAML file is expected to have the following format:
104
+ - key_01: value_01
105
+ key_02: value_02
106
+ - key_11: value_11
107
+ key_12: value_12
108
+ key_13: value_13
109
+ ...
110
+ """
111
+
112
+ examples = get_examples_from_yaml(file_path)
113
+ self.add_examples(examples)
114
+
115
+ def add_examples(self, examples: List[Example]) -> None:
116
+ client = JudgmentApiClient(self.judgment_api_key, self.organization_id)
117
+ client.append_examples(
118
+ dataset_alias=self.name,
119
+ project_name=self.project_name,
120
+ examples=[e.model_dump() for e in examples],
121
+ )
122
+
123
+ def add_traces(self, traces: List[Trace]) -> None:
124
+ client = JudgmentApiClient(self.judgment_api_key, self.organization_id)
125
+ client.append_traces(
126
+ dataset_alias=self.name,
127
+ project_name=self.project_name,
128
+ traces=[t.model_dump() for t in traces],
129
+ )
130
+
131
+ def save_as(
132
+ self,
133
+ file_type: Literal["json", "yaml"],
134
+ dir_path: str,
135
+ save_name: str | None = None,
136
+ ) -> None:
137
+ """
138
+ Saves the dataset as a file. Save only the examples.
139
+
140
+ Args:
141
+ file_type (Literal["json", "csv"]): The file type to save the dataset as.
142
+ dir_path (str): The directory path to save the file to.
143
+ save_name (str, optional): The name of the file to save. Defaults to None.
144
+ """
145
+ if not os.path.exists(dir_path):
146
+ os.makedirs(dir_path)
147
+ file_name = (
148
+ datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
149
+ if save_name is None
150
+ else save_name
151
+ )
152
+ complete_path = os.path.join(dir_path, f"{file_name}.{file_type}")
153
+ if file_type == "json":
154
+ with open(complete_path, "wb") as file:
155
+ file.write(
156
+ orjson.dumps(
157
+ {
158
+ "examples": [e.to_dict() for e in self.examples],
159
+ },
160
+ option=orjson.OPT_INDENT_2,
161
+ )
162
+ )
163
+ elif file_type == "yaml":
164
+ with open(complete_path, "w") as file:
165
+ yaml_data = {
166
+ "examples": [e.to_dict() for e in self.examples],
167
+ }
168
+ yaml.dump(yaml_data, file, default_flow_style=False)
169
+ else:
170
+ ACCEPTABLE_FILE_TYPES = ["json", "yaml"]
171
+ raise TypeError(
172
+ f"Invalid file type: {file_type}. Please choose from {ACCEPTABLE_FILE_TYPES}"
173
+ )
174
+
175
+ def delete(self):
176
+ client = JudgmentApiClient(self.judgment_api_key, self.organization_id)
177
+ client.delete_dataset(self.name, self.project_name)
178
+
179
+ def __iter__(self):
180
+ return iter(self.examples)
181
+
182
+ def __len__(self):
183
+ return len(self.examples)
184
+
185
+ def __str__(self):
186
+ return (
187
+ f"{self.__class__.__name__}("
188
+ f"examples={self.examples}, "
189
+ f"traces={self.traces}, "
190
+ f"name={self.name}"
191
+ f")"
192
+ )
@@ -36,6 +36,7 @@ class EvaluationRun(BaseModel):
36
36
  data["scorers"] = [
37
37
  scorer.model_dump() for scorer in self.scorers
38
38
  ] # Pydantic has problems with properly calling model_dump() on the scorers, so we need to do it manually
39
+ data["examples"] = [example.model_dump() for example in self.examples]
39
40
 
40
41
  return data
41
42
 
@@ -22,7 +22,7 @@ class LiteLLMJudge(JudgevalJudge):
22
22
  def generate(
23
23
  self,
24
24
  input: Union[str, List[Mapping[str, str]]],
25
- schema: pydantic.BaseModel = None,
25
+ schema: Union[pydantic.BaseModel, None] = None,
26
26
  ) -> str:
27
27
  if isinstance(input, str):
28
28
  convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
@@ -42,7 +42,7 @@ class LiteLLMJudge(JudgevalJudge):
42
42
  async def a_generate(
43
43
  self,
44
44
  input: Union[str, List[Mapping[str, str]]],
45
- schema: pydantic.BaseModel = None,
45
+ schema: Union[pydantic.BaseModel, None] = None,
46
46
  ) -> str:
47
47
  if isinstance(input, str):
48
48
  convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
@@ -18,8 +18,8 @@ from judgeval.common.logger import judgeval_logger
18
18
 
19
19
  def build_dynamic_mixture_prompt(
20
20
  judge_responses: List[str],
21
- custom_system_prompt: str | None = None,
22
- custom_conversation_history: List[dict] | None = None,
21
+ custom_system_prompt: Union[str, None] = None,
22
+ custom_conversation_history: Union[List[dict], None] = None,
23
23
  ) -> List[dict]:
24
24
  """
25
25
  Dynamically builds a prompt to mix judge responses together for the Mixture of Judges model.
@@ -178,8 +178,8 @@ class MixtureOfJudges(JudgevalJudge):
178
178
  def generate(
179
179
  self,
180
180
  input: Union[str, List[dict]],
181
- response_schema: pydantic.BaseModel = None,
182
- aggregation_schema: pydantic.BaseModel = None,
181
+ response_schema: Union[pydantic.BaseModel, None] = None,
182
+ aggregation_schema: Union[pydantic.BaseModel, None] = None,
183
183
  **kwargs,
184
184
  ) -> str:
185
185
  """
@@ -230,8 +230,8 @@ class MixtureOfJudges(JudgevalJudge):
230
230
  async def a_generate(
231
231
  self,
232
232
  input: Union[str, List[dict]],
233
- response_schema: pydantic.BaseModel = None,
234
- aggregation_schema: pydantic.BaseModel = None,
233
+ response_schema: Union[pydantic.BaseModel, None] = None,
234
+ aggregation_schema: Union[pydantic.BaseModel, None] = None,
235
235
  **kwargs,
236
236
  ) -> str:
237
237
  """
@@ -11,6 +11,7 @@ from judgeval.common.utils import (
11
11
  afetch_together_api_response,
12
12
  )
13
13
  from judgeval.common.logger import judgeval_logger
14
+ from judgeval.constants import DEFAULT_TOGETHER_MODEL
14
15
 
15
16
  BASE_CONVERSATION = [
16
17
  {"role": "system", "content": "You are a helpful assistant."},
@@ -18,13 +19,15 @@ BASE_CONVERSATION = [
18
19
 
19
20
 
20
21
  class TogetherJudge(JudgevalJudge):
21
- def __init__(self, model: str = "Qwen/Qwen2.5-72B-Instruct-Turbo", **kwargs):
22
+ def __init__(self, model: str = DEFAULT_TOGETHER_MODEL, **kwargs):
22
23
  self.model = model
23
24
  self.kwargs = kwargs
24
25
  super().__init__(model_name=model)
25
26
 
26
27
  # TODO: Fix cost for generate and a_generate
27
- def generate(self, input: Union[str, List[dict]], schema: BaseModel = None) -> str:
28
+ def generate(
29
+ self, input: Union[str, List[dict]], schema: Union[BaseModel, None] = None
30
+ ) -> str:
28
31
  if isinstance(input, str):
29
32
  convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
30
33
  return fetch_together_api_response(
@@ -40,7 +43,7 @@ class TogetherJudge(JudgevalJudge):
40
43
  raise TypeError("Input must be a string or a list of dictionaries.")
41
44
 
42
45
  async def a_generate(
43
- self, input: Union[str, List[dict]], schema: BaseModel = None
46
+ self, input: Union[str, List[dict]], schema: Union[BaseModel, None] = None
44
47
  ) -> str:
45
48
  if isinstance(input, str):
46
49
  convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
@@ -6,7 +6,6 @@ import os
6
6
  from uuid import uuid4
7
7
  from typing import Optional, List, Dict, Any, Union, Callable
8
8
 
9
- from judgeval.data.datasets import EvalDataset, EvalDatasetClient
10
9
  from judgeval.data import (
11
10
  ScoringResult,
12
11
  Example,
@@ -25,11 +24,11 @@ from judgeval.run_evaluation import (
25
24
  from judgeval.data.trace_run import TraceRun
26
25
  from judgeval.common.api import JudgmentApiClient
27
26
  from judgeval.common.exceptions import JudgmentAPIError
28
- from langchain_core.callbacks import BaseCallbackHandler
29
27
  from judgeval.common.tracer import Tracer
30
28
  from judgeval.common.utils import validate_api_key
31
29
  from pydantic import BaseModel
32
30
  from judgeval.common.logger import judgeval_logger
31
+ from judgeval.integrations.langgraph import JudgevalCallbackHandler
33
32
 
34
33
 
35
34
  class EvalRunRequestBody(BaseModel):
@@ -71,7 +70,6 @@ class JudgmentClient(metaclass=SingletonMeta):
71
70
  self.judgment_api_key = api_key
72
71
  self.organization_id = organization_id
73
72
  self.api_client = JudgmentApiClient(api_key, organization_id)
74
- self.eval_dataset_client = EvalDatasetClient(api_key, organization_id)
75
73
 
76
74
  # Verify API key is valid
77
75
  result, response = validate_api_key(api_key)
@@ -86,7 +84,7 @@ class JudgmentClient(metaclass=SingletonMeta):
86
84
  scorers: List[Union[APIScorerConfig, BaseScorer]],
87
85
  examples: Optional[List[Example]] = None,
88
86
  function: Optional[Callable] = None,
89
- tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
87
+ tracer: Optional[Union[Tracer, JudgevalCallbackHandler]] = None,
90
88
  traces: Optional[List[Trace]] = None,
91
89
  tools: Optional[List[Dict[str, Any]]] = None,
92
90
  project_name: str = "default_project",
@@ -178,70 +176,6 @@ class JudgmentClient(metaclass=SingletonMeta):
178
176
  except Exception as e:
179
177
  raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
180
178
 
181
- def create_dataset(self) -> EvalDataset:
182
- return self.eval_dataset_client.create_dataset()
183
-
184
- def push_dataset(
185
- self,
186
- alias: str,
187
- dataset: EvalDataset,
188
- project_name: str,
189
- overwrite: Optional[bool] = False,
190
- ) -> bool:
191
- """
192
- Uploads an `EvalDataset` to the Judgment platform for storage.
193
-
194
- Args:
195
- alias (str): The name to use for the dataset
196
- dataset (EvalDataset): The dataset to upload to Judgment
197
- overwrite (Optional[bool]): Whether to overwrite the dataset if it already exists
198
-
199
- Returns:
200
- bool: Whether the dataset was successfully uploaded
201
- """
202
- # Set judgment_api_key just in case it was not set
203
- dataset.judgment_api_key = self.judgment_api_key
204
- return self.eval_dataset_client.push(dataset, alias, project_name, overwrite)
205
-
206
- def append_dataset(
207
- self, alias: str, examples: List[Example], project_name: str
208
- ) -> bool:
209
- """
210
- Appends an `EvalDataset` to the Judgment platform for storage.
211
- """
212
- return self.eval_dataset_client.append_examples(alias, examples, project_name)
213
-
214
- def pull_dataset(self, alias: str, project_name: str) -> EvalDataset:
215
- """
216
- Retrieves a saved `EvalDataset` from the Judgment platform.
217
-
218
- Args:
219
- alias (str): The name of the dataset to retrieve
220
-
221
- Returns:
222
- EvalDataset: The retrieved dataset
223
- """
224
- return self.eval_dataset_client.pull(alias, project_name)
225
-
226
- def delete_dataset(self, alias: str, project_name: str) -> bool:
227
- """
228
- Deletes a saved `EvalDataset` from the Judgment platform.
229
- """
230
- return self.eval_dataset_client.delete(alias, project_name)
231
-
232
- def pull_project_dataset_stats(self, project_name: str) -> dict:
233
- """
234
- Retrieves all dataset stats from the Judgment platform for the project.
235
-
236
- Args:
237
- project_name (str): The name of the project to retrieve
238
-
239
- Returns:
240
- dict: The retrieved dataset stats
241
- """
242
- return self.eval_dataset_client.pull_project_dataset_stats(project_name)
243
-
244
- # Maybe add option where you can pass in the EvaluationRun object and it will pull the eval results from the backend
245
179
  def pull_eval(
246
180
  self, project_name: str, eval_run_name: str
247
181
  ) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
@@ -262,8 +196,12 @@ class JudgmentClient(metaclass=SingletonMeta):
262
196
  """
263
197
  Creates a project on the server.
264
198
  """
265
- self.api_client.create_project(project_name)
266
- return True
199
+ try:
200
+ self.api_client.create_project(project_name)
201
+ return True
202
+ except Exception as e:
203
+ judgeval_logger.error(f"Error creating project: {e}")
204
+ return False
267
205
 
268
206
  def delete_project(self, project_name: str) -> bool:
269
207
  """
@@ -314,7 +252,7 @@ class JudgmentClient(metaclass=SingletonMeta):
314
252
  scorers: List[Union[APIScorerConfig, BaseScorer]],
315
253
  examples: Optional[List[Example]] = None,
316
254
  function: Optional[Callable] = None,
317
- tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
255
+ tracer: Optional[Union[Tracer, JudgevalCallbackHandler]] = None,
318
256
  traces: Optional[List[Trace]] = None,
319
257
  tools: Optional[List[Dict[str, Any]]] = None,
320
258
  model: Optional[str] = "gpt-4.1",
@@ -1,7 +1,7 @@
1
1
  import asyncio
2
2
  import concurrent.futures
3
3
  import time
4
- import json
4
+ import orjson
5
5
  import sys
6
6
  import threading
7
7
  from typing import List, Dict, Union, Optional, Callable, Tuple, Any
@@ -20,7 +20,7 @@ from judgeval.common.logger import judgeval_logger
20
20
  from judgeval.evaluation_run import EvaluationRun
21
21
  from judgeval.data.trace_run import TraceRun
22
22
  from judgeval.common.tracer import Tracer
23
- from langchain_core.callbacks import BaseCallbackHandler
23
+ from judgeval.integrations.langgraph import JudgevalCallbackHandler
24
24
 
25
25
 
26
26
  def safe_run_async(coro):
@@ -191,6 +191,24 @@ def check_eval_run_name_exists(
191
191
  raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
192
192
 
193
193
 
194
+ def check_example_keys(
195
+ keys: List[str],
196
+ eval_name: str,
197
+ project_name: str,
198
+ judgment_api_key: str,
199
+ organization_id: str,
200
+ ) -> None:
201
+ """
202
+ Checks if the current experiment (if one exists) has the same keys for example
203
+ """
204
+ api_client = JudgmentApiClient(judgment_api_key, organization_id)
205
+ try:
206
+ api_client.check_example_keys(keys, eval_name, project_name)
207
+ except Exception as e:
208
+ judgeval_logger.error(f"Failed to check if example keys match: {str(e)}")
209
+ raise JudgmentAPIError(f"Failed to check if example keys match: {str(e)}")
210
+
211
+
194
212
  def log_evaluation_results(
195
213
  scoring_results: List[ScoringResult],
196
214
  run: Union[EvaluationRun, TraceRun],
@@ -245,7 +263,9 @@ def check_examples(
245
263
  f"[yellow]⚠️ WARNING:[/yellow] Example is missing required parameters for scorer [bold]{scorer.score_type.value}[/bold]"
246
264
  )
247
265
  rprint(f"Missing parameters: {', '.join(missing_params)}")
248
- rprint(f"Example: {json.dumps(example.model_dump(), indent=2)}")
266
+ rprint(
267
+ f"Example: {orjson.dumps(example.model_dump(), option=orjson.OPT_INDENT_2).decode('utf-8')}"
268
+ )
249
269
  rprint("-" * 40)
250
270
  prompt_user = True
251
271
 
@@ -262,7 +282,7 @@ def run_trace_eval(
262
282
  judgment_api_key: str,
263
283
  override: bool = False,
264
284
  function: Optional[Callable] = None,
265
- tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
285
+ tracer: Optional[Union[Tracer, JudgevalCallbackHandler]] = None,
266
286
  examples: Optional[List[Example]] = None,
267
287
  ) -> List[ScoringResult]:
268
288
  # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
@@ -394,7 +414,7 @@ def _poll_evaluation_until_complete(
394
414
  expected_scorer_data_count: int,
395
415
  poll_interval_seconds: float = 5,
396
416
  max_failures: int = 5,
397
- max_poll_count: int = 24, # This should be equivalent to 120 seconds
417
+ max_poll_count: int = 60, # This should be equivalent to 5 minutes
398
418
  ) -> Tuple[List[ScoringResult], str]:
399
419
  """
400
420
  Polls until the evaluation is complete and returns the results.
@@ -500,6 +520,14 @@ def run_eval(
500
520
  Returns:
501
521
  List[ScoringResult]: A list of ScoringResult objects
502
522
  """
523
+ # Check that every example has the same keys
524
+ keys = evaluation_run.examples[0].get_fields().keys()
525
+ for example in evaluation_run.examples:
526
+ current_keys = example.get_fields().keys()
527
+ if current_keys != keys:
528
+ raise ValueError(
529
+ f"All examples must have the same keys: {current_keys} != {keys}"
530
+ )
503
531
 
504
532
  # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
505
533
  if not override and not evaluation_run.append:
@@ -520,9 +548,14 @@ def run_eval(
520
548
  False,
521
549
  )
522
550
 
523
- # Set example IDs if not already set
524
- for idx, example in enumerate(evaluation_run.examples):
525
- example.example_index = idx # Set numeric index
551
+ # Ensure that current experiment (if one exists) has the same keys for example
552
+ check_example_keys(
553
+ keys=list(keys),
554
+ eval_name=evaluation_run.eval_name,
555
+ project_name=evaluation_run.project_name,
556
+ judgment_api_key=judgment_api_key,
557
+ organization_id=evaluation_run.organization_id,
558
+ )
526
559
 
527
560
  judgment_scorers: List[APIScorerConfig] = []
528
561
  local_scorers: List[BaseScorer] = []
@@ -601,7 +634,6 @@ def run_eval(
601
634
  send_results = [
602
635
  scoring_result.model_dump(warnings=False) for scoring_result in results
603
636
  ]
604
-
605
637
  url = log_evaluation_results(send_results, evaluation_run, judgment_api_key)
606
638
  rprint(
607
639
  f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
judgeval/scorers/score.py CHANGED
@@ -30,15 +30,19 @@ async def safe_a_score_example(
30
30
  Args:
31
31
  scorer (BaseScorer): The `BaseScorer` to use for scoring the example.
32
32
  example (Example): The `Example` to be scored.
33
-
34
- ignore_errors (bool): Whether to ignore errors during the evaluation.
35
- If set to false, any error will be raised and stop the evaluation.
36
- If set to true, the error will be stored in the `error` attribute of the `BaseScorer` and the `success` attribute will be set to False.
37
-
38
- skip_on_missing_params (bool): Whether to skip the test case if required parameters are missing.
39
33
  """
40
34
  try:
41
- scorer.score = await scorer.a_score_example(example)
35
+ score = await scorer.a_score_example(example)
36
+ if score is None:
37
+ raise Exception("a_score_example need to return a score")
38
+ elif score < 0:
39
+ judgeval_logger.warning("score cannot be less than 0 , setting to 0")
40
+ score = 0
41
+ elif score > 1:
42
+ judgeval_logger.warning("score cannot be greater than 1 , setting to 1")
43
+ score = 1
44
+ else:
45
+ scorer.score = score
42
46
  scorer.success = scorer.success_check()
43
47
  except Exception as e:
44
48
  judgeval_logger.error(f"Error during scoring: {str(e)}")
judgeval/scorers/utils.py CHANGED
@@ -4,7 +4,7 @@ Util functions for Scorer objects
4
4
 
5
5
  import asyncio
6
6
  import nest_asyncio
7
- import json
7
+ import orjson
8
8
  import re
9
9
  from typing import List, Optional
10
10
 
@@ -48,8 +48,8 @@ def parse_response_json(llm_response: str, scorer: Optional[BaseScorer] = None)
48
48
  ) # Remove trailing comma if present
49
49
 
50
50
  try:
51
- return json.loads(json_str)
52
- except json.JSONDecodeError:
51
+ return orjson.loads(json_str)
52
+ except orjson.JSONDecodeError:
53
53
  error_str = "Evaluation LLM outputted an invalid JSON. Please use a stronger evaluation model."
54
54
  if scorer is not None:
55
55
  scorer.error = error_str
@@ -1,4 +1,5 @@
1
1
  import yaml
2
+ import orjson
2
3
  from typing import List
3
4
  from judgeval.common.logger import judgeval_logger
4
5
 
@@ -9,37 +10,19 @@ def get_examples_from_yaml(file_path: str) -> List[Example] | None:
9
10
  """
10
11
  Adds examples from a YAML file.
11
12
 
12
- The format of the YAML file is expected to be a dictionary with one key: "examples".
13
- The value of the key is a list of dictionaries, where each dictionary represents an example.
14
-
15
13
  The YAML file is expected to have the following format:
16
- examples:
17
- - input: "test input"
18
- actual_output: "test output"
19
- expected_output: "expected output"
20
- context:
21
- - "context1"
22
- - "context2"
23
- retrieval_context:
24
- - "retrieval1"
25
- additional_metadata:
26
- key: "value"
27
- tools_called:
28
- - "tool1"
29
- expected_tools:
30
- - {tool_name: "tool1", parameters: {"query": "test query 1"}}
31
- - {tool_name: "tool2", parameters: {"query": "test query 2"}}
32
- name: "test example"
33
- example_id: null
34
- timestamp: "20241230_160117"
35
- trace_id: "123"
14
+ - key_01: value_01
15
+ key_02: value_02
16
+ - key_11: value_11
17
+ key_12: value_12
18
+ key_13: value_13
19
+ ...
36
20
  """
37
21
  try:
38
22
  with open(file_path, "r") as file:
39
23
  payload = yaml.safe_load(file)
40
24
  if payload is None:
41
25
  raise ValueError("The YAML file is empty.")
42
- examples = payload.get("examples", [])
43
26
  except FileNotFoundError:
44
27
  judgeval_logger.error(f"YAML file not found: {file_path}")
45
28
  raise FileNotFoundError(f"The file {file_path} was not found.")
@@ -47,5 +30,37 @@ def get_examples_from_yaml(file_path: str) -> List[Example] | None:
47
30
  judgeval_logger.error(f"Invalid YAML file: {file_path}")
48
31
  raise ValueError(f"The file {file_path} is not a valid YAML file.")
49
32
 
50
- new_examples = [Example(**e) for e in examples]
33
+ new_examples = [Example(**e) for e in payload]
34
+ return new_examples
35
+
36
+
37
+ def get_examples_from_json(file_path: str) -> List[Example] | None:
38
+ """
39
+ Adds examples from a JSON file.
40
+
41
+ The JSON file is expected to have the following format:
42
+ [
43
+ {
44
+ "key_01": "value_01",
45
+ "key_02": "value_02"
46
+ },
47
+ {
48
+ "key_11": "value_11",
49
+ "key_12": "value_12",
50
+ "key_13": "value_13"
51
+ },
52
+ ...
53
+ ]
54
+ """
55
+ try:
56
+ with open(file_path, "rb") as file:
57
+ payload = orjson.loads(file.read())
58
+ except FileNotFoundError:
59
+ judgeval_logger.error(f"JSON file not found: {file_path}")
60
+ raise FileNotFoundError(f"The file {file_path} was not found.")
61
+ except orjson.JSONDecodeError:
62
+ judgeval_logger.error(f"Invalid JSON file: {file_path}")
63
+ raise ValueError(f"The file {file_path} is not a valid JSON file.")
64
+
65
+ new_examples = [Example(**e) for e in payload]
51
66
  return new_examples