judgeval 0.0.44__py3-none-any.whl → 0.0.46__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. judgeval/__init__.py +5 -4
  2. judgeval/clients.py +6 -6
  3. judgeval/common/__init__.py +7 -2
  4. judgeval/common/exceptions.py +2 -3
  5. judgeval/common/logger.py +74 -49
  6. judgeval/common/s3_storage.py +30 -23
  7. judgeval/common/tracer.py +1273 -939
  8. judgeval/common/utils.py +416 -244
  9. judgeval/constants.py +73 -61
  10. judgeval/data/__init__.py +1 -1
  11. judgeval/data/custom_example.py +3 -2
  12. judgeval/data/datasets/dataset.py +80 -54
  13. judgeval/data/datasets/eval_dataset_client.py +131 -181
  14. judgeval/data/example.py +67 -43
  15. judgeval/data/result.py +11 -9
  16. judgeval/data/scorer_data.py +4 -2
  17. judgeval/data/tool.py +25 -16
  18. judgeval/data/trace.py +57 -29
  19. judgeval/data/trace_run.py +5 -11
  20. judgeval/evaluation_run.py +22 -82
  21. judgeval/integrations/langgraph.py +546 -184
  22. judgeval/judges/base_judge.py +1 -2
  23. judgeval/judges/litellm_judge.py +33 -11
  24. judgeval/judges/mixture_of_judges.py +128 -78
  25. judgeval/judges/together_judge.py +22 -9
  26. judgeval/judges/utils.py +14 -5
  27. judgeval/judgment_client.py +259 -271
  28. judgeval/rules.py +169 -142
  29. judgeval/run_evaluation.py +462 -305
  30. judgeval/scorers/api_scorer.py +20 -11
  31. judgeval/scorers/exceptions.py +1 -0
  32. judgeval/scorers/judgeval_scorer.py +77 -58
  33. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +46 -15
  34. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +3 -2
  35. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +3 -2
  36. judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +12 -11
  37. judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +7 -5
  38. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +3 -2
  39. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +3 -2
  40. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +5 -2
  41. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +2 -1
  42. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +17 -8
  43. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +3 -2
  44. judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +3 -2
  45. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +3 -2
  46. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +3 -2
  47. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +8 -9
  48. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +4 -4
  49. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +5 -5
  50. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +5 -2
  51. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +9 -10
  52. judgeval/scorers/prompt_scorer.py +48 -37
  53. judgeval/scorers/score.py +86 -53
  54. judgeval/scorers/utils.py +11 -7
  55. judgeval/tracer/__init__.py +1 -1
  56. judgeval/utils/alerts.py +23 -12
  57. judgeval/utils/{data_utils.py → file_utils.py} +5 -9
  58. judgeval/utils/requests.py +29 -0
  59. judgeval/version_check.py +5 -2
  60. {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/METADATA +79 -135
  61. judgeval-0.0.46.dist-info/RECORD +69 -0
  62. judgeval-0.0.44.dist-info/RECORD +0 -68
  63. {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/WHEEL +0 -0
  64. {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/licenses/LICENSE.md +0 -0
judgeval/constants.py CHANGED
@@ -6,13 +6,15 @@ from enum import Enum
6
6
  import litellm
7
7
  import os
8
8
 
9
- class APIScorer(str, Enum):
9
+
10
+ class APIScorer(str, Enum):
10
11
  """
11
12
  Collection of proprietary scorers implemented by Judgment.
12
13
 
13
14
  These are ready-made evaluation scorers that can be used to evaluate
14
15
  Examples via the Judgment API.
15
16
  """
17
+
16
18
  FAITHFULNESS = "faithfulness"
17
19
  ANSWER_RELEVANCY = "answer_relevancy"
18
20
  ANSWER_CORRECTNESS = "answer_correctness"
@@ -30,6 +32,7 @@ class APIScorer(str, Enum):
30
32
  TOOL_ORDER = "tool_order"
31
33
  CLASSIFIER = "classifier"
32
34
  TOOL_DEPENDENCY = "tool_dependency"
35
+
33
36
  @classmethod
34
37
  def _missing_(cls, value):
35
38
  # Handle case-insensitive lookup
@@ -37,7 +40,10 @@ class APIScorer(str, Enum):
37
40
  if member.value == value.lower():
38
41
  return member
39
42
 
40
- UNBOUNDED_SCORERS = set([APIScorer.COMPARISON]) # scorers whose scores are not bounded between 0-1
43
+
44
+ UNBOUNDED_SCORERS = set(
45
+ [APIScorer.COMPARISON]
46
+ ) # scorers whose scores are not bounded between 0-1
41
47
 
42
48
  ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
43
49
  # API URLs
@@ -52,87 +58,93 @@ JUDGMENT_DATASETS_PROJECT_STATS_API_URL = f"{ROOT_API}/datasets/fetch_stats_by_p
52
58
  JUDGMENT_DATASETS_INSERT_API_URL = f"{ROOT_API}/datasets/insert_examples/"
53
59
  JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
54
60
  JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_experiment_run/"
55
- JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_names/"
61
+ JUDGMENT_EVAL_DELETE_API_URL = (
62
+ f"{ROOT_API}/delete_eval_results_by_project_and_run_names/"
63
+ )
56
64
  JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
57
65
  JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete/"
58
66
  JUDGMENT_PROJECT_CREATE_API_URL = f"{ROOT_API}/projects/add/"
59
67
  JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
60
68
  JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
61
69
  JUDGMENT_TRACES_UPSERT_API_URL = f"{ROOT_API}/traces/upsert/"
62
- JUDGMENT_TRACES_USAGE_CHECK_API_URL = f"{ROOT_API}/traces/usage/check/"
63
- JUDGMENT_TRACES_USAGE_UPDATE_API_URL = f"{ROOT_API}/traces/usage/update/"
64
70
  JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
65
71
  JUDGMENT_TRACES_ADD_ANNOTATION_API_URL = f"{ROOT_API}/traces/add_annotation/"
66
72
  JUDGMENT_TRACES_SPANS_BATCH_API_URL = f"{ROOT_API}/traces/spans/batch/"
67
- JUDGMENT_TRACES_EVALUATION_RUNS_BATCH_API_URL = f"{ROOT_API}/traces/evaluation_runs/batch/"
73
+ JUDGMENT_TRACES_EVALUATION_RUNS_BATCH_API_URL = (
74
+ f"{ROOT_API}/traces/evaluation_runs/batch/"
75
+ )
68
76
  JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
69
77
  JUDGMENT_GET_EVAL_STATUS_API_URL = f"{ROOT_API}/get_evaluation_status/"
70
78
  # RabbitMQ
71
- RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
79
+ RABBITMQ_HOST = os.getenv(
80
+ "RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com"
81
+ )
72
82
  RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", 5672)
73
83
  RABBITMQ_QUEUE = os.getenv("RABBITMQ_QUEUE", "task_queue")
74
84
  # Models
75
85
  LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
76
86
 
77
87
  TOGETHER_SUPPORTED_MODELS = [
78
- "meta-llama/Meta-Llama-3-70B-Instruct-Turbo",
79
- "Qwen/Qwen2-VL-72B-Instruct",
80
- "meta-llama/Llama-Vision-Free",
81
- "Gryphe/MythoMax-L2-13b",
82
- "Qwen/Qwen2.5-72B-Instruct-Turbo",
83
- "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
84
- "deepseek-ai/DeepSeek-R1",
85
- "meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo",
86
- "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
87
- "google/gemma-2-27b-it",
88
- "mistralai/Mistral-Small-24B-Instruct-2501",
89
- "mistralai/Mixtral-8x22B-Instruct-v0.1",
90
- "meta-llama/Meta-Llama-3-8B-Instruct-Turbo",
91
- "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
92
- "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-classifier",
93
- "deepseek-ai/DeepSeek-V3",
94
- "Qwen/Qwen2-72B-Instruct",
95
- "meta-llama/Meta-Llama-3-8B-Instruct-Lite",
96
- "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
97
- "upstage/SOLAR-10.7B-Instruct-v1.0",
98
- "togethercomputer/MoA-1",
99
- "Qwen/QwQ-32B-Preview",
100
- "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
101
- "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
102
- "mistralai/Mistral-7B-Instruct-v0.2",
103
- "databricks/dbrx-instruct",
104
- "meta-llama/Llama-3-8b-chat-hf",
105
- "google/gemma-2b-it",
106
- "meta-llama/Meta-Llama-3-70B-Instruct-Lite",
107
- "google/gemma-2-9b-it",
108
- "meta-llama/Llama-3.3-70B-Instruct-Turbo",
109
- "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-p",
110
- "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
111
- "Gryphe/MythoMax-L2-13b-Lite",
112
- "meta-llama/Llama-2-7b-chat-hf",
113
- "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
114
- "meta-llama/Llama-2-13b-chat-hf",
115
- "scb10x/scb10x-llama3-typhoon-v1-5-8b-instruct",
116
- "scb10x/scb10x-llama3-typhoon-v1-5x-4f316",
117
- "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
118
- "Qwen/Qwen2.5-Coder-32B-Instruct",
119
- "microsoft/WizardLM-2-8x22B",
120
- "mistralai/Mistral-7B-Instruct-v0.3",
121
- "scb10x/scb10x-llama3-1-typhoon2-60256",
122
- "Qwen/Qwen2.5-7B-Instruct-Turbo",
123
- "scb10x/scb10x-llama3-1-typhoon-18370",
124
- "meta-llama/Llama-3.2-3B-Instruct-Turbo",
125
- "meta-llama/Llama-3-70b-chat-hf",
126
- "mistralai/Mixtral-8x7B-Instruct-v0.1",
127
- "togethercomputer/MoA-1-Turbo",
128
- "deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free",
129
- "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
130
- "mistralai/Mistral-7B-Instruct-v0.1"
88
+ "meta-llama/Meta-Llama-3-70B-Instruct-Turbo",
89
+ "Qwen/Qwen2-VL-72B-Instruct",
90
+ "meta-llama/Llama-Vision-Free",
91
+ "Gryphe/MythoMax-L2-13b",
92
+ "Qwen/Qwen2.5-72B-Instruct-Turbo",
93
+ "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
94
+ "deepseek-ai/DeepSeek-R1",
95
+ "meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo",
96
+ "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
97
+ "google/gemma-2-27b-it",
98
+ "mistralai/Mistral-Small-24B-Instruct-2501",
99
+ "mistralai/Mixtral-8x22B-Instruct-v0.1",
100
+ "meta-llama/Meta-Llama-3-8B-Instruct-Turbo",
101
+ "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
102
+ "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-classifier",
103
+ "deepseek-ai/DeepSeek-V3",
104
+ "Qwen/Qwen2-72B-Instruct",
105
+ "meta-llama/Meta-Llama-3-8B-Instruct-Lite",
106
+ "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
107
+ "upstage/SOLAR-10.7B-Instruct-v1.0",
108
+ "togethercomputer/MoA-1",
109
+ "Qwen/QwQ-32B-Preview",
110
+ "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
111
+ "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
112
+ "mistralai/Mistral-7B-Instruct-v0.2",
113
+ "databricks/dbrx-instruct",
114
+ "meta-llama/Llama-3-8b-chat-hf",
115
+ "google/gemma-2b-it",
116
+ "meta-llama/Meta-Llama-3-70B-Instruct-Lite",
117
+ "google/gemma-2-9b-it",
118
+ "meta-llama/Llama-3.3-70B-Instruct-Turbo",
119
+ "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-p",
120
+ "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
121
+ "Gryphe/MythoMax-L2-13b-Lite",
122
+ "meta-llama/Llama-2-7b-chat-hf",
123
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
124
+ "meta-llama/Llama-2-13b-chat-hf",
125
+ "scb10x/scb10x-llama3-typhoon-v1-5-8b-instruct",
126
+ "scb10x/scb10x-llama3-typhoon-v1-5x-4f316",
127
+ "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
128
+ "Qwen/Qwen2.5-Coder-32B-Instruct",
129
+ "microsoft/WizardLM-2-8x22B",
130
+ "mistralai/Mistral-7B-Instruct-v0.3",
131
+ "scb10x/scb10x-llama3-1-typhoon2-60256",
132
+ "Qwen/Qwen2.5-7B-Instruct-Turbo",
133
+ "scb10x/scb10x-llama3-1-typhoon-18370",
134
+ "meta-llama/Llama-3.2-3B-Instruct-Turbo",
135
+ "meta-llama/Llama-3-70b-chat-hf",
136
+ "mistralai/Mixtral-8x7B-Instruct-v0.1",
137
+ "togethercomputer/MoA-1-Turbo",
138
+ "deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free",
139
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
140
+ "mistralai/Mistral-7B-Instruct-v0.1",
131
141
  ]
132
142
 
133
143
  JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini", "osiris"}
134
144
 
135
- ACCEPTABLE_MODELS = set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS) | JUDGMENT_SUPPORTED_MODELS
145
+ ACCEPTABLE_MODELS = (
146
+ set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS) | JUDGMENT_SUPPORTED_MODELS
147
+ )
136
148
 
137
149
  ## System settings
138
150
  MAX_WORKER_THREADS = 10
judgeval/data/__init__.py CHANGED
@@ -15,5 +15,5 @@ __all__ = [
15
15
  "generate_scoring_result",
16
16
  "Trace",
17
17
  "TraceSpan",
18
- "TraceUsage"
18
+ "TraceUsage",
19
19
  ]
@@ -1,7 +1,8 @@
1
1
  from pydantic import BaseModel, Field
2
- from typing import Optional, Union, List, Dict, Any
2
+ from typing import Optional, List, Dict, Any
3
3
  from uuid import uuid4
4
4
 
5
+
5
6
  class CustomExample(BaseModel):
6
7
  input: Optional[Dict[str, Any]] = None
7
8
  actual_output: Optional[Dict[str, Any]] = None
@@ -15,4 +16,4 @@ class CustomExample(BaseModel):
15
16
  example_id: str = Field(default_factory=lambda: str(uuid4()))
16
17
  example_index: Optional[int] = None
17
18
  timestamp: Optional[str] = None
18
- trace_id: Optional[str] = None
19
+ trace_id: Optional[str] = None
@@ -9,6 +9,8 @@ from typing import List, Union, Literal, Optional
9
9
 
10
10
  from judgeval.data import Example, Trace
11
11
  from judgeval.common.logger import debug, error, warning, info
12
+ from judgeval.utils.file_utils import get_examples_from_yaml
13
+
12
14
 
13
15
  @dataclass
14
16
  class EvalDataset:
@@ -18,12 +20,14 @@ class EvalDataset:
18
20
  _id: Union[str, None] = field(default=None)
19
21
  judgment_api_key: str = field(default="")
20
22
  organization_id: str = field(default="")
21
- def __init__(self,
22
- judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"),
23
- organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
24
- examples: Optional[List[Example]] = None,
25
- traces: Optional[List[Trace]] = None
26
- ):
23
+
24
+ def __init__(
25
+ self,
26
+ judgment_api_key: str = os.getenv("JUDGMENT_API_KEY", ""),
27
+ organization_id: str = os.getenv("JUDGMENT_ORG_ID", ""),
28
+ examples: Optional[List[Example]] = None,
29
+ traces: Optional[List[Trace]] = None,
30
+ ):
27
31
  if not judgment_api_key:
28
32
  warning("No judgment_api_key provided")
29
33
  self.examples = examples or []
@@ -88,14 +92,14 @@ class EvalDataset:
88
92
  new_examples = [Example(**e) for e in examples]
89
93
  for e in new_examples:
90
94
  self.add_example(e)
91
-
95
+
92
96
  def add_from_csv(
93
- self,
97
+ self,
94
98
  file_path: str,
95
99
  header_mapping: dict,
96
100
  primary_delimiter: str = ",",
97
- secondary_delimiter: str = ";"
98
- ) -> None:
101
+ secondary_delimiter: str = ";",
102
+ ) -> None:
99
103
  """
100
104
  Add Examples from a CSV file.
101
105
 
@@ -111,9 +115,9 @@ class EvalDataset:
111
115
  raise ModuleNotFoundError(
112
116
  "Please install pandas to use this method. 'pip install pandas'"
113
117
  )
114
-
118
+
115
119
  # Pandas naturally reads numbers in data files as ints, not strings (can lead to unexpected behavior)
116
- df = pd.read_csv(file_path, dtype={'trace_id': str}, sep=primary_delimiter)
120
+ df = pd.read_csv(file_path, dtype={"trace_id": str}, sep=primary_delimiter)
117
121
  """
118
122
  The user should pass in a dict mapping from Judgment Example headers to their custom defined headers.
119
123
  Available headers for Example objects are as follows:
@@ -131,42 +135,55 @@ class EvalDataset:
131
135
  This can be adjusted using the `secondary_delimiter` parameter.
132
136
  """
133
137
  examples = []
134
-
138
+
135
139
  def process_csv_row(value, header):
136
140
  """
137
141
  Maps a singular value in the CSV file to the appropriate type based on the header.
138
142
  If value exists and can be split into type List[*], we will split upon the user's provided secondary delimiter.
139
143
  """
140
144
  # check that the CSV value is not null for entry
141
- null_replacement = dict() if header == 'additional_metadata' else None
142
- if pd.isna(value) or value == '':
145
+ null_replacement = dict() if header == "additional_metadata" else None
146
+ if pd.isna(value) or value == "":
143
147
  return null_replacement
144
148
  try:
145
- value = ast.literal_eval(value) if header == 'additional_metadata' else str(value)
149
+ value = (
150
+ ast.literal_eval(value)
151
+ if header == "additional_metadata"
152
+ else str(value)
153
+ )
146
154
  except (ValueError, SyntaxError):
147
155
  value = str(value)
148
- if header in ["context", "retrieval_context", "tools_called", "expected_tools"]:
156
+ if header in [
157
+ "context",
158
+ "retrieval_context",
159
+ "tools_called",
160
+ "expected_tools",
161
+ ]:
149
162
  # attempt to split the value by the secondary delimiter
150
163
  value = value.split(secondary_delimiter)
151
-
164
+
152
165
  return value
153
-
166
+
154
167
  for _, row in df.iterrows():
155
168
  data = {
156
- header: process_csv_row(
157
- row[header_mapping[header]], header
158
- )
169
+ header: process_csv_row(row[header_mapping[header]], header)
159
170
  for header in header_mapping
160
171
  }
161
172
  if "example" in header_mapping and row[header_mapping["example"]]:
162
173
  if "name" in header_mapping:
163
- data["name"] = row[header_mapping["name"]] if pd.notna(row[header_mapping["name"]]) else None
174
+ data["name"] = (
175
+ row[header_mapping["name"]]
176
+ if pd.notna(row[header_mapping["name"]])
177
+ else None
178
+ )
164
179
  # every Example has `input` and `actual_output` fields
165
180
  if data["input"] is not None and data["actual_output"] is not None:
166
181
  e = Example(**data)
167
182
  examples.append(e)
168
183
  else:
169
- raise ValueError("Every example must have an 'input' and 'actual_output' field.")
184
+ raise ValueError(
185
+ "Every example must have an 'input' and 'actual_output' field."
186
+ )
170
187
 
171
188
  for e in examples:
172
189
  self.add_example(e)
@@ -201,32 +218,25 @@ class EvalDataset:
201
218
  timestamp: "20241230_160117"
202
219
  trace_id: "123"
203
220
  """
204
- try:
205
- with open(file_path, "r") as file:
206
- payload = yaml.safe_load(file)
207
- if payload is None:
208
- raise ValueError("The YAML file is empty.")
209
- examples = payload.get("examples", [])
210
- except FileNotFoundError:
211
- error(f"YAML file not found: {file_path}")
212
- raise FileNotFoundError(f"The file {file_path} was not found.")
213
- except yaml.YAMLError:
214
- error(f"Invalid YAML file: {file_path}")
215
- raise ValueError(f"The file {file_path} is not a valid YAML file.")
221
+ examples = get_examples_from_yaml(file_path)
216
222
 
217
223
  info(f"Added {len(examples)} examples from YAML")
218
- new_examples = [Example(**e) for e in examples]
219
- for e in new_examples:
224
+ for e in examples:
220
225
  self.add_example(e)
221
226
 
222
227
  def add_example(self, e: Example) -> None:
223
228
  self.examples.append(e)
224
229
  # TODO if we need to add rank, then we need to do it here
225
-
230
+
226
231
  def add_trace(self, t: Trace) -> None:
227
232
  self.traces.append(t)
228
233
 
229
- def save_as(self, file_type: Literal["json", "csv", "yaml"], dir_path: str, save_name: str = None) -> None:
234
+ def save_as(
235
+ self,
236
+ file_type: Literal["json", "csv", "yaml"],
237
+ dir_path: str,
238
+ save_name: str | None = None,
239
+ ) -> None:
230
240
  """
231
241
  Saves the dataset as a file. Save only the examples.
232
242
 
@@ -237,7 +247,11 @@ class EvalDataset:
237
247
  """
238
248
  if not os.path.exists(dir_path):
239
249
  os.makedirs(dir_path)
240
- file_name = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") if save_name is None else save_name
250
+ file_name = (
251
+ datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
252
+ if save_name is None
253
+ else save_name
254
+ )
241
255
  complete_path = os.path.join(dir_path, f"{file_name}.{file_type}")
242
256
  if file_type == "json":
243
257
  with open(complete_path, "w") as file:
@@ -251,12 +265,23 @@ class EvalDataset:
251
265
  elif file_type == "csv":
252
266
  with open(complete_path, "w", newline="") as file:
253
267
  writer = csv.writer(file)
254
- writer.writerow([
255
- "input", "actual_output", "expected_output", "context", \
256
- "retrieval_context", "additional_metadata", "tools_called", \
257
- "expected_tools", "name", "comments", "source_file", "example", \
258
- "trace_id"
259
- ])
268
+ writer.writerow(
269
+ [
270
+ "input",
271
+ "actual_output",
272
+ "expected_output",
273
+ "context",
274
+ "retrieval_context",
275
+ "additional_metadata",
276
+ "tools_called",
277
+ "expected_tools",
278
+ "name",
279
+ "comments",
280
+ "source_file",
281
+ "example",
282
+ "trace_id",
283
+ ]
284
+ )
260
285
  for e in self.examples:
261
286
  writer.writerow(
262
287
  [
@@ -274,8 +299,7 @@ class EvalDataset:
274
299
  True, # Adding an Example
275
300
  ]
276
301
  )
277
-
278
-
302
+
279
303
  elif file_type == "yaml":
280
304
  with open(complete_path, "w") as file:
281
305
  yaml_data = {
@@ -300,14 +324,16 @@ class EvalDataset:
300
324
  yaml.dump(yaml_data, file, default_flow_style=False)
301
325
  else:
302
326
  ACCEPTABLE_FILE_TYPES = ["json", "csv", "yaml"]
303
- raise TypeError(f"Invalid file type: {file_type}. Please choose from {ACCEPTABLE_FILE_TYPES}")
304
-
327
+ raise TypeError(
328
+ f"Invalid file type: {file_type}. Please choose from {ACCEPTABLE_FILE_TYPES}"
329
+ )
330
+
305
331
  def __iter__(self):
306
332
  return iter(self.examples)
307
-
333
+
308
334
  def __len__(self):
309
335
  return len(self.examples)
310
-
336
+
311
337
  def __str__(self):
312
338
  return (
313
339
  f"{self.__class__.__name__}("
@@ -316,4 +342,4 @@ class EvalDataset:
316
342
  f"_alias={self._alias}, "
317
343
  f"_id={self._id}"
318
344
  f")"
319
- )
345
+ )