judgeval 0.0.44__py3-none-any.whl → 0.0.46__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +5 -4
- judgeval/clients.py +6 -6
- judgeval/common/__init__.py +7 -2
- judgeval/common/exceptions.py +2 -3
- judgeval/common/logger.py +74 -49
- judgeval/common/s3_storage.py +30 -23
- judgeval/common/tracer.py +1273 -939
- judgeval/common/utils.py +416 -244
- judgeval/constants.py +73 -61
- judgeval/data/__init__.py +1 -1
- judgeval/data/custom_example.py +3 -2
- judgeval/data/datasets/dataset.py +80 -54
- judgeval/data/datasets/eval_dataset_client.py +131 -181
- judgeval/data/example.py +67 -43
- judgeval/data/result.py +11 -9
- judgeval/data/scorer_data.py +4 -2
- judgeval/data/tool.py +25 -16
- judgeval/data/trace.py +57 -29
- judgeval/data/trace_run.py +5 -11
- judgeval/evaluation_run.py +22 -82
- judgeval/integrations/langgraph.py +546 -184
- judgeval/judges/base_judge.py +1 -2
- judgeval/judges/litellm_judge.py +33 -11
- judgeval/judges/mixture_of_judges.py +128 -78
- judgeval/judges/together_judge.py +22 -9
- judgeval/judges/utils.py +14 -5
- judgeval/judgment_client.py +259 -271
- judgeval/rules.py +169 -142
- judgeval/run_evaluation.py +462 -305
- judgeval/scorers/api_scorer.py +20 -11
- judgeval/scorers/exceptions.py +1 -0
- judgeval/scorers/judgeval_scorer.py +77 -58
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +46 -15
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +12 -11
- judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +7 -5
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +5 -2
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +2 -1
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +17 -8
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +8 -9
- judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +5 -5
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +5 -2
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +9 -10
- judgeval/scorers/prompt_scorer.py +48 -37
- judgeval/scorers/score.py +86 -53
- judgeval/scorers/utils.py +11 -7
- judgeval/tracer/__init__.py +1 -1
- judgeval/utils/alerts.py +23 -12
- judgeval/utils/{data_utils.py → file_utils.py} +5 -9
- judgeval/utils/requests.py +29 -0
- judgeval/version_check.py +5 -2
- {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/METADATA +79 -135
- judgeval-0.0.46.dist-info/RECORD +69 -0
- judgeval-0.0.44.dist-info/RECORD +0 -68
- {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/WHEEL +0 -0
- {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/licenses/LICENSE.md +0 -0
judgeval/constants.py
CHANGED
@@ -6,13 +6,15 @@ from enum import Enum
|
|
6
6
|
import litellm
|
7
7
|
import os
|
8
8
|
|
9
|
-
|
9
|
+
|
10
|
+
class APIScorer(str, Enum):
|
10
11
|
"""
|
11
12
|
Collection of proprietary scorers implemented by Judgment.
|
12
13
|
|
13
14
|
These are ready-made evaluation scorers that can be used to evaluate
|
14
15
|
Examples via the Judgment API.
|
15
16
|
"""
|
17
|
+
|
16
18
|
FAITHFULNESS = "faithfulness"
|
17
19
|
ANSWER_RELEVANCY = "answer_relevancy"
|
18
20
|
ANSWER_CORRECTNESS = "answer_correctness"
|
@@ -30,6 +32,7 @@ class APIScorer(str, Enum):
|
|
30
32
|
TOOL_ORDER = "tool_order"
|
31
33
|
CLASSIFIER = "classifier"
|
32
34
|
TOOL_DEPENDENCY = "tool_dependency"
|
35
|
+
|
33
36
|
@classmethod
|
34
37
|
def _missing_(cls, value):
|
35
38
|
# Handle case-insensitive lookup
|
@@ -37,7 +40,10 @@ class APIScorer(str, Enum):
|
|
37
40
|
if member.value == value.lower():
|
38
41
|
return member
|
39
42
|
|
40
|
-
|
43
|
+
|
44
|
+
UNBOUNDED_SCORERS = set(
|
45
|
+
[APIScorer.COMPARISON]
|
46
|
+
) # scorers whose scores are not bounded between 0-1
|
41
47
|
|
42
48
|
ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
|
43
49
|
# API URLs
|
@@ -52,87 +58,93 @@ JUDGMENT_DATASETS_PROJECT_STATS_API_URL = f"{ROOT_API}/datasets/fetch_stats_by_p
|
|
52
58
|
JUDGMENT_DATASETS_INSERT_API_URL = f"{ROOT_API}/datasets/insert_examples/"
|
53
59
|
JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
|
54
60
|
JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_experiment_run/"
|
55
|
-
JUDGMENT_EVAL_DELETE_API_URL =
|
61
|
+
JUDGMENT_EVAL_DELETE_API_URL = (
|
62
|
+
f"{ROOT_API}/delete_eval_results_by_project_and_run_names/"
|
63
|
+
)
|
56
64
|
JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
|
57
65
|
JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete/"
|
58
66
|
JUDGMENT_PROJECT_CREATE_API_URL = f"{ROOT_API}/projects/add/"
|
59
67
|
JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
|
60
68
|
JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
|
61
69
|
JUDGMENT_TRACES_UPSERT_API_URL = f"{ROOT_API}/traces/upsert/"
|
62
|
-
JUDGMENT_TRACES_USAGE_CHECK_API_URL = f"{ROOT_API}/traces/usage/check/"
|
63
|
-
JUDGMENT_TRACES_USAGE_UPDATE_API_URL = f"{ROOT_API}/traces/usage/update/"
|
64
70
|
JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
|
65
71
|
JUDGMENT_TRACES_ADD_ANNOTATION_API_URL = f"{ROOT_API}/traces/add_annotation/"
|
66
72
|
JUDGMENT_TRACES_SPANS_BATCH_API_URL = f"{ROOT_API}/traces/spans/batch/"
|
67
|
-
JUDGMENT_TRACES_EVALUATION_RUNS_BATCH_API_URL =
|
73
|
+
JUDGMENT_TRACES_EVALUATION_RUNS_BATCH_API_URL = (
|
74
|
+
f"{ROOT_API}/traces/evaluation_runs/batch/"
|
75
|
+
)
|
68
76
|
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
|
69
77
|
JUDGMENT_GET_EVAL_STATUS_API_URL = f"{ROOT_API}/get_evaluation_status/"
|
70
78
|
# RabbitMQ
|
71
|
-
RABBITMQ_HOST = os.getenv(
|
79
|
+
RABBITMQ_HOST = os.getenv(
|
80
|
+
"RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com"
|
81
|
+
)
|
72
82
|
RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", 5672)
|
73
83
|
RABBITMQ_QUEUE = os.getenv("RABBITMQ_QUEUE", "task_queue")
|
74
84
|
# Models
|
75
85
|
LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
|
76
86
|
|
77
87
|
TOGETHER_SUPPORTED_MODELS = [
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
88
|
+
"meta-llama/Meta-Llama-3-70B-Instruct-Turbo",
|
89
|
+
"Qwen/Qwen2-VL-72B-Instruct",
|
90
|
+
"meta-llama/Llama-Vision-Free",
|
91
|
+
"Gryphe/MythoMax-L2-13b",
|
92
|
+
"Qwen/Qwen2.5-72B-Instruct-Turbo",
|
93
|
+
"meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
|
94
|
+
"deepseek-ai/DeepSeek-R1",
|
95
|
+
"meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo",
|
96
|
+
"meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
|
97
|
+
"google/gemma-2-27b-it",
|
98
|
+
"mistralai/Mistral-Small-24B-Instruct-2501",
|
99
|
+
"mistralai/Mixtral-8x22B-Instruct-v0.1",
|
100
|
+
"meta-llama/Meta-Llama-3-8B-Instruct-Turbo",
|
101
|
+
"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
|
102
|
+
"meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-classifier",
|
103
|
+
"deepseek-ai/DeepSeek-V3",
|
104
|
+
"Qwen/Qwen2-72B-Instruct",
|
105
|
+
"meta-llama/Meta-Llama-3-8B-Instruct-Lite",
|
106
|
+
"deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
|
107
|
+
"upstage/SOLAR-10.7B-Instruct-v1.0",
|
108
|
+
"togethercomputer/MoA-1",
|
109
|
+
"Qwen/QwQ-32B-Preview",
|
110
|
+
"meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
|
111
|
+
"meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
|
112
|
+
"mistralai/Mistral-7B-Instruct-v0.2",
|
113
|
+
"databricks/dbrx-instruct",
|
114
|
+
"meta-llama/Llama-3-8b-chat-hf",
|
115
|
+
"google/gemma-2b-it",
|
116
|
+
"meta-llama/Meta-Llama-3-70B-Instruct-Lite",
|
117
|
+
"google/gemma-2-9b-it",
|
118
|
+
"meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
119
|
+
"meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-p",
|
120
|
+
"meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
|
121
|
+
"Gryphe/MythoMax-L2-13b-Lite",
|
122
|
+
"meta-llama/Llama-2-7b-chat-hf",
|
123
|
+
"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
|
124
|
+
"meta-llama/Llama-2-13b-chat-hf",
|
125
|
+
"scb10x/scb10x-llama3-typhoon-v1-5-8b-instruct",
|
126
|
+
"scb10x/scb10x-llama3-typhoon-v1-5x-4f316",
|
127
|
+
"nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
|
128
|
+
"Qwen/Qwen2.5-Coder-32B-Instruct",
|
129
|
+
"microsoft/WizardLM-2-8x22B",
|
130
|
+
"mistralai/Mistral-7B-Instruct-v0.3",
|
131
|
+
"scb10x/scb10x-llama3-1-typhoon2-60256",
|
132
|
+
"Qwen/Qwen2.5-7B-Instruct-Turbo",
|
133
|
+
"scb10x/scb10x-llama3-1-typhoon-18370",
|
134
|
+
"meta-llama/Llama-3.2-3B-Instruct-Turbo",
|
135
|
+
"meta-llama/Llama-3-70b-chat-hf",
|
136
|
+
"mistralai/Mixtral-8x7B-Instruct-v0.1",
|
137
|
+
"togethercomputer/MoA-1-Turbo",
|
138
|
+
"deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free",
|
139
|
+
"deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
|
140
|
+
"mistralai/Mistral-7B-Instruct-v0.1",
|
131
141
|
]
|
132
142
|
|
133
143
|
JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini", "osiris"}
|
134
144
|
|
135
|
-
ACCEPTABLE_MODELS =
|
145
|
+
ACCEPTABLE_MODELS = (
|
146
|
+
set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS) | JUDGMENT_SUPPORTED_MODELS
|
147
|
+
)
|
136
148
|
|
137
149
|
## System settings
|
138
150
|
MAX_WORKER_THREADS = 10
|
judgeval/data/__init__.py
CHANGED
judgeval/data/custom_example.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
from pydantic import BaseModel, Field
|
2
|
-
from typing import Optional,
|
2
|
+
from typing import Optional, List, Dict, Any
|
3
3
|
from uuid import uuid4
|
4
4
|
|
5
|
+
|
5
6
|
class CustomExample(BaseModel):
|
6
7
|
input: Optional[Dict[str, Any]] = None
|
7
8
|
actual_output: Optional[Dict[str, Any]] = None
|
@@ -15,4 +16,4 @@ class CustomExample(BaseModel):
|
|
15
16
|
example_id: str = Field(default_factory=lambda: str(uuid4()))
|
16
17
|
example_index: Optional[int] = None
|
17
18
|
timestamp: Optional[str] = None
|
18
|
-
trace_id: Optional[str] = None
|
19
|
+
trace_id: Optional[str] = None
|
@@ -9,6 +9,8 @@ from typing import List, Union, Literal, Optional
|
|
9
9
|
|
10
10
|
from judgeval.data import Example, Trace
|
11
11
|
from judgeval.common.logger import debug, error, warning, info
|
12
|
+
from judgeval.utils.file_utils import get_examples_from_yaml
|
13
|
+
|
12
14
|
|
13
15
|
@dataclass
|
14
16
|
class EvalDataset:
|
@@ -18,12 +20,14 @@ class EvalDataset:
|
|
18
20
|
_id: Union[str, None] = field(default=None)
|
19
21
|
judgment_api_key: str = field(default="")
|
20
22
|
organization_id: str = field(default="")
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
23
|
+
|
24
|
+
def __init__(
|
25
|
+
self,
|
26
|
+
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY", ""),
|
27
|
+
organization_id: str = os.getenv("JUDGMENT_ORG_ID", ""),
|
28
|
+
examples: Optional[List[Example]] = None,
|
29
|
+
traces: Optional[List[Trace]] = None,
|
30
|
+
):
|
27
31
|
if not judgment_api_key:
|
28
32
|
warning("No judgment_api_key provided")
|
29
33
|
self.examples = examples or []
|
@@ -88,14 +92,14 @@ class EvalDataset:
|
|
88
92
|
new_examples = [Example(**e) for e in examples]
|
89
93
|
for e in new_examples:
|
90
94
|
self.add_example(e)
|
91
|
-
|
95
|
+
|
92
96
|
def add_from_csv(
|
93
|
-
self,
|
97
|
+
self,
|
94
98
|
file_path: str,
|
95
99
|
header_mapping: dict,
|
96
100
|
primary_delimiter: str = ",",
|
97
|
-
secondary_delimiter: str = ";"
|
98
|
-
|
101
|
+
secondary_delimiter: str = ";",
|
102
|
+
) -> None:
|
99
103
|
"""
|
100
104
|
Add Examples from a CSV file.
|
101
105
|
|
@@ -111,9 +115,9 @@ class EvalDataset:
|
|
111
115
|
raise ModuleNotFoundError(
|
112
116
|
"Please install pandas to use this method. 'pip install pandas'"
|
113
117
|
)
|
114
|
-
|
118
|
+
|
115
119
|
# Pandas naturally reads numbers in data files as ints, not strings (can lead to unexpected behavior)
|
116
|
-
df = pd.read_csv(file_path, dtype={
|
120
|
+
df = pd.read_csv(file_path, dtype={"trace_id": str}, sep=primary_delimiter)
|
117
121
|
"""
|
118
122
|
The user should pass in a dict mapping from Judgment Example headers to their custom defined headers.
|
119
123
|
Available headers for Example objects are as follows:
|
@@ -131,42 +135,55 @@ class EvalDataset:
|
|
131
135
|
This can be adjusted using the `secondary_delimiter` parameter.
|
132
136
|
"""
|
133
137
|
examples = []
|
134
|
-
|
138
|
+
|
135
139
|
def process_csv_row(value, header):
|
136
140
|
"""
|
137
141
|
Maps a singular value in the CSV file to the appropriate type based on the header.
|
138
142
|
If value exists and can be split into type List[*], we will split upon the user's provided secondary delimiter.
|
139
143
|
"""
|
140
144
|
# check that the CSV value is not null for entry
|
141
|
-
null_replacement = dict() if header ==
|
142
|
-
if pd.isna(value) or value ==
|
145
|
+
null_replacement = dict() if header == "additional_metadata" else None
|
146
|
+
if pd.isna(value) or value == "":
|
143
147
|
return null_replacement
|
144
148
|
try:
|
145
|
-
value =
|
149
|
+
value = (
|
150
|
+
ast.literal_eval(value)
|
151
|
+
if header == "additional_metadata"
|
152
|
+
else str(value)
|
153
|
+
)
|
146
154
|
except (ValueError, SyntaxError):
|
147
155
|
value = str(value)
|
148
|
-
if header in [
|
156
|
+
if header in [
|
157
|
+
"context",
|
158
|
+
"retrieval_context",
|
159
|
+
"tools_called",
|
160
|
+
"expected_tools",
|
161
|
+
]:
|
149
162
|
# attempt to split the value by the secondary delimiter
|
150
163
|
value = value.split(secondary_delimiter)
|
151
|
-
|
164
|
+
|
152
165
|
return value
|
153
|
-
|
166
|
+
|
154
167
|
for _, row in df.iterrows():
|
155
168
|
data = {
|
156
|
-
header: process_csv_row(
|
157
|
-
row[header_mapping[header]], header
|
158
|
-
)
|
169
|
+
header: process_csv_row(row[header_mapping[header]], header)
|
159
170
|
for header in header_mapping
|
160
171
|
}
|
161
172
|
if "example" in header_mapping and row[header_mapping["example"]]:
|
162
173
|
if "name" in header_mapping:
|
163
|
-
data["name"] =
|
174
|
+
data["name"] = (
|
175
|
+
row[header_mapping["name"]]
|
176
|
+
if pd.notna(row[header_mapping["name"]])
|
177
|
+
else None
|
178
|
+
)
|
164
179
|
# every Example has `input` and `actual_output` fields
|
165
180
|
if data["input"] is not None and data["actual_output"] is not None:
|
166
181
|
e = Example(**data)
|
167
182
|
examples.append(e)
|
168
183
|
else:
|
169
|
-
raise ValueError(
|
184
|
+
raise ValueError(
|
185
|
+
"Every example must have an 'input' and 'actual_output' field."
|
186
|
+
)
|
170
187
|
|
171
188
|
for e in examples:
|
172
189
|
self.add_example(e)
|
@@ -201,32 +218,25 @@ class EvalDataset:
|
|
201
218
|
timestamp: "20241230_160117"
|
202
219
|
trace_id: "123"
|
203
220
|
"""
|
204
|
-
|
205
|
-
with open(file_path, "r") as file:
|
206
|
-
payload = yaml.safe_load(file)
|
207
|
-
if payload is None:
|
208
|
-
raise ValueError("The YAML file is empty.")
|
209
|
-
examples = payload.get("examples", [])
|
210
|
-
except FileNotFoundError:
|
211
|
-
error(f"YAML file not found: {file_path}")
|
212
|
-
raise FileNotFoundError(f"The file {file_path} was not found.")
|
213
|
-
except yaml.YAMLError:
|
214
|
-
error(f"Invalid YAML file: {file_path}")
|
215
|
-
raise ValueError(f"The file {file_path} is not a valid YAML file.")
|
221
|
+
examples = get_examples_from_yaml(file_path)
|
216
222
|
|
217
223
|
info(f"Added {len(examples)} examples from YAML")
|
218
|
-
|
219
|
-
for e in new_examples:
|
224
|
+
for e in examples:
|
220
225
|
self.add_example(e)
|
221
226
|
|
222
227
|
def add_example(self, e: Example) -> None:
|
223
228
|
self.examples.append(e)
|
224
229
|
# TODO if we need to add rank, then we need to do it here
|
225
|
-
|
230
|
+
|
226
231
|
def add_trace(self, t: Trace) -> None:
|
227
232
|
self.traces.append(t)
|
228
233
|
|
229
|
-
def save_as(
|
234
|
+
def save_as(
|
235
|
+
self,
|
236
|
+
file_type: Literal["json", "csv", "yaml"],
|
237
|
+
dir_path: str,
|
238
|
+
save_name: str | None = None,
|
239
|
+
) -> None:
|
230
240
|
"""
|
231
241
|
Saves the dataset as a file. Save only the examples.
|
232
242
|
|
@@ -237,7 +247,11 @@ class EvalDataset:
|
|
237
247
|
"""
|
238
248
|
if not os.path.exists(dir_path):
|
239
249
|
os.makedirs(dir_path)
|
240
|
-
file_name =
|
250
|
+
file_name = (
|
251
|
+
datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
252
|
+
if save_name is None
|
253
|
+
else save_name
|
254
|
+
)
|
241
255
|
complete_path = os.path.join(dir_path, f"{file_name}.{file_type}")
|
242
256
|
if file_type == "json":
|
243
257
|
with open(complete_path, "w") as file:
|
@@ -251,12 +265,23 @@ class EvalDataset:
|
|
251
265
|
elif file_type == "csv":
|
252
266
|
with open(complete_path, "w", newline="") as file:
|
253
267
|
writer = csv.writer(file)
|
254
|
-
writer.writerow(
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
268
|
+
writer.writerow(
|
269
|
+
[
|
270
|
+
"input",
|
271
|
+
"actual_output",
|
272
|
+
"expected_output",
|
273
|
+
"context",
|
274
|
+
"retrieval_context",
|
275
|
+
"additional_metadata",
|
276
|
+
"tools_called",
|
277
|
+
"expected_tools",
|
278
|
+
"name",
|
279
|
+
"comments",
|
280
|
+
"source_file",
|
281
|
+
"example",
|
282
|
+
"trace_id",
|
283
|
+
]
|
284
|
+
)
|
260
285
|
for e in self.examples:
|
261
286
|
writer.writerow(
|
262
287
|
[
|
@@ -274,8 +299,7 @@ class EvalDataset:
|
|
274
299
|
True, # Adding an Example
|
275
300
|
]
|
276
301
|
)
|
277
|
-
|
278
|
-
|
302
|
+
|
279
303
|
elif file_type == "yaml":
|
280
304
|
with open(complete_path, "w") as file:
|
281
305
|
yaml_data = {
|
@@ -300,14 +324,16 @@ class EvalDataset:
|
|
300
324
|
yaml.dump(yaml_data, file, default_flow_style=False)
|
301
325
|
else:
|
302
326
|
ACCEPTABLE_FILE_TYPES = ["json", "csv", "yaml"]
|
303
|
-
raise TypeError(
|
304
|
-
|
327
|
+
raise TypeError(
|
328
|
+
f"Invalid file type: {file_type}. Please choose from {ACCEPTABLE_FILE_TYPES}"
|
329
|
+
)
|
330
|
+
|
305
331
|
def __iter__(self):
|
306
332
|
return iter(self.examples)
|
307
|
-
|
333
|
+
|
308
334
|
def __len__(self):
|
309
335
|
return len(self.examples)
|
310
|
-
|
336
|
+
|
311
337
|
def __str__(self):
|
312
338
|
return (
|
313
339
|
f"{self.__class__.__name__}("
|
@@ -316,4 +342,4 @@ class EvalDataset:
|
|
316
342
|
f"_alias={self._alias}, "
|
317
343
|
f"_id={self._id}"
|
318
344
|
f")"
|
319
|
-
)
|
345
|
+
)
|