judgeval 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/api/api.py +38 -7
- judgeval/common/api/constants.py +9 -1
- judgeval/common/storage/s3_storage.py +2 -3
- judgeval/common/tracer/core.py +66 -32
- judgeval/common/tracer/otel_span_processor.py +4 -50
- judgeval/common/tracer/span_transformer.py +16 -10
- judgeval/common/utils.py +46 -38
- judgeval/constants.py +2 -0
- judgeval/data/example.py +9 -37
- judgeval/data/judgment_types.py +23 -45
- judgeval/data/result.py +8 -14
- judgeval/data/scripts/openapi_transform.py +5 -5
- judgeval/data/trace.py +3 -4
- judgeval/dataset.py +192 -0
- judgeval/evaluation_run.py +1 -0
- judgeval/judges/litellm_judge.py +2 -2
- judgeval/judges/mixture_of_judges.py +6 -6
- judgeval/judges/together_judge.py +6 -3
- judgeval/judgment_client.py +9 -71
- judgeval/run_evaluation.py +41 -9
- judgeval/scorers/score.py +11 -7
- judgeval/scorers/utils.py +3 -3
- judgeval/utils/file_utils.py +40 -25
- {judgeval-0.2.0.dist-info → judgeval-0.3.1.dist-info}/METADATA +10 -6
- {judgeval-0.2.0.dist-info → judgeval-0.3.1.dist-info}/RECORD +27 -29
- judgeval/data/datasets/__init__.py +0 -4
- judgeval/data/datasets/dataset.py +0 -341
- judgeval/data/datasets/eval_dataset_client.py +0 -214
- {judgeval-0.2.0.dist-info → judgeval-0.3.1.dist-info}/WHEEL +0 -0
- {judgeval-0.2.0.dist-info → judgeval-0.3.1.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,214 +0,0 @@
|
|
1
|
-
from typing import Optional, List
|
2
|
-
from rich.progress import Progress, SpinnerColumn, TextColumn
|
3
|
-
from judgeval.common.logger import judgeval_logger
|
4
|
-
from judgeval.common.api import JudgmentApiClient
|
5
|
-
from judgeval.data import Example, Trace
|
6
|
-
from judgeval.data.datasets import EvalDataset
|
7
|
-
|
8
|
-
|
9
|
-
class EvalDatasetClient:
|
10
|
-
def __init__(self, judgment_api_key: str, organization_id: str):
|
11
|
-
self.api_client = JudgmentApiClient(judgment_api_key, organization_id)
|
12
|
-
|
13
|
-
def create_dataset(self) -> EvalDataset:
|
14
|
-
return EvalDataset(judgment_api_key=self.api_client.api_key)
|
15
|
-
|
16
|
-
def push(
|
17
|
-
self,
|
18
|
-
dataset: EvalDataset,
|
19
|
-
alias: str,
|
20
|
-
project_name: str,
|
21
|
-
overwrite: Optional[bool] = False,
|
22
|
-
) -> bool:
|
23
|
-
if overwrite:
|
24
|
-
judgeval_logger.warning(f"Overwrite enabled for alias '{alias}'")
|
25
|
-
"""
|
26
|
-
Pushes the dataset to Judgment platform
|
27
|
-
|
28
|
-
Mock request:
|
29
|
-
dataset = {
|
30
|
-
"alias": alias,
|
31
|
-
"examples": [...],
|
32
|
-
"overwrite": overwrite
|
33
|
-
} ==>
|
34
|
-
{
|
35
|
-
"_alias": alias,
|
36
|
-
"_id": "..." # ID of the dataset
|
37
|
-
}
|
38
|
-
"""
|
39
|
-
with Progress(
|
40
|
-
SpinnerColumn(style="rgb(106,0,255)"),
|
41
|
-
TextColumn("[progress.description]{task.description}"),
|
42
|
-
transient=False,
|
43
|
-
) as progress:
|
44
|
-
task_id = progress.add_task(
|
45
|
-
f"Pushing [rgb(106,0,255)]'{alias}' to Judgment...",
|
46
|
-
total=100,
|
47
|
-
)
|
48
|
-
try:
|
49
|
-
payload = self.api_client.push_dataset(
|
50
|
-
dataset_alias=alias,
|
51
|
-
project_name=project_name,
|
52
|
-
examples=[e.to_dict() for e in dataset.examples],
|
53
|
-
traces=[t.model_dump() for t in dataset.traces],
|
54
|
-
overwrite=overwrite or False,
|
55
|
-
)
|
56
|
-
except Exception as e:
|
57
|
-
judgeval_logger.error(f"Error during push: {e}")
|
58
|
-
raise
|
59
|
-
dataset._alias = payload.get("_alias")
|
60
|
-
dataset._id = payload.get("_id")
|
61
|
-
progress.update(
|
62
|
-
task_id,
|
63
|
-
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
|
64
|
-
)
|
65
|
-
return True
|
66
|
-
|
67
|
-
def append_examples(
|
68
|
-
self, alias: str, examples: List[Example], project_name: str
|
69
|
-
) -> bool:
|
70
|
-
"""
|
71
|
-
Appends the dataset to Judgment platform
|
72
|
-
|
73
|
-
Mock request:
|
74
|
-
dataset = {
|
75
|
-
"alias": alias,
|
76
|
-
"examples": [...],
|
77
|
-
"project_name": project_name
|
78
|
-
} ==>
|
79
|
-
{
|
80
|
-
"_alias": alias,
|
81
|
-
"_id": "..." # ID of the dataset
|
82
|
-
}
|
83
|
-
"""
|
84
|
-
with Progress(
|
85
|
-
SpinnerColumn(style="rgb(106,0,255)"),
|
86
|
-
TextColumn("[progress.description]{task.description}"),
|
87
|
-
transient=False,
|
88
|
-
) as progress:
|
89
|
-
task_id = progress.add_task(
|
90
|
-
f"Appending [rgb(106,0,255)]'{alias}' to Judgment...",
|
91
|
-
total=100,
|
92
|
-
)
|
93
|
-
try:
|
94
|
-
self.api_client.append_examples(
|
95
|
-
dataset_alias=alias,
|
96
|
-
project_name=project_name,
|
97
|
-
examples=[e.to_dict() for e in examples],
|
98
|
-
)
|
99
|
-
except Exception as e:
|
100
|
-
judgeval_logger.error(f"Error during append: {e}")
|
101
|
-
raise
|
102
|
-
|
103
|
-
progress.update(
|
104
|
-
task_id,
|
105
|
-
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
|
106
|
-
)
|
107
|
-
return True
|
108
|
-
|
109
|
-
def pull(self, alias: str, project_name: str) -> EvalDataset:
|
110
|
-
"""
|
111
|
-
Pulls the dataset from Judgment platform
|
112
|
-
|
113
|
-
Mock request:
|
114
|
-
{
|
115
|
-
"alias": alias,
|
116
|
-
"project_name": project_name
|
117
|
-
}
|
118
|
-
==>
|
119
|
-
{
|
120
|
-
"examples": [...],
|
121
|
-
"_alias": alias,
|
122
|
-
"_id": "..." # ID of the dataset
|
123
|
-
}
|
124
|
-
"""
|
125
|
-
# Make a POST request to the Judgment API to get the dataset
|
126
|
-
dataset = self.create_dataset()
|
127
|
-
|
128
|
-
with Progress(
|
129
|
-
SpinnerColumn(style="rgb(106,0,255)"),
|
130
|
-
TextColumn("[progress.description]{task.description}"),
|
131
|
-
transient=False,
|
132
|
-
) as progress:
|
133
|
-
task_id = progress.add_task(
|
134
|
-
f"Pulling [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] from Judgment...",
|
135
|
-
total=100,
|
136
|
-
)
|
137
|
-
try:
|
138
|
-
payload = self.api_client.pull_dataset(
|
139
|
-
dataset_alias=alias,
|
140
|
-
project_name=project_name,
|
141
|
-
)
|
142
|
-
except Exception as e:
|
143
|
-
judgeval_logger.error(f"Error pulling dataset: {str(e)}")
|
144
|
-
raise
|
145
|
-
dataset.examples = [Example(**e) for e in payload.get("examples", [])]
|
146
|
-
dataset.traces = [Trace(**t) for t in payload.get("traces", [])]
|
147
|
-
dataset._alias = payload.get("alias")
|
148
|
-
dataset._id = payload.get("id")
|
149
|
-
progress.update(
|
150
|
-
task_id,
|
151
|
-
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
|
152
|
-
)
|
153
|
-
|
154
|
-
return dataset
|
155
|
-
|
156
|
-
def delete(self, alias: str, project_name: str) -> bool:
|
157
|
-
with Progress(
|
158
|
-
SpinnerColumn(style="rgb(106,0,255)"),
|
159
|
-
TextColumn("[progress.description]{task.description}"),
|
160
|
-
transient=False,
|
161
|
-
) as progress:
|
162
|
-
progress.add_task(
|
163
|
-
f"Deleting [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] from Judgment...",
|
164
|
-
total=100,
|
165
|
-
)
|
166
|
-
try:
|
167
|
-
self.api_client.delete_dataset(
|
168
|
-
dataset_alias=alias,
|
169
|
-
project_name=project_name,
|
170
|
-
)
|
171
|
-
except Exception as e:
|
172
|
-
judgeval_logger.error(f"Error deleting dataset: {str(e)}")
|
173
|
-
raise
|
174
|
-
|
175
|
-
return True
|
176
|
-
|
177
|
-
def pull_project_dataset_stats(self, project_name: str) -> dict:
|
178
|
-
"""
|
179
|
-
Pulls the project datasets stats from Judgment platform
|
180
|
-
|
181
|
-
Mock request:
|
182
|
-
{
|
183
|
-
"project_name": project_name
|
184
|
-
}
|
185
|
-
==>
|
186
|
-
{
|
187
|
-
"test_dataset_1": {"examples_count": len(dataset1.examples)},
|
188
|
-
"test_dataset_2": {"examples_count": len(dataset2.examples)},
|
189
|
-
...
|
190
|
-
}
|
191
|
-
"""
|
192
|
-
# Make a POST request to the Judgment API to get the dataset
|
193
|
-
|
194
|
-
with Progress(
|
195
|
-
SpinnerColumn(style="rgb(106,0,255)"),
|
196
|
-
TextColumn("[progress.description]{task.description}"),
|
197
|
-
transient=False,
|
198
|
-
) as progress:
|
199
|
-
task_id = progress.add_task(
|
200
|
-
"Pulling [rgb(106,0,255)]' datasets'[/rgb(106,0,255)] from Judgment...",
|
201
|
-
total=100,
|
202
|
-
)
|
203
|
-
try:
|
204
|
-
payload = self.api_client.get_project_dataset_stats(project_name)
|
205
|
-
except Exception as e:
|
206
|
-
judgeval_logger.error(f"Error pulling dataset: {str(e)}")
|
207
|
-
raise
|
208
|
-
|
209
|
-
progress.update(
|
210
|
-
task_id,
|
211
|
-
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
|
212
|
-
)
|
213
|
-
|
214
|
-
return payload
|
File without changes
|
File without changes
|