judgeval 0.0.12__py3-none-any.whl → 0.0.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +25 -2
- judgeval/constants.py +2 -0
- judgeval/data/datasets/dataset.py +2 -1
- judgeval/data/datasets/eval_dataset_client.py +106 -9
- judgeval/data/example.py +13 -5
- judgeval/judgment_client.py +29 -6
- judgeval/run_evaluation.py +16 -5
- {judgeval-0.0.12.dist-info → judgeval-0.0.13.dist-info}/METADATA +1 -1
- {judgeval-0.0.12.dist-info → judgeval-0.0.13.dist-info}/RECORD +11 -11
- {judgeval-0.0.12.dist-info → judgeval-0.0.13.dist-info}/WHEEL +0 -0
- {judgeval-0.0.12.dist-info → judgeval-0.0.13.dist-info}/licenses/LICENSE.md +0 -0
judgeval/common/tracer.py
CHANGED
@@ -199,10 +199,11 @@ class TraceManagerClient:
|
|
199
199
|
JUDGMENT_TRACES_FETCH_API_URL,
|
200
200
|
json={
|
201
201
|
"trace_id": trace_id,
|
202
|
-
"judgment_api_key": self.judgment_api_key,
|
202
|
+
# "judgment_api_key": self.judgment_api_key,
|
203
203
|
},
|
204
204
|
headers={
|
205
205
|
"Content-Type": "application/json",
|
206
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
206
207
|
}
|
207
208
|
)
|
208
209
|
|
@@ -225,6 +226,7 @@ class TraceManagerClient:
|
|
225
226
|
json=trace_data,
|
226
227
|
headers={
|
227
228
|
"Content-Type": "application/json",
|
229
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
228
230
|
}
|
229
231
|
)
|
230
232
|
|
@@ -248,6 +250,7 @@ class TraceManagerClient:
|
|
248
250
|
},
|
249
251
|
headers={
|
250
252
|
"Content-Type": "application/json",
|
253
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
251
254
|
}
|
252
255
|
)
|
253
256
|
|
@@ -263,11 +266,12 @@ class TraceManagerClient:
|
|
263
266
|
response = requests.delete(
|
264
267
|
JUDGMENT_TRACES_DELETE_API_URL,
|
265
268
|
json={
|
266
|
-
"judgment_api_key": self.judgment_api_key,
|
269
|
+
# "judgment_api_key": self.judgment_api_key,
|
267
270
|
"trace_ids": trace_ids,
|
268
271
|
},
|
269
272
|
headers={
|
270
273
|
"Content-Type": "application/json",
|
274
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
271
275
|
}
|
272
276
|
)
|
273
277
|
|
@@ -576,6 +580,25 @@ class TraceClient:
|
|
576
580
|
|
577
581
|
self.trace_manager_client.save_trace(trace_data, empty_save)
|
578
582
|
|
583
|
+
|
584
|
+
# Save trace data by making POST request to API
|
585
|
+
response = requests.post(
|
586
|
+
JUDGMENT_TRACES_SAVE_API_URL,
|
587
|
+
json=trace_data,
|
588
|
+
headers={
|
589
|
+
"Content-Type": "application/json",
|
590
|
+
"Authorization": f"Bearer {self.tracer.api_key}" # Bearer token format
|
591
|
+
}
|
592
|
+
)
|
593
|
+
|
594
|
+
if response.status_code == HTTPStatus.BAD_REQUEST:
|
595
|
+
raise ValueError(f"Failed to save trace data: Check your Trace name for conflicts, set overwrite=True to overwrite existing traces: {response.text}")
|
596
|
+
elif response.status_code != HTTPStatus.OK:
|
597
|
+
raise ValueError(f"Failed to save trace data: {response.text}")
|
598
|
+
|
599
|
+
if not empty_save and "ui_results_url" in response.json():
|
600
|
+
rprint(f"\n🔍 You can view your trace data here: [rgb(106,0,255)]{response.json()['ui_results_url']}[/]\n")
|
601
|
+
|
579
602
|
return self.trace_id, trace_data
|
580
603
|
|
581
604
|
def delete(self):
|
judgeval/constants.py
CHANGED
@@ -36,7 +36,9 @@ ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
|
|
36
36
|
JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
|
37
37
|
JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
|
38
38
|
JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull/"
|
39
|
+
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
|
39
40
|
JUDGMENT_DATASETS_PULL_ALL_API_URL = f"{ROOT_API}/datasets/get_all_stats/"
|
41
|
+
JUDGMENT_DATASETS_EDIT_API_URL = f"{ROOT_API}/datasets/edit/"
|
40
42
|
JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
|
41
43
|
JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
|
42
44
|
JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_name/"
|
@@ -162,7 +162,8 @@ class EvalDataset:
|
|
162
162
|
"additional_metadata": ast.literal_eval(row["additional_metadata"]) if pd.notna(row["additional_metadata"]) else dict(),
|
163
163
|
"tools_called": row["tools_called"].split(";") if pd.notna(row["tools_called"]) else [],
|
164
164
|
"expected_tools": row["expected_tools"].split(";") if pd.notna(row["expected_tools"]) else [],
|
165
|
-
"trace_id": row["trace_id"] if pd.notna(row["trace_id"]) else None
|
165
|
+
"trace_id": row["trace_id"] if pd.notna(row["trace_id"]) else None,
|
166
|
+
"example_id": str(row["example_id"]) if pd.notna(row["example_id"]) else None
|
166
167
|
}
|
167
168
|
if row["example"]:
|
168
169
|
data["name"] = row["name"] if pd.notna(row["name"]) else None
|
@@ -1,5 +1,5 @@
|
|
1
1
|
|
2
|
-
from typing import Optional
|
2
|
+
from typing import Optional, List
|
3
3
|
import requests
|
4
4
|
from rich.progress import Progress, SpinnerColumn, TextColumn
|
5
5
|
|
@@ -7,7 +7,9 @@ from judgeval.common.logger import debug, error, warning, info
|
|
7
7
|
from judgeval.constants import (
|
8
8
|
JUDGMENT_DATASETS_PUSH_API_URL,
|
9
9
|
JUDGMENT_DATASETS_PULL_API_URL,
|
10
|
-
JUDGMENT_DATASETS_PULL_ALL_API_URL
|
10
|
+
JUDGMENT_DATASETS_PULL_ALL_API_URL,
|
11
|
+
JUDGMENT_DATASETS_EDIT_API_URL,
|
12
|
+
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
|
11
13
|
)
|
12
14
|
from judgeval.data import Example
|
13
15
|
from judgeval.data.datasets import EvalDataset
|
@@ -23,7 +25,7 @@ class EvalDatasetClient:
|
|
23
25
|
def create_dataset(self) -> EvalDataset:
|
24
26
|
return EvalDataset(judgment_api_key=self.judgment_api_key)
|
25
27
|
|
26
|
-
def push(self, dataset: EvalDataset, alias: str,overwrite: Optional[bool] = False) -> bool:
|
28
|
+
def push(self, dataset: EvalDataset, alias: str, overwrite: Optional[bool] = False) -> bool:
|
27
29
|
debug(f"Pushing dataset with alias '{alias}' (overwrite={overwrite})")
|
28
30
|
if overwrite:
|
29
31
|
warning(f"Overwrite enabled for alias '{alias}'")
|
@@ -56,12 +58,16 @@ class EvalDatasetClient:
|
|
56
58
|
"ground_truths": [g.to_dict() for g in dataset.ground_truths],
|
57
59
|
"examples": [e.to_dict() for e in dataset.examples],
|
58
60
|
"overwrite": overwrite,
|
59
|
-
"judgment_api_key": dataset.judgment_api_key
|
61
|
+
# "judgment_api_key": dataset.judgment_api_key
|
60
62
|
}
|
61
63
|
try:
|
62
64
|
response = requests.post(
|
63
65
|
JUDGMENT_DATASETS_PUSH_API_URL,
|
64
|
-
json=content
|
66
|
+
json=content,
|
67
|
+
headers={
|
68
|
+
"Content-Type": "application/json",
|
69
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
70
|
+
}
|
65
71
|
)
|
66
72
|
if response.status_code == 500:
|
67
73
|
error(f"Server error during push: {content.get('message')}")
|
@@ -115,13 +121,17 @@ class EvalDatasetClient:
|
|
115
121
|
)
|
116
122
|
request_body = {
|
117
123
|
"alias": alias,
|
118
|
-
"judgment_api_key": self.judgment_api_key
|
124
|
+
# "judgment_api_key": self.judgment_api_key
|
119
125
|
}
|
120
126
|
|
121
127
|
try:
|
122
128
|
response = requests.post(
|
123
129
|
JUDGMENT_DATASETS_PULL_API_URL,
|
124
|
-
json=request_body
|
130
|
+
json=request_body,
|
131
|
+
headers={
|
132
|
+
"Content-Type": "application/json",
|
133
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
134
|
+
}
|
125
135
|
)
|
126
136
|
response.raise_for_status()
|
127
137
|
except requests.exceptions.RequestException as e:
|
@@ -169,13 +179,17 @@ class EvalDatasetClient:
|
|
169
179
|
total=100,
|
170
180
|
)
|
171
181
|
request_body = {
|
172
|
-
"judgment_api_key": self.judgment_api_key
|
182
|
+
# "judgment_api_key": self.judgment_api_key
|
173
183
|
}
|
174
184
|
|
175
185
|
try:
|
176
186
|
response = requests.post(
|
177
187
|
JUDGMENT_DATASETS_PULL_ALL_API_URL,
|
178
|
-
json=request_body
|
188
|
+
json=request_body,
|
189
|
+
headers={
|
190
|
+
"Content-Type": "application/json",
|
191
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
192
|
+
}
|
179
193
|
)
|
180
194
|
response.raise_for_status()
|
181
195
|
except requests.exceptions.RequestException as e:
|
@@ -191,3 +205,86 @@ class EvalDatasetClient:
|
|
191
205
|
)
|
192
206
|
|
193
207
|
return payload
|
208
|
+
|
209
|
+
def edit_dataset(self, alias: str, examples: List[Example], ground_truths: List[GroundTruthExample]) -> bool:
|
210
|
+
"""
|
211
|
+
Edits the dataset on Judgment platform by adding new examples and ground truths
|
212
|
+
|
213
|
+
Mock request:
|
214
|
+
{
|
215
|
+
"alias": alias,
|
216
|
+
"examples": [...],
|
217
|
+
"ground_truths": [...],
|
218
|
+
"judgment_api_key": self.judgment_api_key
|
219
|
+
}
|
220
|
+
"""
|
221
|
+
with Progress(
|
222
|
+
SpinnerColumn(style="rgb(106,0,255)"),
|
223
|
+
TextColumn("[progress.description]{task.description}"),
|
224
|
+
transient=False,
|
225
|
+
) as progress:
|
226
|
+
task_id = progress.add_task(
|
227
|
+
f"Editing dataset [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] on Judgment...",
|
228
|
+
total=100,
|
229
|
+
)
|
230
|
+
|
231
|
+
content = {
|
232
|
+
"alias": alias,
|
233
|
+
"examples": [e.to_dict() for e in examples],
|
234
|
+
"ground_truths": [g.to_dict() for g in ground_truths],
|
235
|
+
"judgment_api_key": self.judgment_api_key
|
236
|
+
}
|
237
|
+
|
238
|
+
try:
|
239
|
+
response = requests.post(
|
240
|
+
JUDGMENT_DATASETS_EDIT_API_URL,
|
241
|
+
json=content
|
242
|
+
)
|
243
|
+
response.raise_for_status()
|
244
|
+
except requests.exceptions.RequestException as e:
|
245
|
+
error(f"Error editing dataset: {str(e)}")
|
246
|
+
return False
|
247
|
+
|
248
|
+
info(f"Successfully edited dataset '{alias}'")
|
249
|
+
return True
|
250
|
+
|
251
|
+
def export_jsonl(self, alias: str) -> requests.Response:
|
252
|
+
"""Export dataset in JSONL format from Judgment platform"""
|
253
|
+
debug(f"Exporting dataset with alias '{alias}' as JSONL")
|
254
|
+
with Progress(
|
255
|
+
SpinnerColumn(style="rgb(106,0,255)"),
|
256
|
+
TextColumn("[progress.description]{task.description}"),
|
257
|
+
transient=False,
|
258
|
+
) as progress:
|
259
|
+
task_id = progress.add_task(
|
260
|
+
f"Exporting [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] as JSONL...",
|
261
|
+
total=100,
|
262
|
+
)
|
263
|
+
try:
|
264
|
+
response = requests.post(
|
265
|
+
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL,
|
266
|
+
json={"alias": alias},
|
267
|
+
headers={
|
268
|
+
"Content-Type": "application/json",
|
269
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
270
|
+
},
|
271
|
+
stream=True
|
272
|
+
)
|
273
|
+
response.raise_for_status()
|
274
|
+
except requests.exceptions.HTTPError as err:
|
275
|
+
if err.response.status_code == 404:
|
276
|
+
error(f"Dataset not found: {alias}")
|
277
|
+
else:
|
278
|
+
error(f"HTTP error during export: {err}")
|
279
|
+
raise
|
280
|
+
except Exception as e:
|
281
|
+
error(f"Error during export: {str(e)}")
|
282
|
+
raise
|
283
|
+
|
284
|
+
info(f"Successfully exported dataset with alias '{alias}'")
|
285
|
+
progress.update(
|
286
|
+
task_id,
|
287
|
+
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
|
288
|
+
)
|
289
|
+
|
290
|
+
return response
|
judgeval/data/example.py
CHANGED
@@ -4,9 +4,11 @@ Classes for representing examples in a dataset.
|
|
4
4
|
|
5
5
|
|
6
6
|
from typing import TypeVar, Optional, Any, Dict, List
|
7
|
-
from
|
7
|
+
from uuid import uuid4
|
8
|
+
from pydantic import BaseModel, Field
|
8
9
|
from enum import Enum
|
9
10
|
from datetime import datetime
|
11
|
+
import time
|
10
12
|
|
11
13
|
|
12
14
|
Input = TypeVar('Input')
|
@@ -33,15 +35,19 @@ class Example(BaseModel):
|
|
33
35
|
tools_called: Optional[List[str]] = None
|
34
36
|
expected_tools: Optional[List[str]] = None
|
35
37
|
name: Optional[str] = None
|
36
|
-
example_id:
|
38
|
+
example_id: str = Field(default_factory=lambda: str(uuid4()))
|
39
|
+
example_index: Optional[int] = None
|
37
40
|
timestamp: Optional[str] = None
|
38
41
|
trace_id: Optional[str] = None
|
39
42
|
|
40
43
|
def __init__(self, **data):
|
41
|
-
|
44
|
+
if 'example_id' not in data:
|
45
|
+
data['example_id'] = str(uuid4())
|
42
46
|
# Set timestamp if not provided
|
43
|
-
if
|
44
|
-
|
47
|
+
if 'timestamp' not in data:
|
48
|
+
data['timestamp'] = datetime.now().strftime("%Y%m%d_%H%M%S")
|
49
|
+
super().__init__(**data)
|
50
|
+
|
45
51
|
|
46
52
|
def to_dict(self):
|
47
53
|
return {
|
@@ -55,6 +61,7 @@ class Example(BaseModel):
|
|
55
61
|
"expected_tools": self.expected_tools,
|
56
62
|
"name": self.name,
|
57
63
|
"example_id": self.example_id,
|
64
|
+
"example_index": self.example_index,
|
58
65
|
"timestamp": self.timestamp,
|
59
66
|
"trace_id": self.trace_id
|
60
67
|
}
|
@@ -71,6 +78,7 @@ class Example(BaseModel):
|
|
71
78
|
f"expected_tools={self.expected_tools}, "
|
72
79
|
f"name={self.name}, "
|
73
80
|
f"example_id={self.example_id}, "
|
81
|
+
f"example_index={self.example_index}, "
|
74
82
|
f"timestamp={self.timestamp}, "
|
75
83
|
f"trace_id={self.trace_id})"
|
76
84
|
)
|
judgeval/judgment_client.py
CHANGED
@@ -6,7 +6,7 @@ from typing import Optional, List, Dict, Any, Union
|
|
6
6
|
import requests
|
7
7
|
|
8
8
|
from judgeval.constants import ROOT_API
|
9
|
-
from judgeval.data.datasets import EvalDataset, EvalDatasetClient
|
9
|
+
from judgeval.data.datasets import EvalDataset, EvalDatasetClient, GroundTruthExample
|
10
10
|
from judgeval.data import (
|
11
11
|
ScoringResult,
|
12
12
|
Example
|
@@ -164,6 +164,11 @@ class JudgmentClient:
|
|
164
164
|
"""
|
165
165
|
return self.eval_dataset_client.pull_all_user_dataset_stats()
|
166
166
|
|
167
|
+
def edit_dataset(self, alias: str, examples: List[Example], ground_truths: List[GroundTruthExample]) -> bool:
|
168
|
+
"""
|
169
|
+
Edits the dataset on Judgment platform by adding new examples and ground truths
|
170
|
+
"""
|
171
|
+
return self.eval_dataset_client.edit_dataset(alias, examples, ground_truths)
|
167
172
|
|
168
173
|
# Maybe add option where you can pass in the EvaluationRun object and it will pull the eval results from the backend
|
169
174
|
def pull_eval(self, project_name: str, eval_run_name: str) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
|
@@ -182,6 +187,10 @@ class JudgmentClient:
|
|
182
187
|
eval_name=eval_run_name,
|
183
188
|
judgment_api_key=self.judgment_api_key)
|
184
189
|
eval_run = requests.post(JUDGMENT_EVAL_FETCH_API_URL,
|
190
|
+
headers={
|
191
|
+
"Content-Type": "application/json",
|
192
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
193
|
+
},
|
185
194
|
json=eval_run_request_body.model_dump())
|
186
195
|
if eval_run.status_code != requests.codes.ok:
|
187
196
|
raise ValueError(f"Error fetching eval results: {eval_run.json()}")
|
@@ -213,6 +222,7 @@ class JudgmentClient:
|
|
213
222
|
json=eval_run_request_body.model_dump(),
|
214
223
|
headers={
|
215
224
|
"Content-Type": "application/json",
|
225
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
216
226
|
})
|
217
227
|
if response.status_code != requests.codes.ok:
|
218
228
|
raise ValueError(f"Error deleting eval results: {response.json()}")
|
@@ -235,6 +245,7 @@ class JudgmentClient:
|
|
235
245
|
},
|
236
246
|
headers={
|
237
247
|
"Content-Type": "application/json",
|
248
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
238
249
|
})
|
239
250
|
if response.status_code != requests.codes.ok:
|
240
251
|
raise ValueError(f"Error deleting eval results: {response.json()}")
|
@@ -246,7 +257,11 @@ class JudgmentClient:
|
|
246
257
|
"""
|
247
258
|
response = requests.post(
|
248
259
|
f"{ROOT_API}/validate_api_key/",
|
249
|
-
|
260
|
+
headers={
|
261
|
+
"Content-Type": "application/json",
|
262
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
263
|
+
},
|
264
|
+
json={} # Empty body now
|
250
265
|
)
|
251
266
|
if response.status_code == 200:
|
252
267
|
return True, response.json()
|
@@ -268,12 +283,16 @@ class JudgmentClient:
|
|
268
283
|
"""
|
269
284
|
request_body = {
|
270
285
|
"slug": slug,
|
271
|
-
"judgment_api_key": self.judgment_api_key
|
286
|
+
# "judgment_api_key": self.judgment_api_key
|
272
287
|
}
|
273
288
|
|
274
289
|
response = requests.post(
|
275
290
|
f"{ROOT_API}/fetch_scorer/",
|
276
|
-
json=request_body
|
291
|
+
json=request_body,
|
292
|
+
headers={
|
293
|
+
"Content-Type": "application/json",
|
294
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
295
|
+
}
|
277
296
|
)
|
278
297
|
|
279
298
|
if response.status_code == 500:
|
@@ -306,13 +325,17 @@ class JudgmentClient:
|
|
306
325
|
"name": scorer.name,
|
307
326
|
"conversation": scorer.conversation,
|
308
327
|
"options": scorer.options,
|
309
|
-
"judgment_api_key": self.judgment_api_key,
|
328
|
+
# "judgment_api_key": self.judgment_api_key,
|
310
329
|
"slug": slug
|
311
330
|
}
|
312
331
|
|
313
332
|
response = requests.post(
|
314
333
|
f"{ROOT_API}/save_scorer/",
|
315
|
-
json=request_body
|
334
|
+
json=request_body,
|
335
|
+
headers={
|
336
|
+
"Content-Type": "application/json",
|
337
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
338
|
+
}
|
316
339
|
)
|
317
340
|
|
318
341
|
if response.status_code == 500:
|
judgeval/run_evaluation.py
CHANGED
@@ -47,7 +47,12 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
|
|
47
47
|
try:
|
48
48
|
# submit API request to execute evals
|
49
49
|
payload = evaluation_run.model_dump(warnings=False)
|
50
|
-
response = requests.post(
|
50
|
+
response = requests.post(
|
51
|
+
JUDGMENT_EVAL_API_URL, headers={
|
52
|
+
"Content-Type": "application/json",
|
53
|
+
"Authorization": f"Bearer {evaluation_run.judgment_api_key}"
|
54
|
+
},
|
55
|
+
json=payload)
|
51
56
|
response_data = response.json()
|
52
57
|
except Exception as e:
|
53
58
|
error(f"Error: {e}")
|
@@ -151,6 +156,10 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
|
|
151
156
|
try:
|
152
157
|
response = requests.post(
|
153
158
|
f"{ROOT_API}/eval-run-name-exists/",
|
159
|
+
headers={
|
160
|
+
"Content-Type": "application/json",
|
161
|
+
"Authorization": f"Bearer {judgment_api_key}"
|
162
|
+
},
|
154
163
|
json={
|
155
164
|
"eval_name": eval_name,
|
156
165
|
"project_name": project_name,
|
@@ -188,6 +197,10 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
|
|
188
197
|
try:
|
189
198
|
res = requests.post(
|
190
199
|
JUDGMENT_EVAL_LOG_API_URL,
|
200
|
+
headers={
|
201
|
+
"Content-Type": "application/json",
|
202
|
+
"Authorization": f"Bearer {evaluation_run.judgment_api_key}"
|
203
|
+
},
|
191
204
|
json={
|
192
205
|
"results": [result.to_dict() for result in merged_results],
|
193
206
|
"judgment_api_key": evaluation_run.judgment_api_key,
|
@@ -247,12 +260,10 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
247
260
|
# Set example IDs if not already set
|
248
261
|
debug("Initializing examples with IDs and timestamps")
|
249
262
|
for idx, example in enumerate(evaluation_run.examples):
|
250
|
-
|
251
|
-
example.example_id = idx
|
252
|
-
debug(f"Set example ID {idx} for input: {example.input[:50]}...")
|
263
|
+
example.example_index = idx # Set numeric index
|
253
264
|
example.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
254
265
|
with example_logging_context(example.timestamp, example.example_id):
|
255
|
-
debug(f"Initialized example {example.example_id}")
|
266
|
+
debug(f"Initialized example {example.example_id} (index: {example.example_index})")
|
256
267
|
debug(f"Input: {example.input}")
|
257
268
|
debug(f"Actual output: {example.actual_output}")
|
258
269
|
if example.expected_output:
|
@@ -1,22 +1,22 @@
|
|
1
1
|
judgeval/__init__.py,sha256=xiiG4CkeaOtey4fusCd9CBz0BVqzTIbV-K2EFIU0rUM,283
|
2
2
|
judgeval/clients.py,sha256=Ns5ljrgPPXUMo7fSPJxO12H64lcPyKeQPIVG_RMi2cM,1162
|
3
|
-
judgeval/constants.py,sha256=
|
3
|
+
judgeval/constants.py,sha256=43hGesvBbX1uzc4KXvjLCVdd6cyZRMSnEJp11oA7h74,2794
|
4
4
|
judgeval/evaluation_run.py,sha256=ev-IbL34SwRv8lwB4KHfYag1jYo6b049R8mmwNBqmnM,5923
|
5
|
-
judgeval/judgment_client.py,sha256=
|
6
|
-
judgeval/run_evaluation.py,sha256=
|
5
|
+
judgeval/judgment_client.py,sha256=7vaarj6zXQmQ44m0cVCe72S4e92eZ4tK8sqNTnx4FLQ,14957
|
6
|
+
judgeval/run_evaluation.py,sha256=vl6TcwJVH2jN60Gja1E1tPI3Jvv6YNeNMTDVTcWkqZY,20520
|
7
7
|
judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
|
8
8
|
judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
|
9
9
|
judgeval/common/logger.py,sha256=QXN3UMymmKu2iMEMEgATLBnMDjGr_pE2iOSEFoICgg8,6092
|
10
|
-
judgeval/common/tracer.py,sha256=
|
10
|
+
judgeval/common/tracer.py,sha256=szU7mhyMIoG9EvPIb6dtxv7ix83WVuv7TtVX31FWMoQ,33582
|
11
11
|
judgeval/common/utils.py,sha256=3WRyyX0tvnnj_VAVlEdtZrfzyWj6zfX04xdpCtE1m5Y,33736
|
12
12
|
judgeval/data/__init__.py,sha256=YferxwmUqoBi18hrdgro0BD0h4pt20LAqISeUzGMcVU,474
|
13
13
|
judgeval/data/api_example.py,sha256=vwWFbI6eJr5VgURCRbuSiMtEXLUbTCih_BcaqEBy-pg,4108
|
14
|
-
judgeval/data/example.py,sha256=
|
14
|
+
judgeval/data/example.py,sha256=r_ZA_Fq0k-1xSutSLURwj0-Ug0C_yQl4GQlqtDxbYT0,2771
|
15
15
|
judgeval/data/result.py,sha256=8FIO-bFKPegZuByKRjA2_sumjb8oGWQ5ZeQ1RVz5z2w,4393
|
16
16
|
judgeval/data/scorer_data.py,sha256=pYljblCPZrlMIv5Eg7R-clnmsqzUBAwokKjZpwa0DXE,3280
|
17
17
|
judgeval/data/datasets/__init__.py,sha256=eO6ayeM_bTGwIt0eDSlTBIIBvXvIWRWWSfYZrZROPiQ,265
|
18
|
-
judgeval/data/datasets/dataset.py,sha256=
|
19
|
-
judgeval/data/datasets/eval_dataset_client.py,sha256=
|
18
|
+
judgeval/data/datasets/dataset.py,sha256=6-BhkGiwMmvROxnFbefgzsFZy7wAaLi9kiTQ6p0h_xk,11928
|
19
|
+
judgeval/data/datasets/eval_dataset_client.py,sha256=6wybPyt0BjrMQcOl3cTkcY3c9Pbm_K1fnpMiuzh56E4,11006
|
20
20
|
judgeval/data/datasets/ground_truth.py,sha256=OTBs3VZe-Wp0vEXEsq14GPZHYtpWT16bhGQTycIvkKc,2057
|
21
21
|
judgeval/data/datasets/utils.py,sha256=lQxyl7mevct7JcDSyIrU_8QOzT-EYPWEvoUiAeOdeek,2502
|
22
22
|
judgeval/judges/__init__.py,sha256=tyQ5KY88Kp1Ctfw2IJxnVEpy8DnFCtmy04JdPOpp-As,339
|
@@ -78,7 +78,7 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarizat
|
|
78
78
|
judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py,sha256=JUB3TMqS1OHr6PqpIGqkyiBNbyfUaw7lZuUATjU3_ek,168
|
79
79
|
judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py,sha256=CYGRJY5EuyICYzHrmFdLykwXakX8AC7G3Bhj7p6szfY,5493
|
80
80
|
judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
|
81
|
-
judgeval-0.0.
|
82
|
-
judgeval-0.0.
|
83
|
-
judgeval-0.0.
|
84
|
-
judgeval-0.0.
|
81
|
+
judgeval-0.0.13.dist-info/METADATA,sha256=6BQFdiV0_9Oe119PBqfNnmgX1ZWXjN-_6x0q9lVvnDg,1283
|
82
|
+
judgeval-0.0.13.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
83
|
+
judgeval-0.0.13.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
|
84
|
+
judgeval-0.0.13.dist-info/RECORD,,
|
File without changes
|
File without changes
|