judgeval 0.0.12__py3-none-any.whl → 0.0.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +19 -7
- judgeval/constants.py +2 -0
- judgeval/data/datasets/dataset.py +5 -3
- judgeval/data/datasets/eval_dataset_client.py +114 -10
- judgeval/data/example.py +20 -5
- judgeval/evaluation_run.py +1 -0
- judgeval/judgment_client.py +40 -11
- judgeval/run_evaluation.py +23 -8
- {judgeval-0.0.12.dist-info → judgeval-0.0.14.dist-info}/METADATA +1 -1
- {judgeval-0.0.12.dist-info → judgeval-0.0.14.dist-info}/RECORD +12 -12
- {judgeval-0.0.12.dist-info → judgeval-0.0.14.dist-info}/WHEEL +0 -0
- {judgeval-0.0.12.dist-info → judgeval-0.0.14.dist-info}/licenses/LICENSE.md +0 -0
judgeval/common/tracer.py
CHANGED
@@ -188,8 +188,9 @@ class TraceManagerClient:
|
|
188
188
|
- Saving a trace
|
189
189
|
- Deleting a trace
|
190
190
|
"""
|
191
|
-
def __init__(self, judgment_api_key: str):
|
191
|
+
def __init__(self, judgment_api_key: str, organization_id: str):
|
192
192
|
self.judgment_api_key = judgment_api_key
|
193
|
+
self.organization_id = organization_id
|
193
194
|
|
194
195
|
def fetch_trace(self, trace_id: str):
|
195
196
|
"""
|
@@ -199,10 +200,11 @@ class TraceManagerClient:
|
|
199
200
|
JUDGMENT_TRACES_FETCH_API_URL,
|
200
201
|
json={
|
201
202
|
"trace_id": trace_id,
|
202
|
-
"judgment_api_key": self.judgment_api_key,
|
203
203
|
},
|
204
204
|
headers={
|
205
205
|
"Content-Type": "application/json",
|
206
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
207
|
+
"X-Organization-Id": self.organization_id
|
206
208
|
}
|
207
209
|
)
|
208
210
|
|
@@ -225,6 +227,8 @@ class TraceManagerClient:
|
|
225
227
|
json=trace_data,
|
226
228
|
headers={
|
227
229
|
"Content-Type": "application/json",
|
230
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
231
|
+
"X-Organization-Id": self.organization_id
|
228
232
|
}
|
229
233
|
)
|
230
234
|
|
@@ -243,11 +247,12 @@ class TraceManagerClient:
|
|
243
247
|
response = requests.delete(
|
244
248
|
JUDGMENT_TRACES_DELETE_API_URL,
|
245
249
|
json={
|
246
|
-
"judgment_api_key": self.judgment_api_key,
|
247
250
|
"trace_ids": [trace_id],
|
248
251
|
},
|
249
252
|
headers={
|
250
253
|
"Content-Type": "application/json",
|
254
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
255
|
+
"X-Organization-Id": self.organization_id
|
251
256
|
}
|
252
257
|
)
|
253
258
|
|
@@ -263,11 +268,12 @@ class TraceManagerClient:
|
|
263
268
|
response = requests.delete(
|
264
269
|
JUDGMENT_TRACES_DELETE_API_URL,
|
265
270
|
json={
|
266
|
-
"judgment_api_key": self.judgment_api_key,
|
267
271
|
"trace_ids": trace_ids,
|
268
272
|
},
|
269
273
|
headers={
|
270
274
|
"Content-Type": "application/json",
|
275
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
276
|
+
"X-Organization-Id": self.organization_id
|
271
277
|
}
|
272
278
|
)
|
273
279
|
|
@@ -290,7 +296,7 @@ class TraceClient:
|
|
290
296
|
self.span_type = None
|
291
297
|
self._current_span: Optional[TraceEntry] = None
|
292
298
|
self.overwrite = overwrite
|
293
|
-
self.trace_manager_client = TraceManagerClient(tracer.api_key) # Manages DB operations for trace data
|
299
|
+
self.trace_manager_client = TraceManagerClient(tracer.api_key, tracer.organization_id) # Manages DB operations for trace data
|
294
300
|
|
295
301
|
@contextmanager
|
296
302
|
def span(self, name: str, span_type: SpanType = "span"):
|
@@ -367,6 +373,7 @@ class TraceClient:
|
|
367
373
|
raise ValueError(f"Failed to load scorers: {str(e)}")
|
368
374
|
|
369
375
|
eval_run = EvaluationRun(
|
376
|
+
organization_id=self.tracer.organization_id,
|
370
377
|
log_results=log_results,
|
371
378
|
project_name=self.project_name,
|
372
379
|
eval_name=f"{self.name.capitalize()}-"
|
@@ -542,7 +549,6 @@ class TraceClient:
|
|
542
549
|
# Create trace document
|
543
550
|
trace_data = {
|
544
551
|
"trace_id": self.trace_id,
|
545
|
-
"api_key": self.tracer.api_key,
|
546
552
|
"name": self.name,
|
547
553
|
"project_name": self.project_name,
|
548
554
|
"created_at": datetime.fromtimestamp(self.start_time).isoformat(),
|
@@ -564,6 +570,8 @@ class TraceClient:
|
|
564
570
|
channel = connection.channel()
|
565
571
|
|
566
572
|
channel.queue_declare(queue=RABBITMQ_QUEUE, durable=True)
|
573
|
+
trace_data["judgment_api_key"] = self.tracer.api_key
|
574
|
+
trace_data["organization_id"] = self.tracer.organization_id
|
567
575
|
|
568
576
|
channel.basic_publish(
|
569
577
|
exchange='',
|
@@ -589,14 +597,18 @@ class Tracer:
|
|
589
597
|
cls._instance = super(Tracer, cls).__new__(cls)
|
590
598
|
return cls._instance
|
591
599
|
|
592
|
-
def __init__(self, api_key: str = os.getenv("JUDGMENT_API_KEY"), project_name: str = "default_project"):
|
600
|
+
def __init__(self, api_key: str = os.getenv("JUDGMENT_API_KEY"), project_name: str = "default_project", organization_id: str = os.getenv("ORGANIZATION_ID")):
|
593
601
|
if not hasattr(self, 'initialized'):
|
594
602
|
if not api_key:
|
595
603
|
raise ValueError("Tracer must be configured with a Judgment API key")
|
596
604
|
|
605
|
+
if not organization_id:
|
606
|
+
raise ValueError("Tracer must be configured with an Organization ID")
|
607
|
+
|
597
608
|
self.api_key: str = api_key
|
598
609
|
self.project_name: str = project_name
|
599
610
|
self.client: JudgmentClient = JudgmentClient(judgment_api_key=api_key)
|
611
|
+
self.organization_id: str = organization_id
|
600
612
|
self.depth: int = 0
|
601
613
|
self._current_trace: Optional[str] = None
|
602
614
|
self.initialized: bool = True
|
judgeval/constants.py
CHANGED
@@ -36,7 +36,9 @@ ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
|
|
36
36
|
JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
|
37
37
|
JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
|
38
38
|
JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull/"
|
39
|
+
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
|
39
40
|
JUDGMENT_DATASETS_PULL_ALL_API_URL = f"{ROOT_API}/datasets/get_all_stats/"
|
41
|
+
JUDGMENT_DATASETS_EDIT_API_URL = f"{ROOT_API}/datasets/edit/"
|
40
42
|
JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
|
41
43
|
JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
|
42
44
|
JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_name/"
|
@@ -17,9 +17,10 @@ class EvalDataset:
|
|
17
17
|
_alias: Union[str, None] = field(default=None)
|
18
18
|
_id: Union[str, None] = field(default=None)
|
19
19
|
judgment_api_key: str = field(default="")
|
20
|
-
|
20
|
+
organization_id: str = field(default="")
|
21
21
|
def __init__(self,
|
22
22
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"),
|
23
|
+
organization_id: str = os.getenv("ORGANIZATION_ID"),
|
23
24
|
ground_truths: List[GroundTruthExample] = [],
|
24
25
|
examples: List[Example] = [],
|
25
26
|
):
|
@@ -31,7 +32,7 @@ class EvalDataset:
|
|
31
32
|
self._alias = None
|
32
33
|
self._id = None
|
33
34
|
self.judgment_api_key = judgment_api_key
|
34
|
-
|
35
|
+
self.organization_id = organization_id
|
35
36
|
|
36
37
|
def add_from_json(self, file_path: str) -> None:
|
37
38
|
debug(f"Loading dataset from JSON file: {file_path}")
|
@@ -162,7 +163,8 @@ class EvalDataset:
|
|
162
163
|
"additional_metadata": ast.literal_eval(row["additional_metadata"]) if pd.notna(row["additional_metadata"]) else dict(),
|
163
164
|
"tools_called": row["tools_called"].split(";") if pd.notna(row["tools_called"]) else [],
|
164
165
|
"expected_tools": row["expected_tools"].split(";") if pd.notna(row["expected_tools"]) else [],
|
165
|
-
"trace_id": row["trace_id"] if pd.notna(row["trace_id"]) else None
|
166
|
+
"trace_id": row["trace_id"] if pd.notna(row["trace_id"]) else None,
|
167
|
+
"example_id": str(row["example_id"]) if pd.notna(row["example_id"]) else None
|
166
168
|
}
|
167
169
|
if row["example"]:
|
168
170
|
data["name"] = row["name"] if pd.notna(row["name"]) else None
|
@@ -1,5 +1,5 @@
|
|
1
1
|
|
2
|
-
from typing import Optional
|
2
|
+
from typing import Optional, List
|
3
3
|
import requests
|
4
4
|
from rich.progress import Progress, SpinnerColumn, TextColumn
|
5
5
|
|
@@ -7,7 +7,9 @@ from judgeval.common.logger import debug, error, warning, info
|
|
7
7
|
from judgeval.constants import (
|
8
8
|
JUDGMENT_DATASETS_PUSH_API_URL,
|
9
9
|
JUDGMENT_DATASETS_PULL_API_URL,
|
10
|
-
JUDGMENT_DATASETS_PULL_ALL_API_URL
|
10
|
+
JUDGMENT_DATASETS_PULL_ALL_API_URL,
|
11
|
+
JUDGMENT_DATASETS_EDIT_API_URL,
|
12
|
+
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
|
11
13
|
)
|
12
14
|
from judgeval.data import Example
|
13
15
|
from judgeval.data.datasets import EvalDataset
|
@@ -17,13 +19,14 @@ from judgeval.data.datasets.ground_truth import GroundTruthExample
|
|
17
19
|
|
18
20
|
|
19
21
|
class EvalDatasetClient:
|
20
|
-
def __init__(self, judgment_api_key: str):
|
22
|
+
def __init__(self, judgment_api_key: str, organization_id: str):
|
21
23
|
self.judgment_api_key = judgment_api_key
|
24
|
+
self.organization_id = organization_id
|
22
25
|
|
23
26
|
def create_dataset(self) -> EvalDataset:
|
24
27
|
return EvalDataset(judgment_api_key=self.judgment_api_key)
|
25
28
|
|
26
|
-
def push(self, dataset: EvalDataset, alias: str,overwrite: Optional[bool] = False) -> bool:
|
29
|
+
def push(self, dataset: EvalDataset, alias: str, overwrite: Optional[bool] = False) -> bool:
|
27
30
|
debug(f"Pushing dataset with alias '{alias}' (overwrite={overwrite})")
|
28
31
|
if overwrite:
|
29
32
|
warning(f"Overwrite enabled for alias '{alias}'")
|
@@ -56,12 +59,16 @@ class EvalDatasetClient:
|
|
56
59
|
"ground_truths": [g.to_dict() for g in dataset.ground_truths],
|
57
60
|
"examples": [e.to_dict() for e in dataset.examples],
|
58
61
|
"overwrite": overwrite,
|
59
|
-
"judgment_api_key": dataset.judgment_api_key
|
60
62
|
}
|
61
63
|
try:
|
62
64
|
response = requests.post(
|
63
65
|
JUDGMENT_DATASETS_PUSH_API_URL,
|
64
|
-
json=content
|
66
|
+
json=content,
|
67
|
+
headers={
|
68
|
+
"Content-Type": "application/json",
|
69
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
70
|
+
"X-Organization-Id": self.organization_id
|
71
|
+
}
|
65
72
|
)
|
66
73
|
if response.status_code == 500:
|
67
74
|
error(f"Server error during push: {content.get('message')}")
|
@@ -115,13 +122,17 @@ class EvalDatasetClient:
|
|
115
122
|
)
|
116
123
|
request_body = {
|
117
124
|
"alias": alias,
|
118
|
-
"judgment_api_key": self.judgment_api_key
|
119
125
|
}
|
120
126
|
|
121
127
|
try:
|
122
128
|
response = requests.post(
|
123
129
|
JUDGMENT_DATASETS_PULL_API_URL,
|
124
|
-
json=request_body
|
130
|
+
json=request_body,
|
131
|
+
headers={
|
132
|
+
"Content-Type": "application/json",
|
133
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
134
|
+
"X-Organization-Id": self.organization_id
|
135
|
+
}
|
125
136
|
)
|
126
137
|
response.raise_for_status()
|
127
138
|
except requests.exceptions.RequestException as e:
|
@@ -169,13 +180,17 @@ class EvalDatasetClient:
|
|
169
180
|
total=100,
|
170
181
|
)
|
171
182
|
request_body = {
|
172
|
-
"judgment_api_key": self.judgment_api_key
|
173
183
|
}
|
174
184
|
|
175
185
|
try:
|
176
186
|
response = requests.post(
|
177
187
|
JUDGMENT_DATASETS_PULL_ALL_API_URL,
|
178
|
-
json=request_body
|
188
|
+
json=request_body,
|
189
|
+
headers={
|
190
|
+
"Content-Type": "application/json",
|
191
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
192
|
+
"X-Organization-Id": self.organization_id
|
193
|
+
}
|
179
194
|
)
|
180
195
|
response.raise_for_status()
|
181
196
|
except requests.exceptions.RequestException as e:
|
@@ -191,3 +206,92 @@ class EvalDatasetClient:
|
|
191
206
|
)
|
192
207
|
|
193
208
|
return payload
|
209
|
+
|
210
|
+
def edit_dataset(self, alias: str, examples: List[Example], ground_truths: List[GroundTruthExample]) -> bool:
|
211
|
+
"""
|
212
|
+
Edits the dataset on Judgment platform by adding new examples and ground truths
|
213
|
+
|
214
|
+
Mock request:
|
215
|
+
{
|
216
|
+
"alias": alias,
|
217
|
+
"examples": [...],
|
218
|
+
"ground_truths": [...],
|
219
|
+
"judgment_api_key": self.judgment_api_key
|
220
|
+
}
|
221
|
+
"""
|
222
|
+
with Progress(
|
223
|
+
SpinnerColumn(style="rgb(106,0,255)"),
|
224
|
+
TextColumn("[progress.description]{task.description}"),
|
225
|
+
transient=False,
|
226
|
+
) as progress:
|
227
|
+
task_id = progress.add_task(
|
228
|
+
f"Editing dataset [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] on Judgment...",
|
229
|
+
total=100,
|
230
|
+
)
|
231
|
+
|
232
|
+
content = {
|
233
|
+
"alias": alias,
|
234
|
+
"examples": [e.to_dict() for e in examples],
|
235
|
+
"ground_truths": [g.to_dict() for g in ground_truths],
|
236
|
+
"judgment_api_key": self.judgment_api_key
|
237
|
+
}
|
238
|
+
|
239
|
+
try:
|
240
|
+
response = requests.post(
|
241
|
+
JUDGMENT_DATASETS_EDIT_API_URL,
|
242
|
+
json=content,
|
243
|
+
headers={
|
244
|
+
"Content-Type": "application/json",
|
245
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
246
|
+
"X-Organization-Id": self.organization_id
|
247
|
+
}
|
248
|
+
)
|
249
|
+
response.raise_for_status()
|
250
|
+
except requests.exceptions.RequestException as e:
|
251
|
+
error(f"Error editing dataset: {str(e)}")
|
252
|
+
return False
|
253
|
+
|
254
|
+
info(f"Successfully edited dataset '{alias}'")
|
255
|
+
return True
|
256
|
+
|
257
|
+
def export_jsonl(self, alias: str) -> requests.Response:
|
258
|
+
"""Export dataset in JSONL format from Judgment platform"""
|
259
|
+
debug(f"Exporting dataset with alias '{alias}' as JSONL")
|
260
|
+
with Progress(
|
261
|
+
SpinnerColumn(style="rgb(106,0,255)"),
|
262
|
+
TextColumn("[progress.description]{task.description}"),
|
263
|
+
transient=False,
|
264
|
+
) as progress:
|
265
|
+
task_id = progress.add_task(
|
266
|
+
f"Exporting [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] as JSONL...",
|
267
|
+
total=100,
|
268
|
+
)
|
269
|
+
try:
|
270
|
+
response = requests.post(
|
271
|
+
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL,
|
272
|
+
json={"alias": alias},
|
273
|
+
headers={
|
274
|
+
"Content-Type": "application/json",
|
275
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
276
|
+
"X-Organization-Id": self.organization_id
|
277
|
+
},
|
278
|
+
stream=True
|
279
|
+
)
|
280
|
+
response.raise_for_status()
|
281
|
+
except requests.exceptions.HTTPError as err:
|
282
|
+
if err.response.status_code == 404:
|
283
|
+
error(f"Dataset not found: {alias}")
|
284
|
+
else:
|
285
|
+
error(f"HTTP error during export: {err}")
|
286
|
+
raise
|
287
|
+
except Exception as e:
|
288
|
+
error(f"Error during export: {str(e)}")
|
289
|
+
raise
|
290
|
+
|
291
|
+
info(f"Successfully exported dataset with alias '{alias}'")
|
292
|
+
progress.update(
|
293
|
+
task_id,
|
294
|
+
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
|
295
|
+
)
|
296
|
+
|
297
|
+
return response
|
judgeval/data/example.py
CHANGED
@@ -4,9 +4,11 @@ Classes for representing examples in a dataset.
|
|
4
4
|
|
5
5
|
|
6
6
|
from typing import TypeVar, Optional, Any, Dict, List
|
7
|
-
from
|
7
|
+
from uuid import uuid4
|
8
|
+
from pydantic import BaseModel, Field, field_validator
|
8
9
|
from enum import Enum
|
9
10
|
from datetime import datetime
|
11
|
+
import time
|
10
12
|
|
11
13
|
|
12
14
|
Input = TypeVar('Input')
|
@@ -33,15 +35,26 @@ class Example(BaseModel):
|
|
33
35
|
tools_called: Optional[List[str]] = None
|
34
36
|
expected_tools: Optional[List[str]] = None
|
35
37
|
name: Optional[str] = None
|
36
|
-
example_id:
|
38
|
+
example_id: str = Field(default_factory=lambda: str(uuid4()))
|
39
|
+
example_index: Optional[int] = None
|
37
40
|
timestamp: Optional[str] = None
|
38
41
|
trace_id: Optional[str] = None
|
39
42
|
|
43
|
+
@field_validator('input', 'actual_output', mode='before')
|
44
|
+
def convert_to_str(cls, value):
|
45
|
+
try:
|
46
|
+
return str(value)
|
47
|
+
except Exception:
|
48
|
+
return repr(value)
|
49
|
+
|
40
50
|
def __init__(self, **data):
|
41
|
-
|
51
|
+
if 'example_id' not in data:
|
52
|
+
data['example_id'] = str(uuid4())
|
42
53
|
# Set timestamp if not provided
|
43
|
-
if
|
44
|
-
|
54
|
+
if 'timestamp' not in data:
|
55
|
+
data['timestamp'] = datetime.now().strftime("%Y%m%d_%H%M%S")
|
56
|
+
super().__init__(**data)
|
57
|
+
|
45
58
|
|
46
59
|
def to_dict(self):
|
47
60
|
return {
|
@@ -55,6 +68,7 @@ class Example(BaseModel):
|
|
55
68
|
"expected_tools": self.expected_tools,
|
56
69
|
"name": self.name,
|
57
70
|
"example_id": self.example_id,
|
71
|
+
"example_index": self.example_index,
|
58
72
|
"timestamp": self.timestamp,
|
59
73
|
"trace_id": self.trace_id
|
60
74
|
}
|
@@ -71,6 +85,7 @@ class Example(BaseModel):
|
|
71
85
|
f"expected_tools={self.expected_tools}, "
|
72
86
|
f"name={self.name}, "
|
73
87
|
f"example_id={self.example_id}, "
|
88
|
+
f"example_index={self.example_index}, "
|
74
89
|
f"timestamp={self.timestamp}, "
|
75
90
|
f"trace_id={self.trace_id})"
|
76
91
|
)
|
judgeval/evaluation_run.py
CHANGED
@@ -24,6 +24,7 @@ class EvaluationRun(BaseModel):
|
|
24
24
|
|
25
25
|
# The user will specify whether they want log_results when they call run_eval
|
26
26
|
log_results: bool = False # NOTE: log_results has to be set first because it is used to validate project_name and eval_name
|
27
|
+
organization_id: Optional[str] = None
|
27
28
|
project_name: Optional[str] = None
|
28
29
|
eval_name: Optional[str] = None
|
29
30
|
examples: List[Example]
|
judgeval/judgment_client.py
CHANGED
@@ -6,7 +6,7 @@ from typing import Optional, List, Dict, Any, Union
|
|
6
6
|
import requests
|
7
7
|
|
8
8
|
from judgeval.constants import ROOT_API
|
9
|
-
from judgeval.data.datasets import EvalDataset, EvalDatasetClient
|
9
|
+
from judgeval.data.datasets import EvalDataset, EvalDatasetClient, GroundTruthExample
|
10
10
|
from judgeval.data import (
|
11
11
|
ScoringResult,
|
12
12
|
Example
|
@@ -34,9 +34,10 @@ class EvalRunRequestBody(BaseModel):
|
|
34
34
|
|
35
35
|
|
36
36
|
class JudgmentClient:
|
37
|
-
def __init__(self, judgment_api_key: str = os.getenv("JUDGMENT_API_KEY")):
|
37
|
+
def __init__(self, judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"), organization_id: str = os.getenv("ORGANIZATION_ID")):
|
38
38
|
self.judgment_api_key = judgment_api_key
|
39
|
-
self.
|
39
|
+
self.organization_id = organization_id
|
40
|
+
self.eval_dataset_client = EvalDatasetClient(judgment_api_key, organization_id)
|
40
41
|
|
41
42
|
# Verify API key is valid
|
42
43
|
result, response = self._validate_api_key()
|
@@ -78,7 +79,8 @@ class JudgmentClient:
|
|
78
79
|
model=model,
|
79
80
|
aggregator=aggregator,
|
80
81
|
metadata=metadata,
|
81
|
-
judgment_api_key=self.judgment_api_key
|
82
|
+
judgment_api_key=self.judgment_api_key,
|
83
|
+
organization_id=self.organization_id
|
82
84
|
)
|
83
85
|
return run_eval(eval, override)
|
84
86
|
except ValueError as e:
|
@@ -115,7 +117,8 @@ class JudgmentClient:
|
|
115
117
|
model=model,
|
116
118
|
aggregator=aggregator,
|
117
119
|
metadata=metadata,
|
118
|
-
judgment_api_key=self.judgment_api_key
|
120
|
+
judgment_api_key=self.judgment_api_key,
|
121
|
+
organization_id=self.organization_id
|
119
122
|
)
|
120
123
|
return run_eval(evaluation_run)
|
121
124
|
except ValueError as e:
|
@@ -164,6 +167,11 @@ class JudgmentClient:
|
|
164
167
|
"""
|
165
168
|
return self.eval_dataset_client.pull_all_user_dataset_stats()
|
166
169
|
|
170
|
+
def edit_dataset(self, alias: str, examples: List[Example], ground_truths: List[GroundTruthExample]) -> bool:
|
171
|
+
"""
|
172
|
+
Edits the dataset on Judgment platform by adding new examples and ground truths
|
173
|
+
"""
|
174
|
+
return self.eval_dataset_client.edit_dataset(alias, examples, ground_truths)
|
167
175
|
|
168
176
|
# Maybe add option where you can pass in the EvaluationRun object and it will pull the eval results from the backend
|
169
177
|
def pull_eval(self, project_name: str, eval_run_name: str) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
|
@@ -182,6 +190,11 @@ class JudgmentClient:
|
|
182
190
|
eval_name=eval_run_name,
|
183
191
|
judgment_api_key=self.judgment_api_key)
|
184
192
|
eval_run = requests.post(JUDGMENT_EVAL_FETCH_API_URL,
|
193
|
+
headers={
|
194
|
+
"Content-Type": "application/json",
|
195
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
196
|
+
"X-Organization-Id": self.organization_id
|
197
|
+
},
|
185
198
|
json=eval_run_request_body.model_dump())
|
186
199
|
if eval_run.status_code != requests.codes.ok:
|
187
200
|
raise ValueError(f"Error fetching eval results: {eval_run.json()}")
|
@@ -213,6 +226,8 @@ class JudgmentClient:
|
|
213
226
|
json=eval_run_request_body.model_dump(),
|
214
227
|
headers={
|
215
228
|
"Content-Type": "application/json",
|
229
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
230
|
+
"X-Organization-Id": self.organization_id
|
216
231
|
})
|
217
232
|
if response.status_code != requests.codes.ok:
|
218
233
|
raise ValueError(f"Error deleting eval results: {response.json()}")
|
@@ -231,10 +246,12 @@ class JudgmentClient:
|
|
231
246
|
response = requests.delete(JUDGMENT_EVAL_DELETE_PROJECT_API_URL,
|
232
247
|
json={
|
233
248
|
"project_name": project_name,
|
234
|
-
"judgment_api_key": self.judgment_api_key
|
249
|
+
"judgment_api_key": self.judgment_api_key,
|
235
250
|
},
|
236
251
|
headers={
|
237
252
|
"Content-Type": "application/json",
|
253
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
254
|
+
"X-Organization-Id": self.organization_id
|
238
255
|
})
|
239
256
|
if response.status_code != requests.codes.ok:
|
240
257
|
raise ValueError(f"Error deleting eval results: {response.json()}")
|
@@ -246,7 +263,11 @@ class JudgmentClient:
|
|
246
263
|
"""
|
247
264
|
response = requests.post(
|
248
265
|
f"{ROOT_API}/validate_api_key/",
|
249
|
-
|
266
|
+
headers={
|
267
|
+
"Content-Type": "application/json",
|
268
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
269
|
+
},
|
270
|
+
json={} # Empty body now
|
250
271
|
)
|
251
272
|
if response.status_code == 200:
|
252
273
|
return True, response.json()
|
@@ -268,12 +289,16 @@ class JudgmentClient:
|
|
268
289
|
"""
|
269
290
|
request_body = {
|
270
291
|
"slug": slug,
|
271
|
-
"judgment_api_key": self.judgment_api_key
|
272
292
|
}
|
273
293
|
|
274
294
|
response = requests.post(
|
275
295
|
f"{ROOT_API}/fetch_scorer/",
|
276
|
-
json=request_body
|
296
|
+
json=request_body,
|
297
|
+
headers={
|
298
|
+
"Content-Type": "application/json",
|
299
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
300
|
+
"X-Organization-Id": self.organization_id
|
301
|
+
}
|
277
302
|
)
|
278
303
|
|
279
304
|
if response.status_code == 500:
|
@@ -306,13 +331,17 @@ class JudgmentClient:
|
|
306
331
|
"name": scorer.name,
|
307
332
|
"conversation": scorer.conversation,
|
308
333
|
"options": scorer.options,
|
309
|
-
"judgment_api_key": self.judgment_api_key,
|
310
334
|
"slug": slug
|
311
335
|
}
|
312
336
|
|
313
337
|
response = requests.post(
|
314
338
|
f"{ROOT_API}/save_scorer/",
|
315
|
-
json=request_body
|
339
|
+
json=request_body,
|
340
|
+
headers={
|
341
|
+
"Content-Type": "application/json",
|
342
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
343
|
+
"X-Organization-Id": self.organization_id
|
344
|
+
}
|
316
345
|
)
|
317
346
|
|
318
347
|
if response.status_code == 500:
|
judgeval/run_evaluation.py
CHANGED
@@ -47,7 +47,13 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
|
|
47
47
|
try:
|
48
48
|
# submit API request to execute evals
|
49
49
|
payload = evaluation_run.model_dump(warnings=False)
|
50
|
-
response = requests.post(
|
50
|
+
response = requests.post(
|
51
|
+
JUDGMENT_EVAL_API_URL, headers={
|
52
|
+
"Content-Type": "application/json",
|
53
|
+
"Authorization": f"Bearer {evaluation_run.judgment_api_key}",
|
54
|
+
"X-Organization-Id": evaluation_run.organization_id
|
55
|
+
},
|
56
|
+
json=payload)
|
51
57
|
response_data = response.json()
|
52
58
|
except Exception as e:
|
53
59
|
error(f"Error: {e}")
|
@@ -135,7 +141,7 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
|
|
135
141
|
return results
|
136
142
|
|
137
143
|
|
138
|
-
def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_key: str) -> None:
|
144
|
+
def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str) -> None:
|
139
145
|
"""
|
140
146
|
Checks if an evaluation run name already exists for a given project.
|
141
147
|
|
@@ -151,6 +157,11 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
|
|
151
157
|
try:
|
152
158
|
response = requests.post(
|
153
159
|
f"{ROOT_API}/eval-run-name-exists/",
|
160
|
+
headers={
|
161
|
+
"Content-Type": "application/json",
|
162
|
+
"Authorization": f"Bearer {judgment_api_key}",
|
163
|
+
"X-Organization-Id": organization_id
|
164
|
+
},
|
154
165
|
json={
|
155
166
|
"eval_name": eval_name,
|
156
167
|
"project_name": project_name,
|
@@ -188,9 +199,13 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
|
|
188
199
|
try:
|
189
200
|
res = requests.post(
|
190
201
|
JUDGMENT_EVAL_LOG_API_URL,
|
202
|
+
headers={
|
203
|
+
"Content-Type": "application/json",
|
204
|
+
"Authorization": f"Bearer {evaluation_run.judgment_api_key}",
|
205
|
+
"X-Organization-Id": evaluation_run.organization_id
|
206
|
+
},
|
191
207
|
json={
|
192
208
|
"results": [result.to_dict() for result in merged_results],
|
193
|
-
"judgment_api_key": evaluation_run.judgment_api_key,
|
194
209
|
"project_name": evaluation_run.project_name,
|
195
210
|
"eval_name": evaluation_run.eval_name,
|
196
211
|
}
|
@@ -241,18 +256,17 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
241
256
|
check_eval_run_name_exists(
|
242
257
|
evaluation_run.eval_name,
|
243
258
|
evaluation_run.project_name,
|
244
|
-
evaluation_run.judgment_api_key
|
259
|
+
evaluation_run.judgment_api_key,
|
260
|
+
evaluation_run.organization_id
|
245
261
|
)
|
246
262
|
|
247
263
|
# Set example IDs if not already set
|
248
264
|
debug("Initializing examples with IDs and timestamps")
|
249
265
|
for idx, example in enumerate(evaluation_run.examples):
|
250
|
-
|
251
|
-
example.example_id = idx
|
252
|
-
debug(f"Set example ID {idx} for input: {example.input[:50]}...")
|
266
|
+
example.example_index = idx # Set numeric index
|
253
267
|
example.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
254
268
|
with example_logging_context(example.timestamp, example.example_id):
|
255
|
-
debug(f"Initialized example {example.example_id}")
|
269
|
+
debug(f"Initialized example {example.example_id} (index: {example.example_index})")
|
256
270
|
debug(f"Input: {example.input}")
|
257
271
|
debug(f"Actual output: {example.actual_output}")
|
258
272
|
if example.expected_output:
|
@@ -301,6 +315,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
301
315
|
aggregator=evaluation_run.aggregator,
|
302
316
|
metadata=evaluation_run.metadata,
|
303
317
|
judgment_api_key=evaluation_run.judgment_api_key,
|
318
|
+
organization_id=evaluation_run.organization_id,
|
304
319
|
log_results=evaluation_run.log_results
|
305
320
|
)
|
306
321
|
debug("Sending request to Judgment API")
|
@@ -1,22 +1,22 @@
|
|
1
1
|
judgeval/__init__.py,sha256=xiiG4CkeaOtey4fusCd9CBz0BVqzTIbV-K2EFIU0rUM,283
|
2
2
|
judgeval/clients.py,sha256=Ns5ljrgPPXUMo7fSPJxO12H64lcPyKeQPIVG_RMi2cM,1162
|
3
|
-
judgeval/constants.py,sha256=
|
4
|
-
judgeval/evaluation_run.py,sha256=
|
5
|
-
judgeval/judgment_client.py,sha256=
|
6
|
-
judgeval/run_evaluation.py,sha256=
|
3
|
+
judgeval/constants.py,sha256=43hGesvBbX1uzc4KXvjLCVdd6cyZRMSnEJp11oA7h74,2794
|
4
|
+
judgeval/evaluation_run.py,sha256=59lG8AUFTKqbY_JVEEA0I093-Pmiy0ERYDK5BuXuEGg,5965
|
5
|
+
judgeval/judgment_client.py,sha256=ryGT3A9-Him6oco3WvuHbjB-FVvAR3wCiiGz03eO_Q4,15409
|
6
|
+
judgeval/run_evaluation.py,sha256=Cc7BS07WyqsNpQ38HdMdRI782N3DANjM8UcIq9AwaGA,20769
|
7
7
|
judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
|
8
8
|
judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
|
9
9
|
judgeval/common/logger.py,sha256=QXN3UMymmKu2iMEMEgATLBnMDjGr_pE2iOSEFoICgg8,6092
|
10
|
-
judgeval/common/tracer.py,sha256=
|
10
|
+
judgeval/common/tracer.py,sha256=qam2suh-0_Cu_B7AWg3AMfEo2TisRZVY1SnAfqhiFQo,33211
|
11
11
|
judgeval/common/utils.py,sha256=3WRyyX0tvnnj_VAVlEdtZrfzyWj6zfX04xdpCtE1m5Y,33736
|
12
12
|
judgeval/data/__init__.py,sha256=YferxwmUqoBi18hrdgro0BD0h4pt20LAqISeUzGMcVU,474
|
13
13
|
judgeval/data/api_example.py,sha256=vwWFbI6eJr5VgURCRbuSiMtEXLUbTCih_BcaqEBy-pg,4108
|
14
|
-
judgeval/data/example.py,sha256=
|
14
|
+
judgeval/data/example.py,sha256=Rd-eDEM-giYfkfsGh_PBS2wwl15QlQPzbMV-J64Yj5E,2991
|
15
15
|
judgeval/data/result.py,sha256=8FIO-bFKPegZuByKRjA2_sumjb8oGWQ5ZeQ1RVz5z2w,4393
|
16
16
|
judgeval/data/scorer_data.py,sha256=pYljblCPZrlMIv5Eg7R-clnmsqzUBAwokKjZpwa0DXE,3280
|
17
17
|
judgeval/data/datasets/__init__.py,sha256=eO6ayeM_bTGwIt0eDSlTBIIBvXvIWRWWSfYZrZROPiQ,265
|
18
|
-
judgeval/data/datasets/dataset.py,sha256=
|
19
|
-
judgeval/data/datasets/eval_dataset_client.py,sha256=
|
18
|
+
judgeval/data/datasets/dataset.py,sha256=KdAY0KRUB2jxcGmc1XXXheFFcPsGFOIGY-kTwBNQS_Y,12080
|
19
|
+
judgeval/data/datasets/eval_dataset_client.py,sha256=DzxWQIiHlbpg6FpmWY6brcSP_h_rGcztk2A_6tQNFys,11411
|
20
20
|
judgeval/data/datasets/ground_truth.py,sha256=OTBs3VZe-Wp0vEXEsq14GPZHYtpWT16bhGQTycIvkKc,2057
|
21
21
|
judgeval/data/datasets/utils.py,sha256=lQxyl7mevct7JcDSyIrU_8QOzT-EYPWEvoUiAeOdeek,2502
|
22
22
|
judgeval/judges/__init__.py,sha256=tyQ5KY88Kp1Ctfw2IJxnVEpy8DnFCtmy04JdPOpp-As,339
|
@@ -78,7 +78,7 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarizat
|
|
78
78
|
judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py,sha256=JUB3TMqS1OHr6PqpIGqkyiBNbyfUaw7lZuUATjU3_ek,168
|
79
79
|
judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py,sha256=CYGRJY5EuyICYzHrmFdLykwXakX8AC7G3Bhj7p6szfY,5493
|
80
80
|
judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
|
81
|
-
judgeval-0.0.
|
82
|
-
judgeval-0.0.
|
83
|
-
judgeval-0.0.
|
84
|
-
judgeval-0.0.
|
81
|
+
judgeval-0.0.14.dist-info/METADATA,sha256=ZmCAECDNWwzpuES1slYKWcY_U-SMOsjaOdtSoj6wu0I,1283
|
82
|
+
judgeval-0.0.14.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
83
|
+
judgeval-0.0.14.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
|
84
|
+
judgeval-0.0.14.dist-info/RECORD,,
|
File without changes
|
File without changes
|