judgeval 0.0.11__py3-none-any.whl → 0.0.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +87 -28
- judgeval/constants.py +2 -0
- judgeval/data/datasets/dataset.py +2 -1
- judgeval/data/datasets/eval_dataset_client.py +106 -9
- judgeval/data/example.py +13 -5
- judgeval/judgment_client.py +29 -6
- judgeval/run_evaluation.py +16 -5
- {judgeval-0.0.11.dist-info → judgeval-0.0.13.dist-info}/METADATA +1 -1
- {judgeval-0.0.11.dist-info → judgeval-0.0.13.dist-info}/RECORD +11 -11
- {judgeval-0.0.11.dist-info → judgeval-0.0.13.dist-info}/WHEEL +0 -0
- {judgeval-0.0.11.dist-info → judgeval-0.0.13.dist-info}/licenses/LICENSE.md +0 -0
judgeval/common/tracer.py
CHANGED
@@ -199,10 +199,11 @@ class TraceManagerClient:
|
|
199
199
|
JUDGMENT_TRACES_FETCH_API_URL,
|
200
200
|
json={
|
201
201
|
"trace_id": trace_id,
|
202
|
-
"judgment_api_key": self.judgment_api_key,
|
202
|
+
# "judgment_api_key": self.judgment_api_key,
|
203
203
|
},
|
204
204
|
headers={
|
205
205
|
"Content-Type": "application/json",
|
206
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
206
207
|
}
|
207
208
|
)
|
208
209
|
|
@@ -225,6 +226,7 @@ class TraceManagerClient:
|
|
225
226
|
json=trace_data,
|
226
227
|
headers={
|
227
228
|
"Content-Type": "application/json",
|
229
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
228
230
|
}
|
229
231
|
)
|
230
232
|
|
@@ -248,6 +250,7 @@ class TraceManagerClient:
|
|
248
250
|
},
|
249
251
|
headers={
|
250
252
|
"Content-Type": "application/json",
|
253
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
251
254
|
}
|
252
255
|
)
|
253
256
|
|
@@ -263,11 +266,12 @@ class TraceManagerClient:
|
|
263
266
|
response = requests.delete(
|
264
267
|
JUDGMENT_TRACES_DELETE_API_URL,
|
265
268
|
json={
|
266
|
-
"judgment_api_key": self.judgment_api_key,
|
269
|
+
# "judgment_api_key": self.judgment_api_key,
|
267
270
|
"trace_ids": trace_ids,
|
268
271
|
},
|
269
272
|
headers={
|
270
273
|
"Content-Type": "application/json",
|
274
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
271
275
|
}
|
272
276
|
)
|
273
277
|
|
@@ -557,7 +561,8 @@ class TraceClient:
|
|
557
561
|
"overwrite": overwrite
|
558
562
|
}
|
559
563
|
|
560
|
-
|
564
|
+
# Execute asynchrous evaluation in the background
|
565
|
+
if not empty_save: # Only send to RabbitMQ if the trace is not empty
|
561
566
|
connection = pika.BlockingConnection(
|
562
567
|
pika.ConnectionParameters(host=RABBITMQ_HOST, port=RABBITMQ_PORT))
|
563
568
|
channel = connection.channel()
|
@@ -575,6 +580,25 @@ class TraceClient:
|
|
575
580
|
|
576
581
|
self.trace_manager_client.save_trace(trace_data, empty_save)
|
577
582
|
|
583
|
+
|
584
|
+
# Save trace data by making POST request to API
|
585
|
+
response = requests.post(
|
586
|
+
JUDGMENT_TRACES_SAVE_API_URL,
|
587
|
+
json=trace_data,
|
588
|
+
headers={
|
589
|
+
"Content-Type": "application/json",
|
590
|
+
"Authorization": f"Bearer {self.tracer.api_key}" # Bearer token format
|
591
|
+
}
|
592
|
+
)
|
593
|
+
|
594
|
+
if response.status_code == HTTPStatus.BAD_REQUEST:
|
595
|
+
raise ValueError(f"Failed to save trace data: Check your Trace name for conflicts, set overwrite=True to overwrite existing traces: {response.text}")
|
596
|
+
elif response.status_code != HTTPStatus.OK:
|
597
|
+
raise ValueError(f"Failed to save trace data: {response.text}")
|
598
|
+
|
599
|
+
if not empty_save and "ui_results_url" in response.json():
|
600
|
+
rprint(f"\n🔍 You can view your trace data here: [rgb(106,0,255)]{response.json()['ui_results_url']}[/]\n")
|
601
|
+
|
578
602
|
return self.trace_id, trace_data
|
579
603
|
|
580
604
|
def delete(self):
|
@@ -588,23 +612,31 @@ class Tracer:
|
|
588
612
|
cls._instance = super(Tracer, cls).__new__(cls)
|
589
613
|
return cls._instance
|
590
614
|
|
591
|
-
def __init__(self, api_key: str = os.getenv("JUDGMENT_API_KEY")):
|
615
|
+
def __init__(self, api_key: str = os.getenv("JUDGMENT_API_KEY"), project_name: str = "default_project"):
|
592
616
|
if not hasattr(self, 'initialized'):
|
593
|
-
|
594
617
|
if not api_key:
|
595
618
|
raise ValueError("Tracer must be configured with a Judgment API key")
|
596
619
|
|
597
620
|
self.api_key: str = api_key
|
621
|
+
self.project_name: str = project_name
|
598
622
|
self.client: JudgmentClient = JudgmentClient(judgment_api_key=api_key)
|
599
623
|
self.depth: int = 0
|
600
624
|
self._current_trace: Optional[str] = None
|
601
625
|
self.initialized: bool = True
|
626
|
+
elif hasattr(self, 'project_name') and self.project_name != project_name:
|
627
|
+
warnings.warn(
|
628
|
+
f"Attempting to initialize Tracer with project_name='{project_name}' but it was already initialized with "
|
629
|
+
f"project_name='{self.project_name}'. Due to the singleton pattern, the original project_name will be used. "
|
630
|
+
"To use a different project name, ensure the first Tracer initialization uses the desired project name.",
|
631
|
+
RuntimeWarning
|
632
|
+
)
|
602
633
|
|
603
634
|
@contextmanager
|
604
|
-
def trace(self, name: str, project_name: str =
|
635
|
+
def trace(self, name: str, project_name: str = None, overwrite: bool = False) -> Generator[TraceClient, None, None]:
|
605
636
|
"""Start a new trace context using a context manager"""
|
606
637
|
trace_id = str(uuid.uuid4())
|
607
|
-
|
638
|
+
project = project_name if project_name is not None else self.project_name
|
639
|
+
trace = TraceClient(self, trace_id, name, project_name=project, overwrite=overwrite)
|
608
640
|
prev_trace = self._current_trace
|
609
641
|
self._current_trace = trace
|
610
642
|
|
@@ -623,28 +655,40 @@ class Tracer:
|
|
623
655
|
"""
|
624
656
|
return self._current_trace
|
625
657
|
|
626
|
-
def observe(self, func=None, *, name=None, span_type: SpanType = "span"):
|
658
|
+
def observe(self, func=None, *, name=None, span_type: SpanType = "span", project_name: str = None, overwrite: bool = False):
|
627
659
|
"""
|
628
660
|
Decorator to trace function execution with detailed entry/exit information.
|
629
661
|
|
630
662
|
Args:
|
631
|
-
func: The function to
|
632
|
-
name: Optional custom name for the function
|
633
|
-
span_type:
|
663
|
+
func: The function to decorate
|
664
|
+
name: Optional custom name for the span (defaults to function name)
|
665
|
+
span_type: Type of span (default "span")
|
666
|
+
project_name: Optional project name override
|
667
|
+
overwrite: Whether to overwrite existing traces
|
634
668
|
"""
|
635
669
|
if func is None:
|
636
|
-
return lambda f: self.observe(f, name=name, span_type=span_type)
|
670
|
+
return lambda f: self.observe(f, name=name, span_type=span_type, project_name=project_name, overwrite=overwrite)
|
671
|
+
|
672
|
+
# Use provided name or fall back to function name
|
673
|
+
span_name = name or func.__name__
|
637
674
|
|
638
675
|
if asyncio.iscoroutinefunction(func):
|
639
676
|
@functools.wraps(func)
|
640
677
|
async def async_wrapper(*args, **kwargs):
|
678
|
+
# If there's already a trace, use it. Otherwise create a new one
|
641
679
|
if self._current_trace:
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
680
|
+
trace = self._current_trace
|
681
|
+
else:
|
682
|
+
trace_id = str(uuid.uuid4())
|
683
|
+
trace_name = str(uuid.uuid4())
|
684
|
+
project = project_name if project_name is not None else self.project_name
|
685
|
+
trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite)
|
686
|
+
self._current_trace = trace
|
687
|
+
# Only save empty trace for the root call
|
688
|
+
trace.save(empty_save=True, overwrite=overwrite)
|
689
|
+
|
690
|
+
try:
|
691
|
+
with trace.span(span_name, span_type=span_type) as span:
|
648
692
|
# Record inputs
|
649
693
|
span.record_input({
|
650
694
|
'args': list(args),
|
@@ -658,19 +702,30 @@ class Tracer:
|
|
658
702
|
span.record_output(result)
|
659
703
|
|
660
704
|
return result
|
661
|
-
|
662
|
-
|
705
|
+
finally:
|
706
|
+
# Only save and cleanup if this is the root observe call
|
707
|
+
if self.depth == 0:
|
708
|
+
trace.save(empty_save=False, overwrite=overwrite)
|
709
|
+
self._current_trace = None
|
710
|
+
|
663
711
|
return async_wrapper
|
664
712
|
else:
|
665
713
|
@functools.wraps(func)
|
666
714
|
def wrapper(*args, **kwargs):
|
715
|
+
# If there's already a trace, use it. Otherwise create a new one
|
667
716
|
if self._current_trace:
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
717
|
+
trace = self._current_trace
|
718
|
+
else:
|
719
|
+
trace_id = str(uuid.uuid4())
|
720
|
+
trace_name = str(uuid.uuid4())
|
721
|
+
project = project_name if project_name is not None else self.project_name
|
722
|
+
trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite)
|
723
|
+
self._current_trace = trace
|
724
|
+
# Only save empty trace for the root call
|
725
|
+
trace.save(empty_save=True, overwrite=overwrite)
|
726
|
+
|
727
|
+
try:
|
728
|
+
with trace.span(span_name, span_type=span_type) as span:
|
674
729
|
# Record inputs
|
675
730
|
span.record_input({
|
676
731
|
'args': list(args),
|
@@ -684,8 +739,12 @@ class Tracer:
|
|
684
739
|
span.record_output(result)
|
685
740
|
|
686
741
|
return result
|
687
|
-
|
688
|
-
|
742
|
+
finally:
|
743
|
+
# Only save and cleanup if this is the root observe call
|
744
|
+
if self.depth == 0:
|
745
|
+
trace.save(empty_save=False, overwrite=overwrite)
|
746
|
+
self._current_trace = None
|
747
|
+
|
689
748
|
return wrapper
|
690
749
|
|
691
750
|
def wrap(client: Any) -> Any:
|
judgeval/constants.py
CHANGED
@@ -36,7 +36,9 @@ ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
|
|
36
36
|
JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
|
37
37
|
JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
|
38
38
|
JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull/"
|
39
|
+
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
|
39
40
|
JUDGMENT_DATASETS_PULL_ALL_API_URL = f"{ROOT_API}/datasets/get_all_stats/"
|
41
|
+
JUDGMENT_DATASETS_EDIT_API_URL = f"{ROOT_API}/datasets/edit/"
|
40
42
|
JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
|
41
43
|
JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
|
42
44
|
JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_name/"
|
@@ -162,7 +162,8 @@ class EvalDataset:
|
|
162
162
|
"additional_metadata": ast.literal_eval(row["additional_metadata"]) if pd.notna(row["additional_metadata"]) else dict(),
|
163
163
|
"tools_called": row["tools_called"].split(";") if pd.notna(row["tools_called"]) else [],
|
164
164
|
"expected_tools": row["expected_tools"].split(";") if pd.notna(row["expected_tools"]) else [],
|
165
|
-
"trace_id": row["trace_id"] if pd.notna(row["trace_id"]) else None
|
165
|
+
"trace_id": row["trace_id"] if pd.notna(row["trace_id"]) else None,
|
166
|
+
"example_id": str(row["example_id"]) if pd.notna(row["example_id"]) else None
|
166
167
|
}
|
167
168
|
if row["example"]:
|
168
169
|
data["name"] = row["name"] if pd.notna(row["name"]) else None
|
@@ -1,5 +1,5 @@
|
|
1
1
|
|
2
|
-
from typing import Optional
|
2
|
+
from typing import Optional, List
|
3
3
|
import requests
|
4
4
|
from rich.progress import Progress, SpinnerColumn, TextColumn
|
5
5
|
|
@@ -7,7 +7,9 @@ from judgeval.common.logger import debug, error, warning, info
|
|
7
7
|
from judgeval.constants import (
|
8
8
|
JUDGMENT_DATASETS_PUSH_API_URL,
|
9
9
|
JUDGMENT_DATASETS_PULL_API_URL,
|
10
|
-
JUDGMENT_DATASETS_PULL_ALL_API_URL
|
10
|
+
JUDGMENT_DATASETS_PULL_ALL_API_URL,
|
11
|
+
JUDGMENT_DATASETS_EDIT_API_URL,
|
12
|
+
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
|
11
13
|
)
|
12
14
|
from judgeval.data import Example
|
13
15
|
from judgeval.data.datasets import EvalDataset
|
@@ -23,7 +25,7 @@ class EvalDatasetClient:
|
|
23
25
|
def create_dataset(self) -> EvalDataset:
|
24
26
|
return EvalDataset(judgment_api_key=self.judgment_api_key)
|
25
27
|
|
26
|
-
def push(self, dataset: EvalDataset, alias: str,overwrite: Optional[bool] = False) -> bool:
|
28
|
+
def push(self, dataset: EvalDataset, alias: str, overwrite: Optional[bool] = False) -> bool:
|
27
29
|
debug(f"Pushing dataset with alias '{alias}' (overwrite={overwrite})")
|
28
30
|
if overwrite:
|
29
31
|
warning(f"Overwrite enabled for alias '{alias}'")
|
@@ -56,12 +58,16 @@ class EvalDatasetClient:
|
|
56
58
|
"ground_truths": [g.to_dict() for g in dataset.ground_truths],
|
57
59
|
"examples": [e.to_dict() for e in dataset.examples],
|
58
60
|
"overwrite": overwrite,
|
59
|
-
"judgment_api_key": dataset.judgment_api_key
|
61
|
+
# "judgment_api_key": dataset.judgment_api_key
|
60
62
|
}
|
61
63
|
try:
|
62
64
|
response = requests.post(
|
63
65
|
JUDGMENT_DATASETS_PUSH_API_URL,
|
64
|
-
json=content
|
66
|
+
json=content,
|
67
|
+
headers={
|
68
|
+
"Content-Type": "application/json",
|
69
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
70
|
+
}
|
65
71
|
)
|
66
72
|
if response.status_code == 500:
|
67
73
|
error(f"Server error during push: {content.get('message')}")
|
@@ -115,13 +121,17 @@ class EvalDatasetClient:
|
|
115
121
|
)
|
116
122
|
request_body = {
|
117
123
|
"alias": alias,
|
118
|
-
"judgment_api_key": self.judgment_api_key
|
124
|
+
# "judgment_api_key": self.judgment_api_key
|
119
125
|
}
|
120
126
|
|
121
127
|
try:
|
122
128
|
response = requests.post(
|
123
129
|
JUDGMENT_DATASETS_PULL_API_URL,
|
124
|
-
json=request_body
|
130
|
+
json=request_body,
|
131
|
+
headers={
|
132
|
+
"Content-Type": "application/json",
|
133
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
134
|
+
}
|
125
135
|
)
|
126
136
|
response.raise_for_status()
|
127
137
|
except requests.exceptions.RequestException as e:
|
@@ -169,13 +179,17 @@ class EvalDatasetClient:
|
|
169
179
|
total=100,
|
170
180
|
)
|
171
181
|
request_body = {
|
172
|
-
"judgment_api_key": self.judgment_api_key
|
182
|
+
# "judgment_api_key": self.judgment_api_key
|
173
183
|
}
|
174
184
|
|
175
185
|
try:
|
176
186
|
response = requests.post(
|
177
187
|
JUDGMENT_DATASETS_PULL_ALL_API_URL,
|
178
|
-
json=request_body
|
188
|
+
json=request_body,
|
189
|
+
headers={
|
190
|
+
"Content-Type": "application/json",
|
191
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
192
|
+
}
|
179
193
|
)
|
180
194
|
response.raise_for_status()
|
181
195
|
except requests.exceptions.RequestException as e:
|
@@ -191,3 +205,86 @@ class EvalDatasetClient:
|
|
191
205
|
)
|
192
206
|
|
193
207
|
return payload
|
208
|
+
|
209
|
+
def edit_dataset(self, alias: str, examples: List[Example], ground_truths: List[GroundTruthExample]) -> bool:
|
210
|
+
"""
|
211
|
+
Edits the dataset on Judgment platform by adding new examples and ground truths
|
212
|
+
|
213
|
+
Mock request:
|
214
|
+
{
|
215
|
+
"alias": alias,
|
216
|
+
"examples": [...],
|
217
|
+
"ground_truths": [...],
|
218
|
+
"judgment_api_key": self.judgment_api_key
|
219
|
+
}
|
220
|
+
"""
|
221
|
+
with Progress(
|
222
|
+
SpinnerColumn(style="rgb(106,0,255)"),
|
223
|
+
TextColumn("[progress.description]{task.description}"),
|
224
|
+
transient=False,
|
225
|
+
) as progress:
|
226
|
+
task_id = progress.add_task(
|
227
|
+
f"Editing dataset [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] on Judgment...",
|
228
|
+
total=100,
|
229
|
+
)
|
230
|
+
|
231
|
+
content = {
|
232
|
+
"alias": alias,
|
233
|
+
"examples": [e.to_dict() for e in examples],
|
234
|
+
"ground_truths": [g.to_dict() for g in ground_truths],
|
235
|
+
"judgment_api_key": self.judgment_api_key
|
236
|
+
}
|
237
|
+
|
238
|
+
try:
|
239
|
+
response = requests.post(
|
240
|
+
JUDGMENT_DATASETS_EDIT_API_URL,
|
241
|
+
json=content
|
242
|
+
)
|
243
|
+
response.raise_for_status()
|
244
|
+
except requests.exceptions.RequestException as e:
|
245
|
+
error(f"Error editing dataset: {str(e)}")
|
246
|
+
return False
|
247
|
+
|
248
|
+
info(f"Successfully edited dataset '{alias}'")
|
249
|
+
return True
|
250
|
+
|
251
|
+
def export_jsonl(self, alias: str) -> requests.Response:
|
252
|
+
"""Export dataset in JSONL format from Judgment platform"""
|
253
|
+
debug(f"Exporting dataset with alias '{alias}' as JSONL")
|
254
|
+
with Progress(
|
255
|
+
SpinnerColumn(style="rgb(106,0,255)"),
|
256
|
+
TextColumn("[progress.description]{task.description}"),
|
257
|
+
transient=False,
|
258
|
+
) as progress:
|
259
|
+
task_id = progress.add_task(
|
260
|
+
f"Exporting [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] as JSONL...",
|
261
|
+
total=100,
|
262
|
+
)
|
263
|
+
try:
|
264
|
+
response = requests.post(
|
265
|
+
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL,
|
266
|
+
json={"alias": alias},
|
267
|
+
headers={
|
268
|
+
"Content-Type": "application/json",
|
269
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
270
|
+
},
|
271
|
+
stream=True
|
272
|
+
)
|
273
|
+
response.raise_for_status()
|
274
|
+
except requests.exceptions.HTTPError as err:
|
275
|
+
if err.response.status_code == 404:
|
276
|
+
error(f"Dataset not found: {alias}")
|
277
|
+
else:
|
278
|
+
error(f"HTTP error during export: {err}")
|
279
|
+
raise
|
280
|
+
except Exception as e:
|
281
|
+
error(f"Error during export: {str(e)}")
|
282
|
+
raise
|
283
|
+
|
284
|
+
info(f"Successfully exported dataset with alias '{alias}'")
|
285
|
+
progress.update(
|
286
|
+
task_id,
|
287
|
+
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
|
288
|
+
)
|
289
|
+
|
290
|
+
return response
|
judgeval/data/example.py
CHANGED
@@ -4,9 +4,11 @@ Classes for representing examples in a dataset.
|
|
4
4
|
|
5
5
|
|
6
6
|
from typing import TypeVar, Optional, Any, Dict, List
|
7
|
-
from
|
7
|
+
from uuid import uuid4
|
8
|
+
from pydantic import BaseModel, Field
|
8
9
|
from enum import Enum
|
9
10
|
from datetime import datetime
|
11
|
+
import time
|
10
12
|
|
11
13
|
|
12
14
|
Input = TypeVar('Input')
|
@@ -33,15 +35,19 @@ class Example(BaseModel):
|
|
33
35
|
tools_called: Optional[List[str]] = None
|
34
36
|
expected_tools: Optional[List[str]] = None
|
35
37
|
name: Optional[str] = None
|
36
|
-
example_id:
|
38
|
+
example_id: str = Field(default_factory=lambda: str(uuid4()))
|
39
|
+
example_index: Optional[int] = None
|
37
40
|
timestamp: Optional[str] = None
|
38
41
|
trace_id: Optional[str] = None
|
39
42
|
|
40
43
|
def __init__(self, **data):
|
41
|
-
|
44
|
+
if 'example_id' not in data:
|
45
|
+
data['example_id'] = str(uuid4())
|
42
46
|
# Set timestamp if not provided
|
43
|
-
if
|
44
|
-
|
47
|
+
if 'timestamp' not in data:
|
48
|
+
data['timestamp'] = datetime.now().strftime("%Y%m%d_%H%M%S")
|
49
|
+
super().__init__(**data)
|
50
|
+
|
45
51
|
|
46
52
|
def to_dict(self):
|
47
53
|
return {
|
@@ -55,6 +61,7 @@ class Example(BaseModel):
|
|
55
61
|
"expected_tools": self.expected_tools,
|
56
62
|
"name": self.name,
|
57
63
|
"example_id": self.example_id,
|
64
|
+
"example_index": self.example_index,
|
58
65
|
"timestamp": self.timestamp,
|
59
66
|
"trace_id": self.trace_id
|
60
67
|
}
|
@@ -71,6 +78,7 @@ class Example(BaseModel):
|
|
71
78
|
f"expected_tools={self.expected_tools}, "
|
72
79
|
f"name={self.name}, "
|
73
80
|
f"example_id={self.example_id}, "
|
81
|
+
f"example_index={self.example_index}, "
|
74
82
|
f"timestamp={self.timestamp}, "
|
75
83
|
f"trace_id={self.trace_id})"
|
76
84
|
)
|
judgeval/judgment_client.py
CHANGED
@@ -6,7 +6,7 @@ from typing import Optional, List, Dict, Any, Union
|
|
6
6
|
import requests
|
7
7
|
|
8
8
|
from judgeval.constants import ROOT_API
|
9
|
-
from judgeval.data.datasets import EvalDataset, EvalDatasetClient
|
9
|
+
from judgeval.data.datasets import EvalDataset, EvalDatasetClient, GroundTruthExample
|
10
10
|
from judgeval.data import (
|
11
11
|
ScoringResult,
|
12
12
|
Example
|
@@ -164,6 +164,11 @@ class JudgmentClient:
|
|
164
164
|
"""
|
165
165
|
return self.eval_dataset_client.pull_all_user_dataset_stats()
|
166
166
|
|
167
|
+
def edit_dataset(self, alias: str, examples: List[Example], ground_truths: List[GroundTruthExample]) -> bool:
|
168
|
+
"""
|
169
|
+
Edits the dataset on Judgment platform by adding new examples and ground truths
|
170
|
+
"""
|
171
|
+
return self.eval_dataset_client.edit_dataset(alias, examples, ground_truths)
|
167
172
|
|
168
173
|
# Maybe add option where you can pass in the EvaluationRun object and it will pull the eval results from the backend
|
169
174
|
def pull_eval(self, project_name: str, eval_run_name: str) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
|
@@ -182,6 +187,10 @@ class JudgmentClient:
|
|
182
187
|
eval_name=eval_run_name,
|
183
188
|
judgment_api_key=self.judgment_api_key)
|
184
189
|
eval_run = requests.post(JUDGMENT_EVAL_FETCH_API_URL,
|
190
|
+
headers={
|
191
|
+
"Content-Type": "application/json",
|
192
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
193
|
+
},
|
185
194
|
json=eval_run_request_body.model_dump())
|
186
195
|
if eval_run.status_code != requests.codes.ok:
|
187
196
|
raise ValueError(f"Error fetching eval results: {eval_run.json()}")
|
@@ -213,6 +222,7 @@ class JudgmentClient:
|
|
213
222
|
json=eval_run_request_body.model_dump(),
|
214
223
|
headers={
|
215
224
|
"Content-Type": "application/json",
|
225
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
216
226
|
})
|
217
227
|
if response.status_code != requests.codes.ok:
|
218
228
|
raise ValueError(f"Error deleting eval results: {response.json()}")
|
@@ -235,6 +245,7 @@ class JudgmentClient:
|
|
235
245
|
},
|
236
246
|
headers={
|
237
247
|
"Content-Type": "application/json",
|
248
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
238
249
|
})
|
239
250
|
if response.status_code != requests.codes.ok:
|
240
251
|
raise ValueError(f"Error deleting eval results: {response.json()}")
|
@@ -246,7 +257,11 @@ class JudgmentClient:
|
|
246
257
|
"""
|
247
258
|
response = requests.post(
|
248
259
|
f"{ROOT_API}/validate_api_key/",
|
249
|
-
|
260
|
+
headers={
|
261
|
+
"Content-Type": "application/json",
|
262
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
263
|
+
},
|
264
|
+
json={} # Empty body now
|
250
265
|
)
|
251
266
|
if response.status_code == 200:
|
252
267
|
return True, response.json()
|
@@ -268,12 +283,16 @@ class JudgmentClient:
|
|
268
283
|
"""
|
269
284
|
request_body = {
|
270
285
|
"slug": slug,
|
271
|
-
"judgment_api_key": self.judgment_api_key
|
286
|
+
# "judgment_api_key": self.judgment_api_key
|
272
287
|
}
|
273
288
|
|
274
289
|
response = requests.post(
|
275
290
|
f"{ROOT_API}/fetch_scorer/",
|
276
|
-
json=request_body
|
291
|
+
json=request_body,
|
292
|
+
headers={
|
293
|
+
"Content-Type": "application/json",
|
294
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
295
|
+
}
|
277
296
|
)
|
278
297
|
|
279
298
|
if response.status_code == 500:
|
@@ -306,13 +325,17 @@ class JudgmentClient:
|
|
306
325
|
"name": scorer.name,
|
307
326
|
"conversation": scorer.conversation,
|
308
327
|
"options": scorer.options,
|
309
|
-
"judgment_api_key": self.judgment_api_key,
|
328
|
+
# "judgment_api_key": self.judgment_api_key,
|
310
329
|
"slug": slug
|
311
330
|
}
|
312
331
|
|
313
332
|
response = requests.post(
|
314
333
|
f"{ROOT_API}/save_scorer/",
|
315
|
-
json=request_body
|
334
|
+
json=request_body,
|
335
|
+
headers={
|
336
|
+
"Content-Type": "application/json",
|
337
|
+
"Authorization": f"Bearer {self.judgment_api_key}"
|
338
|
+
}
|
316
339
|
)
|
317
340
|
|
318
341
|
if response.status_code == 500:
|
judgeval/run_evaluation.py
CHANGED
@@ -47,7 +47,12 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
|
|
47
47
|
try:
|
48
48
|
# submit API request to execute evals
|
49
49
|
payload = evaluation_run.model_dump(warnings=False)
|
50
|
-
response = requests.post(
|
50
|
+
response = requests.post(
|
51
|
+
JUDGMENT_EVAL_API_URL, headers={
|
52
|
+
"Content-Type": "application/json",
|
53
|
+
"Authorization": f"Bearer {evaluation_run.judgment_api_key}"
|
54
|
+
},
|
55
|
+
json=payload)
|
51
56
|
response_data = response.json()
|
52
57
|
except Exception as e:
|
53
58
|
error(f"Error: {e}")
|
@@ -151,6 +156,10 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
|
|
151
156
|
try:
|
152
157
|
response = requests.post(
|
153
158
|
f"{ROOT_API}/eval-run-name-exists/",
|
159
|
+
headers={
|
160
|
+
"Content-Type": "application/json",
|
161
|
+
"Authorization": f"Bearer {judgment_api_key}"
|
162
|
+
},
|
154
163
|
json={
|
155
164
|
"eval_name": eval_name,
|
156
165
|
"project_name": project_name,
|
@@ -188,6 +197,10 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
|
|
188
197
|
try:
|
189
198
|
res = requests.post(
|
190
199
|
JUDGMENT_EVAL_LOG_API_URL,
|
200
|
+
headers={
|
201
|
+
"Content-Type": "application/json",
|
202
|
+
"Authorization": f"Bearer {evaluation_run.judgment_api_key}"
|
203
|
+
},
|
191
204
|
json={
|
192
205
|
"results": [result.to_dict() for result in merged_results],
|
193
206
|
"judgment_api_key": evaluation_run.judgment_api_key,
|
@@ -247,12 +260,10 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
247
260
|
# Set example IDs if not already set
|
248
261
|
debug("Initializing examples with IDs and timestamps")
|
249
262
|
for idx, example in enumerate(evaluation_run.examples):
|
250
|
-
|
251
|
-
example.example_id = idx
|
252
|
-
debug(f"Set example ID {idx} for input: {example.input[:50]}...")
|
263
|
+
example.example_index = idx # Set numeric index
|
253
264
|
example.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
254
265
|
with example_logging_context(example.timestamp, example.example_id):
|
255
|
-
debug(f"Initialized example {example.example_id}")
|
266
|
+
debug(f"Initialized example {example.example_id} (index: {example.example_index})")
|
256
267
|
debug(f"Input: {example.input}")
|
257
268
|
debug(f"Actual output: {example.actual_output}")
|
258
269
|
if example.expected_output:
|
@@ -1,22 +1,22 @@
|
|
1
1
|
judgeval/__init__.py,sha256=xiiG4CkeaOtey4fusCd9CBz0BVqzTIbV-K2EFIU0rUM,283
|
2
2
|
judgeval/clients.py,sha256=Ns5ljrgPPXUMo7fSPJxO12H64lcPyKeQPIVG_RMi2cM,1162
|
3
|
-
judgeval/constants.py,sha256=
|
3
|
+
judgeval/constants.py,sha256=43hGesvBbX1uzc4KXvjLCVdd6cyZRMSnEJp11oA7h74,2794
|
4
4
|
judgeval/evaluation_run.py,sha256=ev-IbL34SwRv8lwB4KHfYag1jYo6b049R8mmwNBqmnM,5923
|
5
|
-
judgeval/judgment_client.py,sha256=
|
6
|
-
judgeval/run_evaluation.py,sha256=
|
5
|
+
judgeval/judgment_client.py,sha256=7vaarj6zXQmQ44m0cVCe72S4e92eZ4tK8sqNTnx4FLQ,14957
|
6
|
+
judgeval/run_evaluation.py,sha256=vl6TcwJVH2jN60Gja1E1tPI3Jvv6YNeNMTDVTcWkqZY,20520
|
7
7
|
judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
|
8
8
|
judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
|
9
9
|
judgeval/common/logger.py,sha256=QXN3UMymmKu2iMEMEgATLBnMDjGr_pE2iOSEFoICgg8,6092
|
10
|
-
judgeval/common/tracer.py,sha256=
|
10
|
+
judgeval/common/tracer.py,sha256=szU7mhyMIoG9EvPIb6dtxv7ix83WVuv7TtVX31FWMoQ,33582
|
11
11
|
judgeval/common/utils.py,sha256=3WRyyX0tvnnj_VAVlEdtZrfzyWj6zfX04xdpCtE1m5Y,33736
|
12
12
|
judgeval/data/__init__.py,sha256=YferxwmUqoBi18hrdgro0BD0h4pt20LAqISeUzGMcVU,474
|
13
13
|
judgeval/data/api_example.py,sha256=vwWFbI6eJr5VgURCRbuSiMtEXLUbTCih_BcaqEBy-pg,4108
|
14
|
-
judgeval/data/example.py,sha256=
|
14
|
+
judgeval/data/example.py,sha256=r_ZA_Fq0k-1xSutSLURwj0-Ug0C_yQl4GQlqtDxbYT0,2771
|
15
15
|
judgeval/data/result.py,sha256=8FIO-bFKPegZuByKRjA2_sumjb8oGWQ5ZeQ1RVz5z2w,4393
|
16
16
|
judgeval/data/scorer_data.py,sha256=pYljblCPZrlMIv5Eg7R-clnmsqzUBAwokKjZpwa0DXE,3280
|
17
17
|
judgeval/data/datasets/__init__.py,sha256=eO6ayeM_bTGwIt0eDSlTBIIBvXvIWRWWSfYZrZROPiQ,265
|
18
|
-
judgeval/data/datasets/dataset.py,sha256=
|
19
|
-
judgeval/data/datasets/eval_dataset_client.py,sha256=
|
18
|
+
judgeval/data/datasets/dataset.py,sha256=6-BhkGiwMmvROxnFbefgzsFZy7wAaLi9kiTQ6p0h_xk,11928
|
19
|
+
judgeval/data/datasets/eval_dataset_client.py,sha256=6wybPyt0BjrMQcOl3cTkcY3c9Pbm_K1fnpMiuzh56E4,11006
|
20
20
|
judgeval/data/datasets/ground_truth.py,sha256=OTBs3VZe-Wp0vEXEsq14GPZHYtpWT16bhGQTycIvkKc,2057
|
21
21
|
judgeval/data/datasets/utils.py,sha256=lQxyl7mevct7JcDSyIrU_8QOzT-EYPWEvoUiAeOdeek,2502
|
22
22
|
judgeval/judges/__init__.py,sha256=tyQ5KY88Kp1Ctfw2IJxnVEpy8DnFCtmy04JdPOpp-As,339
|
@@ -78,7 +78,7 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarizat
|
|
78
78
|
judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py,sha256=JUB3TMqS1OHr6PqpIGqkyiBNbyfUaw7lZuUATjU3_ek,168
|
79
79
|
judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py,sha256=CYGRJY5EuyICYzHrmFdLykwXakX8AC7G3Bhj7p6szfY,5493
|
80
80
|
judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
|
81
|
-
judgeval-0.0.
|
82
|
-
judgeval-0.0.
|
83
|
-
judgeval-0.0.
|
84
|
-
judgeval-0.0.
|
81
|
+
judgeval-0.0.13.dist-info/METADATA,sha256=6BQFdiV0_9Oe119PBqfNnmgX1ZWXjN-_6x0q9lVvnDg,1283
|
82
|
+
judgeval-0.0.13.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
83
|
+
judgeval-0.0.13.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
|
84
|
+
judgeval-0.0.13.dist-info/RECORD,,
|
File without changes
|
File without changes
|