judgeval 0.0.9__py3-none-any.whl → 0.0.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +50 -7
- judgeval/constants.py +1 -0
- judgeval/data/datasets/__init__.py +2 -1
- judgeval/data/datasets/dataset.py +1 -122
- judgeval/data/datasets/eval_dataset_client.py +193 -0
- judgeval/data/result.py +16 -1
- judgeval/judgment_client.py +18 -6
- judgeval/run_evaluation.py +19 -0
- {judgeval-0.0.9.dist-info → judgeval-0.0.10.dist-info}/METADATA +1 -1
- {judgeval-0.0.9.dist-info → judgeval-0.0.10.dist-info}/RECORD +12 -11
- {judgeval-0.0.9.dist-info → judgeval-0.0.10.dist-info}/WHEEL +0 -0
- {judgeval-0.0.9.dist-info → judgeval-0.0.10.dist-info}/licenses/LICENSE.md +0 -0
judgeval/common/tracer.py
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
Tracing system for judgeval that allows for function tracing using decorators.
|
3
3
|
"""
|
4
4
|
|
5
|
+
import os
|
5
6
|
import time
|
6
7
|
import functools
|
7
8
|
import requests
|
@@ -20,6 +21,7 @@ import json
|
|
20
21
|
import warnings
|
21
22
|
from pydantic import BaseModel
|
22
23
|
from http import HTTPStatus
|
24
|
+
from rich import print as rprint
|
23
25
|
|
24
26
|
from judgeval.constants import JUDGMENT_TRACES_SAVE_API_URL
|
25
27
|
from judgeval.judgment_client import JudgmentClient
|
@@ -121,8 +123,29 @@ class TraceEntry:
|
|
121
123
|
|
122
124
|
Handles special cases:
|
123
125
|
- Pydantic models are converted using model_dump()
|
126
|
+
- We try to serialize into JSON, then string, then the base representation (__repr__)
|
124
127
|
- Non-serializable objects return None with a warning
|
125
128
|
"""
|
129
|
+
|
130
|
+
def safe_stringify(output, function_name):
|
131
|
+
"""
|
132
|
+
Safely converts an object to a string or repr, handling serialization issues gracefully.
|
133
|
+
"""
|
134
|
+
try:
|
135
|
+
return str(output)
|
136
|
+
except (TypeError, OverflowError, ValueError):
|
137
|
+
pass
|
138
|
+
|
139
|
+
try:
|
140
|
+
return repr(output)
|
141
|
+
except (TypeError, OverflowError, ValueError):
|
142
|
+
pass
|
143
|
+
|
144
|
+
warnings.warn(
|
145
|
+
f"Output for function {function_name} is not JSON serializable and could not be converted to string. Setting to None."
|
146
|
+
)
|
147
|
+
return None
|
148
|
+
|
126
149
|
if isinstance(self.output, BaseModel):
|
127
150
|
return self.output.model_dump()
|
128
151
|
|
@@ -131,8 +154,7 @@ class TraceEntry:
|
|
131
154
|
json.dumps(self.output)
|
132
155
|
return self.output
|
133
156
|
except (TypeError, OverflowError, ValueError):
|
134
|
-
|
135
|
-
return None
|
157
|
+
return safe_stringify(self.output, self.function)
|
136
158
|
|
137
159
|
class TraceClient:
|
138
160
|
"""Client for managing a single trace context"""
|
@@ -361,6 +383,24 @@ class TraceClient:
|
|
361
383
|
raw_entries = [entry.to_dict() for entry in self.entries]
|
362
384
|
condensed_entries = self.condense_trace(raw_entries)
|
363
385
|
|
386
|
+
# Calculate total token counts from LLM API calls
|
387
|
+
total_prompt_tokens = 0
|
388
|
+
total_completion_tokens = 0
|
389
|
+
total_tokens = 0
|
390
|
+
|
391
|
+
for entry in condensed_entries:
|
392
|
+
if entry.get("span_type") == "llm" and isinstance(entry.get("output"), dict):
|
393
|
+
usage = entry["output"].get("usage", {})
|
394
|
+
# Handle OpenAI/Together format
|
395
|
+
if "prompt_tokens" in usage:
|
396
|
+
total_prompt_tokens += usage.get("prompt_tokens", 0)
|
397
|
+
total_completion_tokens += usage.get("completion_tokens", 0)
|
398
|
+
# Handle Anthropic format
|
399
|
+
elif "input_tokens" in usage:
|
400
|
+
total_prompt_tokens += usage.get("input_tokens", 0)
|
401
|
+
total_completion_tokens += usage.get("output_tokens", 0)
|
402
|
+
total_tokens += usage.get("total_tokens", 0)
|
403
|
+
|
364
404
|
# Create trace document
|
365
405
|
trace_data = {
|
366
406
|
"trace_id": self.trace_id,
|
@@ -370,10 +410,10 @@ class TraceClient:
|
|
370
410
|
"created_at": datetime.fromtimestamp(self.start_time).isoformat(),
|
371
411
|
"duration": total_duration,
|
372
412
|
"token_counts": {
|
373
|
-
"prompt_tokens":
|
374
|
-
"completion_tokens":
|
375
|
-
"total_tokens":
|
376
|
-
},
|
413
|
+
"prompt_tokens": total_prompt_tokens,
|
414
|
+
"completion_tokens": total_completion_tokens,
|
415
|
+
"total_tokens": total_tokens,
|
416
|
+
},
|
377
417
|
"entries": condensed_entries,
|
378
418
|
"empty_save": empty_save,
|
379
419
|
"overwrite": overwrite
|
@@ -393,6 +433,9 @@ class TraceClient:
|
|
393
433
|
elif response.status_code != HTTPStatus.OK:
|
394
434
|
raise ValueError(f"Failed to save trace data: {response.text}")
|
395
435
|
|
436
|
+
if not empty_save and "ui_results_url" in response.json():
|
437
|
+
rprint(f"\n🔍 You can view your trace data here: [rgb(106,0,255)]{response.json()['ui_results_url']}[/]\n")
|
438
|
+
|
396
439
|
return self.trace_id, trace_data
|
397
440
|
|
398
441
|
class Tracer:
|
@@ -403,7 +446,7 @@ class Tracer:
|
|
403
446
|
cls._instance = super(Tracer, cls).__new__(cls)
|
404
447
|
return cls._instance
|
405
448
|
|
406
|
-
def __init__(self, api_key: str):
|
449
|
+
def __init__(self, api_key: str = os.getenv("JUDGMENT_API_KEY")):
|
407
450
|
if not hasattr(self, 'initialized'):
|
408
451
|
|
409
452
|
if not api_key:
|
judgeval/constants.py
CHANGED
@@ -36,6 +36,7 @@ ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
|
|
36
36
|
JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
|
37
37
|
JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
|
38
38
|
JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull/"
|
39
|
+
JUDGMENT_DATASETS_PULL_ALL_API_URL = f"{ROOT_API}/datasets/get_all_stats/"
|
39
40
|
JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
|
40
41
|
JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
|
41
42
|
JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
|
@@ -1,4 +1,5 @@
|
|
1
1
|
from judgeval.data.datasets.dataset import EvalDataset
|
2
2
|
from judgeval.data.datasets.ground_truth import GroundTruthExample
|
3
|
+
from judgeval.data.datasets.eval_dataset_client import EvalDatasetClient
|
3
4
|
|
4
|
-
__all__ = ["EvalDataset", "GroundTruthExample"]
|
5
|
+
__all__ = ["EvalDataset", "EvalDatasetClient", "GroundTruthExample"]
|
@@ -2,16 +2,11 @@ import ast
|
|
2
2
|
import csv
|
3
3
|
import datetime
|
4
4
|
import json
|
5
|
-
from rich.console import Console
|
6
|
-
from rich.progress import Progress, SpinnerColumn, TextColumn
|
7
|
-
import requests
|
8
5
|
from dataclasses import dataclass, field
|
9
6
|
import os
|
10
7
|
from typing import List, Optional, Union, Literal
|
11
8
|
|
12
|
-
from judgeval.constants import JUDGMENT_DATASETS_PUSH_API_URL, JUDGMENT_DATASETS_PULL_API_URL
|
13
9
|
from judgeval.data.datasets.ground_truth import GroundTruthExample
|
14
|
-
from judgeval.data.datasets.utils import ground_truths_to_examples, examples_to_ground_truths
|
15
10
|
from judgeval.data import Example
|
16
11
|
from judgeval.common.logger import debug, error, warning, info
|
17
12
|
|
@@ -37,120 +32,6 @@ class EvalDataset:
|
|
37
32
|
self._id = None
|
38
33
|
self.judgment_api_key = judgment_api_key
|
39
34
|
|
40
|
-
def push(self, alias: str, overwrite: Optional[bool] = False) -> bool:
|
41
|
-
debug(f"Pushing dataset with alias '{alias}' (overwrite={overwrite})")
|
42
|
-
if overwrite:
|
43
|
-
warning(f"Overwrite enabled for alias '{alias}'")
|
44
|
-
"""
|
45
|
-
Pushes the dataset to Judgment platform
|
46
|
-
|
47
|
-
Mock request:
|
48
|
-
{
|
49
|
-
"alias": alias,
|
50
|
-
"ground_truths": [...],
|
51
|
-
"examples": [...],
|
52
|
-
"overwrite": overwrite
|
53
|
-
} ==>
|
54
|
-
{
|
55
|
-
"_alias": alias,
|
56
|
-
"_id": "..." # ID of the dataset
|
57
|
-
}
|
58
|
-
"""
|
59
|
-
with Progress(
|
60
|
-
SpinnerColumn(style="rgb(106,0,255)"),
|
61
|
-
TextColumn("[progress.description]{task.description}"),
|
62
|
-
transient=False,
|
63
|
-
) as progress:
|
64
|
-
task_id = progress.add_task(
|
65
|
-
f"Pushing [rgb(106,0,255)]'{alias}' to Judgment...",
|
66
|
-
total=100,
|
67
|
-
)
|
68
|
-
content = {
|
69
|
-
"alias": alias,
|
70
|
-
"ground_truths": [g.to_dict() for g in self.ground_truths],
|
71
|
-
"examples": [e.to_dict() for e in self.examples],
|
72
|
-
"overwrite": overwrite,
|
73
|
-
"judgment_api_key": self.judgment_api_key
|
74
|
-
}
|
75
|
-
try:
|
76
|
-
response = requests.post(
|
77
|
-
JUDGMENT_DATASETS_PUSH_API_URL,
|
78
|
-
json=content
|
79
|
-
)
|
80
|
-
if response.status_code == 500:
|
81
|
-
error(f"Server error during push: {content.get('message')}")
|
82
|
-
return False
|
83
|
-
response.raise_for_status()
|
84
|
-
except requests.exceptions.HTTPError as err:
|
85
|
-
if response.status_code == 422:
|
86
|
-
error(f"Validation error during push: {err.response.json()}")
|
87
|
-
else:
|
88
|
-
error(f"HTTP error during push: {err}")
|
89
|
-
|
90
|
-
info(f"Successfully pushed dataset with alias '{alias}'")
|
91
|
-
payload = response.json()
|
92
|
-
self._alias = payload.get("_alias")
|
93
|
-
self._id = payload.get("_id")
|
94
|
-
progress.update(
|
95
|
-
task_id,
|
96
|
-
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
|
97
|
-
)
|
98
|
-
return True
|
99
|
-
|
100
|
-
def pull(self, alias: str):
|
101
|
-
debug(f"Pulling dataset with alias '{alias}'")
|
102
|
-
"""
|
103
|
-
Pulls the dataset from Judgment platform
|
104
|
-
|
105
|
-
Mock request:
|
106
|
-
{
|
107
|
-
"alias": alias,
|
108
|
-
"user_id": user_id
|
109
|
-
}
|
110
|
-
==>
|
111
|
-
{
|
112
|
-
"ground_truths": [...],
|
113
|
-
"examples": [...],
|
114
|
-
"_alias": alias,
|
115
|
-
"_id": "..." # ID of the dataset
|
116
|
-
}
|
117
|
-
"""
|
118
|
-
# Make a POST request to the Judgment API to get the dataset
|
119
|
-
|
120
|
-
with Progress(
|
121
|
-
SpinnerColumn(style="rgb(106,0,255)"),
|
122
|
-
TextColumn("[progress.description]{task.description}"),
|
123
|
-
transient=False,
|
124
|
-
) as progress:
|
125
|
-
task_id = progress.add_task(
|
126
|
-
f"Pulling [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] from Judgment...",
|
127
|
-
total=100,
|
128
|
-
)
|
129
|
-
request_body = {
|
130
|
-
"alias": alias,
|
131
|
-
"judgment_api_key": self.judgment_api_key
|
132
|
-
}
|
133
|
-
|
134
|
-
try:
|
135
|
-
response = requests.post(
|
136
|
-
JUDGMENT_DATASETS_PULL_API_URL,
|
137
|
-
json=request_body
|
138
|
-
)
|
139
|
-
response.raise_for_status()
|
140
|
-
except requests.exceptions.RequestException as e:
|
141
|
-
error(f"Error pulling dataset: {str(e)}")
|
142
|
-
raise
|
143
|
-
|
144
|
-
info(f"Successfully pulled dataset with alias '{alias}'")
|
145
|
-
payload = response.json()
|
146
|
-
self.ground_truths = [GroundTruthExample(**g) for g in payload.get("ground_truths", [])]
|
147
|
-
self.examples = [Example(**e) for e in payload.get("examples", [])]
|
148
|
-
self._alias = payload.get("_alias")
|
149
|
-
self._id = payload.get("_id")
|
150
|
-
progress.update(
|
151
|
-
task_id,
|
152
|
-
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
|
153
|
-
)
|
154
35
|
|
155
36
|
def add_from_json(self, file_path: str) -> None:
|
156
37
|
debug(f"Loading dataset from JSON file: {file_path}")
|
@@ -402,6 +283,4 @@ class EvalDataset:
|
|
402
283
|
f"_alias={self._alias}, "
|
403
284
|
f"_id={self._id}"
|
404
285
|
f")"
|
405
|
-
)
|
406
|
-
|
407
|
-
|
286
|
+
)
|
@@ -0,0 +1,193 @@
|
|
1
|
+
|
2
|
+
from typing import Optional
|
3
|
+
import requests
|
4
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn
|
5
|
+
|
6
|
+
from judgeval.common.logger import debug, error, warning, info
|
7
|
+
from judgeval.constants import (
|
8
|
+
JUDGMENT_DATASETS_PUSH_API_URL,
|
9
|
+
JUDGMENT_DATASETS_PULL_API_URL,
|
10
|
+
JUDGMENT_DATASETS_PULL_ALL_API_URL
|
11
|
+
)
|
12
|
+
from judgeval.data import Example
|
13
|
+
from judgeval.data.datasets import EvalDataset
|
14
|
+
from judgeval.data.datasets.ground_truth import GroundTruthExample
|
15
|
+
|
16
|
+
|
17
|
+
|
18
|
+
|
19
|
+
class EvalDatasetClient:
|
20
|
+
def __init__(self, judgment_api_key: str):
|
21
|
+
self.judgment_api_key = judgment_api_key
|
22
|
+
|
23
|
+
def create_dataset(self) -> EvalDataset:
|
24
|
+
return EvalDataset(judgment_api_key=self.judgment_api_key)
|
25
|
+
|
26
|
+
def push(self, dataset: EvalDataset, alias: str,overwrite: Optional[bool] = False) -> bool:
|
27
|
+
debug(f"Pushing dataset with alias '{alias}' (overwrite={overwrite})")
|
28
|
+
if overwrite:
|
29
|
+
warning(f"Overwrite enabled for alias '{alias}'")
|
30
|
+
"""
|
31
|
+
Pushes the dataset to Judgment platform
|
32
|
+
|
33
|
+
Mock request:
|
34
|
+
dataset = {
|
35
|
+
"alias": alias,
|
36
|
+
"ground_truths": [...],
|
37
|
+
"examples": [...],
|
38
|
+
"overwrite": overwrite
|
39
|
+
} ==>
|
40
|
+
{
|
41
|
+
"_alias": alias,
|
42
|
+
"_id": "..." # ID of the dataset
|
43
|
+
}
|
44
|
+
"""
|
45
|
+
with Progress(
|
46
|
+
SpinnerColumn(style="rgb(106,0,255)"),
|
47
|
+
TextColumn("[progress.description]{task.description}"),
|
48
|
+
transient=False,
|
49
|
+
) as progress:
|
50
|
+
task_id = progress.add_task(
|
51
|
+
f"Pushing [rgb(106,0,255)]'{alias}' to Judgment...",
|
52
|
+
total=100,
|
53
|
+
)
|
54
|
+
content = {
|
55
|
+
"alias": alias,
|
56
|
+
"ground_truths": [g.to_dict() for g in dataset.ground_truths],
|
57
|
+
"examples": [e.to_dict() for e in dataset.examples],
|
58
|
+
"overwrite": overwrite,
|
59
|
+
"judgment_api_key": dataset.judgment_api_key
|
60
|
+
}
|
61
|
+
try:
|
62
|
+
response = requests.post(
|
63
|
+
JUDGMENT_DATASETS_PUSH_API_URL,
|
64
|
+
json=content
|
65
|
+
)
|
66
|
+
if response.status_code == 500:
|
67
|
+
error(f"Server error during push: {content.get('message')}")
|
68
|
+
return False
|
69
|
+
response.raise_for_status()
|
70
|
+
except requests.exceptions.HTTPError as err:
|
71
|
+
if response.status_code == 422:
|
72
|
+
error(f"Validation error during push: {err.response.json()}")
|
73
|
+
else:
|
74
|
+
error(f"HTTP error during push: {err}")
|
75
|
+
|
76
|
+
info(f"Successfully pushed dataset with alias '{alias}'")
|
77
|
+
payload = response.json()
|
78
|
+
dataset._alias = payload.get("_alias")
|
79
|
+
dataset._id = payload.get("_id")
|
80
|
+
progress.update(
|
81
|
+
task_id,
|
82
|
+
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
|
83
|
+
)
|
84
|
+
return True
|
85
|
+
|
86
|
+
def pull(self, alias: str) -> EvalDataset:
|
87
|
+
debug(f"Pulling dataset with alias '{alias}'")
|
88
|
+
"""
|
89
|
+
Pulls the dataset from Judgment platform
|
90
|
+
|
91
|
+
Mock request:
|
92
|
+
{
|
93
|
+
"alias": alias,
|
94
|
+
"user_id": user_id
|
95
|
+
}
|
96
|
+
==>
|
97
|
+
{
|
98
|
+
"ground_truths": [...],
|
99
|
+
"examples": [...],
|
100
|
+
"_alias": alias,
|
101
|
+
"_id": "..." # ID of the dataset
|
102
|
+
}
|
103
|
+
"""
|
104
|
+
# Make a POST request to the Judgment API to get the dataset
|
105
|
+
dataset = self.create_dataset()
|
106
|
+
|
107
|
+
with Progress(
|
108
|
+
SpinnerColumn(style="rgb(106,0,255)"),
|
109
|
+
TextColumn("[progress.description]{task.description}"),
|
110
|
+
transient=False,
|
111
|
+
) as progress:
|
112
|
+
task_id = progress.add_task(
|
113
|
+
f"Pulling [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] from Judgment...",
|
114
|
+
total=100,
|
115
|
+
)
|
116
|
+
request_body = {
|
117
|
+
"alias": alias,
|
118
|
+
"judgment_api_key": self.judgment_api_key
|
119
|
+
}
|
120
|
+
|
121
|
+
try:
|
122
|
+
response = requests.post(
|
123
|
+
JUDGMENT_DATASETS_PULL_API_URL,
|
124
|
+
json=request_body
|
125
|
+
)
|
126
|
+
response.raise_for_status()
|
127
|
+
except requests.exceptions.RequestException as e:
|
128
|
+
error(f"Error pulling dataset: {str(e)}")
|
129
|
+
raise
|
130
|
+
|
131
|
+
info(f"Successfully pulled dataset with alias '{alias}'")
|
132
|
+
payload = response.json()
|
133
|
+
dataset.ground_truths = [GroundTruthExample(**g) for g in payload.get("ground_truths", [])]
|
134
|
+
dataset.examples = [Example(**e) for e in payload.get("examples", [])]
|
135
|
+
dataset._alias = payload.get("_alias")
|
136
|
+
dataset._id = payload.get("_id")
|
137
|
+
progress.update(
|
138
|
+
task_id,
|
139
|
+
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
|
140
|
+
)
|
141
|
+
|
142
|
+
return dataset
|
143
|
+
|
144
|
+
def pull_all_user_dataset_stats(self) -> dict:
|
145
|
+
debug(f"Pulling user datasets stats for user_id: {self.judgment_api_key}'")
|
146
|
+
"""
|
147
|
+
Pulls the user datasets stats from Judgment platform
|
148
|
+
|
149
|
+
Mock request:
|
150
|
+
{
|
151
|
+
"user_id": user_id
|
152
|
+
}
|
153
|
+
==>
|
154
|
+
{
|
155
|
+
"test_dataset_1": {"examples_count": len(dataset1.examples), "ground_truths_count": len(dataset1.ground_truths)},
|
156
|
+
"test_dataset_2": {"examples_count": len(dataset2.examples), "ground_truths_count": len(dataset2.ground_truths)},
|
157
|
+
...
|
158
|
+
}
|
159
|
+
"""
|
160
|
+
# Make a POST request to the Judgment API to get the dataset
|
161
|
+
|
162
|
+
with Progress(
|
163
|
+
SpinnerColumn(style="rgb(106,0,255)"),
|
164
|
+
TextColumn("[progress.description]{task.description}"),
|
165
|
+
transient=False,
|
166
|
+
) as progress:
|
167
|
+
task_id = progress.add_task(
|
168
|
+
f"Pulling [rgb(106,0,255)]' datasets'[/rgb(106,0,255)] from Judgment...",
|
169
|
+
total=100,
|
170
|
+
)
|
171
|
+
request_body = {
|
172
|
+
"judgment_api_key": self.judgment_api_key
|
173
|
+
}
|
174
|
+
|
175
|
+
try:
|
176
|
+
response = requests.post(
|
177
|
+
JUDGMENT_DATASETS_PULL_ALL_API_URL,
|
178
|
+
json=request_body
|
179
|
+
)
|
180
|
+
response.raise_for_status()
|
181
|
+
except requests.exceptions.RequestException as e:
|
182
|
+
error(f"Error pulling dataset: {str(e)}")
|
183
|
+
raise
|
184
|
+
|
185
|
+
info(f"Successfully pulled datasets for userid: {self.judgment_api_key}'")
|
186
|
+
payload = response.json()
|
187
|
+
|
188
|
+
progress.update(
|
189
|
+
task_id,
|
190
|
+
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
|
191
|
+
)
|
192
|
+
|
193
|
+
return payload
|
judgeval/data/result.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
from dataclasses import dataclass
|
2
|
-
from typing import List, Union, Optional
|
2
|
+
from typing import List, Union, Optional, Dict, Any
|
3
3
|
|
4
4
|
from judgeval.data import ScorerData, ProcessExample
|
5
5
|
|
@@ -18,6 +18,9 @@ class ScoringResult:
|
|
18
18
|
expected_output (Optional[str]): The expected output of the example
|
19
19
|
context (Optional[List[str]]): The context of the example
|
20
20
|
retrieval_context (Optional[List[str]]): The retrieval context of the example
|
21
|
+
additional_metadata (Optional[Dict[str, Any]]): The additional metadata of the example
|
22
|
+
tools_called (Optional[List[str]]): The tools called by the example
|
23
|
+
expected_tools (Optional[List[str]]): The expected tools of the example
|
21
24
|
trace_id (Optional[str]): The trace id of the example
|
22
25
|
|
23
26
|
"""
|
@@ -31,6 +34,9 @@ class ScoringResult:
|
|
31
34
|
expected_output: Optional[str] = None
|
32
35
|
context: Optional[List[str]] = None
|
33
36
|
retrieval_context: Optional[List[str]] = None
|
37
|
+
additional_metadata: Optional[Dict[str, Any]] = None
|
38
|
+
tools_called: Optional[List[str]] = None
|
39
|
+
expected_tools: Optional[List[str]] = None
|
34
40
|
trace_id: Optional[str] = None
|
35
41
|
|
36
42
|
example_id: Optional[str] = None
|
@@ -46,6 +52,9 @@ class ScoringResult:
|
|
46
52
|
"expected_output": self.expected_output,
|
47
53
|
"context": self.context,
|
48
54
|
"retrieval_context": self.retrieval_context,
|
55
|
+
"additional_metadata": self.additional_metadata,
|
56
|
+
"tools_called": self.tools_called,
|
57
|
+
"expected_tools": self.expected_tools,
|
49
58
|
"trace_id": self.trace_id,
|
50
59
|
"example_id": self.example_id
|
51
60
|
}
|
@@ -59,6 +68,9 @@ class ScoringResult:
|
|
59
68
|
expected_output={self.expected_output}, \
|
60
69
|
context={self.context}, \
|
61
70
|
retrieval_context={self.retrieval_context}, \
|
71
|
+
additional_metadata={self.additional_metadata}, \
|
72
|
+
tools_called={self.tools_called}, \
|
73
|
+
expected_tools={self.expected_tools}, \
|
62
74
|
trace_id={self.trace_id})"
|
63
75
|
|
64
76
|
|
@@ -79,5 +91,8 @@ def generate_scoring_result(
|
|
79
91
|
expected_output=process_example.expected_output,
|
80
92
|
context=process_example.context,
|
81
93
|
retrieval_context=process_example.retrieval_context,
|
94
|
+
additional_metadata=process_example.additional_metadata,
|
95
|
+
tools_called=process_example.tools_called,
|
96
|
+
expected_tools=process_example.expected_tools,
|
82
97
|
trace_id=process_example.trace_id
|
83
98
|
)
|
judgeval/judgment_client.py
CHANGED
@@ -6,7 +6,7 @@ from typing import Optional, List, Dict, Any, Union
|
|
6
6
|
import requests
|
7
7
|
|
8
8
|
from judgeval.constants import ROOT_API
|
9
|
-
from judgeval.data.datasets import EvalDataset
|
9
|
+
from judgeval.data.datasets import EvalDataset, EvalDatasetClient
|
10
10
|
from judgeval.data import (
|
11
11
|
ScoringResult,
|
12
12
|
Example
|
@@ -36,6 +36,7 @@ class EvalRunRequestBody(BaseModel):
|
|
36
36
|
class JudgmentClient:
|
37
37
|
def __init__(self, judgment_api_key: str = os.getenv("JUDGMENT_API_KEY")):
|
38
38
|
self.judgment_api_key = judgment_api_key
|
39
|
+
self.eval_dataset_client = EvalDatasetClient(judgment_api_key)
|
39
40
|
|
40
41
|
# Verify API key is valid
|
41
42
|
result, response = self._validate_api_key()
|
@@ -121,7 +122,7 @@ class JudgmentClient:
|
|
121
122
|
raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
|
122
123
|
|
123
124
|
def create_dataset(self) -> EvalDataset:
|
124
|
-
return
|
125
|
+
return self.eval_dataset_client.create_dataset()
|
125
126
|
|
126
127
|
def push_dataset(self, alias: str, dataset: EvalDataset, overwrite: Optional[bool] = False) -> bool:
|
127
128
|
"""
|
@@ -137,7 +138,7 @@ class JudgmentClient:
|
|
137
138
|
"""
|
138
139
|
# Set judgment_api_key just in case it was not set
|
139
140
|
dataset.judgment_api_key = self.judgment_api_key
|
140
|
-
return
|
141
|
+
return self.eval_dataset_client.push(dataset, alias, overwrite)
|
141
142
|
|
142
143
|
def pull_dataset(self, alias: str) -> EvalDataset:
|
143
144
|
"""
|
@@ -149,9 +150,20 @@ class JudgmentClient:
|
|
149
150
|
Returns:
|
150
151
|
EvalDataset: The retrieved dataset
|
151
152
|
"""
|
152
|
-
|
153
|
-
|
154
|
-
|
153
|
+
return self.eval_dataset_client.pull(alias)
|
154
|
+
|
155
|
+
def pull_all_user_dataset_stats(self) -> dict:
|
156
|
+
"""
|
157
|
+
Retrieves all dataset stats from the Judgment platform for the user.
|
158
|
+
|
159
|
+
Args:
|
160
|
+
alias (str): The name of the dataset to retrieve
|
161
|
+
|
162
|
+
Returns:
|
163
|
+
EvalDataset: The retrieved dataset
|
164
|
+
"""
|
165
|
+
return self.eval_dataset_client.pull_all_user_dataset_stats()
|
166
|
+
|
155
167
|
|
156
168
|
# Maybe add option where you can pass in the EvaluationRun object and it will pull the eval results from the backend
|
157
169
|
def pull_eval(self, project_name: str, eval_run_name: str) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
|
judgeval/run_evaluation.py
CHANGED
@@ -97,6 +97,13 @@ def merge_results(api_results: List[ScoringResult], local_results: List[ScoringR
|
|
97
97
|
raise ValueError("The API and local results are not aligned.")
|
98
98
|
if api_result.retrieval_context != local_result.retrieval_context:
|
99
99
|
raise ValueError("The API and local results are not aligned.")
|
100
|
+
if api_result.additional_metadata != local_result.additional_metadata:
|
101
|
+
raise ValueError("The API and local results are not aligned.")
|
102
|
+
if api_result.tools_called != local_result.tools_called:
|
103
|
+
raise ValueError("The API and local results are not aligned.")
|
104
|
+
if api_result.expected_tools != local_result.expected_tools:
|
105
|
+
raise ValueError("The API and local results are not aligned.")
|
106
|
+
|
100
107
|
|
101
108
|
# Merge ScorerData from the API and local scorers together
|
102
109
|
api_scorer_data = api_result.scorers_data
|
@@ -254,6 +261,12 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
254
261
|
debug(f"Context: {example.context}")
|
255
262
|
if example.retrieval_context:
|
256
263
|
debug(f"Retrieval context: {example.retrieval_context}")
|
264
|
+
if example.additional_metadata:
|
265
|
+
debug(f"Additional metadata: {example.additional_metadata}")
|
266
|
+
if example.tools_called:
|
267
|
+
debug(f"Tools called: {example.tools_called}")
|
268
|
+
if example.expected_tools:
|
269
|
+
debug(f"Expected tools: {example.expected_tools}")
|
257
270
|
|
258
271
|
debug(f"Starting evaluation run with {len(evaluation_run.examples)} examples")
|
259
272
|
|
@@ -379,6 +392,9 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
|
|
379
392
|
'expected_output': result.expected_output,
|
380
393
|
'context': result.context,
|
381
394
|
'retrieval_context': result.retrieval_context,
|
395
|
+
'additional_metadata': result.additional_metadata,
|
396
|
+
'tools_called': result.tools_called,
|
397
|
+
'expected_tools': result.expected_tools,
|
382
398
|
'eval_run_name': result.eval_run_name,
|
383
399
|
'failed_scorers': []
|
384
400
|
}
|
@@ -397,6 +413,9 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
|
|
397
413
|
error_msg += f"Expected Output: {fail_case['expected_output']}\n"
|
398
414
|
error_msg += f"Context: {fail_case['context']}\n"
|
399
415
|
error_msg += f"Retrieval Context: {fail_case['retrieval_context']}\n"
|
416
|
+
error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
|
417
|
+
error_msg += f"Tools Called: {fail_case['tools_called']}\n"
|
418
|
+
error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
|
400
419
|
error_msg += f"Eval Run Name: {fail_case['eval_run_name']}\n"
|
401
420
|
|
402
421
|
for fail_scorer in fail_case['failed_scorers']:
|
@@ -1,21 +1,22 @@
|
|
1
1
|
judgeval/__init__.py,sha256=xiiG4CkeaOtey4fusCd9CBz0BVqzTIbV-K2EFIU0rUM,283
|
2
2
|
judgeval/clients.py,sha256=Ns5ljrgPPXUMo7fSPJxO12H64lcPyKeQPIVG_RMi2cM,1162
|
3
|
-
judgeval/constants.py,sha256=
|
3
|
+
judgeval/constants.py,sha256=qwWc3EOpXSn9SHq5rylkHhnzH5WldedqSMCToa7vgZk,2040
|
4
4
|
judgeval/evaluation_run.py,sha256=KcIS7mDR_9XEdqYrJXFcrLz5IDMof34HcD5VtjZgV8w,5884
|
5
|
-
judgeval/judgment_client.py,sha256=
|
6
|
-
judgeval/run_evaluation.py,sha256=
|
5
|
+
judgeval/judgment_client.py,sha256=jMeayUI-Z-GX4mVMVC9t5f7ENKLQ8dOepScYu5Yytf0,11777
|
6
|
+
judgeval/run_evaluation.py,sha256=YOQ6s9RuUrXPTgoYexf7r6Hl1QKIMSTdvHl9kw-ZMzw,20103
|
7
7
|
judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
|
8
8
|
judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
|
9
9
|
judgeval/common/logger.py,sha256=QXN3UMymmKu2iMEMEgATLBnMDjGr_pE2iOSEFoICgg8,6092
|
10
|
-
judgeval/common/tracer.py,sha256=
|
10
|
+
judgeval/common/tracer.py,sha256=k5g9ZLeM-fLdV_q9NpodN8gW4nLTIXsbxeTaXVjm9jk,25658
|
11
11
|
judgeval/common/utils.py,sha256=3WRyyX0tvnnj_VAVlEdtZrfzyWj6zfX04xdpCtE1m5Y,33736
|
12
12
|
judgeval/data/__init__.py,sha256=YferxwmUqoBi18hrdgro0BD0h4pt20LAqISeUzGMcVU,474
|
13
13
|
judgeval/data/api_example.py,sha256=vwWFbI6eJr5VgURCRbuSiMtEXLUbTCih_BcaqEBy-pg,4108
|
14
14
|
judgeval/data/example.py,sha256=lymGZ3jG818-r2vyFunt6OLFrhESOyJnbhao_ljTjlA,2471
|
15
|
-
judgeval/data/result.py,sha256=
|
15
|
+
judgeval/data/result.py,sha256=8FIO-bFKPegZuByKRjA2_sumjb8oGWQ5ZeQ1RVz5z2w,4393
|
16
16
|
judgeval/data/scorer_data.py,sha256=pYljblCPZrlMIv5Eg7R-clnmsqzUBAwokKjZpwa0DXE,3280
|
17
|
-
judgeval/data/datasets/__init__.py,sha256=
|
18
|
-
judgeval/data/datasets/dataset.py,sha256=
|
17
|
+
judgeval/data/datasets/__init__.py,sha256=eO6ayeM_bTGwIt0eDSlTBIIBvXvIWRWWSfYZrZROPiQ,265
|
18
|
+
judgeval/data/datasets/dataset.py,sha256=AGdU21vZ4iVjqbjQ7JY-u29FzJrdDFTgdvhzvYVJNyo,11833
|
19
|
+
judgeval/data/datasets/eval_dataset_client.py,sha256=TaCDzymGFNFjGRrieEdQB8dT8xqNPpsEi2XLGFyrJno,7113
|
19
20
|
judgeval/data/datasets/ground_truth.py,sha256=OTBs3VZe-Wp0vEXEsq14GPZHYtpWT16bhGQTycIvkKc,2057
|
20
21
|
judgeval/data/datasets/utils.py,sha256=lQxyl7mevct7JcDSyIrU_8QOzT-EYPWEvoUiAeOdeek,2502
|
21
22
|
judgeval/judges/__init__.py,sha256=tyQ5KY88Kp1Ctfw2IJxnVEpy8DnFCtmy04JdPOpp-As,339
|
@@ -76,7 +77,7 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py
|
|
76
77
|
judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=CBuE6oCxMzTdJoXFt_YPWBte88kedEQ9t3g52ZRztGY,21086
|
77
78
|
judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py,sha256=JUB3TMqS1OHr6PqpIGqkyiBNbyfUaw7lZuUATjU3_ek,168
|
78
79
|
judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py,sha256=CYGRJY5EuyICYzHrmFdLykwXakX8AC7G3Bhj7p6szfY,5493
|
79
|
-
judgeval-0.0.
|
80
|
-
judgeval-0.0.
|
81
|
-
judgeval-0.0.
|
82
|
-
judgeval-0.0.
|
80
|
+
judgeval-0.0.10.dist-info/METADATA,sha256=i9jeAPs3jY5hAHAdE_rlen4qJdEk0eAqQ0BOzMie97I,1205
|
81
|
+
judgeval-0.0.10.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
82
|
+
judgeval-0.0.10.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
|
83
|
+
judgeval-0.0.10.dist-info/RECORD,,
|
File without changes
|
File without changes
|