judgeval 0.16.9__py3-none-any.whl → 0.22.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of judgeval might be problematic. Click here for more details.
- judgeval/__init__.py +32 -2
- judgeval/api/__init__.py +108 -0
- judgeval/api/api_types.py +76 -15
- judgeval/cli.py +16 -1
- judgeval/data/judgment_types.py +76 -20
- judgeval/dataset/__init__.py +11 -2
- judgeval/env.py +2 -11
- judgeval/evaluation/__init__.py +4 -0
- judgeval/prompt/__init__.py +330 -0
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +1 -13
- judgeval/tracer/__init__.py +371 -257
- judgeval/tracer/constants.py +1 -1
- judgeval/tracer/exporters/store.py +32 -16
- judgeval/tracer/keys.py +11 -9
- judgeval/tracer/llm/llm_anthropic/messages.py +38 -26
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +14 -14
- judgeval/tracer/llm/llm_google/generate_content.py +9 -7
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +38 -14
- judgeval/tracer/llm/llm_openai/chat_completions.py +90 -26
- judgeval/tracer/llm/llm_openai/responses.py +88 -26
- judgeval/tracer/llm/llm_openai/utils.py +42 -0
- judgeval/tracer/llm/llm_together/chat_completions.py +26 -18
- judgeval/tracer/managers.py +4 -0
- judgeval/trainer/__init__.py +10 -1
- judgeval/trainer/base_trainer.py +122 -0
- judgeval/trainer/config.py +1 -1
- judgeval/trainer/fireworks_trainer.py +396 -0
- judgeval/trainer/trainer.py +52 -387
- judgeval/utils/guards.py +9 -5
- judgeval/utils/project.py +15 -0
- judgeval/utils/serialize.py +2 -2
- judgeval/version.py +1 -1
- {judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/METADATA +2 -3
- {judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/RECORD +37 -32
- {judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
- {judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/entry_points.txt +0 -0
- {judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
judgeval/__init__.py
CHANGED
|
@@ -6,6 +6,7 @@ from judgeval.data.evaluation_run import ExampleEvaluationRun
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
from typing import List, Optional, Union, Sequence
|
|
9
|
+
import ast
|
|
9
10
|
from judgeval.scorers import ExampleAPIScorerConfig
|
|
10
11
|
from judgeval.scorers.example_scorer import ExampleScorer
|
|
11
12
|
from judgeval.data.example import Example
|
|
@@ -81,6 +82,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
|
81
82
|
scorer_file_path: str,
|
|
82
83
|
requirements_file_path: Optional[str] = None,
|
|
83
84
|
unique_name: Optional[str] = None,
|
|
85
|
+
overwrite: bool = False,
|
|
84
86
|
) -> bool:
|
|
85
87
|
"""
|
|
86
88
|
Upload custom ExampleScorer from files to backend.
|
|
@@ -89,6 +91,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
|
89
91
|
scorer_file_path: Path to Python file containing CustomScorer class
|
|
90
92
|
requirements_file_path: Optional path to requirements.txt
|
|
91
93
|
unique_name: Optional unique identifier (auto-detected from scorer.name if not provided)
|
|
94
|
+
overwrite: Whether to overwrite existing scorer if it already exists
|
|
92
95
|
|
|
93
96
|
Returns:
|
|
94
97
|
bool: True if upload successful
|
|
@@ -111,6 +114,31 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
|
111
114
|
with open(scorer_file_path, "r") as f:
|
|
112
115
|
scorer_code = f.read()
|
|
113
116
|
|
|
117
|
+
try:
|
|
118
|
+
tree = ast.parse(scorer_code, filename=scorer_file_path)
|
|
119
|
+
except SyntaxError as e:
|
|
120
|
+
error_msg = f"Invalid Python syntax in {scorer_file_path}: {e}"
|
|
121
|
+
judgeval_logger.error(error_msg)
|
|
122
|
+
raise ValueError(error_msg)
|
|
123
|
+
|
|
124
|
+
scorer_classes = []
|
|
125
|
+
for node in ast.walk(tree):
|
|
126
|
+
if isinstance(node, ast.ClassDef):
|
|
127
|
+
for base in node.bases:
|
|
128
|
+
if (isinstance(base, ast.Name) and base.id == "ExampleScorer") or (
|
|
129
|
+
isinstance(base, ast.Attribute) and base.attr == "ExampleScorer"
|
|
130
|
+
):
|
|
131
|
+
scorer_classes.append(node.name)
|
|
132
|
+
|
|
133
|
+
if len(scorer_classes) > 1:
|
|
134
|
+
error_msg = f"Multiple ExampleScorer classes found in {scorer_file_path}: {scorer_classes}. Please only upload one scorer class per file."
|
|
135
|
+
judgeval_logger.error(error_msg)
|
|
136
|
+
raise ValueError(error_msg)
|
|
137
|
+
elif len(scorer_classes) == 0:
|
|
138
|
+
error_msg = f"No ExampleScorer class was found in {scorer_file_path}. Please ensure the file contains a valid scorer class that inherits from ExampleScorer."
|
|
139
|
+
judgeval_logger.error(error_msg)
|
|
140
|
+
raise ValueError(error_msg)
|
|
141
|
+
|
|
114
142
|
# Read requirements (optional)
|
|
115
143
|
requirements_text = ""
|
|
116
144
|
if requirements_file_path and os.path.exists(requirements_file_path):
|
|
@@ -118,6 +146,8 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
|
118
146
|
requirements_text = f.read()
|
|
119
147
|
|
|
120
148
|
try:
|
|
149
|
+
if not self.api_key or not self.organization_id:
|
|
150
|
+
raise ValueError("Judgment API key and organization ID are required")
|
|
121
151
|
client = JudgmentSyncClient(
|
|
122
152
|
api_key=self.api_key,
|
|
123
153
|
organization_id=self.organization_id,
|
|
@@ -127,6 +157,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
|
127
157
|
"scorer_name": unique_name,
|
|
128
158
|
"scorer_code": scorer_code,
|
|
129
159
|
"requirements_text": requirements_text,
|
|
160
|
+
"overwrite": overwrite,
|
|
130
161
|
}
|
|
131
162
|
)
|
|
132
163
|
|
|
@@ -139,8 +170,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
|
139
170
|
judgeval_logger.error(f"Failed to upload custom scorer: {unique_name}")
|
|
140
171
|
return False
|
|
141
172
|
|
|
142
|
-
except Exception
|
|
143
|
-
judgeval_logger.error(f"Error uploading custom scorer: {e}")
|
|
173
|
+
except Exception:
|
|
144
174
|
raise
|
|
145
175
|
|
|
146
176
|
|
judgeval/api/__init__.py
CHANGED
|
@@ -189,6 +189,59 @@ class JudgmentSyncClient:
|
|
|
189
189
|
payload,
|
|
190
190
|
)
|
|
191
191
|
|
|
192
|
+
def prompts_insert(self, payload: PromptInsertRequest) -> PromptInsertResponse:
|
|
193
|
+
return self._request(
|
|
194
|
+
"POST",
|
|
195
|
+
url_for("/prompts/insert/"),
|
|
196
|
+
payload,
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
def prompts_tag(self, payload: PromptTagRequest) -> PromptTagResponse:
|
|
200
|
+
return self._request(
|
|
201
|
+
"POST",
|
|
202
|
+
url_for("/prompts/tag/"),
|
|
203
|
+
payload,
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
def prompts_untag(self, payload: PromptUntagRequest) -> PromptUntagResponse:
|
|
207
|
+
return self._request(
|
|
208
|
+
"POST",
|
|
209
|
+
url_for("/prompts/untag/"),
|
|
210
|
+
payload,
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
def prompts_fetch(
|
|
214
|
+
self,
|
|
215
|
+
project_id: str,
|
|
216
|
+
name: str,
|
|
217
|
+
commit_id: Optional[str] = None,
|
|
218
|
+
tag: Optional[str] = None,
|
|
219
|
+
) -> PromptFetchResponse:
|
|
220
|
+
query_params = {}
|
|
221
|
+
query_params["project_id"] = project_id
|
|
222
|
+
query_params["name"] = name
|
|
223
|
+
if commit_id is not None:
|
|
224
|
+
query_params["commit_id"] = commit_id
|
|
225
|
+
if tag is not None:
|
|
226
|
+
query_params["tag"] = tag
|
|
227
|
+
return self._request(
|
|
228
|
+
"GET",
|
|
229
|
+
url_for("/prompts/fetch/"),
|
|
230
|
+
query_params,
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
def prompts_get_prompt_versions(
|
|
234
|
+
self, project_id: str, name: str
|
|
235
|
+
) -> PromptVersionsResponse:
|
|
236
|
+
query_params = {}
|
|
237
|
+
query_params["project_id"] = project_id
|
|
238
|
+
query_params["name"] = name
|
|
239
|
+
return self._request(
|
|
240
|
+
"GET",
|
|
241
|
+
url_for("/prompts/get_prompt_versions/"),
|
|
242
|
+
query_params,
|
|
243
|
+
)
|
|
244
|
+
|
|
192
245
|
def projects_resolve(
|
|
193
246
|
self, payload: ResolveProjectNameRequest
|
|
194
247
|
) -> ResolveProjectNameResponse:
|
|
@@ -381,6 +434,61 @@ class JudgmentAsyncClient:
|
|
|
381
434
|
payload,
|
|
382
435
|
)
|
|
383
436
|
|
|
437
|
+
async def prompts_insert(
|
|
438
|
+
self, payload: PromptInsertRequest
|
|
439
|
+
) -> PromptInsertResponse:
|
|
440
|
+
return await self._request(
|
|
441
|
+
"POST",
|
|
442
|
+
url_for("/prompts/insert/"),
|
|
443
|
+
payload,
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
async def prompts_tag(self, payload: PromptTagRequest) -> PromptTagResponse:
|
|
447
|
+
return await self._request(
|
|
448
|
+
"POST",
|
|
449
|
+
url_for("/prompts/tag/"),
|
|
450
|
+
payload,
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
async def prompts_untag(self, payload: PromptUntagRequest) -> PromptUntagResponse:
|
|
454
|
+
return await self._request(
|
|
455
|
+
"POST",
|
|
456
|
+
url_for("/prompts/untag/"),
|
|
457
|
+
payload,
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
async def prompts_fetch(
|
|
461
|
+
self,
|
|
462
|
+
project_id: str,
|
|
463
|
+
name: str,
|
|
464
|
+
commit_id: Optional[str] = None,
|
|
465
|
+
tag: Optional[str] = None,
|
|
466
|
+
) -> PromptFetchResponse:
|
|
467
|
+
query_params = {}
|
|
468
|
+
query_params["project_id"] = project_id
|
|
469
|
+
query_params["name"] = name
|
|
470
|
+
if commit_id is not None:
|
|
471
|
+
query_params["commit_id"] = commit_id
|
|
472
|
+
if tag is not None:
|
|
473
|
+
query_params["tag"] = tag
|
|
474
|
+
return await self._request(
|
|
475
|
+
"GET",
|
|
476
|
+
url_for("/prompts/fetch/"),
|
|
477
|
+
query_params,
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
async def prompts_get_prompt_versions(
|
|
481
|
+
self, project_id: str, name: str
|
|
482
|
+
) -> PromptVersionsResponse:
|
|
483
|
+
query_params = {}
|
|
484
|
+
query_params["project_id"] = project_id
|
|
485
|
+
query_params["name"] = name
|
|
486
|
+
return await self._request(
|
|
487
|
+
"GET",
|
|
488
|
+
url_for("/prompts/get_prompt_versions/"),
|
|
489
|
+
query_params,
|
|
490
|
+
)
|
|
491
|
+
|
|
384
492
|
async def projects_resolve(
|
|
385
493
|
self, payload: ResolveProjectNameRequest
|
|
386
494
|
) -> ResolveProjectNameResponse:
|
judgeval/api/api_types.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# generated by datamodel-codegen:
|
|
2
2
|
# filename: .openapi.json
|
|
3
|
-
# timestamp: 2025-10-
|
|
3
|
+
# timestamp: 2025-10-25T22:30:20+00:00
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
6
6
|
from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
|
|
@@ -24,15 +24,6 @@ class DatasetsFetch(TypedDict):
|
|
|
24
24
|
project_name: str
|
|
25
25
|
|
|
26
26
|
|
|
27
|
-
class DatasetsTableRow(TypedDict):
|
|
28
|
-
dataset_id: str
|
|
29
|
-
name: str
|
|
30
|
-
created_at: str
|
|
31
|
-
kind: Literal["trace", "example"]
|
|
32
|
-
entries: int
|
|
33
|
-
creator: str
|
|
34
|
-
|
|
35
|
-
|
|
36
27
|
class ProjectAdd(TypedDict):
|
|
37
28
|
project_name: str
|
|
38
29
|
|
|
@@ -67,19 +58,16 @@ class SavePromptScorerRequest(TypedDict):
|
|
|
67
58
|
description: NotRequired[Optional[str]]
|
|
68
59
|
|
|
69
60
|
|
|
70
|
-
class SavePromptScorerResponse(TypedDict):
|
|
71
|
-
message: str
|
|
72
|
-
name: str
|
|
73
|
-
|
|
74
|
-
|
|
75
61
|
class FetchPromptScorersRequest(TypedDict):
|
|
76
62
|
names: NotRequired[Optional[List[str]]]
|
|
63
|
+
is_trace: NotRequired[Optional[bool]]
|
|
77
64
|
|
|
78
65
|
|
|
79
66
|
class CustomScorerUploadPayload(TypedDict):
|
|
80
67
|
scorer_name: str
|
|
81
68
|
scorer_code: str
|
|
82
69
|
requirements_text: str
|
|
70
|
+
overwrite: NotRequired[bool]
|
|
83
71
|
|
|
84
72
|
|
|
85
73
|
class CustomScorerTemplateResponse(TypedDict):
|
|
@@ -88,6 +76,40 @@ class CustomScorerTemplateResponse(TypedDict):
|
|
|
88
76
|
message: str
|
|
89
77
|
|
|
90
78
|
|
|
79
|
+
class PromptInsertRequest(TypedDict):
|
|
80
|
+
project_id: str
|
|
81
|
+
name: str
|
|
82
|
+
prompt: str
|
|
83
|
+
tags: List[str]
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class PromptInsertResponse(TypedDict):
|
|
87
|
+
commit_id: str
|
|
88
|
+
parent_commit_id: NotRequired[Optional[str]]
|
|
89
|
+
created_at: str
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class PromptTagRequest(TypedDict):
|
|
93
|
+
project_id: str
|
|
94
|
+
name: str
|
|
95
|
+
commit_id: str
|
|
96
|
+
tags: List[str]
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class PromptTagResponse(TypedDict):
|
|
100
|
+
commit_id: str
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class PromptUntagRequest(TypedDict):
|
|
104
|
+
project_id: str
|
|
105
|
+
name: str
|
|
106
|
+
tags: List[str]
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class PromptUntagResponse(TypedDict):
|
|
110
|
+
commit_ids: List[str]
|
|
111
|
+
|
|
112
|
+
|
|
91
113
|
class ResolveProjectNameRequest(TypedDict):
|
|
92
114
|
project_name: str
|
|
93
115
|
|
|
@@ -158,6 +180,9 @@ DatasetKind = Literal["trace", "example"]
|
|
|
158
180
|
|
|
159
181
|
|
|
160
182
|
class PromptScorer(TypedDict):
|
|
183
|
+
id: str
|
|
184
|
+
user_id: str
|
|
185
|
+
organization_id: str
|
|
161
186
|
name: str
|
|
162
187
|
prompt: str
|
|
163
188
|
threshold: float
|
|
@@ -167,6 +192,19 @@ class PromptScorer(TypedDict):
|
|
|
167
192
|
created_at: NotRequired[Optional[str]]
|
|
168
193
|
updated_at: NotRequired[Optional[str]]
|
|
169
194
|
is_trace: NotRequired[Optional[bool]]
|
|
195
|
+
is_bucket_rubric: NotRequired[Optional[bool]]
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
class PromptCommitInfo(TypedDict):
|
|
199
|
+
name: str
|
|
200
|
+
prompt: str
|
|
201
|
+
tags: List[str]
|
|
202
|
+
commit_id: str
|
|
203
|
+
parent_commit_id: NotRequired[Optional[str]]
|
|
204
|
+
created_at: str
|
|
205
|
+
first_name: str
|
|
206
|
+
last_name: str
|
|
207
|
+
user_email: str
|
|
170
208
|
|
|
171
209
|
|
|
172
210
|
class ScorerData(TypedDict):
|
|
@@ -245,6 +283,7 @@ class TraceEvaluationRun(TypedDict):
|
|
|
245
283
|
created_at: NotRequired[str]
|
|
246
284
|
trace_and_span_ids: List[TraceAndSpanId]
|
|
247
285
|
is_offline: NotRequired[bool]
|
|
286
|
+
is_bucket_run: NotRequired[bool]
|
|
248
287
|
|
|
249
288
|
|
|
250
289
|
class DatasetInsertExamples(TypedDict):
|
|
@@ -253,6 +292,15 @@ class DatasetInsertExamples(TypedDict):
|
|
|
253
292
|
project_name: str
|
|
254
293
|
|
|
255
294
|
|
|
295
|
+
class DatasetInfo(TypedDict):
|
|
296
|
+
dataset_id: str
|
|
297
|
+
name: str
|
|
298
|
+
created_at: str
|
|
299
|
+
kind: DatasetKind
|
|
300
|
+
entries: int
|
|
301
|
+
creator: str
|
|
302
|
+
|
|
303
|
+
|
|
256
304
|
class DatasetCreate(TypedDict):
|
|
257
305
|
name: str
|
|
258
306
|
dataset_kind: DatasetKind
|
|
@@ -261,10 +309,22 @@ class DatasetCreate(TypedDict):
|
|
|
261
309
|
overwrite: bool
|
|
262
310
|
|
|
263
311
|
|
|
312
|
+
class SavePromptScorerResponse(TypedDict):
|
|
313
|
+
scorer_response: PromptScorer
|
|
314
|
+
|
|
315
|
+
|
|
264
316
|
class FetchPromptScorersResponse(TypedDict):
|
|
265
317
|
scorers: List[PromptScorer]
|
|
266
318
|
|
|
267
319
|
|
|
320
|
+
class PromptFetchResponse(TypedDict):
|
|
321
|
+
commit: NotRequired[Optional[PromptCommitInfo]]
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
class PromptVersionsResponse(TypedDict):
|
|
325
|
+
versions: List[PromptCommitInfo]
|
|
326
|
+
|
|
327
|
+
|
|
268
328
|
class ScoringResult(TypedDict):
|
|
269
329
|
success: bool
|
|
270
330
|
scorers_data: List[ScorerData]
|
|
@@ -287,6 +347,7 @@ class OtelTraceListItem(TypedDict):
|
|
|
287
347
|
llm_cost: NotRequired[Optional[float]]
|
|
288
348
|
error: NotRequired[str]
|
|
289
349
|
scores: NotRequired[List[OtelSpanListItemScores]]
|
|
350
|
+
rules_invoked: NotRequired[List[str]]
|
|
290
351
|
customer_id: NotRequired[Optional[str]]
|
|
291
352
|
input: NotRequired[Optional[str]]
|
|
292
353
|
output: NotRequired[Optional[str]]
|
judgeval/cli.py
CHANGED
|
@@ -6,6 +6,7 @@ from dotenv import load_dotenv
|
|
|
6
6
|
from judgeval.logger import judgeval_logger
|
|
7
7
|
from judgeval import JudgmentClient
|
|
8
8
|
from judgeval.version import get_version
|
|
9
|
+
from judgeval.exceptions import JudgmentAPIError
|
|
9
10
|
|
|
10
11
|
load_dotenv()
|
|
11
12
|
|
|
@@ -26,6 +27,12 @@ def upload_scorer(
|
|
|
26
27
|
unique_name: str = typer.Option(
|
|
27
28
|
None, help="Custom name for the scorer (auto-detected if not provided)"
|
|
28
29
|
),
|
|
30
|
+
overwrite: bool = typer.Option(
|
|
31
|
+
False,
|
|
32
|
+
"--overwrite",
|
|
33
|
+
"-o",
|
|
34
|
+
help="Overwrite existing scorer if it already exists",
|
|
35
|
+
),
|
|
29
36
|
):
|
|
30
37
|
# Validate file paths
|
|
31
38
|
if not Path(scorer_file_path).exists():
|
|
@@ -43,14 +50,22 @@ def upload_scorer(
|
|
|
43
50
|
scorer_file_path=scorer_file_path,
|
|
44
51
|
requirements_file_path=requirements_file_path,
|
|
45
52
|
unique_name=unique_name,
|
|
53
|
+
overwrite=overwrite,
|
|
46
54
|
)
|
|
47
55
|
|
|
48
56
|
if not result:
|
|
49
57
|
judgeval_logger.error("Failed to upload custom scorer")
|
|
50
58
|
raise typer.Exit(1)
|
|
51
59
|
|
|
60
|
+
judgeval_logger.info("Custom scorer uploaded successfully!")
|
|
52
61
|
raise typer.Exit(0)
|
|
53
|
-
except Exception:
|
|
62
|
+
except Exception as e:
|
|
63
|
+
if isinstance(e, JudgmentAPIError) and e.status_code == 409:
|
|
64
|
+
judgeval_logger.error(
|
|
65
|
+
"Duplicate scorer detected. Use --overwrite flag to replace the existing scorer"
|
|
66
|
+
)
|
|
67
|
+
raise typer.Exit(1)
|
|
68
|
+
# Re-raise other exceptions
|
|
54
69
|
raise
|
|
55
70
|
|
|
56
71
|
|
judgeval/data/judgment_types.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# generated by datamodel-codegen:
|
|
2
2
|
# filename: .openapi.json
|
|
3
|
-
# timestamp: 2025-10-
|
|
3
|
+
# timestamp: 2025-10-25T22:30:19+00:00
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
6
6
|
from typing import Annotated, Any, Dict, List, Optional, Union
|
|
@@ -26,20 +26,6 @@ class DatasetsFetch(BaseModel):
|
|
|
26
26
|
project_name: Annotated[str, Field(title="Project Name")]
|
|
27
27
|
|
|
28
28
|
|
|
29
|
-
class Kind(Enum):
|
|
30
|
-
trace = "trace"
|
|
31
|
-
example = "example"
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
class DatasetsTableRow(BaseModel):
|
|
35
|
-
dataset_id: Annotated[str, Field(title="Dataset Id")]
|
|
36
|
-
name: Annotated[str, Field(title="Name")]
|
|
37
|
-
created_at: Annotated[str, Field(title="Created At")]
|
|
38
|
-
kind: Annotated[Kind, Field(title="Kind")]
|
|
39
|
-
entries: Annotated[int, Field(title="Entries")]
|
|
40
|
-
creator: Annotated[str, Field(title="Creator")]
|
|
41
|
-
|
|
42
|
-
|
|
43
29
|
class ProjectAdd(BaseModel):
|
|
44
30
|
project_name: Annotated[str, Field(title="Project Name")]
|
|
45
31
|
|
|
@@ -74,19 +60,16 @@ class SavePromptScorerRequest(BaseModel):
|
|
|
74
60
|
description: Annotated[Optional[str], Field(title="Description")] = None
|
|
75
61
|
|
|
76
62
|
|
|
77
|
-
class SavePromptScorerResponse(BaseModel):
|
|
78
|
-
message: Annotated[str, Field(title="Message")]
|
|
79
|
-
name: Annotated[str, Field(title="Name")]
|
|
80
|
-
|
|
81
|
-
|
|
82
63
|
class FetchPromptScorersRequest(BaseModel):
|
|
83
64
|
names: Annotated[Optional[List[str]], Field(title="Names")] = None
|
|
65
|
+
is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = None
|
|
84
66
|
|
|
85
67
|
|
|
86
68
|
class CustomScorerUploadPayload(BaseModel):
|
|
87
69
|
scorer_name: Annotated[str, Field(title="Scorer Name")]
|
|
88
70
|
scorer_code: Annotated[str, Field(title="Scorer Code")]
|
|
89
71
|
requirements_text: Annotated[str, Field(title="Requirements Text")]
|
|
72
|
+
overwrite: Annotated[Optional[bool], Field(title="Overwrite")] = False
|
|
90
73
|
|
|
91
74
|
|
|
92
75
|
class CustomScorerTemplateResponse(BaseModel):
|
|
@@ -95,6 +78,40 @@ class CustomScorerTemplateResponse(BaseModel):
|
|
|
95
78
|
message: Annotated[str, Field(title="Message")]
|
|
96
79
|
|
|
97
80
|
|
|
81
|
+
class PromptInsertRequest(BaseModel):
|
|
82
|
+
project_id: Annotated[str, Field(title="Project Id")]
|
|
83
|
+
name: Annotated[str, Field(title="Name")]
|
|
84
|
+
prompt: Annotated[str, Field(title="Prompt")]
|
|
85
|
+
tags: Annotated[List[str], Field(title="Tags")]
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class PromptInsertResponse(BaseModel):
|
|
89
|
+
commit_id: Annotated[str, Field(title="Commit Id")]
|
|
90
|
+
parent_commit_id: Annotated[Optional[str], Field(title="Parent Commit Id")] = None
|
|
91
|
+
created_at: Annotated[str, Field(title="Created At")]
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class PromptTagRequest(BaseModel):
|
|
95
|
+
project_id: Annotated[str, Field(title="Project Id")]
|
|
96
|
+
name: Annotated[str, Field(title="Name")]
|
|
97
|
+
commit_id: Annotated[str, Field(title="Commit Id")]
|
|
98
|
+
tags: Annotated[List[str], Field(title="Tags")]
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class PromptTagResponse(BaseModel):
|
|
102
|
+
commit_id: Annotated[str, Field(title="Commit Id")]
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class PromptUntagRequest(BaseModel):
|
|
106
|
+
project_id: Annotated[str, Field(title="Project Id")]
|
|
107
|
+
name: Annotated[str, Field(title="Name")]
|
|
108
|
+
tags: Annotated[List[str], Field(title="Tags")]
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class PromptUntagResponse(BaseModel):
|
|
112
|
+
commit_ids: Annotated[List[str], Field(title="Commit Ids")]
|
|
113
|
+
|
|
114
|
+
|
|
98
115
|
class ResolveProjectNameRequest(BaseModel):
|
|
99
116
|
project_name: Annotated[str, Field(title="Project Name")]
|
|
100
117
|
|
|
@@ -176,6 +193,9 @@ class DatasetKind(Enum):
|
|
|
176
193
|
|
|
177
194
|
|
|
178
195
|
class PromptScorer(BaseModel):
|
|
196
|
+
id: Annotated[str, Field(title="Id")]
|
|
197
|
+
user_id: Annotated[str, Field(title="User Id")]
|
|
198
|
+
organization_id: Annotated[str, Field(title="Organization Id")]
|
|
179
199
|
name: Annotated[str, Field(title="Name")]
|
|
180
200
|
prompt: Annotated[str, Field(title="Prompt")]
|
|
181
201
|
threshold: Annotated[float, Field(title="Threshold")]
|
|
@@ -185,6 +205,19 @@ class PromptScorer(BaseModel):
|
|
|
185
205
|
created_at: Annotated[Optional[AwareDatetime], Field(title="Created At")] = None
|
|
186
206
|
updated_at: Annotated[Optional[AwareDatetime], Field(title="Updated At")] = None
|
|
187
207
|
is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = False
|
|
208
|
+
is_bucket_rubric: Annotated[Optional[bool], Field(title="Is Bucket Rubric")] = None
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
class PromptCommitInfo(BaseModel):
|
|
212
|
+
name: Annotated[str, Field(title="Name")]
|
|
213
|
+
prompt: Annotated[str, Field(title="Prompt")]
|
|
214
|
+
tags: Annotated[List[str], Field(title="Tags")]
|
|
215
|
+
commit_id: Annotated[str, Field(title="Commit Id")]
|
|
216
|
+
parent_commit_id: Annotated[Optional[str], Field(title="Parent Commit Id")] = None
|
|
217
|
+
created_at: Annotated[str, Field(title="Created At")]
|
|
218
|
+
first_name: Annotated[str, Field(title="First Name")]
|
|
219
|
+
last_name: Annotated[str, Field(title="Last Name")]
|
|
220
|
+
user_email: Annotated[str, Field(title="User Email")]
|
|
188
221
|
|
|
189
222
|
|
|
190
223
|
class ScorerData(BaseModel):
|
|
@@ -279,6 +312,7 @@ class TraceEvaluationRun(BaseModel):
|
|
|
279
312
|
List[TraceAndSpanId], Field(title="Trace And Span Ids")
|
|
280
313
|
]
|
|
281
314
|
is_offline: Annotated[Optional[bool], Field(title="Is Offline")] = False
|
|
315
|
+
is_bucket_run: Annotated[Optional[bool], Field(title="Is Bucket Run")] = False
|
|
282
316
|
|
|
283
317
|
|
|
284
318
|
class DatasetInsertExamples(BaseModel):
|
|
@@ -287,6 +321,15 @@ class DatasetInsertExamples(BaseModel):
|
|
|
287
321
|
project_name: Annotated[str, Field(title="Project Name")]
|
|
288
322
|
|
|
289
323
|
|
|
324
|
+
class DatasetInfo(BaseModel):
|
|
325
|
+
dataset_id: Annotated[str, Field(title="Dataset Id")]
|
|
326
|
+
name: Annotated[str, Field(title="Name")]
|
|
327
|
+
created_at: Annotated[str, Field(title="Created At")]
|
|
328
|
+
kind: DatasetKind
|
|
329
|
+
entries: Annotated[int, Field(title="Entries")]
|
|
330
|
+
creator: Annotated[str, Field(title="Creator")]
|
|
331
|
+
|
|
332
|
+
|
|
290
333
|
class DatasetCreate(BaseModel):
|
|
291
334
|
name: Annotated[str, Field(title="Name")]
|
|
292
335
|
dataset_kind: DatasetKind
|
|
@@ -295,10 +338,22 @@ class DatasetCreate(BaseModel):
|
|
|
295
338
|
overwrite: Annotated[bool, Field(title="Overwrite")]
|
|
296
339
|
|
|
297
340
|
|
|
341
|
+
class SavePromptScorerResponse(BaseModel):
|
|
342
|
+
scorer_response: PromptScorer
|
|
343
|
+
|
|
344
|
+
|
|
298
345
|
class FetchPromptScorersResponse(BaseModel):
|
|
299
346
|
scorers: Annotated[List[PromptScorer], Field(title="Scorers")]
|
|
300
347
|
|
|
301
348
|
|
|
349
|
+
class PromptFetchResponse(BaseModel):
|
|
350
|
+
commit: Optional[PromptCommitInfo] = None
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
class PromptVersionsResponse(BaseModel):
|
|
354
|
+
versions: Annotated[List[PromptCommitInfo], Field(title="Versions")]
|
|
355
|
+
|
|
356
|
+
|
|
302
357
|
class ScoringResult(BaseModel):
|
|
303
358
|
success: Annotated[bool, Field(title="Success")]
|
|
304
359
|
scorers_data: Annotated[List[ScorerData], Field(title="Scorers Data")]
|
|
@@ -325,6 +380,7 @@ class OtelTraceListItem(BaseModel):
|
|
|
325
380
|
scores: Annotated[
|
|
326
381
|
Optional[List[OtelSpanListItemScores]], Field(title="Scores")
|
|
327
382
|
] = []
|
|
383
|
+
rules_invoked: Annotated[Optional[List[str]], Field(title="Rules Invoked")] = []
|
|
328
384
|
customer_id: Annotated[Optional[str], Field(title="Customer Id")] = None
|
|
329
385
|
input: Annotated[Optional[str], Field(title="Input")] = None
|
|
330
386
|
output: Annotated[Optional[str], Field(title="Output")] = None
|
judgeval/dataset/__init__.py
CHANGED
|
@@ -32,8 +32,8 @@ class Dataset:
|
|
|
32
32
|
dataset_kind: DatasetKind = DatasetKind.example
|
|
33
33
|
examples: Optional[List[Example]] = None
|
|
34
34
|
traces: Optional[List[Trace]] = None
|
|
35
|
-
judgment_api_key: str = JUDGMENT_API_KEY
|
|
36
|
-
organization_id: str = JUDGMENT_ORG_ID
|
|
35
|
+
judgment_api_key: str | None = JUDGMENT_API_KEY
|
|
36
|
+
organization_id: str | None = JUDGMENT_ORG_ID
|
|
37
37
|
|
|
38
38
|
@classmethod
|
|
39
39
|
def get(
|
|
@@ -41,6 +41,8 @@ class Dataset:
|
|
|
41
41
|
name: str,
|
|
42
42
|
project_name: str,
|
|
43
43
|
):
|
|
44
|
+
if not cls.judgment_api_key or not cls.organization_id:
|
|
45
|
+
raise ValueError("Judgment API key and organization ID are required")
|
|
44
46
|
client = JudgmentSyncClient(cls.judgment_api_key, cls.organization_id)
|
|
45
47
|
dataset = client.datasets_pull_for_judgeval(
|
|
46
48
|
{
|
|
@@ -102,6 +104,8 @@ class Dataset:
|
|
|
102
104
|
examples: List[Example] = [],
|
|
103
105
|
overwrite: bool = False,
|
|
104
106
|
):
|
|
107
|
+
if not cls.judgment_api_key or not cls.organization_id:
|
|
108
|
+
raise ValueError("Judgment API key and organization ID are required")
|
|
105
109
|
if not examples:
|
|
106
110
|
examples = []
|
|
107
111
|
|
|
@@ -125,6 +129,8 @@ class Dataset:
|
|
|
125
129
|
|
|
126
130
|
@classmethod
|
|
127
131
|
def list(cls, project_name: str):
|
|
132
|
+
if not cls.judgment_api_key or not cls.organization_id:
|
|
133
|
+
raise ValueError("Judgment API key and organization ID are required")
|
|
128
134
|
client = JudgmentSyncClient(cls.judgment_api_key, cls.organization_id)
|
|
129
135
|
datasets = client.datasets_pull_all_for_judgeval({"project_name": project_name})
|
|
130
136
|
|
|
@@ -173,6 +179,9 @@ class Dataset:
|
|
|
173
179
|
if not isinstance(examples, list):
|
|
174
180
|
raise TypeError("examples must be a list")
|
|
175
181
|
|
|
182
|
+
if not self.judgment_api_key or not self.organization_id:
|
|
183
|
+
raise ValueError("Judgment API key and organization ID are required")
|
|
184
|
+
|
|
176
185
|
client = JudgmentSyncClient(self.judgment_api_key, self.organization_id)
|
|
177
186
|
client.datasets_insert_examples_for_judgeval(
|
|
178
187
|
{
|
judgeval/env.py
CHANGED
|
@@ -19,17 +19,8 @@ def optional_env_var(var_name: str, default: str | None = None) -> str | None:
|
|
|
19
19
|
return os.getenv(var_name, default)
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
if value is None:
|
|
25
|
-
raise EnvironmentError(
|
|
26
|
-
f"Environment variable '{var_name}' is required but not set."
|
|
27
|
-
)
|
|
28
|
-
return value
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
JUDGMENT_API_KEY = required_env_var("JUDGMENT_API_KEY")
|
|
32
|
-
JUDGMENT_ORG_ID = required_env_var("JUDGMENT_ORG_ID")
|
|
22
|
+
JUDGMENT_API_KEY = optional_env_var("JUDGMENT_API_KEY")
|
|
23
|
+
JUDGMENT_ORG_ID = optional_env_var("JUDGMENT_ORG_ID")
|
|
33
24
|
JUDGMENT_API_URL = optional_env_var("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
|
|
34
25
|
|
|
35
26
|
JUDGMENT_DEFAULT_GPT_MODEL = optional_env_var("JUDGMENT_DEFAULT_GPT_MODEL", "gpt-5")
|
judgeval/evaluation/__init__.py
CHANGED
|
@@ -112,6 +112,8 @@ def _poll_evaluation_until_complete(
|
|
|
112
112
|
|
|
113
113
|
poll_count = 0
|
|
114
114
|
exception_count = 0
|
|
115
|
+
if not JUDGMENT_API_KEY or not JUDGMENT_ORG_ID:
|
|
116
|
+
raise ValueError("Judgment API key and organization ID are required")
|
|
115
117
|
api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
|
|
116
118
|
while poll_count < max_poll_count:
|
|
117
119
|
poll_count += 1
|
|
@@ -222,6 +224,8 @@ def run_eval(
|
|
|
222
224
|
)
|
|
223
225
|
t.start()
|
|
224
226
|
try:
|
|
227
|
+
if not JUDGMENT_API_KEY or not JUDGMENT_ORG_ID:
|
|
228
|
+
raise ValueError("Judgment API key and organization ID are required")
|
|
225
229
|
api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
|
|
226
230
|
response = api_client.add_to_run_eval_queue_examples(
|
|
227
231
|
evaluation_run.model_dump(warnings=False) # type: ignore
|