judgeval 0.16.9__py3-none-any.whl → 0.22.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of judgeval might be problematic. Click here for more details.

Files changed (37) hide show
  1. judgeval/__init__.py +32 -2
  2. judgeval/api/__init__.py +108 -0
  3. judgeval/api/api_types.py +76 -15
  4. judgeval/cli.py +16 -1
  5. judgeval/data/judgment_types.py +76 -20
  6. judgeval/dataset/__init__.py +11 -2
  7. judgeval/env.py +2 -11
  8. judgeval/evaluation/__init__.py +4 -0
  9. judgeval/prompt/__init__.py +330 -0
  10. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +1 -13
  11. judgeval/tracer/__init__.py +371 -257
  12. judgeval/tracer/constants.py +1 -1
  13. judgeval/tracer/exporters/store.py +32 -16
  14. judgeval/tracer/keys.py +11 -9
  15. judgeval/tracer/llm/llm_anthropic/messages.py +38 -26
  16. judgeval/tracer/llm/llm_anthropic/messages_stream.py +14 -14
  17. judgeval/tracer/llm/llm_google/generate_content.py +9 -7
  18. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +38 -14
  19. judgeval/tracer/llm/llm_openai/chat_completions.py +90 -26
  20. judgeval/tracer/llm/llm_openai/responses.py +88 -26
  21. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  22. judgeval/tracer/llm/llm_together/chat_completions.py +26 -18
  23. judgeval/tracer/managers.py +4 -0
  24. judgeval/trainer/__init__.py +10 -1
  25. judgeval/trainer/base_trainer.py +122 -0
  26. judgeval/trainer/config.py +1 -1
  27. judgeval/trainer/fireworks_trainer.py +396 -0
  28. judgeval/trainer/trainer.py +52 -387
  29. judgeval/utils/guards.py +9 -5
  30. judgeval/utils/project.py +15 -0
  31. judgeval/utils/serialize.py +2 -2
  32. judgeval/version.py +1 -1
  33. {judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/METADATA +2 -3
  34. {judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/RECORD +37 -32
  35. {judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
  36. {judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/entry_points.txt +0 -0
  37. {judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
judgeval/__init__.py CHANGED
@@ -6,6 +6,7 @@ from judgeval.data.evaluation_run import ExampleEvaluationRun
6
6
 
7
7
 
8
8
  from typing import List, Optional, Union, Sequence
9
+ import ast
9
10
  from judgeval.scorers import ExampleAPIScorerConfig
10
11
  from judgeval.scorers.example_scorer import ExampleScorer
11
12
  from judgeval.data.example import Example
@@ -81,6 +82,7 @@ class JudgmentClient(metaclass=SingletonMeta):
81
82
  scorer_file_path: str,
82
83
  requirements_file_path: Optional[str] = None,
83
84
  unique_name: Optional[str] = None,
85
+ overwrite: bool = False,
84
86
  ) -> bool:
85
87
  """
86
88
  Upload custom ExampleScorer from files to backend.
@@ -89,6 +91,7 @@ class JudgmentClient(metaclass=SingletonMeta):
89
91
  scorer_file_path: Path to Python file containing CustomScorer class
90
92
  requirements_file_path: Optional path to requirements.txt
91
93
  unique_name: Optional unique identifier (auto-detected from scorer.name if not provided)
94
+ overwrite: Whether to overwrite existing scorer if it already exists
92
95
 
93
96
  Returns:
94
97
  bool: True if upload successful
@@ -111,6 +114,31 @@ class JudgmentClient(metaclass=SingletonMeta):
111
114
  with open(scorer_file_path, "r") as f:
112
115
  scorer_code = f.read()
113
116
 
117
+ try:
118
+ tree = ast.parse(scorer_code, filename=scorer_file_path)
119
+ except SyntaxError as e:
120
+ error_msg = f"Invalid Python syntax in {scorer_file_path}: {e}"
121
+ judgeval_logger.error(error_msg)
122
+ raise ValueError(error_msg)
123
+
124
+ scorer_classes = []
125
+ for node in ast.walk(tree):
126
+ if isinstance(node, ast.ClassDef):
127
+ for base in node.bases:
128
+ if (isinstance(base, ast.Name) and base.id == "ExampleScorer") or (
129
+ isinstance(base, ast.Attribute) and base.attr == "ExampleScorer"
130
+ ):
131
+ scorer_classes.append(node.name)
132
+
133
+ if len(scorer_classes) > 1:
134
+ error_msg = f"Multiple ExampleScorer classes found in {scorer_file_path}: {scorer_classes}. Please only upload one scorer class per file."
135
+ judgeval_logger.error(error_msg)
136
+ raise ValueError(error_msg)
137
+ elif len(scorer_classes) == 0:
138
+ error_msg = f"No ExampleScorer class was found in {scorer_file_path}. Please ensure the file contains a valid scorer class that inherits from ExampleScorer."
139
+ judgeval_logger.error(error_msg)
140
+ raise ValueError(error_msg)
141
+
114
142
  # Read requirements (optional)
115
143
  requirements_text = ""
116
144
  if requirements_file_path and os.path.exists(requirements_file_path):
@@ -118,6 +146,8 @@ class JudgmentClient(metaclass=SingletonMeta):
118
146
  requirements_text = f.read()
119
147
 
120
148
  try:
149
+ if not self.api_key or not self.organization_id:
150
+ raise ValueError("Judgment API key and organization ID are required")
121
151
  client = JudgmentSyncClient(
122
152
  api_key=self.api_key,
123
153
  organization_id=self.organization_id,
@@ -127,6 +157,7 @@ class JudgmentClient(metaclass=SingletonMeta):
127
157
  "scorer_name": unique_name,
128
158
  "scorer_code": scorer_code,
129
159
  "requirements_text": requirements_text,
160
+ "overwrite": overwrite,
130
161
  }
131
162
  )
132
163
 
@@ -139,8 +170,7 @@ class JudgmentClient(metaclass=SingletonMeta):
139
170
  judgeval_logger.error(f"Failed to upload custom scorer: {unique_name}")
140
171
  return False
141
172
 
142
- except Exception as e:
143
- judgeval_logger.error(f"Error uploading custom scorer: {e}")
173
+ except Exception:
144
174
  raise
145
175
 
146
176
 
judgeval/api/__init__.py CHANGED
@@ -189,6 +189,59 @@ class JudgmentSyncClient:
189
189
  payload,
190
190
  )
191
191
 
192
+ def prompts_insert(self, payload: PromptInsertRequest) -> PromptInsertResponse:
193
+ return self._request(
194
+ "POST",
195
+ url_for("/prompts/insert/"),
196
+ payload,
197
+ )
198
+
199
+ def prompts_tag(self, payload: PromptTagRequest) -> PromptTagResponse:
200
+ return self._request(
201
+ "POST",
202
+ url_for("/prompts/tag/"),
203
+ payload,
204
+ )
205
+
206
+ def prompts_untag(self, payload: PromptUntagRequest) -> PromptUntagResponse:
207
+ return self._request(
208
+ "POST",
209
+ url_for("/prompts/untag/"),
210
+ payload,
211
+ )
212
+
213
+ def prompts_fetch(
214
+ self,
215
+ project_id: str,
216
+ name: str,
217
+ commit_id: Optional[str] = None,
218
+ tag: Optional[str] = None,
219
+ ) -> PromptFetchResponse:
220
+ query_params = {}
221
+ query_params["project_id"] = project_id
222
+ query_params["name"] = name
223
+ if commit_id is not None:
224
+ query_params["commit_id"] = commit_id
225
+ if tag is not None:
226
+ query_params["tag"] = tag
227
+ return self._request(
228
+ "GET",
229
+ url_for("/prompts/fetch/"),
230
+ query_params,
231
+ )
232
+
233
+ def prompts_get_prompt_versions(
234
+ self, project_id: str, name: str
235
+ ) -> PromptVersionsResponse:
236
+ query_params = {}
237
+ query_params["project_id"] = project_id
238
+ query_params["name"] = name
239
+ return self._request(
240
+ "GET",
241
+ url_for("/prompts/get_prompt_versions/"),
242
+ query_params,
243
+ )
244
+
192
245
  def projects_resolve(
193
246
  self, payload: ResolveProjectNameRequest
194
247
  ) -> ResolveProjectNameResponse:
@@ -381,6 +434,61 @@ class JudgmentAsyncClient:
381
434
  payload,
382
435
  )
383
436
 
437
+ async def prompts_insert(
438
+ self, payload: PromptInsertRequest
439
+ ) -> PromptInsertResponse:
440
+ return await self._request(
441
+ "POST",
442
+ url_for("/prompts/insert/"),
443
+ payload,
444
+ )
445
+
446
+ async def prompts_tag(self, payload: PromptTagRequest) -> PromptTagResponse:
447
+ return await self._request(
448
+ "POST",
449
+ url_for("/prompts/tag/"),
450
+ payload,
451
+ )
452
+
453
+ async def prompts_untag(self, payload: PromptUntagRequest) -> PromptUntagResponse:
454
+ return await self._request(
455
+ "POST",
456
+ url_for("/prompts/untag/"),
457
+ payload,
458
+ )
459
+
460
+ async def prompts_fetch(
461
+ self,
462
+ project_id: str,
463
+ name: str,
464
+ commit_id: Optional[str] = None,
465
+ tag: Optional[str] = None,
466
+ ) -> PromptFetchResponse:
467
+ query_params = {}
468
+ query_params["project_id"] = project_id
469
+ query_params["name"] = name
470
+ if commit_id is not None:
471
+ query_params["commit_id"] = commit_id
472
+ if tag is not None:
473
+ query_params["tag"] = tag
474
+ return await self._request(
475
+ "GET",
476
+ url_for("/prompts/fetch/"),
477
+ query_params,
478
+ )
479
+
480
+ async def prompts_get_prompt_versions(
481
+ self, project_id: str, name: str
482
+ ) -> PromptVersionsResponse:
483
+ query_params = {}
484
+ query_params["project_id"] = project_id
485
+ query_params["name"] = name
486
+ return await self._request(
487
+ "GET",
488
+ url_for("/prompts/get_prompt_versions/"),
489
+ query_params,
490
+ )
491
+
384
492
  async def projects_resolve(
385
493
  self, payload: ResolveProjectNameRequest
386
494
  ) -> ResolveProjectNameResponse:
judgeval/api/api_types.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # generated by datamodel-codegen:
2
2
  # filename: .openapi.json
3
- # timestamp: 2025-10-15T19:25:00+00:00
3
+ # timestamp: 2025-10-25T22:30:20+00:00
4
4
 
5
5
  from __future__ import annotations
6
6
  from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
@@ -24,15 +24,6 @@ class DatasetsFetch(TypedDict):
24
24
  project_name: str
25
25
 
26
26
 
27
- class DatasetsTableRow(TypedDict):
28
- dataset_id: str
29
- name: str
30
- created_at: str
31
- kind: Literal["trace", "example"]
32
- entries: int
33
- creator: str
34
-
35
-
36
27
  class ProjectAdd(TypedDict):
37
28
  project_name: str
38
29
 
@@ -67,19 +58,16 @@ class SavePromptScorerRequest(TypedDict):
67
58
  description: NotRequired[Optional[str]]
68
59
 
69
60
 
70
- class SavePromptScorerResponse(TypedDict):
71
- message: str
72
- name: str
73
-
74
-
75
61
  class FetchPromptScorersRequest(TypedDict):
76
62
  names: NotRequired[Optional[List[str]]]
63
+ is_trace: NotRequired[Optional[bool]]
77
64
 
78
65
 
79
66
  class CustomScorerUploadPayload(TypedDict):
80
67
  scorer_name: str
81
68
  scorer_code: str
82
69
  requirements_text: str
70
+ overwrite: NotRequired[bool]
83
71
 
84
72
 
85
73
  class CustomScorerTemplateResponse(TypedDict):
@@ -88,6 +76,40 @@ class CustomScorerTemplateResponse(TypedDict):
88
76
  message: str
89
77
 
90
78
 
79
+ class PromptInsertRequest(TypedDict):
80
+ project_id: str
81
+ name: str
82
+ prompt: str
83
+ tags: List[str]
84
+
85
+
86
+ class PromptInsertResponse(TypedDict):
87
+ commit_id: str
88
+ parent_commit_id: NotRequired[Optional[str]]
89
+ created_at: str
90
+
91
+
92
+ class PromptTagRequest(TypedDict):
93
+ project_id: str
94
+ name: str
95
+ commit_id: str
96
+ tags: List[str]
97
+
98
+
99
+ class PromptTagResponse(TypedDict):
100
+ commit_id: str
101
+
102
+
103
+ class PromptUntagRequest(TypedDict):
104
+ project_id: str
105
+ name: str
106
+ tags: List[str]
107
+
108
+
109
+ class PromptUntagResponse(TypedDict):
110
+ commit_ids: List[str]
111
+
112
+
91
113
  class ResolveProjectNameRequest(TypedDict):
92
114
  project_name: str
93
115
 
@@ -158,6 +180,9 @@ DatasetKind = Literal["trace", "example"]
158
180
 
159
181
 
160
182
  class PromptScorer(TypedDict):
183
+ id: str
184
+ user_id: str
185
+ organization_id: str
161
186
  name: str
162
187
  prompt: str
163
188
  threshold: float
@@ -167,6 +192,19 @@ class PromptScorer(TypedDict):
167
192
  created_at: NotRequired[Optional[str]]
168
193
  updated_at: NotRequired[Optional[str]]
169
194
  is_trace: NotRequired[Optional[bool]]
195
+ is_bucket_rubric: NotRequired[Optional[bool]]
196
+
197
+
198
+ class PromptCommitInfo(TypedDict):
199
+ name: str
200
+ prompt: str
201
+ tags: List[str]
202
+ commit_id: str
203
+ parent_commit_id: NotRequired[Optional[str]]
204
+ created_at: str
205
+ first_name: str
206
+ last_name: str
207
+ user_email: str
170
208
 
171
209
 
172
210
  class ScorerData(TypedDict):
@@ -245,6 +283,7 @@ class TraceEvaluationRun(TypedDict):
245
283
  created_at: NotRequired[str]
246
284
  trace_and_span_ids: List[TraceAndSpanId]
247
285
  is_offline: NotRequired[bool]
286
+ is_bucket_run: NotRequired[bool]
248
287
 
249
288
 
250
289
  class DatasetInsertExamples(TypedDict):
@@ -253,6 +292,15 @@ class DatasetInsertExamples(TypedDict):
253
292
  project_name: str
254
293
 
255
294
 
295
+ class DatasetInfo(TypedDict):
296
+ dataset_id: str
297
+ name: str
298
+ created_at: str
299
+ kind: DatasetKind
300
+ entries: int
301
+ creator: str
302
+
303
+
256
304
  class DatasetCreate(TypedDict):
257
305
  name: str
258
306
  dataset_kind: DatasetKind
@@ -261,10 +309,22 @@ class DatasetCreate(TypedDict):
261
309
  overwrite: bool
262
310
 
263
311
 
312
+ class SavePromptScorerResponse(TypedDict):
313
+ scorer_response: PromptScorer
314
+
315
+
264
316
  class FetchPromptScorersResponse(TypedDict):
265
317
  scorers: List[PromptScorer]
266
318
 
267
319
 
320
+ class PromptFetchResponse(TypedDict):
321
+ commit: NotRequired[Optional[PromptCommitInfo]]
322
+
323
+
324
+ class PromptVersionsResponse(TypedDict):
325
+ versions: List[PromptCommitInfo]
326
+
327
+
268
328
  class ScoringResult(TypedDict):
269
329
  success: bool
270
330
  scorers_data: List[ScorerData]
@@ -287,6 +347,7 @@ class OtelTraceListItem(TypedDict):
287
347
  llm_cost: NotRequired[Optional[float]]
288
348
  error: NotRequired[str]
289
349
  scores: NotRequired[List[OtelSpanListItemScores]]
350
+ rules_invoked: NotRequired[List[str]]
290
351
  customer_id: NotRequired[Optional[str]]
291
352
  input: NotRequired[Optional[str]]
292
353
  output: NotRequired[Optional[str]]
judgeval/cli.py CHANGED
@@ -6,6 +6,7 @@ from dotenv import load_dotenv
6
6
  from judgeval.logger import judgeval_logger
7
7
  from judgeval import JudgmentClient
8
8
  from judgeval.version import get_version
9
+ from judgeval.exceptions import JudgmentAPIError
9
10
 
10
11
  load_dotenv()
11
12
 
@@ -26,6 +27,12 @@ def upload_scorer(
26
27
  unique_name: str = typer.Option(
27
28
  None, help="Custom name for the scorer (auto-detected if not provided)"
28
29
  ),
30
+ overwrite: bool = typer.Option(
31
+ False,
32
+ "--overwrite",
33
+ "-o",
34
+ help="Overwrite existing scorer if it already exists",
35
+ ),
29
36
  ):
30
37
  # Validate file paths
31
38
  if not Path(scorer_file_path).exists():
@@ -43,14 +50,22 @@ def upload_scorer(
43
50
  scorer_file_path=scorer_file_path,
44
51
  requirements_file_path=requirements_file_path,
45
52
  unique_name=unique_name,
53
+ overwrite=overwrite,
46
54
  )
47
55
 
48
56
  if not result:
49
57
  judgeval_logger.error("Failed to upload custom scorer")
50
58
  raise typer.Exit(1)
51
59
 
60
+ judgeval_logger.info("Custom scorer uploaded successfully!")
52
61
  raise typer.Exit(0)
53
- except Exception:
62
+ except Exception as e:
63
+ if isinstance(e, JudgmentAPIError) and e.status_code == 409:
64
+ judgeval_logger.error(
65
+ "Duplicate scorer detected. Use --overwrite flag to replace the existing scorer"
66
+ )
67
+ raise typer.Exit(1)
68
+ # Re-raise other exceptions
54
69
  raise
55
70
 
56
71
 
@@ -1,6 +1,6 @@
1
1
  # generated by datamodel-codegen:
2
2
  # filename: .openapi.json
3
- # timestamp: 2025-10-15T19:24:59+00:00
3
+ # timestamp: 2025-10-25T22:30:19+00:00
4
4
 
5
5
  from __future__ import annotations
6
6
  from typing import Annotated, Any, Dict, List, Optional, Union
@@ -26,20 +26,6 @@ class DatasetsFetch(BaseModel):
26
26
  project_name: Annotated[str, Field(title="Project Name")]
27
27
 
28
28
 
29
- class Kind(Enum):
30
- trace = "trace"
31
- example = "example"
32
-
33
-
34
- class DatasetsTableRow(BaseModel):
35
- dataset_id: Annotated[str, Field(title="Dataset Id")]
36
- name: Annotated[str, Field(title="Name")]
37
- created_at: Annotated[str, Field(title="Created At")]
38
- kind: Annotated[Kind, Field(title="Kind")]
39
- entries: Annotated[int, Field(title="Entries")]
40
- creator: Annotated[str, Field(title="Creator")]
41
-
42
-
43
29
  class ProjectAdd(BaseModel):
44
30
  project_name: Annotated[str, Field(title="Project Name")]
45
31
 
@@ -74,19 +60,16 @@ class SavePromptScorerRequest(BaseModel):
74
60
  description: Annotated[Optional[str], Field(title="Description")] = None
75
61
 
76
62
 
77
- class SavePromptScorerResponse(BaseModel):
78
- message: Annotated[str, Field(title="Message")]
79
- name: Annotated[str, Field(title="Name")]
80
-
81
-
82
63
  class FetchPromptScorersRequest(BaseModel):
83
64
  names: Annotated[Optional[List[str]], Field(title="Names")] = None
65
+ is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = None
84
66
 
85
67
 
86
68
  class CustomScorerUploadPayload(BaseModel):
87
69
  scorer_name: Annotated[str, Field(title="Scorer Name")]
88
70
  scorer_code: Annotated[str, Field(title="Scorer Code")]
89
71
  requirements_text: Annotated[str, Field(title="Requirements Text")]
72
+ overwrite: Annotated[Optional[bool], Field(title="Overwrite")] = False
90
73
 
91
74
 
92
75
  class CustomScorerTemplateResponse(BaseModel):
@@ -95,6 +78,40 @@ class CustomScorerTemplateResponse(BaseModel):
95
78
  message: Annotated[str, Field(title="Message")]
96
79
 
97
80
 
81
+ class PromptInsertRequest(BaseModel):
82
+ project_id: Annotated[str, Field(title="Project Id")]
83
+ name: Annotated[str, Field(title="Name")]
84
+ prompt: Annotated[str, Field(title="Prompt")]
85
+ tags: Annotated[List[str], Field(title="Tags")]
86
+
87
+
88
+ class PromptInsertResponse(BaseModel):
89
+ commit_id: Annotated[str, Field(title="Commit Id")]
90
+ parent_commit_id: Annotated[Optional[str], Field(title="Parent Commit Id")] = None
91
+ created_at: Annotated[str, Field(title="Created At")]
92
+
93
+
94
+ class PromptTagRequest(BaseModel):
95
+ project_id: Annotated[str, Field(title="Project Id")]
96
+ name: Annotated[str, Field(title="Name")]
97
+ commit_id: Annotated[str, Field(title="Commit Id")]
98
+ tags: Annotated[List[str], Field(title="Tags")]
99
+
100
+
101
+ class PromptTagResponse(BaseModel):
102
+ commit_id: Annotated[str, Field(title="Commit Id")]
103
+
104
+
105
+ class PromptUntagRequest(BaseModel):
106
+ project_id: Annotated[str, Field(title="Project Id")]
107
+ name: Annotated[str, Field(title="Name")]
108
+ tags: Annotated[List[str], Field(title="Tags")]
109
+
110
+
111
+ class PromptUntagResponse(BaseModel):
112
+ commit_ids: Annotated[List[str], Field(title="Commit Ids")]
113
+
114
+
98
115
  class ResolveProjectNameRequest(BaseModel):
99
116
  project_name: Annotated[str, Field(title="Project Name")]
100
117
 
@@ -176,6 +193,9 @@ class DatasetKind(Enum):
176
193
 
177
194
 
178
195
  class PromptScorer(BaseModel):
196
+ id: Annotated[str, Field(title="Id")]
197
+ user_id: Annotated[str, Field(title="User Id")]
198
+ organization_id: Annotated[str, Field(title="Organization Id")]
179
199
  name: Annotated[str, Field(title="Name")]
180
200
  prompt: Annotated[str, Field(title="Prompt")]
181
201
  threshold: Annotated[float, Field(title="Threshold")]
@@ -185,6 +205,19 @@ class PromptScorer(BaseModel):
185
205
  created_at: Annotated[Optional[AwareDatetime], Field(title="Created At")] = None
186
206
  updated_at: Annotated[Optional[AwareDatetime], Field(title="Updated At")] = None
187
207
  is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = False
208
+ is_bucket_rubric: Annotated[Optional[bool], Field(title="Is Bucket Rubric")] = None
209
+
210
+
211
+ class PromptCommitInfo(BaseModel):
212
+ name: Annotated[str, Field(title="Name")]
213
+ prompt: Annotated[str, Field(title="Prompt")]
214
+ tags: Annotated[List[str], Field(title="Tags")]
215
+ commit_id: Annotated[str, Field(title="Commit Id")]
216
+ parent_commit_id: Annotated[Optional[str], Field(title="Parent Commit Id")] = None
217
+ created_at: Annotated[str, Field(title="Created At")]
218
+ first_name: Annotated[str, Field(title="First Name")]
219
+ last_name: Annotated[str, Field(title="Last Name")]
220
+ user_email: Annotated[str, Field(title="User Email")]
188
221
 
189
222
 
190
223
  class ScorerData(BaseModel):
@@ -279,6 +312,7 @@ class TraceEvaluationRun(BaseModel):
279
312
  List[TraceAndSpanId], Field(title="Trace And Span Ids")
280
313
  ]
281
314
  is_offline: Annotated[Optional[bool], Field(title="Is Offline")] = False
315
+ is_bucket_run: Annotated[Optional[bool], Field(title="Is Bucket Run")] = False
282
316
 
283
317
 
284
318
  class DatasetInsertExamples(BaseModel):
@@ -287,6 +321,15 @@ class DatasetInsertExamples(BaseModel):
287
321
  project_name: Annotated[str, Field(title="Project Name")]
288
322
 
289
323
 
324
+ class DatasetInfo(BaseModel):
325
+ dataset_id: Annotated[str, Field(title="Dataset Id")]
326
+ name: Annotated[str, Field(title="Name")]
327
+ created_at: Annotated[str, Field(title="Created At")]
328
+ kind: DatasetKind
329
+ entries: Annotated[int, Field(title="Entries")]
330
+ creator: Annotated[str, Field(title="Creator")]
331
+
332
+
290
333
  class DatasetCreate(BaseModel):
291
334
  name: Annotated[str, Field(title="Name")]
292
335
  dataset_kind: DatasetKind
@@ -295,10 +338,22 @@ class DatasetCreate(BaseModel):
295
338
  overwrite: Annotated[bool, Field(title="Overwrite")]
296
339
 
297
340
 
341
+ class SavePromptScorerResponse(BaseModel):
342
+ scorer_response: PromptScorer
343
+
344
+
298
345
  class FetchPromptScorersResponse(BaseModel):
299
346
  scorers: Annotated[List[PromptScorer], Field(title="Scorers")]
300
347
 
301
348
 
349
+ class PromptFetchResponse(BaseModel):
350
+ commit: Optional[PromptCommitInfo] = None
351
+
352
+
353
+ class PromptVersionsResponse(BaseModel):
354
+ versions: Annotated[List[PromptCommitInfo], Field(title="Versions")]
355
+
356
+
302
357
  class ScoringResult(BaseModel):
303
358
  success: Annotated[bool, Field(title="Success")]
304
359
  scorers_data: Annotated[List[ScorerData], Field(title="Scorers Data")]
@@ -325,6 +380,7 @@ class OtelTraceListItem(BaseModel):
325
380
  scores: Annotated[
326
381
  Optional[List[OtelSpanListItemScores]], Field(title="Scores")
327
382
  ] = []
383
+ rules_invoked: Annotated[Optional[List[str]], Field(title="Rules Invoked")] = []
328
384
  customer_id: Annotated[Optional[str], Field(title="Customer Id")] = None
329
385
  input: Annotated[Optional[str], Field(title="Input")] = None
330
386
  output: Annotated[Optional[str], Field(title="Output")] = None
@@ -32,8 +32,8 @@ class Dataset:
32
32
  dataset_kind: DatasetKind = DatasetKind.example
33
33
  examples: Optional[List[Example]] = None
34
34
  traces: Optional[List[Trace]] = None
35
- judgment_api_key: str = JUDGMENT_API_KEY or ""
36
- organization_id: str = JUDGMENT_ORG_ID or ""
35
+ judgment_api_key: str | None = JUDGMENT_API_KEY
36
+ organization_id: str | None = JUDGMENT_ORG_ID
37
37
 
38
38
  @classmethod
39
39
  def get(
@@ -41,6 +41,8 @@ class Dataset:
41
41
  name: str,
42
42
  project_name: str,
43
43
  ):
44
+ if not cls.judgment_api_key or not cls.organization_id:
45
+ raise ValueError("Judgment API key and organization ID are required")
44
46
  client = JudgmentSyncClient(cls.judgment_api_key, cls.organization_id)
45
47
  dataset = client.datasets_pull_for_judgeval(
46
48
  {
@@ -102,6 +104,8 @@ class Dataset:
102
104
  examples: List[Example] = [],
103
105
  overwrite: bool = False,
104
106
  ):
107
+ if not cls.judgment_api_key or not cls.organization_id:
108
+ raise ValueError("Judgment API key and organization ID are required")
105
109
  if not examples:
106
110
  examples = []
107
111
 
@@ -125,6 +129,8 @@ class Dataset:
125
129
 
126
130
  @classmethod
127
131
  def list(cls, project_name: str):
132
+ if not cls.judgment_api_key or not cls.organization_id:
133
+ raise ValueError("Judgment API key and organization ID are required")
128
134
  client = JudgmentSyncClient(cls.judgment_api_key, cls.organization_id)
129
135
  datasets = client.datasets_pull_all_for_judgeval({"project_name": project_name})
130
136
 
@@ -173,6 +179,9 @@ class Dataset:
173
179
  if not isinstance(examples, list):
174
180
  raise TypeError("examples must be a list")
175
181
 
182
+ if not self.judgment_api_key or not self.organization_id:
183
+ raise ValueError("Judgment API key and organization ID are required")
184
+
176
185
  client = JudgmentSyncClient(self.judgment_api_key, self.organization_id)
177
186
  client.datasets_insert_examples_for_judgeval(
178
187
  {
judgeval/env.py CHANGED
@@ -19,17 +19,8 @@ def optional_env_var(var_name: str, default: str | None = None) -> str | None:
19
19
  return os.getenv(var_name, default)
20
20
 
21
21
 
22
- def required_env_var(var_name: str) -> str:
23
- value = os.getenv(var_name)
24
- if value is None:
25
- raise EnvironmentError(
26
- f"Environment variable '{var_name}' is required but not set."
27
- )
28
- return value
29
-
30
-
31
- JUDGMENT_API_KEY = required_env_var("JUDGMENT_API_KEY")
32
- JUDGMENT_ORG_ID = required_env_var("JUDGMENT_ORG_ID")
22
+ JUDGMENT_API_KEY = optional_env_var("JUDGMENT_API_KEY")
23
+ JUDGMENT_ORG_ID = optional_env_var("JUDGMENT_ORG_ID")
33
24
  JUDGMENT_API_URL = optional_env_var("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
34
25
 
35
26
  JUDGMENT_DEFAULT_GPT_MODEL = optional_env_var("JUDGMENT_DEFAULT_GPT_MODEL", "gpt-5")
@@ -112,6 +112,8 @@ def _poll_evaluation_until_complete(
112
112
 
113
113
  poll_count = 0
114
114
  exception_count = 0
115
+ if not JUDGMENT_API_KEY or not JUDGMENT_ORG_ID:
116
+ raise ValueError("Judgment API key and organization ID are required")
115
117
  api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
116
118
  while poll_count < max_poll_count:
117
119
  poll_count += 1
@@ -222,6 +224,8 @@ def run_eval(
222
224
  )
223
225
  t.start()
224
226
  try:
227
+ if not JUDGMENT_API_KEY or not JUDGMENT_ORG_ID:
228
+ raise ValueError("Judgment API key and organization ID are required")
225
229
  api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
226
230
  response = api_client.add_to_run_eval_queue_examples(
227
231
  evaluation_run.model_dump(warnings=False) # type: ignore