judgeval 0.10.0__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +4 -4
- judgeval/api/__init__.py +22 -8
- judgeval/api/api_types.py +30 -17
- judgeval/data/evaluation_run.py +10 -11
- judgeval/data/judgment_types.py +25 -14
- judgeval/data/result.py +1 -0
- judgeval/data/scorer_data.py +1 -26
- judgeval/dataset/__init__.py +32 -8
- judgeval/env.py +11 -2
- judgeval/evaluation/__init__.py +20 -63
- judgeval/integrations/langgraph/__init__.py +2 -1
- judgeval/scorers/__init__.py +0 -4
- judgeval/scorers/agent_scorer.py +15 -15
- judgeval/scorers/api_scorer.py +0 -8
- judgeval/scorers/base_scorer.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +3 -5
- judgeval/scorers/score.py +1 -1
- judgeval/tracer/__init__.py +7 -10
- judgeval/tracer/local_eval_queue.py +11 -7
- judgeval/tracer/utils.py +2 -2
- judgeval/trainer/config.py +1 -1
- judgeval/trainer/trainable_model.py +1 -1
- judgeval/trainer/trainer.py +8 -6
- judgeval/utils/async_utils.py +7 -3
- judgeval/utils/testing.py +0 -4
- {judgeval-0.10.0.dist-info → judgeval-0.11.0.dist-info}/METADATA +1 -1
- {judgeval-0.10.0.dist-info → judgeval-0.11.0.dist-info}/RECORD +34 -35
- judgeval/data/tool.py +0 -5
- {judgeval-0.10.0.dist-info → judgeval-0.11.0.dist-info}/WHEEL +0 -0
- {judgeval-0.10.0.dist-info → judgeval-0.11.0.dist-info}/entry_points.txt +0 -0
- {judgeval-0.10.0.dist-info → judgeval-0.11.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/__init__.py
CHANGED
@@ -6,7 +6,8 @@ from judgeval.data.evaluation_run import ExampleEvaluationRun
|
|
6
6
|
|
7
7
|
|
8
8
|
from typing import List, Optional, Union
|
9
|
-
from judgeval.scorers import
|
9
|
+
from judgeval.scorers import APIScorerConfig
|
10
|
+
from judgeval.scorers.example_scorer import ExampleScorer
|
10
11
|
from judgeval.data.example import Example
|
11
12
|
from judgeval.logger import judgeval_logger
|
12
13
|
from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_DEFAULT_GPT_MODEL, JUDGMENT_ORG_ID
|
@@ -38,7 +39,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
38
39
|
def run_evaluation(
|
39
40
|
self,
|
40
41
|
examples: List[Example],
|
41
|
-
scorers: List[Union[
|
42
|
+
scorers: List[Union[APIScorerConfig, ExampleScorer]],
|
42
43
|
project_name: str = "default_project",
|
43
44
|
eval_run_name: str = "default_eval_run",
|
44
45
|
model: str = JUDGMENT_DEFAULT_GPT_MODEL,
|
@@ -51,10 +52,9 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
51
52
|
examples=examples,
|
52
53
|
scorers=scorers,
|
53
54
|
model=model,
|
54
|
-
organization_id=self.organization_id,
|
55
55
|
)
|
56
56
|
|
57
|
-
results = run_eval(eval
|
57
|
+
results = run_eval(eval)
|
58
58
|
if assert_test:
|
59
59
|
assert_test_results(results)
|
60
60
|
|
judgeval/api/__init__.py
CHANGED
@@ -137,6 +137,13 @@ class JudgmentSyncClient:
|
|
137
137
|
payload,
|
138
138
|
)
|
139
139
|
|
140
|
+
def datasets_pull_all_for_judgeval(self, payload: DatasetsFetch) -> Any:
|
141
|
+
return self._request(
|
142
|
+
"POST",
|
143
|
+
url_for("/datasets/pull_all_for_judgeval/"),
|
144
|
+
payload,
|
145
|
+
)
|
146
|
+
|
140
147
|
def datasets_create_for_judgeval(self, payload: DatasetCreate) -> Any:
|
141
148
|
return self._request(
|
142
149
|
"POST",
|
@@ -174,12 +181,12 @@ class JudgmentSyncClient:
|
|
174
181
|
payload,
|
175
182
|
)
|
176
183
|
|
177
|
-
def
|
178
|
-
self, payload:
|
179
|
-
) ->
|
184
|
+
def fetch_scorers(
|
185
|
+
self, payload: FetchPromptScorersRequest
|
186
|
+
) -> FetchPromptScorersResponse:
|
180
187
|
return self._request(
|
181
188
|
"POST",
|
182
|
-
url_for("/
|
189
|
+
url_for("/fetch_scorers/"),
|
183
190
|
payload,
|
184
191
|
)
|
185
192
|
|
@@ -339,6 +346,13 @@ class JudgmentAsyncClient:
|
|
339
346
|
payload,
|
340
347
|
)
|
341
348
|
|
349
|
+
async def datasets_pull_all_for_judgeval(self, payload: DatasetsFetch) -> Any:
|
350
|
+
return await self._request(
|
351
|
+
"POST",
|
352
|
+
url_for("/datasets/pull_all_for_judgeval/"),
|
353
|
+
payload,
|
354
|
+
)
|
355
|
+
|
342
356
|
async def datasets_create_for_judgeval(self, payload: DatasetCreate) -> Any:
|
343
357
|
return await self._request(
|
344
358
|
"POST",
|
@@ -378,12 +392,12 @@ class JudgmentAsyncClient:
|
|
378
392
|
payload,
|
379
393
|
)
|
380
394
|
|
381
|
-
async def
|
382
|
-
self, payload:
|
383
|
-
) ->
|
395
|
+
async def fetch_scorers(
|
396
|
+
self, payload: FetchPromptScorersRequest
|
397
|
+
) -> FetchPromptScorersResponse:
|
384
398
|
return await self._request(
|
385
399
|
"POST",
|
386
|
-
url_for("/
|
400
|
+
url_for("/fetch_scorers/"),
|
387
401
|
payload,
|
388
402
|
)
|
389
403
|
|
judgeval/api/api_types.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# generated by datamodel-codegen:
|
2
2
|
# filename: .openapi.json
|
3
|
-
# timestamp: 2025-09-
|
3
|
+
# timestamp: 2025-09-12T16:54:35+00:00
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
6
|
from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
|
@@ -20,6 +20,10 @@ class DatasetFetch(TypedDict):
|
|
20
20
|
project_name: str
|
21
21
|
|
22
22
|
|
23
|
+
class DatasetsFetch(TypedDict):
|
24
|
+
project_name: str
|
25
|
+
|
26
|
+
|
23
27
|
class ProjectAdd(TypedDict):
|
24
28
|
project_name: str
|
25
29
|
|
@@ -57,8 +61,8 @@ class SavePromptScorerResponse(TypedDict):
|
|
57
61
|
name: str
|
58
62
|
|
59
63
|
|
60
|
-
class
|
61
|
-
|
64
|
+
class FetchPromptScorersRequest(TypedDict):
|
65
|
+
names: NotRequired[Optional[List[str]]]
|
62
66
|
|
63
67
|
|
64
68
|
class CustomScorerUploadPayload(TypedDict):
|
@@ -151,7 +155,7 @@ class ScorerData(TypedDict):
|
|
151
155
|
score: NotRequired[Optional[float]]
|
152
156
|
reason: NotRequired[Optional[str]]
|
153
157
|
strict_mode: NotRequired[Optional[bool]]
|
154
|
-
evaluation_model: NotRequired[str]
|
158
|
+
evaluation_model: NotRequired[Optional[str]]
|
155
159
|
error: NotRequired[Optional[str]]
|
156
160
|
additional_metadata: NotRequired[Optional[Dict[str, Any]]]
|
157
161
|
|
@@ -186,13 +190,13 @@ class OtelTraceSpan(TypedDict):
|
|
186
190
|
|
187
191
|
|
188
192
|
class ExampleEvaluationRun(TypedDict):
|
189
|
-
id: NotRequired[
|
190
|
-
project_name:
|
191
|
-
eval_name:
|
193
|
+
id: NotRequired[str]
|
194
|
+
project_name: str
|
195
|
+
eval_name: str
|
192
196
|
custom_scorers: NotRequired[List[BaseScorer]]
|
193
197
|
judgment_scorers: NotRequired[List[ScorerConfig]]
|
194
198
|
model: str
|
195
|
-
created_at: NotRequired[
|
199
|
+
created_at: NotRequired[str]
|
196
200
|
examples: List[Example]
|
197
201
|
trace_span_id: NotRequired[Optional[str]]
|
198
202
|
trace_id: NotRequired[Optional[str]]
|
@@ -203,13 +207,13 @@ class HTTPValidationError(TypedDict):
|
|
203
207
|
|
204
208
|
|
205
209
|
class TraceEvaluationRun(TypedDict):
|
206
|
-
id: NotRequired[
|
207
|
-
project_name:
|
208
|
-
eval_name:
|
210
|
+
id: NotRequired[str]
|
211
|
+
project_name: str
|
212
|
+
eval_name: str
|
209
213
|
custom_scorers: NotRequired[List[BaseScorer]]
|
210
214
|
judgment_scorers: NotRequired[List[ScorerConfig]]
|
211
215
|
model: str
|
212
|
-
created_at: NotRequired[
|
216
|
+
created_at: NotRequired[str]
|
213
217
|
trace_and_span_ids: List[TraceAndSpanId]
|
214
218
|
is_offline: NotRequired[bool]
|
215
219
|
|
@@ -226,21 +230,30 @@ class DatasetReturn(TypedDict):
|
|
226
230
|
examples: NotRequired[Optional[List[Example]]]
|
227
231
|
|
228
232
|
|
233
|
+
class DatasetInfo(TypedDict):
|
234
|
+
dataset_id: str
|
235
|
+
name: str
|
236
|
+
created_at: str
|
237
|
+
dataset_kind: DatasetKind
|
238
|
+
entries: int
|
239
|
+
creator: str
|
240
|
+
|
241
|
+
|
229
242
|
class DatasetCreate(TypedDict):
|
230
243
|
name: str
|
231
244
|
dataset_kind: DatasetKind
|
232
245
|
project_name: str
|
233
|
-
examples:
|
234
|
-
overwrite:
|
246
|
+
examples: List[Example]
|
247
|
+
overwrite: bool
|
235
248
|
|
236
249
|
|
237
|
-
class
|
238
|
-
|
250
|
+
class FetchPromptScorersResponse(TypedDict):
|
251
|
+
scorers: List[PromptScorer]
|
239
252
|
|
240
253
|
|
241
254
|
class ScoringResult(TypedDict):
|
242
255
|
success: bool
|
243
|
-
scorers_data:
|
256
|
+
scorers_data: List[ScorerData]
|
244
257
|
name: NotRequired[Optional[str]]
|
245
258
|
data_object: NotRequired[Optional[Union[OtelTraceSpan, Example]]]
|
246
259
|
trace_id: NotRequired[Optional[str]]
|
judgeval/data/evaluation_run.py
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
from typing import List, Optional, Union, Tuple
|
2
|
-
from
|
3
|
-
from pydantic import field_validator, model_validator, Field
|
2
|
+
from pydantic import field_validator, model_validator, Field, BaseModel
|
4
3
|
from datetime import datetime, timezone
|
5
4
|
import uuid
|
6
5
|
|
7
6
|
from judgeval.data import Example
|
8
|
-
from judgeval.scorers import
|
7
|
+
from judgeval.scorers import APIScorerConfig
|
8
|
+
from judgeval.scorers.example_scorer import ExampleScorer
|
9
9
|
from judgeval.constants import ACCEPTABLE_MODELS
|
10
10
|
from judgeval.data.judgment_types import (
|
11
11
|
ExampleEvaluationRun as ExampleEvaluationRunJudgmentType,
|
@@ -14,19 +14,18 @@ from judgeval.data.judgment_types import (
|
|
14
14
|
|
15
15
|
|
16
16
|
class EvaluationRun(BaseModel):
|
17
|
-
id:
|
18
|
-
created_at:
|
17
|
+
id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
18
|
+
created_at: str = Field(
|
19
19
|
default_factory=lambda: datetime.now(timezone.utc).isoformat()
|
20
20
|
)
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
scorers: Optional[List[Union[BaseScorer, APIScorerConfig]]] = None
|
21
|
+
custom_scorers: List[ExampleScorer] = Field(default_factory=list)
|
22
|
+
judgment_scorers: List[APIScorerConfig] = Field(default_factory=list)
|
23
|
+
scorers: List[Union[ExampleScorer, APIScorerConfig]] = Field(default_factory=list)
|
25
24
|
model: str
|
26
25
|
|
27
26
|
def __init__(
|
28
27
|
self,
|
29
|
-
scorers: Optional[List[Union[
|
28
|
+
scorers: Optional[List[Union[ExampleScorer, APIScorerConfig]]] = None,
|
30
29
|
**kwargs,
|
31
30
|
):
|
32
31
|
"""
|
@@ -38,7 +37,7 @@ class EvaluationRun(BaseModel):
|
|
38
37
|
"""
|
39
38
|
if scorers is not None:
|
40
39
|
# Automatically sort scorers into appropriate fields
|
41
|
-
custom_scorers = [s for s in scorers if isinstance(s,
|
40
|
+
custom_scorers = [s for s in scorers if isinstance(s, ExampleScorer)]
|
42
41
|
judgment_scorers = [s for s in scorers if isinstance(s, APIScorerConfig)]
|
43
42
|
|
44
43
|
# Always set both fields as lists (even if empty) to satisfy validation
|
judgeval/data/judgment_types.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# generated by datamodel-codegen:
|
2
2
|
# filename: .openapi.json
|
3
|
-
# timestamp: 2025-09-
|
3
|
+
# timestamp: 2025-09-12T16:54:34+00:00
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
6
|
from typing import Annotated, Any, Dict, List, Optional, Union
|
@@ -22,6 +22,10 @@ class DatasetFetch(BaseModel):
|
|
22
22
|
project_name: Annotated[str, Field(title="Project Name")]
|
23
23
|
|
24
24
|
|
25
|
+
class DatasetsFetch(BaseModel):
|
26
|
+
project_name: Annotated[str, Field(title="Project Name")]
|
27
|
+
|
28
|
+
|
25
29
|
class ProjectAdd(BaseModel):
|
26
30
|
project_name: Annotated[str, Field(title="Project Name")]
|
27
31
|
|
@@ -59,8 +63,8 @@ class SavePromptScorerResponse(BaseModel):
|
|
59
63
|
name: Annotated[str, Field(title="Name")]
|
60
64
|
|
61
65
|
|
62
|
-
class
|
63
|
-
|
66
|
+
class FetchPromptScorersRequest(BaseModel):
|
67
|
+
names: Annotated[Optional[List[str]], Field(title="Names")] = None
|
64
68
|
|
65
69
|
|
66
70
|
class CustomScorerUploadPayload(BaseModel):
|
@@ -210,8 +214,8 @@ class OtelTraceSpan(BaseModel):
|
|
210
214
|
|
211
215
|
class ExampleEvaluationRun(BaseModel):
|
212
216
|
id: Annotated[Optional[str], Field(title="Id")] = None
|
213
|
-
project_name: Annotated[
|
214
|
-
eval_name: Annotated[
|
217
|
+
project_name: Annotated[str, Field(title="Project Name")]
|
218
|
+
eval_name: Annotated[str, Field(title="Eval Name")]
|
215
219
|
custom_scorers: Annotated[
|
216
220
|
Optional[List[BaseScorer]], Field(title="Custom Scorers")
|
217
221
|
] = []
|
@@ -231,8 +235,8 @@ class HTTPValidationError(BaseModel):
|
|
231
235
|
|
232
236
|
class TraceEvaluationRun(BaseModel):
|
233
237
|
id: Annotated[Optional[str], Field(title="Id")] = None
|
234
|
-
project_name: Annotated[
|
235
|
-
eval_name: Annotated[
|
238
|
+
project_name: Annotated[str, Field(title="Project Name")]
|
239
|
+
eval_name: Annotated[str, Field(title="Eval Name")]
|
236
240
|
custom_scorers: Annotated[
|
237
241
|
Optional[List[BaseScorer]], Field(title="Custom Scorers")
|
238
242
|
] = []
|
@@ -259,23 +263,30 @@ class DatasetReturn(BaseModel):
|
|
259
263
|
examples: Annotated[Optional[List[Example]], Field(title="Examples")] = None
|
260
264
|
|
261
265
|
|
266
|
+
class DatasetInfo(BaseModel):
|
267
|
+
dataset_id: Annotated[str, Field(title="Dataset Id")]
|
268
|
+
name: Annotated[str, Field(title="Name")]
|
269
|
+
created_at: Annotated[str, Field(title="Created At")]
|
270
|
+
dataset_kind: DatasetKind
|
271
|
+
entries: Annotated[int, Field(title="Entries")]
|
272
|
+
creator: Annotated[str, Field(title="Creator")]
|
273
|
+
|
274
|
+
|
262
275
|
class DatasetCreate(BaseModel):
|
263
276
|
name: Annotated[str, Field(title="Name")]
|
264
277
|
dataset_kind: DatasetKind
|
265
278
|
project_name: Annotated[str, Field(title="Project Name")]
|
266
|
-
examples: Annotated[
|
267
|
-
overwrite: Annotated[
|
279
|
+
examples: Annotated[List[Example], Field(title="Examples")]
|
280
|
+
overwrite: Annotated[bool, Field(title="Overwrite")]
|
268
281
|
|
269
282
|
|
270
|
-
class
|
271
|
-
|
283
|
+
class FetchPromptScorersResponse(BaseModel):
|
284
|
+
scorers: Annotated[List[PromptScorer], Field(title="Scorers")]
|
272
285
|
|
273
286
|
|
274
287
|
class ScoringResult(BaseModel):
|
275
288
|
success: Annotated[bool, Field(title="Success")]
|
276
|
-
scorers_data: Annotated[
|
277
|
-
None
|
278
|
-
)
|
289
|
+
scorers_data: Annotated[List[ScorerData], Field(title="Scorers Data")]
|
279
290
|
name: Annotated[Optional[str], Field(title="Name")] = None
|
280
291
|
data_object: Annotated[
|
281
292
|
Optional[Union[OtelTraceSpan, Example]], Field(title="Data Object")
|
judgeval/data/result.py
CHANGED
judgeval/data/scorer_data.py
CHANGED
@@ -6,36 +6,11 @@ ScorerData holds the information related to a single, completed Scorer evaluatio
|
|
6
6
|
|
7
7
|
from __future__ import annotations
|
8
8
|
|
9
|
-
from judgeval.data.judgment_types import ScorerData
|
9
|
+
from judgeval.data.judgment_types import ScorerData
|
10
10
|
from judgeval.scorers import BaseScorer
|
11
11
|
from typing import List
|
12
12
|
|
13
13
|
|
14
|
-
class ScorerData(JudgmentScorerData):
|
15
|
-
"""
|
16
|
-
ScorerData holds the information related to a single, completed Scorer evaluation run.
|
17
|
-
|
18
|
-
For example, if running the Judgment Faithfulness scorer on an example, the ScorerData
|
19
|
-
object will contain whether the example passed its threshold expectation, as well as more detailed
|
20
|
-
information surrounding the evaluation run such as the claims and verdicts generated by the
|
21
|
-
judge model(s).
|
22
|
-
"""
|
23
|
-
|
24
|
-
def to_dict(self) -> dict:
|
25
|
-
"""Convert the ScorerData instance to a JSON-serializable dictionary."""
|
26
|
-
return {
|
27
|
-
"name": self.name,
|
28
|
-
"threshold": self.threshold,
|
29
|
-
"success": self.success,
|
30
|
-
"score": self.score,
|
31
|
-
"reason": self.reason,
|
32
|
-
"strict_mode": self.strict_mode,
|
33
|
-
"evaluation_model": self.evaluation_model,
|
34
|
-
"error": self.error,
|
35
|
-
"additional_metadata": self.additional_metadata,
|
36
|
-
}
|
37
|
-
|
38
|
-
|
39
14
|
def create_scorer_data(scorer: BaseScorer) -> List[ScorerData]:
|
40
15
|
"""
|
41
16
|
After a `scorer` is run, it contains information about the example that was evaluated
|
judgeval/dataset/__init__.py
CHANGED
@@ -3,7 +3,7 @@ import orjson
|
|
3
3
|
import os
|
4
4
|
import yaml
|
5
5
|
from dataclasses import dataclass
|
6
|
-
from typing import List, Literal
|
6
|
+
from typing import List, Literal
|
7
7
|
|
8
8
|
from judgeval.data import Example
|
9
9
|
from judgeval.utils.file_utils import get_examples_from_yaml, get_examples_from_json
|
@@ -11,6 +11,18 @@ from judgeval.api import JudgmentSyncClient
|
|
11
11
|
from judgeval.logger import judgeval_logger
|
12
12
|
from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
|
13
13
|
|
14
|
+
from judgeval.api.api_types import DatasetKind
|
15
|
+
|
16
|
+
|
17
|
+
@dataclass
|
18
|
+
class DatasetInfo:
|
19
|
+
dataset_id: str
|
20
|
+
name: str
|
21
|
+
created_at: str
|
22
|
+
dataset_kind: DatasetKind
|
23
|
+
entries: int
|
24
|
+
creator: str
|
25
|
+
|
14
26
|
|
15
27
|
@dataclass
|
16
28
|
class Dataset:
|
@@ -36,13 +48,16 @@ class Dataset:
|
|
36
48
|
if not dataset:
|
37
49
|
raise ValueError(f"Dataset {name} not found in project {project_name}")
|
38
50
|
examples = dataset.get("examples", [])
|
51
|
+
if examples is None:
|
52
|
+
examples = []
|
53
|
+
|
39
54
|
for e in examples:
|
40
|
-
if isinstance(e, dict) and isinstance(e.get("data"), dict):
|
41
|
-
e.update(e.pop("data"))
|
55
|
+
if isinstance(e, dict) and isinstance(e.get("data", {}), dict):
|
56
|
+
e.update(e.pop("data")) # type: ignore
|
42
57
|
e.pop(
|
43
58
|
"example_id"
|
44
59
|
) # TODO: remove once scorer data migraiton is complete
|
45
|
-
judgeval_logger.info(f"
|
60
|
+
judgeval_logger.info(f"Successfully retrieved dataset {name}!")
|
46
61
|
return cls(
|
47
62
|
name=name,
|
48
63
|
project_name=project_name,
|
@@ -54,7 +69,7 @@ class Dataset:
|
|
54
69
|
cls,
|
55
70
|
name: str,
|
56
71
|
project_name: str,
|
57
|
-
examples:
|
72
|
+
examples: List[Example] = [],
|
58
73
|
overwrite: bool = False,
|
59
74
|
):
|
60
75
|
if not examples:
|
@@ -65,19 +80,28 @@ class Dataset:
|
|
65
80
|
{
|
66
81
|
"name": name,
|
67
82
|
"project_name": project_name,
|
68
|
-
"examples":
|
83
|
+
"examples": examples, # type: ignore
|
69
84
|
"dataset_kind": "example",
|
70
85
|
"overwrite": overwrite,
|
71
86
|
}
|
72
87
|
)
|
73
88
|
|
74
|
-
judgeval_logger.info(f"
|
89
|
+
judgeval_logger.info(f"Successfully created dataset {name}!")
|
75
90
|
return cls(
|
76
91
|
name=name,
|
77
92
|
project_name=project_name,
|
78
93
|
examples=examples,
|
79
94
|
)
|
80
95
|
|
96
|
+
@classmethod
|
97
|
+
def list(cls, project_name: str):
|
98
|
+
client = JudgmentSyncClient(cls.judgment_api_key, cls.organization_id)
|
99
|
+
datasets = client.datasets_pull_all_for_judgeval({"project_name": project_name})
|
100
|
+
|
101
|
+
judgeval_logger.info(f"Fetched all datasets for project {project_name}!")
|
102
|
+
|
103
|
+
return [DatasetInfo(**dataset_info) for dataset_info in datasets]
|
104
|
+
|
81
105
|
def add_from_json(self, file_path: str) -> None:
|
82
106
|
"""
|
83
107
|
Adds examples from a JSON file.
|
@@ -124,7 +148,7 @@ class Dataset:
|
|
124
148
|
{
|
125
149
|
"dataset_name": self.name,
|
126
150
|
"project_name": self.project_name,
|
127
|
-
"examples":
|
151
|
+
"examples": examples, # type: ignore
|
128
152
|
}
|
129
153
|
)
|
130
154
|
|
judgeval/env.py
CHANGED
@@ -19,8 +19,17 @@ def optional_env_var(var_name: str, default: str | None = None) -> str | None:
|
|
19
19
|
return os.getenv(var_name, default)
|
20
20
|
|
21
21
|
|
22
|
-
|
23
|
-
|
22
|
+
def required_env_var(var_name: str) -> str:
|
23
|
+
value = os.getenv(var_name)
|
24
|
+
if value is None:
|
25
|
+
raise EnvironmentError(
|
26
|
+
f"Environment variable '{var_name}' is required but not set."
|
27
|
+
)
|
28
|
+
return value
|
29
|
+
|
30
|
+
|
31
|
+
JUDGMENT_API_KEY = required_env_var("JUDGMENT_API_KEY")
|
32
|
+
JUDGMENT_ORG_ID = required_env_var("JUDGMENT_ORG_ID")
|
24
33
|
JUDGMENT_API_URL = optional_env_var("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
|
25
34
|
|
26
35
|
JUDGMENT_DEFAULT_GPT_MODEL = optional_env_var("JUDGMENT_DEFAULT_GPT_MODEL", "gpt-4.1")
|