judgeval 0.10.1__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +5 -5
- judgeval/api/__init__.py +17 -9
- judgeval/api/api_types.py +20 -18
- judgeval/data/evaluation_run.py +13 -12
- judgeval/data/judgment_types.py +25 -14
- judgeval/data/result.py +1 -0
- judgeval/data/scorer_data.py +1 -26
- judgeval/dataset/__init__.py +17 -16
- judgeval/env.py +11 -2
- judgeval/evaluation/__init__.py +20 -63
- judgeval/integrations/langgraph/__init__.py +2 -1
- judgeval/scorers/__init__.py +2 -0
- judgeval/scorers/agent_scorer.py +15 -15
- judgeval/scorers/base_scorer.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +1 -1
- judgeval/scorers/score.py +1 -1
- judgeval/tracer/__init__.py +6 -9
- judgeval/tracer/local_eval_queue.py +11 -7
- judgeval/trainer/config.py +1 -1
- judgeval/trainer/trainable_model.py +1 -1
- judgeval/trainer/trainer.py +8 -6
- judgeval/utils/async_utils.py +7 -3
- judgeval/utils/testing.py +0 -4
- judgeval/version.py +1 -1
- {judgeval-0.10.1.dist-info → judgeval-0.12.0.dist-info}/METADATA +1 -1
- {judgeval-0.10.1.dist-info → judgeval-0.12.0.dist-info}/RECORD +29 -30
- judgeval/data/tool.py +0 -5
- {judgeval-0.10.1.dist-info → judgeval-0.12.0.dist-info}/WHEEL +0 -0
- {judgeval-0.10.1.dist-info → judgeval-0.12.0.dist-info}/entry_points.txt +0 -0
- {judgeval-0.10.1.dist-info → judgeval-0.12.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/__init__.py
CHANGED
@@ -5,8 +5,9 @@ from judgeval.evaluation import run_eval
|
|
5
5
|
from judgeval.data.evaluation_run import ExampleEvaluationRun
|
6
6
|
|
7
7
|
|
8
|
-
from typing import List, Optional, Union
|
9
|
-
from judgeval.scorers import
|
8
|
+
from typing import List, Optional, Union, Sequence
|
9
|
+
from judgeval.scorers import ExampleAPIScorerConfig
|
10
|
+
from judgeval.scorers.example_scorer import ExampleScorer
|
10
11
|
from judgeval.data.example import Example
|
11
12
|
from judgeval.logger import judgeval_logger
|
12
13
|
from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_DEFAULT_GPT_MODEL, JUDGMENT_ORG_ID
|
@@ -38,7 +39,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
38
39
|
def run_evaluation(
|
39
40
|
self,
|
40
41
|
examples: List[Example],
|
41
|
-
scorers:
|
42
|
+
scorers: Sequence[Union[ExampleAPIScorerConfig, ExampleScorer]],
|
42
43
|
project_name: str = "default_project",
|
43
44
|
eval_run_name: str = "default_eval_run",
|
44
45
|
model: str = JUDGMENT_DEFAULT_GPT_MODEL,
|
@@ -51,10 +52,9 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
51
52
|
examples=examples,
|
52
53
|
scorers=scorers,
|
53
54
|
model=model,
|
54
|
-
organization_id=self.organization_id,
|
55
55
|
)
|
56
56
|
|
57
|
-
results = run_eval(eval
|
57
|
+
results = run_eval(eval)
|
58
58
|
if assert_test:
|
59
59
|
assert_test_results(results)
|
60
60
|
|
judgeval/api/__init__.py
CHANGED
@@ -137,12 +137,13 @@ class JudgmentSyncClient:
|
|
137
137
|
payload,
|
138
138
|
)
|
139
139
|
|
140
|
-
def datasets_pull_all_for_judgeval(self, payload: DatasetsFetch) ->
|
140
|
+
def datasets_pull_all_for_judgeval(self, payload: DatasetsFetch) -> Any:
|
141
141
|
return self._request(
|
142
142
|
"POST",
|
143
143
|
url_for("/datasets/pull_all_for_judgeval/"),
|
144
144
|
payload,
|
145
145
|
)
|
146
|
+
|
146
147
|
def datasets_create_for_judgeval(self, payload: DatasetCreate) -> Any:
|
147
148
|
return self._request(
|
148
149
|
"POST",
|
@@ -180,12 +181,12 @@ class JudgmentSyncClient:
|
|
180
181
|
payload,
|
181
182
|
)
|
182
183
|
|
183
|
-
def
|
184
|
-
self, payload:
|
185
|
-
) ->
|
184
|
+
def fetch_scorers(
|
185
|
+
self, payload: FetchPromptScorersRequest
|
186
|
+
) -> FetchPromptScorersResponse:
|
186
187
|
return self._request(
|
187
188
|
"POST",
|
188
|
-
url_for("/
|
189
|
+
url_for("/fetch_scorers/"),
|
189
190
|
payload,
|
190
191
|
)
|
191
192
|
|
@@ -345,6 +346,13 @@ class JudgmentAsyncClient:
|
|
345
346
|
payload,
|
346
347
|
)
|
347
348
|
|
349
|
+
async def datasets_pull_all_for_judgeval(self, payload: DatasetsFetch) -> Any:
|
350
|
+
return await self._request(
|
351
|
+
"POST",
|
352
|
+
url_for("/datasets/pull_all_for_judgeval/"),
|
353
|
+
payload,
|
354
|
+
)
|
355
|
+
|
348
356
|
async def datasets_create_for_judgeval(self, payload: DatasetCreate) -> Any:
|
349
357
|
return await self._request(
|
350
358
|
"POST",
|
@@ -384,12 +392,12 @@ class JudgmentAsyncClient:
|
|
384
392
|
payload,
|
385
393
|
)
|
386
394
|
|
387
|
-
async def
|
388
|
-
self, payload:
|
389
|
-
) ->
|
395
|
+
async def fetch_scorers(
|
396
|
+
self, payload: FetchPromptScorersRequest
|
397
|
+
) -> FetchPromptScorersResponse:
|
390
398
|
return await self._request(
|
391
399
|
"POST",
|
392
|
-
url_for("/
|
400
|
+
url_for("/fetch_scorers/"),
|
393
401
|
payload,
|
394
402
|
)
|
395
403
|
|
judgeval/api/api_types.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# generated by datamodel-codegen:
|
2
2
|
# filename: .openapi.json
|
3
|
-
# timestamp: 2025-09-
|
3
|
+
# timestamp: 2025-09-12T16:54:35+00:00
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
6
|
from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
|
@@ -19,6 +19,7 @@ class DatasetFetch(TypedDict):
|
|
19
19
|
dataset_name: str
|
20
20
|
project_name: str
|
21
21
|
|
22
|
+
|
22
23
|
class DatasetsFetch(TypedDict):
|
23
24
|
project_name: str
|
24
25
|
|
@@ -60,8 +61,8 @@ class SavePromptScorerResponse(TypedDict):
|
|
60
61
|
name: str
|
61
62
|
|
62
63
|
|
63
|
-
class
|
64
|
-
|
64
|
+
class FetchPromptScorersRequest(TypedDict):
|
65
|
+
names: NotRequired[Optional[List[str]]]
|
65
66
|
|
66
67
|
|
67
68
|
class CustomScorerUploadPayload(TypedDict):
|
@@ -154,7 +155,7 @@ class ScorerData(TypedDict):
|
|
154
155
|
score: NotRequired[Optional[float]]
|
155
156
|
reason: NotRequired[Optional[str]]
|
156
157
|
strict_mode: NotRequired[Optional[bool]]
|
157
|
-
evaluation_model: NotRequired[str]
|
158
|
+
evaluation_model: NotRequired[Optional[str]]
|
158
159
|
error: NotRequired[Optional[str]]
|
159
160
|
additional_metadata: NotRequired[Optional[Dict[str, Any]]]
|
160
161
|
|
@@ -189,13 +190,13 @@ class OtelTraceSpan(TypedDict):
|
|
189
190
|
|
190
191
|
|
191
192
|
class ExampleEvaluationRun(TypedDict):
|
192
|
-
id: NotRequired[
|
193
|
-
project_name:
|
194
|
-
eval_name:
|
193
|
+
id: NotRequired[str]
|
194
|
+
project_name: str
|
195
|
+
eval_name: str
|
195
196
|
custom_scorers: NotRequired[List[BaseScorer]]
|
196
197
|
judgment_scorers: NotRequired[List[ScorerConfig]]
|
197
198
|
model: str
|
198
|
-
created_at: NotRequired[
|
199
|
+
created_at: NotRequired[str]
|
199
200
|
examples: List[Example]
|
200
201
|
trace_span_id: NotRequired[Optional[str]]
|
201
202
|
trace_id: NotRequired[Optional[str]]
|
@@ -206,13 +207,13 @@ class HTTPValidationError(TypedDict):
|
|
206
207
|
|
207
208
|
|
208
209
|
class TraceEvaluationRun(TypedDict):
|
209
|
-
id: NotRequired[
|
210
|
-
project_name:
|
211
|
-
eval_name:
|
210
|
+
id: NotRequired[str]
|
211
|
+
project_name: str
|
212
|
+
eval_name: str
|
212
213
|
custom_scorers: NotRequired[List[BaseScorer]]
|
213
214
|
judgment_scorers: NotRequired[List[ScorerConfig]]
|
214
215
|
model: str
|
215
|
-
created_at: NotRequired[
|
216
|
+
created_at: NotRequired[str]
|
216
217
|
trace_and_span_ids: List[TraceAndSpanId]
|
217
218
|
is_offline: NotRequired[bool]
|
218
219
|
|
@@ -228,30 +229,31 @@ class DatasetReturn(TypedDict):
|
|
228
229
|
project_name: str
|
229
230
|
examples: NotRequired[Optional[List[Example]]]
|
230
231
|
|
232
|
+
|
231
233
|
class DatasetInfo(TypedDict):
|
232
234
|
dataset_id: str
|
233
235
|
name: str
|
234
236
|
created_at: str
|
235
237
|
dataset_kind: DatasetKind
|
236
238
|
entries: int
|
237
|
-
creator: str
|
239
|
+
creator: str
|
238
240
|
|
239
241
|
|
240
242
|
class DatasetCreate(TypedDict):
|
241
243
|
name: str
|
242
244
|
dataset_kind: DatasetKind
|
243
245
|
project_name: str
|
244
|
-
examples:
|
245
|
-
overwrite:
|
246
|
+
examples: List[Example]
|
247
|
+
overwrite: bool
|
246
248
|
|
247
249
|
|
248
|
-
class
|
249
|
-
|
250
|
+
class FetchPromptScorersResponse(TypedDict):
|
251
|
+
scorers: List[PromptScorer]
|
250
252
|
|
251
253
|
|
252
254
|
class ScoringResult(TypedDict):
|
253
255
|
success: bool
|
254
|
-
scorers_data:
|
256
|
+
scorers_data: List[ScorerData]
|
255
257
|
name: NotRequired[Optional[str]]
|
256
258
|
data_object: NotRequired[Optional[Union[OtelTraceSpan, Example]]]
|
257
259
|
trace_id: NotRequired[Optional[str]]
|
judgeval/data/evaluation_run.py
CHANGED
@@ -1,11 +1,11 @@
|
|
1
|
-
from typing import List, Optional, Union, Tuple
|
2
|
-
from
|
3
|
-
from pydantic import field_validator, model_validator, Field
|
1
|
+
from typing import List, Optional, Union, Tuple, Sequence
|
2
|
+
from pydantic import field_validator, model_validator, Field, BaseModel
|
4
3
|
from datetime import datetime, timezone
|
5
4
|
import uuid
|
6
5
|
|
7
6
|
from judgeval.data import Example
|
8
|
-
from judgeval.scorers import
|
7
|
+
from judgeval.scorers import APIScorerConfig
|
8
|
+
from judgeval.scorers.example_scorer import ExampleScorer
|
9
9
|
from judgeval.constants import ACCEPTABLE_MODELS
|
10
10
|
from judgeval.data.judgment_types import (
|
11
11
|
ExampleEvaluationRun as ExampleEvaluationRunJudgmentType,
|
@@ -14,19 +14,20 @@ from judgeval.data.judgment_types import (
|
|
14
14
|
|
15
15
|
|
16
16
|
class EvaluationRun(BaseModel):
|
17
|
-
id:
|
18
|
-
created_at:
|
17
|
+
id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
18
|
+
created_at: str = Field(
|
19
19
|
default_factory=lambda: datetime.now(timezone.utc).isoformat()
|
20
20
|
)
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
21
|
+
custom_scorers: List[ExampleScorer] = Field(default_factory=list)
|
22
|
+
judgment_scorers: Sequence[APIScorerConfig] = Field(default_factory=list)
|
23
|
+
scorers: Sequence[Union[ExampleScorer, APIScorerConfig]] = Field(
|
24
|
+
default_factory=list
|
25
|
+
)
|
25
26
|
model: str
|
26
27
|
|
27
28
|
def __init__(
|
28
29
|
self,
|
29
|
-
scorers: Optional[List[Union[
|
30
|
+
scorers: Optional[List[Union[ExampleScorer, APIScorerConfig]]] = None,
|
30
31
|
**kwargs,
|
31
32
|
):
|
32
33
|
"""
|
@@ -38,7 +39,7 @@ class EvaluationRun(BaseModel):
|
|
38
39
|
"""
|
39
40
|
if scorers is not None:
|
40
41
|
# Automatically sort scorers into appropriate fields
|
41
|
-
custom_scorers = [s for s in scorers if isinstance(s,
|
42
|
+
custom_scorers = [s for s in scorers if isinstance(s, ExampleScorer)]
|
42
43
|
judgment_scorers = [s for s in scorers if isinstance(s, APIScorerConfig)]
|
43
44
|
|
44
45
|
# Always set both fields as lists (even if empty) to satisfy validation
|
judgeval/data/judgment_types.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# generated by datamodel-codegen:
|
2
2
|
# filename: .openapi.json
|
3
|
-
# timestamp: 2025-09-
|
3
|
+
# timestamp: 2025-09-12T16:54:34+00:00
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
6
|
from typing import Annotated, Any, Dict, List, Optional, Union
|
@@ -22,6 +22,10 @@ class DatasetFetch(BaseModel):
|
|
22
22
|
project_name: Annotated[str, Field(title="Project Name")]
|
23
23
|
|
24
24
|
|
25
|
+
class DatasetsFetch(BaseModel):
|
26
|
+
project_name: Annotated[str, Field(title="Project Name")]
|
27
|
+
|
28
|
+
|
25
29
|
class ProjectAdd(BaseModel):
|
26
30
|
project_name: Annotated[str, Field(title="Project Name")]
|
27
31
|
|
@@ -59,8 +63,8 @@ class SavePromptScorerResponse(BaseModel):
|
|
59
63
|
name: Annotated[str, Field(title="Name")]
|
60
64
|
|
61
65
|
|
62
|
-
class
|
63
|
-
|
66
|
+
class FetchPromptScorersRequest(BaseModel):
|
67
|
+
names: Annotated[Optional[List[str]], Field(title="Names")] = None
|
64
68
|
|
65
69
|
|
66
70
|
class CustomScorerUploadPayload(BaseModel):
|
@@ -210,8 +214,8 @@ class OtelTraceSpan(BaseModel):
|
|
210
214
|
|
211
215
|
class ExampleEvaluationRun(BaseModel):
|
212
216
|
id: Annotated[Optional[str], Field(title="Id")] = None
|
213
|
-
project_name: Annotated[
|
214
|
-
eval_name: Annotated[
|
217
|
+
project_name: Annotated[str, Field(title="Project Name")]
|
218
|
+
eval_name: Annotated[str, Field(title="Eval Name")]
|
215
219
|
custom_scorers: Annotated[
|
216
220
|
Optional[List[BaseScorer]], Field(title="Custom Scorers")
|
217
221
|
] = []
|
@@ -231,8 +235,8 @@ class HTTPValidationError(BaseModel):
|
|
231
235
|
|
232
236
|
class TraceEvaluationRun(BaseModel):
|
233
237
|
id: Annotated[Optional[str], Field(title="Id")] = None
|
234
|
-
project_name: Annotated[
|
235
|
-
eval_name: Annotated[
|
238
|
+
project_name: Annotated[str, Field(title="Project Name")]
|
239
|
+
eval_name: Annotated[str, Field(title="Eval Name")]
|
236
240
|
custom_scorers: Annotated[
|
237
241
|
Optional[List[BaseScorer]], Field(title="Custom Scorers")
|
238
242
|
] = []
|
@@ -259,23 +263,30 @@ class DatasetReturn(BaseModel):
|
|
259
263
|
examples: Annotated[Optional[List[Example]], Field(title="Examples")] = None
|
260
264
|
|
261
265
|
|
266
|
+
class DatasetInfo(BaseModel):
|
267
|
+
dataset_id: Annotated[str, Field(title="Dataset Id")]
|
268
|
+
name: Annotated[str, Field(title="Name")]
|
269
|
+
created_at: Annotated[str, Field(title="Created At")]
|
270
|
+
dataset_kind: DatasetKind
|
271
|
+
entries: Annotated[int, Field(title="Entries")]
|
272
|
+
creator: Annotated[str, Field(title="Creator")]
|
273
|
+
|
274
|
+
|
262
275
|
class DatasetCreate(BaseModel):
|
263
276
|
name: Annotated[str, Field(title="Name")]
|
264
277
|
dataset_kind: DatasetKind
|
265
278
|
project_name: Annotated[str, Field(title="Project Name")]
|
266
|
-
examples: Annotated[
|
267
|
-
overwrite: Annotated[
|
279
|
+
examples: Annotated[List[Example], Field(title="Examples")]
|
280
|
+
overwrite: Annotated[bool, Field(title="Overwrite")]
|
268
281
|
|
269
282
|
|
270
|
-
class
|
271
|
-
|
283
|
+
class FetchPromptScorersResponse(BaseModel):
|
284
|
+
scorers: Annotated[List[PromptScorer], Field(title="Scorers")]
|
272
285
|
|
273
286
|
|
274
287
|
class ScoringResult(BaseModel):
|
275
288
|
success: Annotated[bool, Field(title="Success")]
|
276
|
-
scorers_data: Annotated[
|
277
|
-
None
|
278
|
-
)
|
289
|
+
scorers_data: Annotated[List[ScorerData], Field(title="Scorers Data")]
|
279
290
|
name: Annotated[Optional[str], Field(title="Name")] = None
|
280
291
|
data_object: Annotated[
|
281
292
|
Optional[Union[OtelTraceSpan, Example]], Field(title="Data Object")
|
judgeval/data/result.py
CHANGED
judgeval/data/scorer_data.py
CHANGED
@@ -6,36 +6,11 @@ ScorerData holds the information related to a single, completed Scorer evaluatio
|
|
6
6
|
|
7
7
|
from __future__ import annotations
|
8
8
|
|
9
|
-
from judgeval.data.judgment_types import ScorerData
|
9
|
+
from judgeval.data.judgment_types import ScorerData
|
10
10
|
from judgeval.scorers import BaseScorer
|
11
11
|
from typing import List
|
12
12
|
|
13
13
|
|
14
|
-
class ScorerData(JudgmentScorerData):
|
15
|
-
"""
|
16
|
-
ScorerData holds the information related to a single, completed Scorer evaluation run.
|
17
|
-
|
18
|
-
For example, if running the Judgment Faithfulness scorer on an example, the ScorerData
|
19
|
-
object will contain whether the example passed its threshold expectation, as well as more detailed
|
20
|
-
information surrounding the evaluation run such as the claims and verdicts generated by the
|
21
|
-
judge model(s).
|
22
|
-
"""
|
23
|
-
|
24
|
-
def to_dict(self) -> dict:
|
25
|
-
"""Convert the ScorerData instance to a JSON-serializable dictionary."""
|
26
|
-
return {
|
27
|
-
"name": self.name,
|
28
|
-
"threshold": self.threshold,
|
29
|
-
"success": self.success,
|
30
|
-
"score": self.score,
|
31
|
-
"reason": self.reason,
|
32
|
-
"strict_mode": self.strict_mode,
|
33
|
-
"evaluation_model": self.evaluation_model,
|
34
|
-
"error": self.error,
|
35
|
-
"additional_metadata": self.additional_metadata,
|
36
|
-
}
|
37
|
-
|
38
|
-
|
39
14
|
def create_scorer_data(scorer: BaseScorer) -> List[ScorerData]:
|
40
15
|
"""
|
41
16
|
After a `scorer` is run, it contains information about the example that was evaluated
|
judgeval/dataset/__init__.py
CHANGED
@@ -3,7 +3,7 @@ import orjson
|
|
3
3
|
import os
|
4
4
|
import yaml
|
5
5
|
from dataclasses import dataclass
|
6
|
-
from typing import List, Literal
|
6
|
+
from typing import List, Literal
|
7
7
|
|
8
8
|
from judgeval.data import Example
|
9
9
|
from judgeval.utils.file_utils import get_examples_from_yaml, get_examples_from_json
|
@@ -13,15 +13,17 @@ from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
|
|
13
13
|
|
14
14
|
from judgeval.api.api_types import DatasetKind
|
15
15
|
|
16
|
+
|
16
17
|
@dataclass
|
17
18
|
class DatasetInfo:
|
18
19
|
dataset_id: str
|
19
|
-
name: str
|
20
|
+
name: str
|
20
21
|
created_at: str
|
21
22
|
dataset_kind: DatasetKind
|
22
23
|
entries: int
|
23
24
|
creator: str
|
24
25
|
|
26
|
+
|
25
27
|
@dataclass
|
26
28
|
class Dataset:
|
27
29
|
examples: List[Example]
|
@@ -46,9 +48,12 @@ class Dataset:
|
|
46
48
|
if not dataset:
|
47
49
|
raise ValueError(f"Dataset {name} not found in project {project_name}")
|
48
50
|
examples = dataset.get("examples", [])
|
51
|
+
if examples is None:
|
52
|
+
examples = []
|
53
|
+
|
49
54
|
for e in examples:
|
50
|
-
if isinstance(e, dict) and isinstance(e.get("data"), dict):
|
51
|
-
e.update(e.pop("data"))
|
55
|
+
if isinstance(e, dict) and isinstance(e.get("data", {}), dict):
|
56
|
+
e.update(e.pop("data")) # type: ignore
|
52
57
|
e.pop(
|
53
58
|
"example_id"
|
54
59
|
) # TODO: remove once scorer data migraiton is complete
|
@@ -64,7 +69,7 @@ class Dataset:
|
|
64
69
|
cls,
|
65
70
|
name: str,
|
66
71
|
project_name: str,
|
67
|
-
examples:
|
72
|
+
examples: List[Example] = [],
|
68
73
|
overwrite: bool = False,
|
69
74
|
):
|
70
75
|
if not examples:
|
@@ -75,7 +80,7 @@ class Dataset:
|
|
75
80
|
{
|
76
81
|
"name": name,
|
77
82
|
"project_name": project_name,
|
78
|
-
"examples":
|
83
|
+
"examples": examples, # type: ignore
|
79
84
|
"dataset_kind": "example",
|
80
85
|
"overwrite": overwrite,
|
81
86
|
}
|
@@ -87,18 +92,14 @@ class Dataset:
|
|
87
92
|
project_name=project_name,
|
88
93
|
examples=examples,
|
89
94
|
)
|
95
|
+
|
90
96
|
@classmethod
|
91
|
-
def list(
|
92
|
-
cls,
|
93
|
-
project_name: str
|
94
|
-
):
|
97
|
+
def list(cls, project_name: str):
|
95
98
|
client = JudgmentSyncClient(cls.judgment_api_key, cls.organization_id)
|
96
|
-
datasets = client.datasets_pull_all_for_judgeval(
|
97
|
-
|
98
|
-
)
|
99
|
-
|
99
|
+
datasets = client.datasets_pull_all_for_judgeval({"project_name": project_name})
|
100
|
+
|
100
101
|
judgeval_logger.info(f"Fetched all datasets for project {project_name}!")
|
101
|
-
|
102
|
+
|
102
103
|
return [DatasetInfo(**dataset_info) for dataset_info in datasets]
|
103
104
|
|
104
105
|
def add_from_json(self, file_path: str) -> None:
|
@@ -147,7 +148,7 @@ class Dataset:
|
|
147
148
|
{
|
148
149
|
"dataset_name": self.name,
|
149
150
|
"project_name": self.project_name,
|
150
|
-
"examples":
|
151
|
+
"examples": examples, # type: ignore
|
151
152
|
}
|
152
153
|
)
|
153
154
|
|
judgeval/env.py
CHANGED
@@ -19,8 +19,17 @@ def optional_env_var(var_name: str, default: str | None = None) -> str | None:
|
|
19
19
|
return os.getenv(var_name, default)
|
20
20
|
|
21
21
|
|
22
|
-
|
23
|
-
|
22
|
+
def required_env_var(var_name: str) -> str:
|
23
|
+
value = os.getenv(var_name)
|
24
|
+
if value is None:
|
25
|
+
raise EnvironmentError(
|
26
|
+
f"Environment variable '{var_name}' is required but not set."
|
27
|
+
)
|
28
|
+
return value
|
29
|
+
|
30
|
+
|
31
|
+
JUDGMENT_API_KEY = required_env_var("JUDGMENT_API_KEY")
|
32
|
+
JUDGMENT_ORG_ID = required_env_var("JUDGMENT_ORG_ID")
|
24
33
|
JUDGMENT_API_URL = optional_env_var("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
|
25
34
|
|
26
35
|
JUDGMENT_DEFAULT_GPT_MODEL = optional_env_var("JUDGMENT_DEFAULT_GPT_MODEL", "gpt-4.1")
|
judgeval/evaluation/__init__.py
CHANGED
@@ -3,14 +3,11 @@ from __future__ import annotations
|
|
3
3
|
import asyncio
|
4
4
|
import concurrent.futures
|
5
5
|
import time
|
6
|
-
import orjson
|
7
|
-
import sys
|
8
6
|
import threading
|
9
|
-
from typing import List,
|
7
|
+
from typing import List, Tuple, TYPE_CHECKING
|
10
8
|
from rich import print as rprint
|
11
9
|
|
12
|
-
from judgeval.data import ScorerData, ScoringResult
|
13
|
-
from judgeval.scorers import BaseScorer, ExampleAPIScorerConfig
|
10
|
+
from judgeval.data import ScorerData, ScoringResult
|
14
11
|
from judgeval.scorers.score import a_execute_scoring
|
15
12
|
from judgeval.api import JudgmentSyncClient
|
16
13
|
from judgeval.env import (
|
@@ -19,9 +16,10 @@ from judgeval.env import (
|
|
19
16
|
from judgeval.exceptions import JudgmentAPIError, JudgmentRuntimeError
|
20
17
|
from judgeval.logger import judgeval_logger
|
21
18
|
|
19
|
+
from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
|
22
20
|
|
23
21
|
if TYPE_CHECKING:
|
24
|
-
from judgeval.data.evaluation_run import
|
22
|
+
from judgeval.data.evaluation_run import ExampleEvaluationRun
|
25
23
|
|
26
24
|
|
27
25
|
def safe_run_async(coro):
|
@@ -49,8 +47,7 @@ def safe_run_async(coro):
|
|
49
47
|
|
50
48
|
def log_evaluation_results(
|
51
49
|
scoring_results: List[ScoringResult],
|
52
|
-
run:
|
53
|
-
judgment_api_key: str,
|
50
|
+
run: ExampleEvaluationRun,
|
54
51
|
) -> str:
|
55
52
|
"""
|
56
53
|
Logs evaluation results to the Judgment API database.
|
@@ -65,10 +62,10 @@ def log_evaluation_results(
|
|
65
62
|
ValueError: If there's a validation error with the results
|
66
63
|
"""
|
67
64
|
try:
|
68
|
-
if not
|
65
|
+
if not JUDGMENT_API_KEY or not JUDGMENT_ORG_ID:
|
69
66
|
raise ValueError("API key and organization ID are required")
|
70
67
|
|
71
|
-
api_client = JudgmentSyncClient(
|
68
|
+
api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
|
72
69
|
response = api_client.log_eval_results(
|
73
70
|
{
|
74
71
|
"results": scoring_results, # type: ignore
|
@@ -85,41 +82,8 @@ def log_evaluation_results(
|
|
85
82
|
)
|
86
83
|
|
87
84
|
|
88
|
-
def check_examples(
|
89
|
-
examples: List[Example], scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]]
|
90
|
-
) -> None:
|
91
|
-
"""
|
92
|
-
Checks if the example contains the necessary parameters for the scorer.
|
93
|
-
"""
|
94
|
-
prompt_user = False
|
95
|
-
for scorer in scorers:
|
96
|
-
for example in examples:
|
97
|
-
missing_params = []
|
98
|
-
for param in scorer.required_params:
|
99
|
-
if getattr(example, param.value) is None:
|
100
|
-
missing_params.append(f"{param.value}")
|
101
|
-
if missing_params:
|
102
|
-
rprint(
|
103
|
-
f"[yellow]⚠️ WARNING:[/yellow] Example is missing required parameters for scorer [bold]{scorer.score_type.value}[/bold]"
|
104
|
-
)
|
105
|
-
rprint(f"Missing parameters: {', '.join(missing_params)}")
|
106
|
-
rprint(
|
107
|
-
f"Example: {orjson.dumps(example.model_dump(), option=orjson.OPT_INDENT_2).decode('utf-8')}"
|
108
|
-
)
|
109
|
-
rprint("-" * 40)
|
110
|
-
prompt_user = True
|
111
|
-
|
112
|
-
if prompt_user:
|
113
|
-
user_input = input("Do you want to continue? (y/n)")
|
114
|
-
if user_input.lower() != "y":
|
115
|
-
sys.exit(0)
|
116
|
-
else:
|
117
|
-
rprint("[green]Continuing...[/green]")
|
118
|
-
|
119
|
-
|
120
85
|
def _poll_evaluation_until_complete(
|
121
|
-
evaluation_run:
|
122
|
-
judgment_api_key: str,
|
86
|
+
evaluation_run: ExampleEvaluationRun,
|
123
87
|
expected_scorer_data_count: int,
|
124
88
|
poll_interval_seconds: float = 5,
|
125
89
|
max_failures: int = 5,
|
@@ -140,13 +104,15 @@ def _poll_evaluation_until_complete(
|
|
140
104
|
Returns:
|
141
105
|
List[ScoringResult]: The evaluation results
|
142
106
|
"""
|
143
|
-
organization_id = evaluation_run.organization_id
|
144
107
|
project_name = evaluation_run.project_name
|
145
108
|
experiment_run_id = evaluation_run.id
|
146
109
|
|
110
|
+
if not project_name or not experiment_run_id:
|
111
|
+
raise ValueError("Project name and experiment run ID are required")
|
112
|
+
|
147
113
|
poll_count = 0
|
148
114
|
exception_count = 0
|
149
|
-
api_client = JudgmentSyncClient(
|
115
|
+
api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
|
150
116
|
while poll_count < max_poll_count:
|
151
117
|
poll_count += 1
|
152
118
|
try:
|
@@ -213,14 +179,13 @@ def progress_logger(stop_event, msg="Working...", interval=5):
|
|
213
179
|
|
214
180
|
|
215
181
|
def run_eval(
|
216
|
-
evaluation_run:
|
217
|
-
judgment_api_key: str,
|
182
|
+
evaluation_run: ExampleEvaluationRun,
|
218
183
|
) -> List[ScoringResult]:
|
219
184
|
"""
|
220
185
|
Executes an evaluation of `Example`s using one or more `Scorer`s
|
221
186
|
|
222
187
|
Args:
|
223
|
-
evaluation_run (
|
188
|
+
evaluation_run (ExampleEvaluationRun): Stores example and evaluation together for running
|
224
189
|
|
225
190
|
Returns:
|
226
191
|
List[ScoringResult]: A list of ScoringResult objects
|
@@ -258,16 +223,13 @@ def run_eval(
|
|
258
223
|
judgeval_logger.error(error_msg)
|
259
224
|
raise ValueError(error_msg)
|
260
225
|
|
261
|
-
check_examples(evaluation_run.examples, evaluation_run.judgment_scorers)
|
262
226
|
stop_event = threading.Event()
|
263
227
|
t = threading.Thread(
|
264
228
|
target=progress_logger, args=(stop_event, "Running evaluation...")
|
265
229
|
)
|
266
230
|
t.start()
|
267
231
|
try:
|
268
|
-
api_client = JudgmentSyncClient(
|
269
|
-
judgment_api_key, evaluation_run.organization_id
|
270
|
-
)
|
232
|
+
api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
|
271
233
|
response = api_client.add_to_run_eval_queue_examples(
|
272
234
|
evaluation_run.model_dump(warnings=False) # type: ignore
|
273
235
|
)
|
@@ -286,7 +248,6 @@ def run_eval(
|
|
286
248
|
)
|
287
249
|
results, url = _poll_evaluation_until_complete(
|
288
250
|
evaluation_run=evaluation_run,
|
289
|
-
judgment_api_key=judgment_api_key,
|
290
251
|
expected_scorer_data_count=(num_scorers * len(evaluation_run.examples)),
|
291
252
|
)
|
292
253
|
finally:
|
@@ -306,7 +267,7 @@ def run_eval(
|
|
306
267
|
send_results = [
|
307
268
|
scoring_result.model_dump(warnings=False) for scoring_result in results
|
308
269
|
]
|
309
|
-
url = log_evaluation_results(send_results, evaluation_run
|
270
|
+
url = log_evaluation_results(send_results, evaluation_run)
|
310
271
|
rprint(
|
311
272
|
f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
|
312
273
|
)
|
@@ -323,27 +284,23 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
|
|
323
284
|
Returns:
|
324
285
|
None. Raises exceptions for any failed test cases.
|
325
286
|
"""
|
326
|
-
failed_cases: List[ScorerData] = []
|
287
|
+
failed_cases: List[List[ScorerData]] = []
|
327
288
|
|
328
289
|
for result in scoring_results:
|
329
290
|
if not result.success:
|
330
291
|
# Create a test case context with all relevant fields
|
331
|
-
test_case:
|
292
|
+
test_case: List[ScorerData] = []
|
332
293
|
if result.scorers_data:
|
333
294
|
# If the result was not successful, check each scorer_data
|
334
295
|
for scorer_data in result.scorers_data:
|
335
296
|
if not scorer_data.success:
|
336
|
-
|
337
|
-
# Remove threshold, evaluation model for Tool Order scorer
|
338
|
-
scorer_data.threshold = None
|
339
|
-
scorer_data.evaluation_model = None
|
340
|
-
test_case["failed_scorers"].append(scorer_data)
|
297
|
+
test_case.append(scorer_data)
|
341
298
|
failed_cases.append(test_case)
|
342
299
|
|
343
300
|
if failed_cases:
|
344
301
|
error_msg = "The following test cases failed: \n"
|
345
302
|
for fail_case in failed_cases:
|
346
|
-
for fail_scorer in fail_case
|
303
|
+
for fail_scorer in fail_case:
|
347
304
|
error_msg += (
|
348
305
|
f"\nScorer Name: {fail_scorer.name}\n"
|
349
306
|
f"Threshold: {fail_scorer.threshold}\n"
|
@@ -507,6 +507,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
507
507
|
)
|
508
508
|
|
509
509
|
# Extract response content
|
510
|
+
output: Any
|
510
511
|
if response.generations:
|
511
512
|
last_generation = response.generations[-1][-1]
|
512
513
|
if (
|
@@ -547,7 +548,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
547
548
|
for key, value in usage_attrs.items():
|
548
549
|
span.set_attribute(key, value)
|
549
550
|
|
550
|
-
self._end_span(run_id=run_id, outputs=output, **usage_attrs)
|
551
|
+
self._end_span(run_id=run_id, outputs=output, **usage_attrs) # type: ignore
|
551
552
|
|
552
553
|
except Exception as e:
|
553
554
|
judgeval_logger.exception(f"Error in on_llm_end: {e}")
|
judgeval/scorers/__init__.py
CHANGED
@@ -4,6 +4,7 @@ from judgeval.scorers.api_scorer import (
|
|
4
4
|
TraceAPIScorerConfig,
|
5
5
|
)
|
6
6
|
from judgeval.scorers.base_scorer import BaseScorer
|
7
|
+
from judgeval.scorers.example_scorer import ExampleScorer
|
7
8
|
from judgeval.scorers.judgeval_scorers.api_scorers import (
|
8
9
|
FaithfulnessScorer,
|
9
10
|
AnswerRelevancyScorer,
|
@@ -18,6 +19,7 @@ __all__ = [
|
|
18
19
|
"ExampleAPIScorerConfig",
|
19
20
|
"TraceAPIScorerConfig",
|
20
21
|
"BaseScorer",
|
22
|
+
"ExampleScorer",
|
21
23
|
"TracePromptScorer",
|
22
24
|
"PromptScorer",
|
23
25
|
"FaithfulnessScorer",
|
judgeval/scorers/agent_scorer.py
CHANGED
@@ -1,17 +1,17 @@
|
|
1
|
-
from judgeval.scorers.base_scorer import BaseScorer
|
2
|
-
from judgeval.data.judgment_types import Trace as JudgmentTrace
|
3
|
-
from typing import List, Optional
|
4
|
-
from abc import abstractmethod
|
1
|
+
# from judgeval.scorers.base_scorer import BaseScorer
|
2
|
+
# from judgeval.data.judgment_types import Trace as JudgmentTrace
|
3
|
+
# from typing import List, Optional
|
4
|
+
# from abc import abstractmethod
|
5
5
|
|
6
6
|
|
7
|
-
class TraceScorer(BaseScorer):
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
7
|
+
# class TraceScorer(BaseScorer):
|
8
|
+
# @abstractmethod
|
9
|
+
# async def a_score_trace(
|
10
|
+
# self, trace: JudgmentTrace, tools: Optional[List] = None, *args, **kwargs
|
11
|
+
# ) -> float:
|
12
|
+
# """
|
13
|
+
# Asynchronously measures the score on a trace
|
14
|
+
# """
|
15
|
+
# raise NotImplementedError(
|
16
|
+
# "You must implement the `a_score_trace` method in your custom scorer"
|
17
|
+
# )
|
judgeval/scorers/base_scorer.py
CHANGED
@@ -27,7 +27,7 @@ class BaseScorer(BaseModel):
|
|
27
27
|
threshold: float = 0.5
|
28
28
|
|
29
29
|
# name of your scorer (Faithfulness, PromptScorer-randomslug)
|
30
|
-
name:
|
30
|
+
name: str = ""
|
31
31
|
|
32
32
|
# The name of the class of the scorer
|
33
33
|
class_name: Optional[str] = None
|
@@ -42,7 +42,7 @@ class BaseScorer(BaseModel):
|
|
42
42
|
using_native_model: Optional[bool] = None
|
43
43
|
|
44
44
|
# Whether the test case passed or failed
|
45
|
-
success:
|
45
|
+
success: bool = False
|
46
46
|
|
47
47
|
# The name of the model used to evaluate the test case
|
48
48
|
model: Optional[str] = None
|
@@ -55,7 +55,7 @@ def fetch_prompt_scorer(
|
|
55
55
|
):
|
56
56
|
client = JudgmentSyncClient(judgment_api_key, organization_id)
|
57
57
|
try:
|
58
|
-
scorer_config = client.
|
58
|
+
scorer_config = client.fetch_scorers({"names": [name]})["scorers"][0]
|
59
59
|
scorer_config.pop("created_at")
|
60
60
|
scorer_config.pop("updated_at")
|
61
61
|
return scorer_config
|
judgeval/scorers/score.py
CHANGED
judgeval/tracer/__init__.py
CHANGED
@@ -43,8 +43,8 @@ from judgeval.env import (
|
|
43
43
|
JUDGMENT_ORG_ID,
|
44
44
|
)
|
45
45
|
from judgeval.logger import judgeval_logger
|
46
|
-
from judgeval.scorers.api_scorer import
|
47
|
-
from judgeval.scorers.
|
46
|
+
from judgeval.scorers.api_scorer import TraceAPIScorerConfig, ExampleAPIScorerConfig
|
47
|
+
from judgeval.scorers.example_scorer import ExampleScorer
|
48
48
|
from judgeval.tracer.constants import JUDGEVAL_TRACER_INSTRUMENTING_MODULE_NAME
|
49
49
|
from judgeval.tracer.managers import (
|
50
50
|
sync_span_context,
|
@@ -358,7 +358,6 @@ class Tracer:
|
|
358
358
|
eval_run_name = f"async_trace_evaluate_{span_id}"
|
359
359
|
|
360
360
|
eval_run = TraceEvaluationRun(
|
361
|
-
organization_id=self.organization_id,
|
362
361
|
project_name=self.project_name,
|
363
362
|
eval_name=eval_run_name,
|
364
363
|
scorers=[scorer],
|
@@ -862,7 +861,7 @@ class Tracer:
|
|
862
861
|
self,
|
863
862
|
/,
|
864
863
|
*,
|
865
|
-
scorer: Union[ExampleAPIScorerConfig,
|
864
|
+
scorer: Union[ExampleAPIScorerConfig, ExampleScorer],
|
866
865
|
example: Example,
|
867
866
|
model: str = JUDGMENT_DEFAULT_GPT_MODEL,
|
868
867
|
sampling_rate: float = 1.0,
|
@@ -871,9 +870,9 @@ class Tracer:
|
|
871
870
|
judgeval_logger.info("Evaluation is not enabled, skipping evaluation")
|
872
871
|
return
|
873
872
|
|
874
|
-
if not isinstance(scorer, (ExampleAPIScorerConfig,
|
873
|
+
if not isinstance(scorer, (ExampleAPIScorerConfig, ExampleScorer)):
|
875
874
|
judgeval_logger.error(
|
876
|
-
"Scorer must be an instance of ExampleAPIScorerConfig or
|
875
|
+
"Scorer must be an instance of ExampleAPIScorerConfig or ExampleScorer, got %s, skipping evaluation."
|
877
876
|
% type(scorer)
|
878
877
|
)
|
879
878
|
return
|
@@ -903,12 +902,11 @@ class Tracer:
|
|
903
902
|
trace_id = format(span_context.trace_id, "032x")
|
904
903
|
span_id = format(span_context.span_id, "016x")
|
905
904
|
hosted_scoring = isinstance(scorer, ExampleAPIScorerConfig) or (
|
906
|
-
isinstance(scorer,
|
905
|
+
isinstance(scorer, ExampleScorer) and scorer.server_hosted
|
907
906
|
)
|
908
907
|
eval_run_name = f"async_evaluate_{span_id}" # note this name doesnt matter because we don't save the experiment only the example and scorer_data
|
909
908
|
if hosted_scoring:
|
910
909
|
eval_run = ExampleEvaluationRun(
|
911
|
-
organization_id=self.organization_id,
|
912
910
|
project_name=self.project_name,
|
913
911
|
eval_name=eval_run_name,
|
914
912
|
examples=[example],
|
@@ -923,7 +921,6 @@ class Tracer:
|
|
923
921
|
else:
|
924
922
|
# Handle custom scorers using local evaluation queue
|
925
923
|
eval_run = ExampleEvaluationRun(
|
926
|
-
organization_id=self.organization_id,
|
927
924
|
project_name=self.project_name,
|
928
925
|
eval_name=eval_run_name,
|
929
926
|
examples=[example],
|
@@ -13,7 +13,7 @@ import time
|
|
13
13
|
from judgeval.logger import judgeval_logger
|
14
14
|
from judgeval.env import JUDGMENT_MAX_CONCURRENT_EVALUATIONS
|
15
15
|
from judgeval.data import ScoringResult
|
16
|
-
from judgeval.data.evaluation_run import
|
16
|
+
from judgeval.data.evaluation_run import ExampleEvaluationRun
|
17
17
|
from judgeval.utils.async_utils import safe_run_async
|
18
18
|
from judgeval.scorers.score import a_execute_scoring
|
19
19
|
from judgeval.api import JudgmentSyncClient
|
@@ -34,7 +34,7 @@ class LocalEvaluationQueue:
|
|
34
34
|
):
|
35
35
|
if num_workers <= 0:
|
36
36
|
raise ValueError("num_workers must be a positive integer.")
|
37
|
-
self._queue: queue.Queue[Optional[
|
37
|
+
self._queue: queue.Queue[Optional[ExampleEvaluationRun]] = queue.Queue()
|
38
38
|
self._max_concurrent = max_concurrent
|
39
39
|
self._num_workers = num_workers # Number of worker threads
|
40
40
|
self._worker_threads: List[threading.Thread] = []
|
@@ -44,11 +44,11 @@ class LocalEvaluationQueue:
|
|
44
44
|
organization_id=JUDGMENT_ORG_ID,
|
45
45
|
)
|
46
46
|
|
47
|
-
def enqueue(self, evaluation_run:
|
47
|
+
def enqueue(self, evaluation_run: ExampleEvaluationRun) -> None:
|
48
48
|
"""Add evaluation run to the queue."""
|
49
49
|
self._queue.put(evaluation_run)
|
50
50
|
|
51
|
-
def _process_run(self, evaluation_run:
|
51
|
+
def _process_run(self, evaluation_run: ExampleEvaluationRun) -> List[ScoringResult]:
|
52
52
|
"""Execute evaluation run locally and return results."""
|
53
53
|
|
54
54
|
if not evaluation_run.custom_scorers:
|
@@ -70,7 +70,9 @@ class LocalEvaluationQueue:
|
|
70
70
|
|
71
71
|
def run_all(
|
72
72
|
self,
|
73
|
-
callback: Optional[
|
73
|
+
callback: Optional[
|
74
|
+
Callable[[ExampleEvaluationRun, List[ScoringResult]], None]
|
75
|
+
] = None,
|
74
76
|
) -> None:
|
75
77
|
"""Process all queued runs synchronously.
|
76
78
|
|
@@ -134,7 +136,9 @@ class LocalEvaluationQueue:
|
|
134
136
|
|
135
137
|
def start_worker(
|
136
138
|
self,
|
137
|
-
callback: Optional[
|
139
|
+
callback: Optional[
|
140
|
+
Callable[[ExampleEvaluationRun, List[ScoringResult]], None]
|
141
|
+
] = None,
|
138
142
|
) -> Optional[threading.Thread]:
|
139
143
|
"""Start a single background thread to process runs (backward compatibility).
|
140
144
|
|
@@ -144,7 +148,7 @@ class LocalEvaluationQueue:
|
|
144
148
|
Returns:
|
145
149
|
The started thread, or None if no threads were started.
|
146
150
|
"""
|
147
|
-
threads = self.start_workers(
|
151
|
+
threads = self.start_workers()
|
148
152
|
return threads[0] if threads else None
|
149
153
|
|
150
154
|
def wait_for_completion(self, timeout: Optional[float] = None) -> bool:
|
judgeval/trainer/config.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional, Dict, Any, TYPE_CHECKING
|
|
5
5
|
import json
|
6
6
|
|
7
7
|
if TYPE_CHECKING:
|
8
|
-
from fireworks.llm.llm_reinforcement_step import ReinforcementAcceleratorTypeLiteral
|
8
|
+
from fireworks.llm.llm_reinforcement_step import ReinforcementAcceleratorTypeLiteral # type: ignore[import-not-found]
|
9
9
|
|
10
10
|
|
11
11
|
@dataclass
|
judgeval/trainer/trainer.py
CHANGED
@@ -2,7 +2,7 @@ import asyncio
|
|
2
2
|
import json
|
3
3
|
import time
|
4
4
|
from typing import Optional, Callable, Any, List, Union, Dict
|
5
|
-
from fireworks import Dataset
|
5
|
+
from fireworks import Dataset # type: ignore[import-not-found]
|
6
6
|
from .config import TrainerConfig, ModelConfig
|
7
7
|
from .trainable_model import TrainableModel
|
8
8
|
from judgeval.tracer import Tracer
|
@@ -10,7 +10,7 @@ from judgeval.tracer.exporters.store import SpanStore
|
|
10
10
|
from judgeval.tracer.exporters import InMemorySpanExporter
|
11
11
|
from judgeval.tracer.keys import AttributeKeys
|
12
12
|
from judgeval import JudgmentClient
|
13
|
-
from judgeval.scorers import
|
13
|
+
from judgeval.scorers import ExampleScorer, ExampleAPIScorerConfig
|
14
14
|
from judgeval.data import Example
|
15
15
|
from .console import _spinner_progress, _print_progress, _print_progress_update
|
16
16
|
from judgeval.exceptions import JudgmentRuntimeError
|
@@ -85,7 +85,9 @@ class JudgmentTrainer:
|
|
85
85
|
if not first_found and span_attributes.get(
|
86
86
|
AttributeKeys.JUDGMENT_INPUT
|
87
87
|
):
|
88
|
-
input_data = span_attributes.get(
|
88
|
+
input_data: Any = span_attributes.get(
|
89
|
+
AttributeKeys.JUDGMENT_INPUT, {}
|
90
|
+
)
|
89
91
|
if isinstance(input_data, dict) and "messages" in input_data:
|
90
92
|
input_messages = input_data["messages"]
|
91
93
|
if input_messages:
|
@@ -154,7 +156,7 @@ class JudgmentTrainer:
|
|
154
156
|
async def generate_rollouts_and_rewards(
|
155
157
|
self,
|
156
158
|
agent_function: Callable[[Any], Any],
|
157
|
-
scorers: List[Union[ExampleAPIScorerConfig,
|
159
|
+
scorers: List[Union[ExampleAPIScorerConfig, ExampleScorer]],
|
158
160
|
prompts: List[Any],
|
159
161
|
num_prompts_per_step: Optional[int] = None,
|
160
162
|
num_generations_per_prompt: Optional[int] = None,
|
@@ -264,7 +266,7 @@ class JudgmentTrainer:
|
|
264
266
|
async def run_reinforcement_learning(
|
265
267
|
self,
|
266
268
|
agent_function: Callable[[Any], Any],
|
267
|
-
scorers: List[Union[ExampleAPIScorerConfig,
|
269
|
+
scorers: List[Union[ExampleAPIScorerConfig, ExampleScorer]],
|
268
270
|
prompts: List[Any],
|
269
271
|
) -> ModelConfig:
|
270
272
|
"""
|
@@ -370,7 +372,7 @@ class JudgmentTrainer:
|
|
370
372
|
async def train(
|
371
373
|
self,
|
372
374
|
agent_function: Callable[[Any], Any],
|
373
|
-
scorers: List[Union[ExampleAPIScorerConfig,
|
375
|
+
scorers: List[Union[ExampleAPIScorerConfig, ExampleScorer]],
|
374
376
|
prompts: List[Any],
|
375
377
|
rft_provider: Optional[str] = None,
|
376
378
|
) -> ModelConfig:
|
judgeval/utils/async_utils.py
CHANGED
@@ -2,13 +2,13 @@
|
|
2
2
|
|
3
3
|
import asyncio
|
4
4
|
import concurrent.futures
|
5
|
-
from typing import Awaitable, TypeVar
|
5
|
+
from typing import Awaitable, TypeVar, Coroutine
|
6
6
|
|
7
7
|
|
8
8
|
T = TypeVar("T")
|
9
9
|
|
10
10
|
|
11
|
-
def safe_run_async(coro: Awaitable[T]) -> T:
|
11
|
+
def safe_run_async(coro: Awaitable[T]) -> T:
|
12
12
|
"""Safely execute an async *coro* from synchronous code.
|
13
13
|
|
14
14
|
This helper handles two common situations:
|
@@ -24,6 +24,8 @@ def safe_run_async(coro: Awaitable[T]) -> T: # type: ignore[type-var]
|
|
24
24
|
Returns:
|
25
25
|
The result returned by *coro*.
|
26
26
|
"""
|
27
|
+
if not isinstance(coro, Coroutine):
|
28
|
+
raise TypeError("The provided awaitable must be a coroutine.")
|
27
29
|
|
28
30
|
try:
|
29
31
|
asyncio.get_running_loop()
|
@@ -31,5 +33,7 @@ def safe_run_async(coro: Awaitable[T]) -> T: # type: ignore[type-var]
|
|
31
33
|
return asyncio.run(coro)
|
32
34
|
|
33
35
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
34
|
-
future = executor.submit(
|
36
|
+
future: concurrent.futures.Future[T] = executor.submit(
|
37
|
+
lambda: asyncio.run(coro)
|
38
|
+
)
|
35
39
|
return future.result()
|
judgeval/utils/testing.py
CHANGED
@@ -26,10 +26,6 @@ def assert_test_results(scoring_results: List[ScoringResult]) -> None:
|
|
26
26
|
# If the result was not successful, check each scorer_data
|
27
27
|
for scorer_data in result.scorers_data:
|
28
28
|
if not scorer_data.success:
|
29
|
-
if scorer_data.name == "Tool Order":
|
30
|
-
# Remove threshold, evaluation model for Tool Order scorer
|
31
|
-
scorer_data.threshold = None
|
32
|
-
scorer_data.evaluation_model = None
|
33
29
|
test_case.append(scorer_data)
|
34
30
|
failed_cases.append(test_case)
|
35
31
|
|
judgeval/version.py
CHANGED
@@ -1,38 +1,37 @@
|
|
1
|
-
judgeval/__init__.py,sha256=
|
1
|
+
judgeval/__init__.py,sha256=LDL_vOvI6LmMwbVt6NMPwponDeEOaGHV-nd_0wSCLHM,4957
|
2
2
|
judgeval/cli.py,sha256=R5IiIQmSVg21kQHX2kL3sOeXCxvvAMSqyva3Z9AoSXc,1560
|
3
3
|
judgeval/constants.py,sha256=h7Cuf_2uvNzHZi8nqRFoMpvsQUZMS3mlNB3s2uduse8,3557
|
4
|
-
judgeval/env.py,sha256=
|
4
|
+
judgeval/env.py,sha256=QO_77E2oX5LLf29XgqLdUoYUIqEaGxd9mcCco6rzS-w,2445
|
5
5
|
judgeval/exceptions.py,sha256=tTbfe4yoOtPXmn22UQz9-6a-5PT9uOko85xaRRwr0Sw,621
|
6
6
|
judgeval/logger.py,sha256=ZWbp0QfT1CJnQIjV-Zle4n489nFCKEmD2-ukx--iiow,1553
|
7
|
-
judgeval/version.py,sha256=
|
7
|
+
judgeval/version.py,sha256=necdb4jxf2rIhW5LPI_UhDC8zSb9h-dNqtKbwoLv6z8,74
|
8
8
|
judgeval/warnings.py,sha256=LbGte14ppiFjrkp-JJYueZ40NWFvMkWRvPXr6r-fUWw,73
|
9
|
-
judgeval/api/__init__.py,sha256=
|
10
|
-
judgeval/api/api_types.py,sha256=
|
9
|
+
judgeval/api/__init__.py,sha256=3Pm0qQ4ZQj76jUsJVrnuazRnYcqF3pzM_Wv_Z6lOv0w,13216
|
10
|
+
judgeval/api/api_types.py,sha256=AEh_9WpL0wTDUKZ0CwphkiGV3IeysBgTE9FzX4VYPic,6528
|
11
11
|
judgeval/data/__init__.py,sha256=1tU0EN0ThIfQ1fad5I3dKxAfTcZ5U8cvTLcQ6qLVLU0,407
|
12
|
-
judgeval/data/evaluation_run.py,sha256=
|
12
|
+
judgeval/data/evaluation_run.py,sha256=N47waxScMFKvGBxADX2FrfjW4wT5Zqd8n1PZKWb7JMA,4766
|
13
13
|
judgeval/data/example.py,sha256=eGJpF-lyUH734Cg90B7WtU9f8iKoS3VFGeV6R-GVCCc,1039
|
14
|
-
judgeval/data/judgment_types.py,sha256=
|
15
|
-
judgeval/data/result.py,sha256=
|
16
|
-
judgeval/data/scorer_data.py,sha256=
|
17
|
-
judgeval/data/tool.py,sha256=bj_WxFg22mypUUVR5KqQRxMDHWvKwiE1MMPjLnTCoDU,99
|
14
|
+
judgeval/data/judgment_types.py,sha256=8cGuj6VAHjYPfmHZL_Bb4D0D2bLP0V9-_Wec2WZhjKA,12130
|
15
|
+
judgeval/data/result.py,sha256=XufFGSAkBDfevPUmzSgsR9HEqytISkM0U5HkhJmsjpY,2102
|
16
|
+
judgeval/data/scorer_data.py,sha256=HeP15ZgftFTJCF8JmDJCLWXRnZJIaGDJCzl7Hg6gWwE,2006
|
18
17
|
judgeval/data/trace.py,sha256=R9RF1kv1JHeOpjXLjErJcxV2RrNrJUSqWcWe73l3f9k,503
|
19
18
|
judgeval/data/scripts/fix_default_factory.py,sha256=lvp2JwYZqz-XpD9LZNa3mANZVP-jJSZoNzolI6JWERM,591
|
20
19
|
judgeval/data/scripts/openapi_transform.py,sha256=Sm04JClzyP1ga8KA3gkIdsae8Hlx-XU7-x0gHCQYOhg,3877
|
21
|
-
judgeval/dataset/__init__.py,sha256=
|
22
|
-
judgeval/evaluation/__init__.py,sha256=
|
23
|
-
judgeval/integrations/langgraph/__init__.py,sha256=
|
20
|
+
judgeval/dataset/__init__.py,sha256=2B3ifWP_gn_4l0GgZaY2tB9UuV8m7dI1BEWwMgckDOc,6348
|
21
|
+
judgeval/evaluation/__init__.py,sha256=6bSC1Sw-fpJN6OkZTv4UtAoYZqkjUy7OG17lxiRX5qE,13321
|
22
|
+
judgeval/integrations/langgraph/__init__.py,sha256=Ow2rl21SmRQNVVR_WfejCsxFPcLvFFlpvKVgG0_igEQ,27580
|
24
23
|
judgeval/judges/__init__.py,sha256=e7JnTc1TG_SwqydDHTXHIP0EBazQxt-ydMQG7ghSU5A,228
|
25
24
|
judgeval/judges/base_judge.py,sha256=_dz0qWsKRxzXxpRY9l6mrxTRYPSF2FE4ZXkrzhZ4gbY,986
|
26
25
|
judgeval/judges/litellm_judge.py,sha256=5vEF0IUo7HVWnOF2ww-DMke8Xkarnz32B_qbgKjc0-I,4182
|
27
26
|
judgeval/judges/together_judge.py,sha256=GzwlXZJzle8hT-vWKmq39JyIeanJqJfHDOkrksUbzk0,4398
|
28
27
|
judgeval/judges/utils.py,sha256=ITbYwvjU3o9-FIAReFvxh24yJrx9LV3l9BnSBgKUpxg,2068
|
29
|
-
judgeval/scorers/__init__.py,sha256=
|
30
|
-
judgeval/scorers/agent_scorer.py,sha256
|
28
|
+
judgeval/scorers/__init__.py,sha256=pomKzEy4YNFyygYp8vbS3co8iB5CMstRkQwdUgi1u4g,744
|
29
|
+
judgeval/scorers/agent_scorer.py,sha256=-qcNSkY6i7ur2LXkM7H1jTKuuFbDuXbjTq42o3vjeQ8,595
|
31
30
|
judgeval/scorers/api_scorer.py,sha256=8TUJut9r74v-qMACiSKAUbDI1v3ZItPXrTz8s4_Lrgk,2287
|
32
|
-
judgeval/scorers/base_scorer.py,sha256=
|
31
|
+
judgeval/scorers/base_scorer.py,sha256=hsMuqdW8QtW5n9JzruXyaZC7im2K2sSmz1RDkbMisJ4,2702
|
33
32
|
judgeval/scorers/example_scorer.py,sha256=o_BGUztJXjnKnuOqIa9T4PXe0wPoWg63FyH518N1LxA,561
|
34
33
|
judgeval/scorers/exceptions.py,sha256=ACDHK5-TWiF3NTk-wycaedpbrdobm-CvvC1JA_iP-Mk,179
|
35
|
-
judgeval/scorers/score.py,sha256=
|
34
|
+
judgeval/scorers/score.py,sha256=95tnNRnihrEVvG0yH-RDTQ8KoiBakDijjukclqxH5KE,7183
|
36
35
|
judgeval/scorers/utils.py,sha256=iSZONwK0HecxUPz-cMCyra_87DSCag1E8BdpF2a4_44,377
|
37
36
|
judgeval/scorers/judgeval_scorers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
38
37
|
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=wrq7y9I30GZbwDXIrSh81KRO_-j7i-1DjwX5Hc3PScI,728
|
@@ -40,11 +39,11 @@ judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=_qa1s
|
|
40
39
|
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=ciiFBQQC4UDsk9qou9OiKbAR31s82eRUY1ZTt1gdM-0,407
|
41
40
|
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=lIJ3GgOI9tfbrC7voZMvlxXdK3X1bhdj2zNxqdaGIkM,545
|
42
41
|
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=bSwbpVNhpkpEeX3GtCJuyz5vFyY1gbyqYEfaBF2KTVY,697
|
43
|
-
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py,sha256=
|
44
|
-
judgeval/tracer/__init__.py,sha256=
|
42
|
+
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py,sha256=djPfHC8NP9srwTAgp075kK_zz6Tbn2WFIh6jOZjqppQ,9688
|
43
|
+
judgeval/tracer/__init__.py,sha256=YLJklv1YfNDV61GiJw3PflLp_cajxAnXHojJVKitbz4,35074
|
45
44
|
judgeval/tracer/constants.py,sha256=ae8tivAW97awJQxdRB9OMqX50wOLX3zqChT_AGkPBu0,85
|
46
45
|
judgeval/tracer/keys.py,sha256=qXPoZSkEhVF-YYfQ9-zeDMVdr4GtpPf2W7MPJaN2AQo,2889
|
47
|
-
judgeval/tracer/local_eval_queue.py,sha256=
|
46
|
+
judgeval/tracer/local_eval_queue.py,sha256=KZKvSSli7B-EVzdHa4-CmXUpv0uOjGLLRa2KTPg8lRc,7320
|
48
47
|
judgeval/tracer/managers.py,sha256=h2ZHJ61_vf3cS-HlEUiodFzKDUuQWIhYC6n7pMVyM9c,6113
|
49
48
|
judgeval/tracer/utils.py,sha256=3_8ZjjF4XgNyAu9LpThq5dVOcwdwI-E3vb-HRl_Px8c,594
|
50
49
|
judgeval/tracer/exporters/__init__.py,sha256=lnZXfPGaQH844HAIuZCQqjqhnmZGA98kHY8Xp-Oi4Ws,1220
|
@@ -55,21 +54,21 @@ judgeval/tracer/llm/__init__.py,sha256=p9uwWPg9k-NcWjj9TbwQj55sHhBOqRYx2-Ld6YHaF
|
|
55
54
|
judgeval/tracer/llm/providers.py,sha256=QQLJlSNnDjXRAc2Wqw78o254COJUSXX39D7D_mx3NVA,2651
|
56
55
|
judgeval/tracer/processors/__init__.py,sha256=tXbQaXGMQeutgM_7d5Y2EFTeSjbVEBky685Dst_v3rg,8672
|
57
56
|
judgeval/trainer/__init__.py,sha256=h_DDVV7HFF7HUPAJFpt2d9wjqgnmEVcHxqZyB1k7pPQ,257
|
58
|
-
judgeval/trainer/config.py,sha256=
|
57
|
+
judgeval/trainer/config.py,sha256=sAAVBgeoFDJWYjGIgOvoQoiO0gtqNAOI6MHncwdN_mk,4292
|
59
58
|
judgeval/trainer/console.py,sha256=PJ0rCnDwC7aoW-VsLDS96ZyMyagh-l9EOJKff1ATIpo,4342
|
60
|
-
judgeval/trainer/trainable_model.py,sha256=
|
61
|
-
judgeval/trainer/trainer.py,sha256=
|
62
|
-
judgeval/utils/async_utils.py,sha256=
|
59
|
+
judgeval/trainer/trainable_model.py,sha256=T-Sioi_sXtfYlcu3lE0cd60PHs8DrYaZ-Kxb4h1nU04,8993
|
60
|
+
judgeval/trainer/trainer.py,sha256=FBhHq2YPooKADDCC_IEKex81L6a5quCmAMyl9mn3QLk,16675
|
61
|
+
judgeval/utils/async_utils.py,sha256=AF1xdu8Ao5GyhFvfaLOaKJHn1RISyXZ4U70UZe9zfBA,1083
|
63
62
|
judgeval/utils/decorators.py,sha256=rdqY1w0zNL6O6GU6Wdeo0-x5EgpFTEhU2vkgiWsRYdc,525
|
64
63
|
judgeval/utils/file_utils.py,sha256=3LI1YCZwO5ogTgJreyOgRgDksey3natO2Td1PQqaPyY,3252
|
65
64
|
judgeval/utils/guards.py,sha256=QBb6m6KElxdvt2bskLZCKh_zGHbBcqV-VfGzT63o3hY,807
|
66
65
|
judgeval/utils/meta.py,sha256=wQFCLJTNKF9yUdXcw37AT6mC-wqzZpAvjn5gP_6flD8,349
|
67
66
|
judgeval/utils/serialize.py,sha256=QXR-8Nj5rqOrI9zLx0oRLdk6DW6Bc7j8eyF4zQ7PLxA,6256
|
68
|
-
judgeval/utils/testing.py,sha256=
|
67
|
+
judgeval/utils/testing.py,sha256=4HO4UCZQgeB7wi-LQoKPjiAYMbj4PpeApAnxZdmI_8w,3392
|
69
68
|
judgeval/utils/url.py,sha256=Shf0v3XcbaWpL0m1eGJEEO_z4TsQCnDB2Rl25OTUmiI,195
|
70
69
|
judgeval/utils/version_check.py,sha256=kcF6SvB6GbVKI0Gv9QRVm-kvBn9_z-c3jmPORsXO3h0,1015
|
71
|
-
judgeval-0.
|
72
|
-
judgeval-0.
|
73
|
-
judgeval-0.
|
74
|
-
judgeval-0.
|
75
|
-
judgeval-0.
|
70
|
+
judgeval-0.12.0.dist-info/METADATA,sha256=RVS9bm8KrWk-ifawDz1s9oDx_NY3zjGPkbknKKzpjeM,8870
|
71
|
+
judgeval-0.12.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
72
|
+
judgeval-0.12.0.dist-info/entry_points.txt,sha256=-eoeD-oDLn4A7MSgeBS9Akwanf3_0r0cgEleBcIOjg0,46
|
73
|
+
judgeval-0.12.0.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
|
74
|
+
judgeval-0.12.0.dist-info/RECORD,,
|
judgeval/data/tool.py
DELETED
File without changes
|
File without changes
|
File without changes
|