judgeval 0.11.0__py3-none-any.whl → 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +5 -5
- judgeval/api/api_types.py +81 -12
- judgeval/cli.py +2 -1
- judgeval/constants.py +0 -6
- judgeval/data/evaluation_run.py +7 -8
- judgeval/data/judgment_types.py +97 -12
- judgeval/data/trace.py +108 -1
- judgeval/dataset/__init__.py +72 -23
- judgeval/env.py +5 -20
- judgeval/integrations/langgraph/__init__.py +9 -785
- judgeval/scorers/__init__.py +6 -0
- judgeval/scorers/api_scorer.py +15 -12
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +26 -35
- judgeval/scorers/score.py +1 -1
- judgeval/scorers/utils.py +1 -4
- judgeval/tracer/__init__.py +181 -162
- judgeval/tracer/exporters/__init__.py +4 -1
- judgeval/tracer/keys.py +15 -25
- judgeval/tracer/llm/__init__.py +0 -1
- judgeval/tracer/llm/anthropic/__init__.py +20 -0
- judgeval/tracer/llm/google/__init__.py +21 -0
- judgeval/tracer/llm/groq/__init__.py +20 -0
- judgeval/tracer/llm/openai/__init__.py +32 -0
- judgeval/tracer/llm/providers.py +28 -79
- judgeval/tracer/llm/together/__init__.py +20 -0
- judgeval/tracer/managers.py +23 -48
- judgeval/tracer/processors/__init__.py +36 -75
- judgeval/tracer/utils.py +3 -4
- judgeval/trainer/trainer.py +4 -4
- judgeval/utils/file_utils.py +0 -2
- judgeval/utils/meta.py +18 -5
- judgeval/utils/testing.py +0 -14
- judgeval/utils/version_check.py +2 -0
- judgeval/version.py +1 -1
- {judgeval-0.11.0.dist-info → judgeval-0.13.0.dist-info}/METADATA +1 -7
- {judgeval-0.11.0.dist-info → judgeval-0.13.0.dist-info}/RECORD +43 -38
- {judgeval-0.11.0.dist-info → judgeval-0.13.0.dist-info}/WHEEL +0 -0
- {judgeval-0.11.0.dist-info → judgeval-0.13.0.dist-info}/entry_points.txt +0 -0
- {judgeval-0.11.0.dist-info → judgeval-0.13.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/scorers/__init__.py
CHANGED
@@ -1,7 +1,10 @@
|
|
1
1
|
from judgeval.scorers.api_scorer import (
|
2
2
|
APIScorerConfig,
|
3
|
+
ExampleAPIScorerConfig,
|
4
|
+
TraceAPIScorerConfig,
|
3
5
|
)
|
4
6
|
from judgeval.scorers.base_scorer import BaseScorer
|
7
|
+
from judgeval.scorers.example_scorer import ExampleScorer
|
5
8
|
from judgeval.scorers.judgeval_scorers.api_scorers import (
|
6
9
|
FaithfulnessScorer,
|
7
10
|
AnswerRelevancyScorer,
|
@@ -13,7 +16,10 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
|
|
13
16
|
|
14
17
|
__all__ = [
|
15
18
|
"APIScorerConfig",
|
19
|
+
"ExampleAPIScorerConfig",
|
20
|
+
"TraceAPIScorerConfig",
|
16
21
|
"BaseScorer",
|
22
|
+
"ExampleScorer",
|
17
23
|
"TracePromptScorer",
|
18
24
|
"PromptScorer",
|
19
25
|
"FaithfulnessScorer",
|
judgeval/scorers/api_scorer.py
CHANGED
@@ -8,8 +8,9 @@ from __future__ import annotations
|
|
8
8
|
|
9
9
|
from pydantic import BaseModel, field_validator
|
10
10
|
from typing import List
|
11
|
-
from judgeval.constants import
|
11
|
+
from judgeval.constants import APIScorerType
|
12
12
|
from judgeval.data.example import ExampleParams
|
13
|
+
from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
|
13
14
|
|
14
15
|
|
15
16
|
class APIScorerConfig(BaseModel):
|
@@ -29,8 +30,8 @@ class APIScorerConfig(BaseModel):
|
|
29
30
|
name: str = ""
|
30
31
|
threshold: float = 0.5
|
31
32
|
strict_mode: bool = False
|
33
|
+
model: str = JUDGMENT_DEFAULT_GPT_MODEL
|
32
34
|
|
33
|
-
# This is used to check if the example has the required parameters before running the scorer
|
34
35
|
required_params: List[ExampleParams] = []
|
35
36
|
|
36
37
|
kwargs: dict = {}
|
@@ -42,16 +43,10 @@ class APIScorerConfig(BaseModel):
|
|
42
43
|
Validates that the threshold is between 0 and 1 inclusive.
|
43
44
|
"""
|
44
45
|
score_type = info.data.get("score_type")
|
45
|
-
if
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
)
|
50
|
-
else:
|
51
|
-
if not 0 <= v <= 1:
|
52
|
-
raise ValueError(
|
53
|
-
f"Threshold for {score_type} must be between 0 and 1, got: {v}"
|
54
|
-
)
|
46
|
+
if not 0 <= v <= 1:
|
47
|
+
raise ValueError(
|
48
|
+
f"Threshold for {score_type} must be between 0 and 1, got: {v}"
|
49
|
+
)
|
55
50
|
return v
|
56
51
|
|
57
52
|
@field_validator("name", mode="after")
|
@@ -63,3 +58,11 @@ class APIScorerConfig(BaseModel):
|
|
63
58
|
|
64
59
|
def __str__(self):
|
65
60
|
return f"JudgmentScorer(score_type={self.score_type.value}, threshold={self.threshold})"
|
61
|
+
|
62
|
+
|
63
|
+
class ExampleAPIScorerConfig(APIScorerConfig):
|
64
|
+
pass
|
65
|
+
|
66
|
+
|
67
|
+
class TraceAPIScorerConfig(APIScorerConfig):
|
68
|
+
pass
|
@@ -1,18 +1,10 @@
|
|
1
|
-
|
2
|
-
`judgeval` answer relevancy scorer
|
3
|
-
|
4
|
-
TODO add link to docs page for this scorer
|
5
|
-
|
6
|
-
"""
|
7
|
-
|
8
|
-
# Internal imports
|
9
|
-
from judgeval.scorers.api_scorer import APIScorerConfig
|
1
|
+
from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
|
10
2
|
from judgeval.constants import APIScorerType
|
11
3
|
from judgeval.data import ExampleParams
|
12
4
|
from typing import List
|
13
5
|
|
14
6
|
|
15
|
-
class AnswerCorrectnessScorer(
|
7
|
+
class AnswerCorrectnessScorer(ExampleAPIScorerConfig):
|
16
8
|
score_type: APIScorerType = APIScorerType.ANSWER_CORRECTNESS
|
17
9
|
required_params: List[ExampleParams] = [
|
18
10
|
ExampleParams.INPUT,
|
@@ -1,10 +1,10 @@
|
|
1
|
-
from judgeval.scorers.api_scorer import
|
1
|
+
from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
|
2
2
|
from judgeval.constants import APIScorerType
|
3
3
|
from judgeval.data import ExampleParams
|
4
4
|
from typing import List
|
5
5
|
|
6
6
|
|
7
|
-
class AnswerRelevancyScorer(
|
7
|
+
class AnswerRelevancyScorer(ExampleAPIScorerConfig):
|
8
8
|
score_type: APIScorerType = APIScorerType.ANSWER_RELEVANCY
|
9
9
|
required_params: List[ExampleParams] = [
|
10
10
|
ExampleParams.INPUT,
|
@@ -1,18 +1,10 @@
|
|
1
|
-
|
2
|
-
`judgeval` faithfulness scorer
|
3
|
-
|
4
|
-
TODO add link to docs page for this scorer
|
5
|
-
|
6
|
-
"""
|
7
|
-
|
8
|
-
# Internal imports
|
9
|
-
from judgeval.scorers.api_scorer import APIScorerConfig
|
1
|
+
from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
|
10
2
|
from judgeval.constants import APIScorerType
|
11
3
|
from judgeval.data import ExampleParams
|
12
4
|
from typing import List
|
13
5
|
|
14
6
|
|
15
|
-
class FaithfulnessScorer(
|
7
|
+
class FaithfulnessScorer(ExampleAPIScorerConfig):
|
16
8
|
score_type: APIScorerType = APIScorerType.FAITHFULNESS
|
17
9
|
required_params: List[ExampleParams] = [
|
18
10
|
ExampleParams.INPUT,
|
@@ -1,17 +1,9 @@
|
|
1
|
-
|
2
|
-
`judgeval` instruction adherence scorer
|
3
|
-
|
4
|
-
TODO add link to docs page for this scorer
|
5
|
-
|
6
|
-
"""
|
7
|
-
|
8
|
-
# Internal imports
|
9
|
-
from judgeval.scorers.api_scorer import APIScorerConfig
|
1
|
+
from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
|
10
2
|
from judgeval.constants import APIScorerType
|
11
3
|
from judgeval.data import ExampleParams
|
12
4
|
|
13
5
|
|
14
|
-
class InstructionAdherenceScorer(
|
6
|
+
class InstructionAdherenceScorer(ExampleAPIScorerConfig):
|
15
7
|
def __init__(self, threshold: float):
|
16
8
|
super().__init__(
|
17
9
|
threshold=threshold,
|
@@ -21,7 +13,3 @@ class InstructionAdherenceScorer(APIScorerConfig):
|
|
21
13
|
ExampleParams.ACTUAL_OUTPUT,
|
22
14
|
],
|
23
15
|
)
|
24
|
-
|
25
|
-
@property
|
26
|
-
def __name__(self):
|
27
|
-
return "Instruction Adherence"
|
@@ -1,24 +1,26 @@
|
|
1
1
|
from judgeval.scorers.api_scorer import (
|
2
2
|
APIScorerConfig,
|
3
|
+
ExampleAPIScorerConfig,
|
4
|
+
TraceAPIScorerConfig,
|
3
5
|
)
|
4
6
|
from judgeval.constants import APIScorerType
|
5
|
-
from typing import Dict, Any
|
7
|
+
from typing import Dict, Any
|
6
8
|
from judgeval.api import JudgmentSyncClient
|
7
9
|
from judgeval.exceptions import JudgmentAPIError
|
8
10
|
import os
|
9
|
-
from copy import copy
|
10
11
|
from judgeval.logger import judgeval_logger
|
11
12
|
from abc import ABC
|
13
|
+
from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
|
12
14
|
|
13
15
|
|
14
16
|
def push_prompt_scorer(
|
15
17
|
name: str,
|
16
18
|
prompt: str,
|
17
19
|
threshold: float,
|
18
|
-
|
20
|
+
model: str = JUDGMENT_DEFAULT_GPT_MODEL,
|
19
21
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
|
20
22
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
21
|
-
is_trace:
|
23
|
+
is_trace: bool = False,
|
22
24
|
) -> str:
|
23
25
|
client = JudgmentSyncClient(judgment_api_key, organization_id)
|
24
26
|
try:
|
@@ -27,7 +29,7 @@ def push_prompt_scorer(
|
|
27
29
|
"name": name,
|
28
30
|
"prompt": prompt,
|
29
31
|
"threshold": threshold,
|
30
|
-
"
|
32
|
+
"model": model,
|
31
33
|
"is_trace": is_trace,
|
32
34
|
}
|
33
35
|
)
|
@@ -94,17 +96,8 @@ def scorer_exists(
|
|
94
96
|
|
95
97
|
|
96
98
|
class BasePromptScorer(ABC, APIScorerConfig):
|
97
|
-
"""
|
98
|
-
In the Judgment backend, this scorer is implemented as a PromptScorer that takes
|
99
|
-
1. a system role that may involve the Example object
|
100
|
-
2. options for scores on the example
|
101
|
-
|
102
|
-
and uses a judge to execute the evaluation from the system role and classify into one of the options
|
103
|
-
"""
|
104
|
-
|
105
99
|
score_type: APIScorerType
|
106
100
|
prompt: str
|
107
|
-
options: Optional[Dict[str, float]] = None
|
108
101
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or ""
|
109
102
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or ""
|
110
103
|
|
@@ -131,7 +124,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
131
124
|
name=name,
|
132
125
|
prompt=scorer_config["prompt"],
|
133
126
|
threshold=scorer_config["threshold"],
|
134
|
-
|
127
|
+
model=scorer_config.get("model"),
|
135
128
|
judgment_api_key=judgment_api_key,
|
136
129
|
organization_id=organization_id,
|
137
130
|
)
|
@@ -142,7 +135,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
142
135
|
name: str,
|
143
136
|
prompt: str,
|
144
137
|
threshold: float = 0.5,
|
145
|
-
|
138
|
+
model: str = JUDGMENT_DEFAULT_GPT_MODEL,
|
146
139
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
|
147
140
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
148
141
|
):
|
@@ -157,7 +150,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
157
150
|
name,
|
158
151
|
prompt,
|
159
152
|
threshold,
|
160
|
-
|
153
|
+
model,
|
161
154
|
judgment_api_key,
|
162
155
|
organization_id,
|
163
156
|
is_trace,
|
@@ -168,7 +161,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
168
161
|
name=name,
|
169
162
|
prompt=prompt,
|
170
163
|
threshold=threshold,
|
171
|
-
|
164
|
+
model=model,
|
172
165
|
judgment_api_key=judgment_api_key,
|
173
166
|
organization_id=organization_id,
|
174
167
|
)
|
@@ -198,16 +191,13 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
198
191
|
self.push_prompt_scorer()
|
199
192
|
judgeval_logger.info(f"Successfully updated prompt for {self.name}")
|
200
193
|
|
201
|
-
def
|
194
|
+
def set_model(self, model: str):
|
202
195
|
"""
|
203
|
-
Updates the
|
204
|
-
|
205
|
-
Sample options:
|
206
|
-
{"yes": 1, "no": 0}
|
196
|
+
Updates the model of the scorer.
|
207
197
|
"""
|
208
|
-
self.
|
198
|
+
self.model = model
|
209
199
|
self.push_prompt_scorer()
|
210
|
-
judgeval_logger.info(f"Successfully updated
|
200
|
+
judgeval_logger.info(f"Successfully updated model for {self.name}")
|
211
201
|
|
212
202
|
def append_to_prompt(self, prompt_addition: str):
|
213
203
|
"""
|
@@ -218,23 +208,23 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
218
208
|
judgeval_logger.info(f"Successfully appended to prompt for {self.name}")
|
219
209
|
|
220
210
|
# Getters
|
221
|
-
def get_threshold(self) -> float
|
211
|
+
def get_threshold(self) -> float:
|
222
212
|
"""
|
223
213
|
Returns the threshold of the scorer.
|
224
214
|
"""
|
225
215
|
return self.threshold
|
226
216
|
|
227
|
-
def get_prompt(self) -> str
|
217
|
+
def get_prompt(self) -> str:
|
228
218
|
"""
|
229
219
|
Returns the prompt of the scorer.
|
230
220
|
"""
|
231
221
|
return self.prompt
|
232
222
|
|
233
|
-
def
|
223
|
+
def get_model(self) -> str:
|
234
224
|
"""
|
235
|
-
Returns the
|
225
|
+
Returns the model of the scorer.
|
236
226
|
"""
|
237
|
-
return
|
227
|
+
return self.model
|
238
228
|
|
239
229
|
def get_name(self) -> str | None:
|
240
230
|
"""
|
@@ -248,9 +238,9 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
248
238
|
"""
|
249
239
|
return {
|
250
240
|
"name": self.name,
|
241
|
+
"model": self.model,
|
251
242
|
"prompt": self.prompt,
|
252
243
|
"threshold": self.threshold,
|
253
|
-
"options": self.options,
|
254
244
|
}
|
255
245
|
|
256
246
|
def push_prompt_scorer(self):
|
@@ -261,13 +251,14 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
261
251
|
self.name,
|
262
252
|
self.prompt,
|
263
253
|
self.threshold,
|
264
|
-
self.
|
254
|
+
self.model,
|
265
255
|
self.judgment_api_key,
|
266
256
|
self.organization_id,
|
257
|
+
isinstance(self, TracePromptScorer),
|
267
258
|
)
|
268
259
|
|
269
260
|
def __str__(self):
|
270
|
-
return f"PromptScorer(name={self.name},
|
261
|
+
return f"PromptScorer(name={self.name}, model={self.model}, prompt={self.prompt}, threshold={self.threshold})"
|
271
262
|
|
272
263
|
def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
|
273
264
|
base = super().model_dump(*args, **kwargs)
|
@@ -282,9 +273,9 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
282
273
|
return base
|
283
274
|
|
284
275
|
|
285
|
-
class PromptScorer(BasePromptScorer,
|
276
|
+
class PromptScorer(BasePromptScorer, ExampleAPIScorerConfig):
|
286
277
|
pass
|
287
278
|
|
288
279
|
|
289
|
-
class TracePromptScorer(BasePromptScorer,
|
280
|
+
class TracePromptScorer(BasePromptScorer, TraceAPIScorerConfig):
|
290
281
|
pass
|
judgeval/scorers/score.py
CHANGED
judgeval/scorers/utils.py
CHANGED
@@ -11,7 +11,4 @@ def clone_scorers(scorers: List[BaseScorer]) -> List[BaseScorer]:
|
|
11
11
|
"""
|
12
12
|
Creates duplicates of the scorers passed as argument.
|
13
13
|
"""
|
14
|
-
|
15
|
-
for s in scorers:
|
16
|
-
cloned_scorers.append(s.model_copy(deep=True))
|
17
|
-
return cloned_scorers
|
14
|
+
return [s.model_copy(deep=True) for s in scorers]
|