judgeval 0.12.0__py3-none-any.whl → 0.13.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +2 -2
- judgeval/api/api_types.py +81 -12
- judgeval/cli.py +2 -1
- judgeval/constants.py +0 -6
- judgeval/data/evaluation_run.py +2 -5
- judgeval/data/judgment_types.py +97 -12
- judgeval/data/trace.py +108 -1
- judgeval/dataset/__init__.py +72 -23
- judgeval/env.py +5 -20
- judgeval/integrations/langgraph/__init__.py +9 -785
- judgeval/scorers/api_scorer.py +7 -12
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -8
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -8
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -12
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +22 -33
- judgeval/scorers/score.py +1 -1
- judgeval/scorers/utils.py +1 -4
- judgeval/tracer/__init__.py +175 -156
- judgeval/tracer/exporters/__init__.py +4 -1
- judgeval/tracer/keys.py +15 -25
- judgeval/tracer/llm/__init__.py +0 -1
- judgeval/tracer/llm/anthropic/__init__.py +20 -0
- judgeval/tracer/llm/google/__init__.py +21 -0
- judgeval/tracer/llm/groq/__init__.py +20 -0
- judgeval/tracer/llm/openai/__init__.py +32 -0
- judgeval/tracer/llm/providers.py +28 -79
- judgeval/tracer/llm/together/__init__.py +20 -0
- judgeval/tracer/managers.py +23 -48
- judgeval/tracer/processors/__init__.py +36 -75
- judgeval/tracer/utils.py +1 -2
- judgeval/utils/file_utils.py +0 -2
- judgeval/utils/meta.py +18 -5
- judgeval/utils/testing.py +0 -14
- judgeval/utils/version_check.py +2 -0
- judgeval/version.py +1 -1
- {judgeval-0.12.0.dist-info → judgeval-0.13.1.dist-info}/METADATA +1 -7
- {judgeval-0.12.0.dist-info → judgeval-0.13.1.dist-info}/RECORD +40 -35
- {judgeval-0.12.0.dist-info → judgeval-0.13.1.dist-info}/WHEEL +0 -0
- {judgeval-0.12.0.dist-info → judgeval-0.13.1.dist-info}/entry_points.txt +0 -0
- {judgeval-0.12.0.dist-info → judgeval-0.13.1.dist-info}/licenses/LICENSE.md +0 -0
judgeval/scorers/api_scorer.py
CHANGED
@@ -8,8 +8,9 @@ from __future__ import annotations
|
|
8
8
|
|
9
9
|
from pydantic import BaseModel, field_validator
|
10
10
|
from typing import List
|
11
|
-
from judgeval.constants import
|
11
|
+
from judgeval.constants import APIScorerType
|
12
12
|
from judgeval.data.example import ExampleParams
|
13
|
+
from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
|
13
14
|
|
14
15
|
|
15
16
|
class APIScorerConfig(BaseModel):
|
@@ -29,8 +30,8 @@ class APIScorerConfig(BaseModel):
|
|
29
30
|
name: str = ""
|
30
31
|
threshold: float = 0.5
|
31
32
|
strict_mode: bool = False
|
33
|
+
model: str = JUDGMENT_DEFAULT_GPT_MODEL
|
32
34
|
|
33
|
-
# This is used to check if the example has the required parameters before running the scorer
|
34
35
|
required_params: List[ExampleParams] = []
|
35
36
|
|
36
37
|
kwargs: dict = {}
|
@@ -42,16 +43,10 @@ class APIScorerConfig(BaseModel):
|
|
42
43
|
Validates that the threshold is between 0 and 1 inclusive.
|
43
44
|
"""
|
44
45
|
score_type = info.data.get("score_type")
|
45
|
-
if
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
)
|
50
|
-
else:
|
51
|
-
if not 0 <= v <= 1:
|
52
|
-
raise ValueError(
|
53
|
-
f"Threshold for {score_type} must be between 0 and 1, got: {v}"
|
54
|
-
)
|
46
|
+
if not 0 <= v <= 1:
|
47
|
+
raise ValueError(
|
48
|
+
f"Threshold for {score_type} must be between 0 and 1, got: {v}"
|
49
|
+
)
|
55
50
|
return v
|
56
51
|
|
57
52
|
@field_validator("name", mode="after")
|
@@ -1,11 +1,3 @@
|
|
1
|
-
"""
|
2
|
-
`judgeval` answer relevancy scorer
|
3
|
-
|
4
|
-
TODO add link to docs page for this scorer
|
5
|
-
|
6
|
-
"""
|
7
|
-
|
8
|
-
# Internal imports
|
9
1
|
from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
|
10
2
|
from judgeval.constants import APIScorerType
|
11
3
|
from judgeval.data import ExampleParams
|
@@ -1,11 +1,3 @@
|
|
1
|
-
"""
|
2
|
-
`judgeval` faithfulness scorer
|
3
|
-
|
4
|
-
TODO add link to docs page for this scorer
|
5
|
-
|
6
|
-
"""
|
7
|
-
|
8
|
-
# Internal imports
|
9
1
|
from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
|
10
2
|
from judgeval.constants import APIScorerType
|
11
3
|
from judgeval.data import ExampleParams
|
@@ -1,11 +1,3 @@
|
|
1
|
-
"""
|
2
|
-
`judgeval` instruction adherence scorer
|
3
|
-
|
4
|
-
TODO add link to docs page for this scorer
|
5
|
-
|
6
|
-
"""
|
7
|
-
|
8
|
-
# Internal imports
|
9
1
|
from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
|
10
2
|
from judgeval.constants import APIScorerType
|
11
3
|
from judgeval.data import ExampleParams
|
@@ -21,7 +13,3 @@ class InstructionAdherenceScorer(ExampleAPIScorerConfig):
|
|
21
13
|
ExampleParams.ACTUAL_OUTPUT,
|
22
14
|
],
|
23
15
|
)
|
24
|
-
|
25
|
-
@property
|
26
|
-
def __name__(self):
|
27
|
-
return "Instruction Adherence"
|
@@ -4,23 +4,23 @@ from judgeval.scorers.api_scorer import (
|
|
4
4
|
TraceAPIScorerConfig,
|
5
5
|
)
|
6
6
|
from judgeval.constants import APIScorerType
|
7
|
-
from typing import Dict, Any
|
7
|
+
from typing import Dict, Any
|
8
8
|
from judgeval.api import JudgmentSyncClient
|
9
9
|
from judgeval.exceptions import JudgmentAPIError
|
10
10
|
import os
|
11
|
-
from copy import copy
|
12
11
|
from judgeval.logger import judgeval_logger
|
13
12
|
from abc import ABC
|
13
|
+
from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
|
14
14
|
|
15
15
|
|
16
16
|
def push_prompt_scorer(
|
17
17
|
name: str,
|
18
18
|
prompt: str,
|
19
19
|
threshold: float,
|
20
|
-
|
20
|
+
model: str = JUDGMENT_DEFAULT_GPT_MODEL,
|
21
21
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
|
22
22
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
23
|
-
is_trace:
|
23
|
+
is_trace: bool = False,
|
24
24
|
) -> str:
|
25
25
|
client = JudgmentSyncClient(judgment_api_key, organization_id)
|
26
26
|
try:
|
@@ -29,7 +29,7 @@ def push_prompt_scorer(
|
|
29
29
|
"name": name,
|
30
30
|
"prompt": prompt,
|
31
31
|
"threshold": threshold,
|
32
|
-
"
|
32
|
+
"model": model,
|
33
33
|
"is_trace": is_trace,
|
34
34
|
}
|
35
35
|
)
|
@@ -96,17 +96,8 @@ def scorer_exists(
|
|
96
96
|
|
97
97
|
|
98
98
|
class BasePromptScorer(ABC, APIScorerConfig):
|
99
|
-
"""
|
100
|
-
In the Judgment backend, this scorer is implemented as a PromptScorer that takes
|
101
|
-
1. a system role that may involve the Example object
|
102
|
-
2. options for scores on the example
|
103
|
-
|
104
|
-
and uses a judge to execute the evaluation from the system role and classify into one of the options
|
105
|
-
"""
|
106
|
-
|
107
99
|
score_type: APIScorerType
|
108
100
|
prompt: str
|
109
|
-
options: Optional[Dict[str, float]] = None
|
110
101
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or ""
|
111
102
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or ""
|
112
103
|
|
@@ -133,7 +124,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
133
124
|
name=name,
|
134
125
|
prompt=scorer_config["prompt"],
|
135
126
|
threshold=scorer_config["threshold"],
|
136
|
-
|
127
|
+
model=scorer_config.get("model"),
|
137
128
|
judgment_api_key=judgment_api_key,
|
138
129
|
organization_id=organization_id,
|
139
130
|
)
|
@@ -144,7 +135,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
144
135
|
name: str,
|
145
136
|
prompt: str,
|
146
137
|
threshold: float = 0.5,
|
147
|
-
|
138
|
+
model: str = JUDGMENT_DEFAULT_GPT_MODEL,
|
148
139
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
|
149
140
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
150
141
|
):
|
@@ -159,7 +150,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
159
150
|
name,
|
160
151
|
prompt,
|
161
152
|
threshold,
|
162
|
-
|
153
|
+
model,
|
163
154
|
judgment_api_key,
|
164
155
|
organization_id,
|
165
156
|
is_trace,
|
@@ -170,7 +161,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
170
161
|
name=name,
|
171
162
|
prompt=prompt,
|
172
163
|
threshold=threshold,
|
173
|
-
|
164
|
+
model=model,
|
174
165
|
judgment_api_key=judgment_api_key,
|
175
166
|
organization_id=organization_id,
|
176
167
|
)
|
@@ -200,16 +191,13 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
200
191
|
self.push_prompt_scorer()
|
201
192
|
judgeval_logger.info(f"Successfully updated prompt for {self.name}")
|
202
193
|
|
203
|
-
def
|
194
|
+
def set_model(self, model: str):
|
204
195
|
"""
|
205
|
-
Updates the
|
206
|
-
|
207
|
-
Sample options:
|
208
|
-
{"yes": 1, "no": 0}
|
196
|
+
Updates the model of the scorer.
|
209
197
|
"""
|
210
|
-
self.
|
198
|
+
self.model = model
|
211
199
|
self.push_prompt_scorer()
|
212
|
-
judgeval_logger.info(f"Successfully updated
|
200
|
+
judgeval_logger.info(f"Successfully updated model for {self.name}")
|
213
201
|
|
214
202
|
def append_to_prompt(self, prompt_addition: str):
|
215
203
|
"""
|
@@ -220,23 +208,23 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
220
208
|
judgeval_logger.info(f"Successfully appended to prompt for {self.name}")
|
221
209
|
|
222
210
|
# Getters
|
223
|
-
def get_threshold(self) -> float
|
211
|
+
def get_threshold(self) -> float:
|
224
212
|
"""
|
225
213
|
Returns the threshold of the scorer.
|
226
214
|
"""
|
227
215
|
return self.threshold
|
228
216
|
|
229
|
-
def get_prompt(self) -> str
|
217
|
+
def get_prompt(self) -> str:
|
230
218
|
"""
|
231
219
|
Returns the prompt of the scorer.
|
232
220
|
"""
|
233
221
|
return self.prompt
|
234
222
|
|
235
|
-
def
|
223
|
+
def get_model(self) -> str:
|
236
224
|
"""
|
237
|
-
Returns the
|
225
|
+
Returns the model of the scorer.
|
238
226
|
"""
|
239
|
-
return
|
227
|
+
return self.model
|
240
228
|
|
241
229
|
def get_name(self) -> str | None:
|
242
230
|
"""
|
@@ -250,9 +238,9 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
250
238
|
"""
|
251
239
|
return {
|
252
240
|
"name": self.name,
|
241
|
+
"model": self.model,
|
253
242
|
"prompt": self.prompt,
|
254
243
|
"threshold": self.threshold,
|
255
|
-
"options": self.options,
|
256
244
|
}
|
257
245
|
|
258
246
|
def push_prompt_scorer(self):
|
@@ -263,13 +251,14 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
263
251
|
self.name,
|
264
252
|
self.prompt,
|
265
253
|
self.threshold,
|
266
|
-
self.
|
254
|
+
self.model,
|
267
255
|
self.judgment_api_key,
|
268
256
|
self.organization_id,
|
257
|
+
isinstance(self, TracePromptScorer),
|
269
258
|
)
|
270
259
|
|
271
260
|
def __str__(self):
|
272
|
-
return f"PromptScorer(name={self.name},
|
261
|
+
return f"PromptScorer(name={self.name}, model={self.model}, prompt={self.prompt}, threshold={self.threshold})"
|
273
262
|
|
274
263
|
def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
|
275
264
|
base = super().model_dump(*args, **kwargs)
|
judgeval/scorers/score.py
CHANGED
judgeval/scorers/utils.py
CHANGED
@@ -11,7 +11,4 @@ def clone_scorers(scorers: List[BaseScorer]) -> List[BaseScorer]:
|
|
11
11
|
"""
|
12
12
|
Creates duplicates of the scorers passed as argument.
|
13
13
|
"""
|
14
|
-
|
15
|
-
for s in scorers:
|
16
|
-
cloned_scorers.append(s.model_copy(deep=True))
|
17
|
-
return cloned_scorers
|
14
|
+
return [s.model_copy(deep=True) for s in scorers]
|