judgeval 0.11.0__py3-none-any.whl → 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. judgeval/__init__.py +5 -5
  2. judgeval/api/api_types.py +81 -12
  3. judgeval/cli.py +2 -1
  4. judgeval/constants.py +0 -6
  5. judgeval/data/evaluation_run.py +7 -8
  6. judgeval/data/judgment_types.py +97 -12
  7. judgeval/data/trace.py +108 -1
  8. judgeval/dataset/__init__.py +72 -23
  9. judgeval/env.py +5 -20
  10. judgeval/integrations/langgraph/__init__.py +9 -785
  11. judgeval/scorers/__init__.py +6 -0
  12. judgeval/scorers/api_scorer.py +15 -12
  13. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
  14. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
  15. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
  16. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
  17. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +26 -35
  18. judgeval/scorers/score.py +1 -1
  19. judgeval/scorers/utils.py +1 -4
  20. judgeval/tracer/__init__.py +181 -162
  21. judgeval/tracer/exporters/__init__.py +4 -1
  22. judgeval/tracer/keys.py +15 -25
  23. judgeval/tracer/llm/__init__.py +0 -1
  24. judgeval/tracer/llm/anthropic/__init__.py +20 -0
  25. judgeval/tracer/llm/google/__init__.py +21 -0
  26. judgeval/tracer/llm/groq/__init__.py +20 -0
  27. judgeval/tracer/llm/openai/__init__.py +32 -0
  28. judgeval/tracer/llm/providers.py +28 -79
  29. judgeval/tracer/llm/together/__init__.py +20 -0
  30. judgeval/tracer/managers.py +23 -48
  31. judgeval/tracer/processors/__init__.py +36 -75
  32. judgeval/tracer/utils.py +3 -4
  33. judgeval/trainer/trainer.py +4 -4
  34. judgeval/utils/file_utils.py +0 -2
  35. judgeval/utils/meta.py +18 -5
  36. judgeval/utils/testing.py +0 -14
  37. judgeval/utils/version_check.py +2 -0
  38. judgeval/version.py +1 -1
  39. {judgeval-0.11.0.dist-info → judgeval-0.13.0.dist-info}/METADATA +1 -7
  40. {judgeval-0.11.0.dist-info → judgeval-0.13.0.dist-info}/RECORD +43 -38
  41. {judgeval-0.11.0.dist-info → judgeval-0.13.0.dist-info}/WHEEL +0 -0
  42. {judgeval-0.11.0.dist-info → judgeval-0.13.0.dist-info}/entry_points.txt +0 -0
  43. {judgeval-0.11.0.dist-info → judgeval-0.13.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,7 +1,10 @@
1
1
  from judgeval.scorers.api_scorer import (
2
2
  APIScorerConfig,
3
+ ExampleAPIScorerConfig,
4
+ TraceAPIScorerConfig,
3
5
  )
4
6
  from judgeval.scorers.base_scorer import BaseScorer
7
+ from judgeval.scorers.example_scorer import ExampleScorer
5
8
  from judgeval.scorers.judgeval_scorers.api_scorers import (
6
9
  FaithfulnessScorer,
7
10
  AnswerRelevancyScorer,
@@ -13,7 +16,10 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
13
16
 
14
17
  __all__ = [
15
18
  "APIScorerConfig",
19
+ "ExampleAPIScorerConfig",
20
+ "TraceAPIScorerConfig",
16
21
  "BaseScorer",
22
+ "ExampleScorer",
17
23
  "TracePromptScorer",
18
24
  "PromptScorer",
19
25
  "FaithfulnessScorer",
@@ -8,8 +8,9 @@ from __future__ import annotations
8
8
 
9
9
  from pydantic import BaseModel, field_validator
10
10
  from typing import List
11
- from judgeval.constants import UNBOUNDED_SCORERS, APIScorerType
11
+ from judgeval.constants import APIScorerType
12
12
  from judgeval.data.example import ExampleParams
13
+ from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
13
14
 
14
15
 
15
16
  class APIScorerConfig(BaseModel):
@@ -29,8 +30,8 @@ class APIScorerConfig(BaseModel):
29
30
  name: str = ""
30
31
  threshold: float = 0.5
31
32
  strict_mode: bool = False
33
+ model: str = JUDGMENT_DEFAULT_GPT_MODEL
32
34
 
33
- # This is used to check if the example has the required parameters before running the scorer
34
35
  required_params: List[ExampleParams] = []
35
36
 
36
37
  kwargs: dict = {}
@@ -42,16 +43,10 @@ class APIScorerConfig(BaseModel):
42
43
  Validates that the threshold is between 0 and 1 inclusive.
43
44
  """
44
45
  score_type = info.data.get("score_type")
45
- if score_type in UNBOUNDED_SCORERS:
46
- if v < 0:
47
- raise ValueError(
48
- f"Threshold for {score_type} must be greater than 0, got: {v}"
49
- )
50
- else:
51
- if not 0 <= v <= 1:
52
- raise ValueError(
53
- f"Threshold for {score_type} must be between 0 and 1, got: {v}"
54
- )
46
+ if not 0 <= v <= 1:
47
+ raise ValueError(
48
+ f"Threshold for {score_type} must be between 0 and 1, got: {v}"
49
+ )
55
50
  return v
56
51
 
57
52
  @field_validator("name", mode="after")
@@ -63,3 +58,11 @@ class APIScorerConfig(BaseModel):
63
58
 
64
59
  def __str__(self):
65
60
  return f"JudgmentScorer(score_type={self.score_type.value}, threshold={self.threshold})"
61
+
62
+
63
+ class ExampleAPIScorerConfig(APIScorerConfig):
64
+ pass
65
+
66
+
67
+ class TraceAPIScorerConfig(APIScorerConfig):
68
+ pass
@@ -1,18 +1,10 @@
1
- """
2
- `judgeval` answer relevancy scorer
3
-
4
- TODO add link to docs page for this scorer
5
-
6
- """
7
-
8
- # Internal imports
9
- from judgeval.scorers.api_scorer import APIScorerConfig
1
+ from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
10
2
  from judgeval.constants import APIScorerType
11
3
  from judgeval.data import ExampleParams
12
4
  from typing import List
13
5
 
14
6
 
15
- class AnswerCorrectnessScorer(APIScorerConfig):
7
+ class AnswerCorrectnessScorer(ExampleAPIScorerConfig):
16
8
  score_type: APIScorerType = APIScorerType.ANSWER_CORRECTNESS
17
9
  required_params: List[ExampleParams] = [
18
10
  ExampleParams.INPUT,
@@ -1,10 +1,10 @@
1
- from judgeval.scorers.api_scorer import APIScorerConfig
1
+ from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
2
2
  from judgeval.constants import APIScorerType
3
3
  from judgeval.data import ExampleParams
4
4
  from typing import List
5
5
 
6
6
 
7
- class AnswerRelevancyScorer(APIScorerConfig):
7
+ class AnswerRelevancyScorer(ExampleAPIScorerConfig):
8
8
  score_type: APIScorerType = APIScorerType.ANSWER_RELEVANCY
9
9
  required_params: List[ExampleParams] = [
10
10
  ExampleParams.INPUT,
@@ -1,18 +1,10 @@
1
- """
2
- `judgeval` faithfulness scorer
3
-
4
- TODO add link to docs page for this scorer
5
-
6
- """
7
-
8
- # Internal imports
9
- from judgeval.scorers.api_scorer import APIScorerConfig
1
+ from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
10
2
  from judgeval.constants import APIScorerType
11
3
  from judgeval.data import ExampleParams
12
4
  from typing import List
13
5
 
14
6
 
15
- class FaithfulnessScorer(APIScorerConfig):
7
+ class FaithfulnessScorer(ExampleAPIScorerConfig):
16
8
  score_type: APIScorerType = APIScorerType.FAITHFULNESS
17
9
  required_params: List[ExampleParams] = [
18
10
  ExampleParams.INPUT,
@@ -1,17 +1,9 @@
1
- """
2
- `judgeval` instruction adherence scorer
3
-
4
- TODO add link to docs page for this scorer
5
-
6
- """
7
-
8
- # Internal imports
9
- from judgeval.scorers.api_scorer import APIScorerConfig
1
+ from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
10
2
  from judgeval.constants import APIScorerType
11
3
  from judgeval.data import ExampleParams
12
4
 
13
5
 
14
- class InstructionAdherenceScorer(APIScorerConfig):
6
+ class InstructionAdherenceScorer(ExampleAPIScorerConfig):
15
7
  def __init__(self, threshold: float):
16
8
  super().__init__(
17
9
  threshold=threshold,
@@ -21,7 +13,3 @@ class InstructionAdherenceScorer(APIScorerConfig):
21
13
  ExampleParams.ACTUAL_OUTPUT,
22
14
  ],
23
15
  )
24
-
25
- @property
26
- def __name__(self):
27
- return "Instruction Adherence"
@@ -1,24 +1,26 @@
1
1
  from judgeval.scorers.api_scorer import (
2
2
  APIScorerConfig,
3
+ ExampleAPIScorerConfig,
4
+ TraceAPIScorerConfig,
3
5
  )
4
6
  from judgeval.constants import APIScorerType
5
- from typing import Dict, Any, Optional
7
+ from typing import Dict, Any
6
8
  from judgeval.api import JudgmentSyncClient
7
9
  from judgeval.exceptions import JudgmentAPIError
8
10
  import os
9
- from copy import copy
10
11
  from judgeval.logger import judgeval_logger
11
12
  from abc import ABC
13
+ from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
12
14
 
13
15
 
14
16
  def push_prompt_scorer(
15
17
  name: str,
16
18
  prompt: str,
17
19
  threshold: float,
18
- options: Optional[Dict[str, float]] = None,
20
+ model: str = JUDGMENT_DEFAULT_GPT_MODEL,
19
21
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
20
22
  organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
21
- is_trace: Optional[bool] = None,
23
+ is_trace: bool = False,
22
24
  ) -> str:
23
25
  client = JudgmentSyncClient(judgment_api_key, organization_id)
24
26
  try:
@@ -27,7 +29,7 @@ def push_prompt_scorer(
27
29
  "name": name,
28
30
  "prompt": prompt,
29
31
  "threshold": threshold,
30
- "options": options,
32
+ "model": model,
31
33
  "is_trace": is_trace,
32
34
  }
33
35
  )
@@ -94,17 +96,8 @@ def scorer_exists(
94
96
 
95
97
 
96
98
  class BasePromptScorer(ABC, APIScorerConfig):
97
- """
98
- In the Judgment backend, this scorer is implemented as a PromptScorer that takes
99
- 1. a system role that may involve the Example object
100
- 2. options for scores on the example
101
-
102
- and uses a judge to execute the evaluation from the system role and classify into one of the options
103
- """
104
-
105
99
  score_type: APIScorerType
106
100
  prompt: str
107
- options: Optional[Dict[str, float]] = None
108
101
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or ""
109
102
  organization_id: str = os.getenv("JUDGMENT_ORG_ID") or ""
110
103
 
@@ -131,7 +124,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
131
124
  name=name,
132
125
  prompt=scorer_config["prompt"],
133
126
  threshold=scorer_config["threshold"],
134
- options=scorer_config.get("options"),
127
+ model=scorer_config.get("model"),
135
128
  judgment_api_key=judgment_api_key,
136
129
  organization_id=organization_id,
137
130
  )
@@ -142,7 +135,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
142
135
  name: str,
143
136
  prompt: str,
144
137
  threshold: float = 0.5,
145
- options: Optional[Dict[str, float]] = None,
138
+ model: str = JUDGMENT_DEFAULT_GPT_MODEL,
146
139
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
147
140
  organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
148
141
  ):
@@ -157,7 +150,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
157
150
  name,
158
151
  prompt,
159
152
  threshold,
160
- options,
153
+ model,
161
154
  judgment_api_key,
162
155
  organization_id,
163
156
  is_trace,
@@ -168,7 +161,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
168
161
  name=name,
169
162
  prompt=prompt,
170
163
  threshold=threshold,
171
- options=options,
164
+ model=model,
172
165
  judgment_api_key=judgment_api_key,
173
166
  organization_id=organization_id,
174
167
  )
@@ -198,16 +191,13 @@ class BasePromptScorer(ABC, APIScorerConfig):
198
191
  self.push_prompt_scorer()
199
192
  judgeval_logger.info(f"Successfully updated prompt for {self.name}")
200
193
 
201
- def set_options(self, options: Dict[str, float]):
194
+ def set_model(self, model: str):
202
195
  """
203
- Updates the options with the new options.
204
-
205
- Sample options:
206
- {"yes": 1, "no": 0}
196
+ Updates the model of the scorer.
207
197
  """
208
- self.options = options
198
+ self.model = model
209
199
  self.push_prompt_scorer()
210
- judgeval_logger.info(f"Successfully updated options for {self.name}")
200
+ judgeval_logger.info(f"Successfully updated model for {self.name}")
211
201
 
212
202
  def append_to_prompt(self, prompt_addition: str):
213
203
  """
@@ -218,23 +208,23 @@ class BasePromptScorer(ABC, APIScorerConfig):
218
208
  judgeval_logger.info(f"Successfully appended to prompt for {self.name}")
219
209
 
220
210
  # Getters
221
- def get_threshold(self) -> float | None:
211
+ def get_threshold(self) -> float:
222
212
  """
223
213
  Returns the threshold of the scorer.
224
214
  """
225
215
  return self.threshold
226
216
 
227
- def get_prompt(self) -> str | None:
217
+ def get_prompt(self) -> str:
228
218
  """
229
219
  Returns the prompt of the scorer.
230
220
  """
231
221
  return self.prompt
232
222
 
233
- def get_options(self) -> Dict[str, float] | None:
223
+ def get_model(self) -> str:
234
224
  """
235
- Returns the options of the scorer.
225
+ Returns the model of the scorer.
236
226
  """
237
- return copy(self.options) if self.options is not None else None
227
+ return self.model
238
228
 
239
229
  def get_name(self) -> str | None:
240
230
  """
@@ -248,9 +238,9 @@ class BasePromptScorer(ABC, APIScorerConfig):
248
238
  """
249
239
  return {
250
240
  "name": self.name,
241
+ "model": self.model,
251
242
  "prompt": self.prompt,
252
243
  "threshold": self.threshold,
253
- "options": self.options,
254
244
  }
255
245
 
256
246
  def push_prompt_scorer(self):
@@ -261,13 +251,14 @@ class BasePromptScorer(ABC, APIScorerConfig):
261
251
  self.name,
262
252
  self.prompt,
263
253
  self.threshold,
264
- self.options,
254
+ self.model,
265
255
  self.judgment_api_key,
266
256
  self.organization_id,
257
+ isinstance(self, TracePromptScorer),
267
258
  )
268
259
 
269
260
  def __str__(self):
270
- return f"PromptScorer(name={self.name}, prompt={self.prompt}, threshold={self.threshold}, options={self.options})"
261
+ return f"PromptScorer(name={self.name}, model={self.model}, prompt={self.prompt}, threshold={self.threshold})"
271
262
 
272
263
  def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
273
264
  base = super().model_dump(*args, **kwargs)
@@ -282,9 +273,9 @@ class BasePromptScorer(ABC, APIScorerConfig):
282
273
  return base
283
274
 
284
275
 
285
- class PromptScorer(BasePromptScorer, APIScorerConfig):
276
+ class PromptScorer(BasePromptScorer, ExampleAPIScorerConfig):
286
277
  pass
287
278
 
288
279
 
289
- class TracePromptScorer(BasePromptScorer, APIScorerConfig):
280
+ class TracePromptScorer(BasePromptScorer, TraceAPIScorerConfig):
290
281
  pass
judgeval/scorers/score.py CHANGED
@@ -21,7 +21,7 @@ from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
21
21
 
22
22
 
23
23
  async def safe_a_score_example(
24
- scorer: Union[ExampleScorer],
24
+ scorer: ExampleScorer,
25
25
  example: Example,
26
26
  ):
27
27
  """
judgeval/scorers/utils.py CHANGED
@@ -11,7 +11,4 @@ def clone_scorers(scorers: List[BaseScorer]) -> List[BaseScorer]:
11
11
  """
12
12
  Creates duplicates of the scorers passed as argument.
13
13
  """
14
- cloned_scorers = []
15
- for s in scorers:
16
- cloned_scorers.append(s.model_copy(deep=True))
17
- return cloned_scorers
14
+ return [s.model_copy(deep=True) for s in scorers]