judgeval 0.12.0__py3-none-any.whl → 0.13.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. judgeval/__init__.py +2 -2
  2. judgeval/api/api_types.py +81 -12
  3. judgeval/cli.py +2 -1
  4. judgeval/constants.py +0 -6
  5. judgeval/data/evaluation_run.py +2 -5
  6. judgeval/data/judgment_types.py +97 -12
  7. judgeval/data/trace.py +108 -1
  8. judgeval/dataset/__init__.py +72 -23
  9. judgeval/env.py +5 -20
  10. judgeval/integrations/langgraph/__init__.py +9 -785
  11. judgeval/scorers/api_scorer.py +7 -12
  12. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -8
  13. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -8
  14. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -12
  15. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +22 -33
  16. judgeval/scorers/score.py +1 -1
  17. judgeval/scorers/utils.py +1 -4
  18. judgeval/tracer/__init__.py +175 -156
  19. judgeval/tracer/exporters/__init__.py +4 -1
  20. judgeval/tracer/keys.py +15 -25
  21. judgeval/tracer/llm/__init__.py +0 -1
  22. judgeval/tracer/llm/anthropic/__init__.py +20 -0
  23. judgeval/tracer/llm/google/__init__.py +21 -0
  24. judgeval/tracer/llm/groq/__init__.py +20 -0
  25. judgeval/tracer/llm/openai/__init__.py +32 -0
  26. judgeval/tracer/llm/providers.py +28 -79
  27. judgeval/tracer/llm/together/__init__.py +20 -0
  28. judgeval/tracer/managers.py +23 -48
  29. judgeval/tracer/processors/__init__.py +36 -75
  30. judgeval/tracer/utils.py +1 -2
  31. judgeval/utils/file_utils.py +0 -2
  32. judgeval/utils/meta.py +18 -5
  33. judgeval/utils/testing.py +0 -14
  34. judgeval/utils/version_check.py +2 -0
  35. judgeval/version.py +1 -1
  36. {judgeval-0.12.0.dist-info → judgeval-0.13.1.dist-info}/METADATA +1 -7
  37. {judgeval-0.12.0.dist-info → judgeval-0.13.1.dist-info}/RECORD +40 -35
  38. {judgeval-0.12.0.dist-info → judgeval-0.13.1.dist-info}/WHEEL +0 -0
  39. {judgeval-0.12.0.dist-info → judgeval-0.13.1.dist-info}/entry_points.txt +0 -0
  40. {judgeval-0.12.0.dist-info → judgeval-0.13.1.dist-info}/licenses/LICENSE.md +0 -0
@@ -8,8 +8,9 @@ from __future__ import annotations
8
8
 
9
9
  from pydantic import BaseModel, field_validator
10
10
  from typing import List
11
- from judgeval.constants import UNBOUNDED_SCORERS, APIScorerType
11
+ from judgeval.constants import APIScorerType
12
12
  from judgeval.data.example import ExampleParams
13
+ from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
13
14
 
14
15
 
15
16
  class APIScorerConfig(BaseModel):
@@ -29,8 +30,8 @@ class APIScorerConfig(BaseModel):
29
30
  name: str = ""
30
31
  threshold: float = 0.5
31
32
  strict_mode: bool = False
33
+ model: str = JUDGMENT_DEFAULT_GPT_MODEL
32
34
 
33
- # This is used to check if the example has the required parameters before running the scorer
34
35
  required_params: List[ExampleParams] = []
35
36
 
36
37
  kwargs: dict = {}
@@ -42,16 +43,10 @@ class APIScorerConfig(BaseModel):
42
43
  Validates that the threshold is between 0 and 1 inclusive.
43
44
  """
44
45
  score_type = info.data.get("score_type")
45
- if score_type in UNBOUNDED_SCORERS:
46
- if v < 0:
47
- raise ValueError(
48
- f"Threshold for {score_type} must be greater than 0, got: {v}"
49
- )
50
- else:
51
- if not 0 <= v <= 1:
52
- raise ValueError(
53
- f"Threshold for {score_type} must be between 0 and 1, got: {v}"
54
- )
46
+ if not 0 <= v <= 1:
47
+ raise ValueError(
48
+ f"Threshold for {score_type} must be between 0 and 1, got: {v}"
49
+ )
55
50
  return v
56
51
 
57
52
  @field_validator("name", mode="after")
@@ -1,11 +1,3 @@
1
- """
2
- `judgeval` answer relevancy scorer
3
-
4
- TODO add link to docs page for this scorer
5
-
6
- """
7
-
8
- # Internal imports
9
1
  from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
10
2
  from judgeval.constants import APIScorerType
11
3
  from judgeval.data import ExampleParams
@@ -1,11 +1,3 @@
1
- """
2
- `judgeval` faithfulness scorer
3
-
4
- TODO add link to docs page for this scorer
5
-
6
- """
7
-
8
- # Internal imports
9
1
  from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
10
2
  from judgeval.constants import APIScorerType
11
3
  from judgeval.data import ExampleParams
@@ -1,11 +1,3 @@
1
- """
2
- `judgeval` instruction adherence scorer
3
-
4
- TODO add link to docs page for this scorer
5
-
6
- """
7
-
8
- # Internal imports
9
1
  from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
10
2
  from judgeval.constants import APIScorerType
11
3
  from judgeval.data import ExampleParams
@@ -21,7 +13,3 @@ class InstructionAdherenceScorer(ExampleAPIScorerConfig):
21
13
  ExampleParams.ACTUAL_OUTPUT,
22
14
  ],
23
15
  )
24
-
25
- @property
26
- def __name__(self):
27
- return "Instruction Adherence"
@@ -4,23 +4,23 @@ from judgeval.scorers.api_scorer import (
4
4
  TraceAPIScorerConfig,
5
5
  )
6
6
  from judgeval.constants import APIScorerType
7
- from typing import Dict, Any, Optional
7
+ from typing import Dict, Any
8
8
  from judgeval.api import JudgmentSyncClient
9
9
  from judgeval.exceptions import JudgmentAPIError
10
10
  import os
11
- from copy import copy
12
11
  from judgeval.logger import judgeval_logger
13
12
  from abc import ABC
13
+ from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
14
14
 
15
15
 
16
16
  def push_prompt_scorer(
17
17
  name: str,
18
18
  prompt: str,
19
19
  threshold: float,
20
- options: Optional[Dict[str, float]] = None,
20
+ model: str = JUDGMENT_DEFAULT_GPT_MODEL,
21
21
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
22
22
  organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
23
- is_trace: Optional[bool] = None,
23
+ is_trace: bool = False,
24
24
  ) -> str:
25
25
  client = JudgmentSyncClient(judgment_api_key, organization_id)
26
26
  try:
@@ -29,7 +29,7 @@ def push_prompt_scorer(
29
29
  "name": name,
30
30
  "prompt": prompt,
31
31
  "threshold": threshold,
32
- "options": options,
32
+ "model": model,
33
33
  "is_trace": is_trace,
34
34
  }
35
35
  )
@@ -96,17 +96,8 @@ def scorer_exists(
96
96
 
97
97
 
98
98
  class BasePromptScorer(ABC, APIScorerConfig):
99
- """
100
- In the Judgment backend, this scorer is implemented as a PromptScorer that takes
101
- 1. a system role that may involve the Example object
102
- 2. options for scores on the example
103
-
104
- and uses a judge to execute the evaluation from the system role and classify into one of the options
105
- """
106
-
107
99
  score_type: APIScorerType
108
100
  prompt: str
109
- options: Optional[Dict[str, float]] = None
110
101
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or ""
111
102
  organization_id: str = os.getenv("JUDGMENT_ORG_ID") or ""
112
103
 
@@ -133,7 +124,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
133
124
  name=name,
134
125
  prompt=scorer_config["prompt"],
135
126
  threshold=scorer_config["threshold"],
136
- options=scorer_config.get("options"),
127
+ model=scorer_config.get("model"),
137
128
  judgment_api_key=judgment_api_key,
138
129
  organization_id=organization_id,
139
130
  )
@@ -144,7 +135,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
144
135
  name: str,
145
136
  prompt: str,
146
137
  threshold: float = 0.5,
147
- options: Optional[Dict[str, float]] = None,
138
+ model: str = JUDGMENT_DEFAULT_GPT_MODEL,
148
139
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
149
140
  organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
150
141
  ):
@@ -159,7 +150,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
159
150
  name,
160
151
  prompt,
161
152
  threshold,
162
- options,
153
+ model,
163
154
  judgment_api_key,
164
155
  organization_id,
165
156
  is_trace,
@@ -170,7 +161,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
170
161
  name=name,
171
162
  prompt=prompt,
172
163
  threshold=threshold,
173
- options=options,
164
+ model=model,
174
165
  judgment_api_key=judgment_api_key,
175
166
  organization_id=organization_id,
176
167
  )
@@ -200,16 +191,13 @@ class BasePromptScorer(ABC, APIScorerConfig):
200
191
  self.push_prompt_scorer()
201
192
  judgeval_logger.info(f"Successfully updated prompt for {self.name}")
202
193
 
203
- def set_options(self, options: Dict[str, float]):
194
+ def set_model(self, model: str):
204
195
  """
205
- Updates the options with the new options.
206
-
207
- Sample options:
208
- {"yes": 1, "no": 0}
196
+ Updates the model of the scorer.
209
197
  """
210
- self.options = options
198
+ self.model = model
211
199
  self.push_prompt_scorer()
212
- judgeval_logger.info(f"Successfully updated options for {self.name}")
200
+ judgeval_logger.info(f"Successfully updated model for {self.name}")
213
201
 
214
202
  def append_to_prompt(self, prompt_addition: str):
215
203
  """
@@ -220,23 +208,23 @@ class BasePromptScorer(ABC, APIScorerConfig):
220
208
  judgeval_logger.info(f"Successfully appended to prompt for {self.name}")
221
209
 
222
210
  # Getters
223
- def get_threshold(self) -> float | None:
211
+ def get_threshold(self) -> float:
224
212
  """
225
213
  Returns the threshold of the scorer.
226
214
  """
227
215
  return self.threshold
228
216
 
229
- def get_prompt(self) -> str | None:
217
+ def get_prompt(self) -> str:
230
218
  """
231
219
  Returns the prompt of the scorer.
232
220
  """
233
221
  return self.prompt
234
222
 
235
- def get_options(self) -> Dict[str, float] | None:
223
+ def get_model(self) -> str:
236
224
  """
237
- Returns the options of the scorer.
225
+ Returns the model of the scorer.
238
226
  """
239
- return copy(self.options) if self.options is not None else None
227
+ return self.model
240
228
 
241
229
  def get_name(self) -> str | None:
242
230
  """
@@ -250,9 +238,9 @@ class BasePromptScorer(ABC, APIScorerConfig):
250
238
  """
251
239
  return {
252
240
  "name": self.name,
241
+ "model": self.model,
253
242
  "prompt": self.prompt,
254
243
  "threshold": self.threshold,
255
- "options": self.options,
256
244
  }
257
245
 
258
246
  def push_prompt_scorer(self):
@@ -263,13 +251,14 @@ class BasePromptScorer(ABC, APIScorerConfig):
263
251
  self.name,
264
252
  self.prompt,
265
253
  self.threshold,
266
- self.options,
254
+ self.model,
267
255
  self.judgment_api_key,
268
256
  self.organization_id,
257
+ isinstance(self, TracePromptScorer),
269
258
  )
270
259
 
271
260
  def __str__(self):
272
- return f"PromptScorer(name={self.name}, prompt={self.prompt}, threshold={self.threshold}, options={self.options})"
261
+ return f"PromptScorer(name={self.name}, model={self.model}, prompt={self.prompt}, threshold={self.threshold})"
273
262
 
274
263
  def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
275
264
  base = super().model_dump(*args, **kwargs)
judgeval/scorers/score.py CHANGED
@@ -21,7 +21,7 @@ from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
21
21
 
22
22
 
23
23
  async def safe_a_score_example(
24
- scorer: Union[ExampleScorer],
24
+ scorer: ExampleScorer,
25
25
  example: Example,
26
26
  ):
27
27
  """
judgeval/scorers/utils.py CHANGED
@@ -11,7 +11,4 @@ def clone_scorers(scorers: List[BaseScorer]) -> List[BaseScorer]:
11
11
  """
12
12
  Creates duplicates of the scorers passed as argument.
13
13
  """
14
- cloned_scorers = []
15
- for s in scorers:
16
- cloned_scorers.append(s.model_copy(deep=True))
17
- return cloned_scorers
14
+ return [s.model_copy(deep=True) for s in scorers]