judgeval 0.0.31__py3-none-any.whl → 0.0.33__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. judgeval/__init__.py +3 -1
  2. judgeval/common/s3_storage.py +93 -0
  3. judgeval/common/tracer.py +869 -183
  4. judgeval/constants.py +1 -1
  5. judgeval/data/datasets/dataset.py +5 -1
  6. judgeval/data/datasets/eval_dataset_client.py +2 -2
  7. judgeval/data/sequence.py +16 -26
  8. judgeval/data/sequence_run.py +2 -0
  9. judgeval/judgment_client.py +44 -166
  10. judgeval/rules.py +4 -7
  11. judgeval/run_evaluation.py +2 -2
  12. judgeval/scorers/__init__.py +4 -4
  13. judgeval/scorers/judgeval_scorers/__init__.py +0 -176
  14. judgeval/version_check.py +22 -0
  15. {judgeval-0.0.31.dist-info → judgeval-0.0.33.dist-info}/METADATA +15 -2
  16. judgeval-0.0.33.dist-info/RECORD +63 -0
  17. judgeval/scorers/base_scorer.py +0 -58
  18. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -27
  19. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
  20. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -276
  21. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
  22. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
  23. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
  24. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
  25. judgeval/scorers/judgeval_scorers/local_implementations/comparison/__init__.py +0 -0
  26. judgeval/scorers/judgeval_scorers/local_implementations/comparison/comparison_scorer.py +0 -161
  27. judgeval/scorers/judgeval_scorers/local_implementations/comparison/prompts.py +0 -222
  28. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
  29. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
  30. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
  31. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
  32. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
  33. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
  34. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
  35. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
  36. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
  37. judgeval/scorers/judgeval_scorers/local_implementations/execution_order/__init__.py +0 -3
  38. judgeval/scorers/judgeval_scorers/local_implementations/execution_order/execution_order.py +0 -156
  39. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
  40. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -318
  41. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
  42. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
  43. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -264
  44. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
  45. judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/instruction_adherence.py +0 -232
  46. judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/prompt.py +0 -102
  47. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
  48. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
  49. judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
  50. judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
  51. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -551
  52. judgeval-0.0.31.dist-info/RECORD +0 -96
  53. {judgeval-0.0.31.dist-info → judgeval-0.0.33.dist-info}/WHEEL +0 -0
  54. {judgeval-0.0.31.dist-info → judgeval-0.0.33.dist-info}/licenses/LICENSE.md +0 -0
judgeval/constants.py CHANGED
@@ -43,7 +43,7 @@ JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
43
43
  JUDGMENT_SEQUENCE_EVAL_API_URL = f"{ROOT_API}/evaluate_sequence/"
44
44
  JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
45
45
  JUDGMENT_DATASETS_APPEND_API_URL = f"{ROOT_API}/datasets/insert_examples/"
46
- JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull/"
46
+ JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull_for_judgeval/"
47
47
  JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
48
48
  JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
49
49
  JUDGMENT_DATASETS_PROJECT_STATS_API_URL = f"{ROOT_API}/datasets/fetch_stats_by_project/"
@@ -7,12 +7,13 @@ import yaml
7
7
  from dataclasses import dataclass, field
8
8
  from typing import List, Union, Literal
9
9
 
10
- from judgeval.data import Example
10
+ from judgeval.data import Example, Sequence
11
11
  from judgeval.common.logger import debug, error, warning, info
12
12
 
13
13
  @dataclass
14
14
  class EvalDataset:
15
15
  examples: List[Example]
16
+ sequences: List[Sequence]
16
17
  _alias: Union[str, None] = field(default=None)
17
18
  _id: Union[str, None] = field(default=None)
18
19
  judgment_api_key: str = field(default="")
@@ -21,11 +22,13 @@ class EvalDataset:
21
22
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"),
22
23
  organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
23
24
  examples: List[Example] = [],
25
+ sequences: List[Sequence] = []
24
26
  ):
25
27
  debug(f"Initializing EvalDataset with {len(examples)} examples")
26
28
  if not judgment_api_key:
27
29
  warning("No judgment_api_key provided")
28
30
  self.examples = examples
31
+ self.sequences = sequences
29
32
  self._alias = None
30
33
  self._id = None
31
34
  self.judgment_api_key = judgment_api_key
@@ -309,6 +312,7 @@ class EvalDataset:
309
312
  return (
310
313
  f"{self.__class__.__name__}("
311
314
  f"examples={self.examples}, "
315
+ f"sequences={self.sequences}, "
312
316
  f"_alias={self._alias}, "
313
317
  f"_id={self._id}"
314
318
  f")"
@@ -13,7 +13,7 @@ from judgeval.constants import (
13
13
  JUDGMENT_DATASETS_INSERT_API_URL,
14
14
  JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
15
15
  )
16
- from judgeval.data import Example
16
+ from judgeval.data import Example, Sequence
17
17
  from judgeval.data.datasets import EvalDataset
18
18
 
19
19
 
@@ -201,8 +201,8 @@ class EvalDatasetClient:
201
201
 
202
202
  info(f"Successfully pulled dataset with alias '{alias}'")
203
203
  payload = response.json()
204
-
205
204
  dataset.examples = [Example(**e) for e in payload.get("examples", [])]
205
+ dataset.sequences = [Sequence(**s) for s in payload.get("sequences", [])]
206
206
  dataset._alias = payload.get("alias")
207
207
  dataset._id = payload.get("id")
208
208
  progress.update(
judgeval/data/sequence.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from pydantic import BaseModel, Field, field_validator, model_validator
2
2
  from typing import List, Optional, Union, Any
3
3
  from judgeval.data.example import Example
4
- from judgeval.scorers import ScorerWrapper, JudgevalScorer
4
+ from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
5
5
  from uuid import uuid4
6
6
  from datetime import datetime, timezone
7
7
 
@@ -16,42 +16,32 @@ class Sequence(BaseModel):
16
16
  scorers: Optional[Any] = None
17
17
  parent_sequence_id: Optional[str] = None
18
18
  sequence_order: Optional[int] = 0
19
+ root_sequence_id: Optional[str] = None
20
+ inputs: Optional[str] = None
21
+ output: Optional[str] = None
19
22
 
20
23
  @field_validator("scorers")
21
24
  def validate_scorer(cls, v):
22
- loaded_scorers = []
23
25
  for scorer in v or []:
24
- try:
25
- if isinstance(scorer, ScorerWrapper):
26
- loaded_scorers.append(scorer.load_implementation())
27
- else:
28
- loaded_scorers.append(scorer)
29
- except Exception as e:
30
- raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
31
- return loaded_scorers
26
+ if not isinstance(scorer, APIJudgmentScorer) and not isinstance(scorer, JudgevalScorer):
27
+ raise ValueError(f"Invalid scorer type: {type(scorer)}")
28
+ return v
32
29
 
33
- @model_validator(mode='after')
34
- def set_parent_sequence_ids(self) -> "Sequence":
35
- """Recursively set the parent_sequence_id for all nested Sequences."""
36
- for item in self.items:
37
- if isinstance(item, Sequence):
38
- item.parent_sequence_id = self.sequence_id
39
- # Recurse into deeper nested sequences
40
- item.set_parent_sequence_ids()
41
- return self
30
+ @model_validator(mode="after")
31
+ def populate_sequence_metadata(self) -> "Sequence":
32
+ """Recursively set parent_sequence_id, root_sequence_id, and sequence_order."""
33
+ # If root_sequence_id isn't already set, assign it to self
34
+ if self.root_sequence_id is None:
35
+ self.root_sequence_id = self.sequence_id
42
36
 
43
- @model_validator(mode='after')
44
- def set_parent_and_order(self) -> "Sequence":
45
- """Set parent_sequence_id and sequence_order for all items."""
46
37
  for idx, item in enumerate(self.items):
47
- # Set sequence_order for both Example and Sequence objects
48
38
  item.sequence_order = idx
49
-
50
39
  if isinstance(item, Sequence):
51
40
  item.parent_sequence_id = self.sequence_id
52
- item.set_parent_and_order() # Recurse for nested sequences
41
+ item.root_sequence_id = self.root_sequence_id
42
+ item.populate_sequence_metadata()
53
43
  return self
54
-
44
+
55
45
  class Config:
56
46
  arbitrary_types_allowed = True
57
47
 
@@ -21,6 +21,7 @@ class SequenceRun(BaseModel):
21
21
  metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
22
22
  judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
23
23
  rules (Optional[List[Rule]]): Rules to evaluate against scoring results
24
+ append (Optional[bool]): Whether to append to existing evaluation results
24
25
  """
25
26
 
26
27
  # The user will specify whether they want log_results when they call run_eval
@@ -33,6 +34,7 @@ class SequenceRun(BaseModel):
33
34
  aggregator: Optional[str] = None
34
35
  metadata: Optional[Dict[str, Any]] = None
35
36
  trace_span_id: Optional[str] = None
37
+ append: Optional[bool] = False
36
38
  # API Key will be "" until user calls client.run_eval(), then API Key will be set
37
39
  judgment_api_key: Optional[str] = ""
38
40
  override: Optional[bool] = False
@@ -17,7 +17,6 @@ from judgeval.scorers import (
17
17
  APIJudgmentScorer,
18
18
  JudgevalScorer,
19
19
  ClassifierScorer,
20
- ScorerWrapper,
21
20
  )
22
21
  from judgeval.evaluation_run import EvaluationRun
23
22
  from judgeval.run_evaluation import (
@@ -74,7 +73,7 @@ class JudgmentClient(metaclass=SingletonMeta):
74
73
  def a_run_evaluation(
75
74
  self,
76
75
  examples: List[Example],
77
- scorers: List[Union[ScorerWrapper, JudgevalScorer]],
76
+ scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
78
77
  model: Union[str, List[str], JudgevalJudge],
79
78
  aggregator: Optional[str] = None,
80
79
  metadata: Optional[Dict[str, Any]] = None,
@@ -83,49 +82,57 @@ class JudgmentClient(metaclass=SingletonMeta):
83
82
  eval_run_name: str = "default_eval_run",
84
83
  override: bool = False,
85
84
  append: bool = False,
86
- use_judgment: bool = True,
87
85
  ignore_errors: bool = True,
88
86
  rules: Optional[List[Rule]] = None
89
87
  ) -> List[ScoringResult]:
90
- return self.run_evaluation(examples, scorers, model, aggregator, metadata, log_results, project_name, eval_run_name, override, append, use_judgment, ignore_errors, True, rules)
88
+ return self.run_evaluation(
89
+ examples=examples,
90
+ scorers=scorers,
91
+ model=model,
92
+ aggregator=aggregator,
93
+ metadata=metadata,
94
+ log_results=log_results,
95
+ project_name=project_name,
96
+ eval_run_name=eval_run_name,
97
+ override=override,
98
+ append=append,
99
+ ignore_errors=ignore_errors,
100
+ rules=rules
101
+ )
91
102
 
92
103
  def run_sequence_evaluation(
93
104
  self,
94
105
  sequences: List[Sequence],
95
106
  model: Union[str, List[str], JudgevalJudge],
107
+ scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
96
108
  aggregator: Optional[str] = None,
97
109
  project_name: str = "default_project",
98
110
  eval_run_name: str = "default_eval_sequence",
99
- use_judgment: bool = True,
100
111
  log_results: bool = True,
112
+ append: bool = False,
101
113
  override: bool = False,
102
114
  ignore_errors: bool = True,
103
115
  rules: Optional[List[Rule]] = None
104
116
  ) -> List[ScoringResult]:
105
117
  try:
106
- if rules:
107
- loaded_rules = []
108
- for rule in rules:
109
- try:
110
- processed_conditions = []
111
- for condition in rule.conditions:
112
- # Convert metric if it's a ScorerWrapper
113
- if isinstance(condition.metric, ScorerWrapper):
114
- try:
115
- condition_copy = condition.model_copy()
116
- condition_copy.metric = condition.metric.load_implementation(use_judgment=use_judgment)
117
- processed_conditions.append(condition_copy)
118
- except Exception as e:
119
- raise ValueError(f"Failed to convert ScorerWrapper to implementation in rule '{rule.name}', condition metric '{condition.metric}': {str(e)}")
120
- else:
121
- processed_conditions.append(condition)
122
-
123
- # Create new rule with processed conditions
124
- new_rule = rule.model_copy()
125
- new_rule.conditions = processed_conditions
126
- loaded_rules.append(new_rule)
127
- except Exception as e:
128
- raise ValueError(f"Failed to process rule '{rule.name}': {str(e)}")
118
+ def get_all_sequences(root: Sequence) -> List[Sequence]:
119
+ all_sequences = [root]
120
+
121
+ for item in root.items:
122
+ if isinstance(item, Sequence):
123
+ all_sequences.extend(get_all_sequences(item))
124
+
125
+ return all_sequences
126
+
127
+ def flatten_sequence_list(sequences: List[Sequence]) -> List[Sequence]:
128
+ flattened = []
129
+ for seq in sequences:
130
+ flattened.extend(get_all_sequences(seq))
131
+ return flattened
132
+
133
+ flattened_sequences = flatten_sequence_list(sequences)
134
+ for sequence in flattened_sequences:
135
+ sequence.scorers = scorers
129
136
 
130
137
  sequence_run = SequenceRun(
131
138
  project_name=project_name,
@@ -134,11 +141,11 @@ class JudgmentClient(metaclass=SingletonMeta):
134
141
  model=model,
135
142
  aggregator=aggregator,
136
143
  log_results=log_results,
144
+ append=append,
137
145
  judgment_api_key=self.judgment_api_key,
138
146
  organization_id=self.organization_id
139
147
  )
140
-
141
- return run_sequence_eval(sequence_run, override, ignore_errors, use_judgment)
148
+ return run_sequence_eval(sequence_run, override, ignore_errors)
142
149
  except ValueError as e:
143
150
  raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: \n{str(e)}")
144
151
  except Exception as e:
@@ -147,7 +154,7 @@ class JudgmentClient(metaclass=SingletonMeta):
147
154
  def run_evaluation(
148
155
  self,
149
156
  examples: Union[List[Example], List[CustomExample]],
150
- scorers: List[Union[ScorerWrapper, JudgevalScorer]],
157
+ scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
151
158
  model: Union[str, List[str], JudgevalJudge],
152
159
  aggregator: Optional[str] = None,
153
160
  metadata: Optional[Dict[str, Any]] = None,
@@ -156,7 +163,6 @@ class JudgmentClient(metaclass=SingletonMeta):
156
163
  eval_run_name: str = "default_eval_run",
157
164
  override: bool = False,
158
165
  append: bool = False,
159
- use_judgment: bool = True,
160
166
  ignore_errors: bool = True,
161
167
  async_execution: bool = False,
162
168
  rules: Optional[List[Rule]] = None
@@ -166,7 +172,7 @@ class JudgmentClient(metaclass=SingletonMeta):
166
172
 
167
173
  Args:
168
174
  examples (Union[List[Example], List[CustomExample]]): The examples to evaluate
169
- scorers (List[Union[ScorerWrapper, JudgevalScorer]]): A list of scorers to use for evaluation
175
+ scorers (List[Union[APIJudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
170
176
  model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
171
177
  aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
172
178
  metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run
@@ -174,7 +180,6 @@ class JudgmentClient(metaclass=SingletonMeta):
174
180
  project_name (str): The name of the project the evaluation results belong to
175
181
  eval_run_name (str): A name for this evaluation run
176
182
  override (bool): Whether to override an existing evaluation run with the same name
177
- use_judgment (bool): Whether to use Judgment API for evaluation
178
183
  ignore_errors (bool): Whether to ignore errors during evaluation (safely handled)
179
184
  rules (Optional[List[Rule]]): Rules to evaluate against scoring results
180
185
 
@@ -185,58 +190,21 @@ class JudgmentClient(metaclass=SingletonMeta):
185
190
  raise ValueError("Cannot set both override and append to True. Please choose one.")
186
191
 
187
192
  try:
188
- # Load appropriate implementations for all scorers
189
- loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = []
190
- for scorer in scorers:
191
- try:
192
- if isinstance(scorer, ScorerWrapper):
193
- loaded_scorers.append(scorer.load_implementation(use_judgment=use_judgment))
194
- else:
195
- loaded_scorers.append(scorer)
196
- except Exception as e:
197
- raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
198
-
199
- # Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
200
- if rules and any(isinstance(scorer, JudgevalScorer) for scorer in loaded_scorers):
193
+ if rules and any(isinstance(scorer, JudgevalScorer) for scorer in scorers):
201
194
  raise ValueError("Cannot use Judgeval scorers (only API scorers) when using rules. Please either remove rules or use only APIJudgmentScorer types.")
202
195
 
203
- # Convert ScorerWrapper in rules to their implementations
204
- loaded_rules = None
205
- if rules:
206
- loaded_rules = []
207
- for rule in rules:
208
- try:
209
- processed_conditions = []
210
- for condition in rule.conditions:
211
- # Convert metric if it's a ScorerWrapper
212
- if isinstance(condition.metric, ScorerWrapper):
213
- try:
214
- condition_copy = condition.model_copy()
215
- condition_copy.metric = condition.metric.load_implementation(use_judgment=use_judgment)
216
- processed_conditions.append(condition_copy)
217
- except Exception as e:
218
- raise ValueError(f"Failed to convert ScorerWrapper to implementation in rule '{rule.name}', condition metric '{condition.metric}': {str(e)}")
219
- else:
220
- processed_conditions.append(condition)
221
-
222
- # Create new rule with processed conditions
223
- new_rule = rule.model_copy()
224
- new_rule.conditions = processed_conditions
225
- loaded_rules.append(new_rule)
226
- except Exception as e:
227
- raise ValueError(f"Failed to process rule '{rule.name}': {str(e)}")
228
196
  eval = EvaluationRun(
229
197
  log_results=log_results,
230
198
  append=append,
231
199
  project_name=project_name,
232
200
  eval_name=eval_run_name,
233
201
  examples=examples,
234
- scorers=loaded_scorers,
202
+ scorers=scorers,
235
203
  model=model,
236
204
  aggregator=aggregator,
237
205
  metadata=metadata,
238
206
  judgment_api_key=self.judgment_api_key,
239
- rules=loaded_rules,
207
+ rules=rules,
240
208
  organization_id=self.organization_id
241
209
  )
242
210
  return run_eval(eval, override, ignore_errors=ignore_errors, async_execution=async_execution)
@@ -244,98 +212,6 @@ class JudgmentClient(metaclass=SingletonMeta):
244
212
  raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
245
213
  except Exception as e:
246
214
  raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
247
-
248
- def evaluate_dataset(
249
- self,
250
- dataset: EvalDataset,
251
- scorers: List[Union[ScorerWrapper, JudgevalScorer]],
252
- model: Union[str, List[str], JudgevalJudge],
253
- aggregator: Optional[str] = None,
254
- metadata: Optional[Dict[str, Any]] = None,
255
- project_name: str = "",
256
- eval_run_name: str = "",
257
- log_results: bool = True,
258
- use_judgment: bool = True,
259
- rules: Optional[List[Rule]] = None
260
- ) -> List[ScoringResult]:
261
- """
262
- Executes an evaluation of a `EvalDataset` using one or more `Scorer`s
263
-
264
- Args:
265
- dataset (EvalDataset): The dataset containing examples to evaluate
266
- scorers (List[Union[ScorerWrapper, JudgevalScorer]]): A list of scorers to use for evaluation
267
- model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
268
- aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
269
- metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run
270
- project_name (str): The name of the project the evaluation results belong to
271
- eval_run_name (str): A name for this evaluation run
272
- log_results (bool): Whether to log the results to the Judgment API
273
- use_judgment (bool): Whether to use Judgment API for evaluation
274
- rules (Optional[List[Rule]]): Rules to evaluate against scoring results
275
-
276
- Returns:
277
- List[ScoringResult]: The results of the evaluation
278
- """
279
- try:
280
- # Load appropriate implementations for all scorers
281
- loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = []
282
- for scorer in scorers:
283
- try:
284
- if isinstance(scorer, ScorerWrapper):
285
- loaded_scorers.append(scorer.load_implementation(use_judgment=use_judgment))
286
- else:
287
- loaded_scorers.append(scorer)
288
- except Exception as e:
289
- raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
290
-
291
- # Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
292
- if rules and any(isinstance(scorer, JudgevalScorer) for scorer in loaded_scorers):
293
- raise ValueError("Cannot use Judgeval scorers (only API scorers) when using rules. Please either remove rules or use only APIJudgmentScorer types.")
294
-
295
- # Convert ScorerWrapper in rules to their implementations
296
- loaded_rules = None
297
- if rules:
298
- loaded_rules = []
299
- for rule in rules:
300
- try:
301
- processed_conditions = []
302
- for condition in rule.conditions:
303
- # Convert metric if it's a ScorerWrapper
304
- if isinstance(condition.metric, ScorerWrapper):
305
- try:
306
- condition_copy = condition.model_copy()
307
- condition_copy.metric = condition.metric.load_implementation(use_judgment=use_judgment)
308
- processed_conditions.append(condition_copy)
309
- except Exception as e:
310
- raise ValueError(f"Failed to convert ScorerWrapper to implementation in rule '{rule.name}', condition metric '{condition.metric}': {str(e)}")
311
- else:
312
- processed_conditions.append(condition)
313
-
314
- # Create new rule with processed conditions
315
- new_rule = rule.model_copy()
316
- new_rule.conditions = processed_conditions
317
- loaded_rules.append(new_rule)
318
- except Exception as e:
319
- raise ValueError(f"Failed to process rule '{rule.name}': {str(e)}")
320
-
321
- evaluation_run = EvaluationRun(
322
- log_results=log_results,
323
- project_name=project_name,
324
- eval_name=eval_run_name,
325
- examples=dataset.examples,
326
- scorers=loaded_scorers,
327
- model=model,
328
- aggregator=aggregator,
329
- metadata=metadata,
330
- judgment_api_key=self.judgment_api_key,
331
- rules=loaded_rules,
332
- organization_id=self.organization_id
333
- )
334
- return run_eval(evaluation_run)
335
- except ValueError as e:
336
- raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
337
- except Exception as e:
338
- raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
339
215
 
340
216
  def create_dataset(self) -> EvalDataset:
341
217
  return self.eval_dataset_client.create_dataset()
@@ -566,6 +442,8 @@ class JudgmentClient(metaclass=SingletonMeta):
566
442
  raise JudgmentAPIError(f"Failed to fetch classifier scorer '{slug}': {response.json().get('detail', '')}")
567
443
 
568
444
  scorer_config = response.json()
445
+ created_at = scorer_config.pop("created_at")
446
+ updated_at = scorer_config.pop("updated_at")
569
447
 
570
448
  try:
571
449
  return ClassifierScorer(**scorer_config)
judgeval/rules.py CHANGED
@@ -10,7 +10,7 @@ from concurrent.futures import ThreadPoolExecutor
10
10
  import time
11
11
  import uuid
12
12
 
13
- from judgeval.scorers import APIJudgmentScorer, JudgevalScorer, ScorerWrapper
13
+ from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
14
14
 
15
15
  class AlertStatus(str, Enum):
16
16
  """Status of an alert evaluation."""
@@ -23,22 +23,19 @@ class Condition(BaseModel):
23
23
 
24
24
  Example:
25
25
  {
26
- "metric": FaithfulnessScorer(threshold=0.7) # Must be a scorer object: APIJudgmentScorer, JudgevalScorer, or ScorerWrapper
26
+ "metric": FaithfulnessScorer(threshold=0.7) # Must be a scorer object: APIJudgmentScorer, JudgevalScorer
27
27
  }
28
28
 
29
29
  The Condition class uses the scorer's threshold and success function internally.
30
30
  """
31
31
  model_config = ConfigDict(arbitrary_types_allowed=True)
32
32
 
33
- metric: Union[APIJudgmentScorer, JudgevalScorer, ScorerWrapper]
33
+ metric: Union[APIJudgmentScorer, JudgevalScorer]
34
34
 
35
35
  @property
36
36
  def metric_name(self) -> str:
37
37
  """Get the name of the metric for lookups in scores dictionary."""
38
- if isinstance(self.metric, ScorerWrapper):
39
- # Handle ScorerWrapper case specifically
40
- return self.metric.scorer.score_type if hasattr(self.metric.scorer, 'score_type') else str(self.metric.scorer)
41
- elif hasattr(self.metric, 'score_type'):
38
+ if hasattr(self.metric, 'score_type'):
42
39
  # Handle APIJudgmentScorer and JudgevalScorer which have score_type
43
40
  return self.metric.score_type
44
41
  elif hasattr(self.metric, '__name__'):
@@ -334,9 +334,9 @@ def check_examples(examples: List[Example], scorers: List[APIJudgmentScorer]) ->
334
334
  # Example ID (usually random UUID) does not provide any helpful information for the user but printing the entire example is overdoing it
335
335
  print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
336
336
 
337
- def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True, async_execution: bool = False) -> List[ScoringResult]:
337
+ def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True) -> List[ScoringResult]:
338
338
  # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
339
- if not override and sequence_run.log_results:
339
+ if not override and sequence_run.log_results and not sequence_run.append:
340
340
  check_eval_run_name_exists(
341
341
  sequence_run.eval_name,
342
342
  sequence_run.project_name,
@@ -1,7 +1,7 @@
1
1
  from judgeval.scorers.api_scorer import APIJudgmentScorer
2
2
  from judgeval.scorers.judgeval_scorer import JudgevalScorer
3
3
  from judgeval.scorers.prompt_scorer import PromptScorer, ClassifierScorer
4
- from judgeval.scorers.judgeval_scorers import (
4
+ from judgeval.scorers.judgeval_scorers.api_scorers import (
5
5
  ExecutionOrderScorer,
6
6
  JSONCorrectnessScorer,
7
7
  SummarizationScorer,
@@ -11,14 +11,15 @@ from judgeval.scorers.judgeval_scorers import (
11
11
  ContextualPrecisionScorer,
12
12
  ContextualRecallScorer,
13
13
  AnswerRelevancyScorer,
14
- ScorerWrapper,
15
14
  AnswerCorrectnessScorer,
16
- Text2SQLScorer,
17
15
  ComparisonScorer,
18
16
  InstructionAdherenceScorer,
19
17
  GroundednessScorer,
20
18
  DerailmentScorer,
21
19
  )
20
+ from judgeval.scorers.judgeval_scorers.classifiers import (
21
+ Text2SQLScorer,
22
+ )
22
23
 
23
24
  __all__ = [
24
25
  "APIJudgmentScorer",
@@ -34,7 +35,6 @@ __all__ = [
34
35
  "ContextualPrecisionScorer",
35
36
  "ContextualRecallScorer",
36
37
  "AnswerRelevancyScorer",
37
- "ScorerWrapper",
38
38
  "AnswerCorrectnessScorer",
39
39
  "Text2SQLScorer",
40
40
  "ComparisonScorer",