judgeval 0.0.32__py3-none-any.whl → 0.0.33__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. judgeval/common/s3_storage.py +93 -0
  2. judgeval/common/tracer.py +612 -123
  3. judgeval/data/sequence.py +4 -10
  4. judgeval/judgment_client.py +25 -86
  5. judgeval/rules.py +4 -7
  6. judgeval/run_evaluation.py +1 -1
  7. judgeval/scorers/__init__.py +4 -4
  8. judgeval/scorers/judgeval_scorers/__init__.py +0 -176
  9. {judgeval-0.0.32.dist-info → judgeval-0.0.33.dist-info}/METADATA +15 -2
  10. judgeval-0.0.33.dist-info/RECORD +63 -0
  11. judgeval/scorers/base_scorer.py +0 -58
  12. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -27
  13. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
  14. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -276
  15. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
  16. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
  17. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
  18. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
  19. judgeval/scorers/judgeval_scorers/local_implementations/comparison/__init__.py +0 -0
  20. judgeval/scorers/judgeval_scorers/local_implementations/comparison/comparison_scorer.py +0 -161
  21. judgeval/scorers/judgeval_scorers/local_implementations/comparison/prompts.py +0 -222
  22. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
  23. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
  24. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
  25. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
  26. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
  27. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
  28. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
  29. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
  30. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
  31. judgeval/scorers/judgeval_scorers/local_implementations/execution_order/__init__.py +0 -3
  32. judgeval/scorers/judgeval_scorers/local_implementations/execution_order/execution_order.py +0 -156
  33. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
  34. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -318
  35. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
  36. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
  37. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -264
  38. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
  39. judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/instruction_adherence.py +0 -232
  40. judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/prompt.py +0 -102
  41. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
  42. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
  43. judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
  44. judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
  45. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -551
  46. judgeval-0.0.32.dist-info/RECORD +0 -97
  47. {judgeval-0.0.32.dist-info → judgeval-0.0.33.dist-info}/WHEEL +0 -0
  48. {judgeval-0.0.32.dist-info → judgeval-0.0.33.dist-info}/licenses/LICENSE.md +0 -0
judgeval/data/sequence.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from pydantic import BaseModel, Field, field_validator, model_validator
2
2
  from typing import List, Optional, Union, Any
3
3
  from judgeval.data.example import Example
4
- from judgeval.scorers import ScorerWrapper, JudgevalScorer
4
+ from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
5
5
  from uuid import uuid4
6
6
  from datetime import datetime, timezone
7
7
 
@@ -22,16 +22,10 @@ class Sequence(BaseModel):
22
22
 
23
23
  @field_validator("scorers")
24
24
  def validate_scorer(cls, v):
25
- loaded_scorers = []
26
25
  for scorer in v or []:
27
- try:
28
- if isinstance(scorer, ScorerWrapper):
29
- loaded_scorers.append(scorer.load_implementation())
30
- else:
31
- loaded_scorers.append(scorer)
32
- except Exception as e:
33
- raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
34
- return loaded_scorers
26
+ if not isinstance(scorer, APIJudgmentScorer) and not isinstance(scorer, JudgevalScorer):
27
+ raise ValueError(f"Invalid scorer type: {type(scorer)}")
28
+ return v
35
29
 
36
30
  @model_validator(mode="after")
37
31
  def populate_sequence_metadata(self) -> "Sequence":
@@ -17,7 +17,6 @@ from judgeval.scorers import (
17
17
  APIJudgmentScorer,
18
18
  JudgevalScorer,
19
19
  ClassifierScorer,
20
- ScorerWrapper,
21
20
  )
22
21
  from judgeval.evaluation_run import EvaluationRun
23
22
  from judgeval.run_evaluation import (
@@ -74,7 +73,7 @@ class JudgmentClient(metaclass=SingletonMeta):
74
73
  def a_run_evaluation(
75
74
  self,
76
75
  examples: List[Example],
77
- scorers: List[Union[ScorerWrapper, JudgevalScorer]],
76
+ scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
78
77
  model: Union[str, List[str], JudgevalJudge],
79
78
  aggregator: Optional[str] = None,
80
79
  metadata: Optional[Dict[str, Any]] = None,
@@ -83,21 +82,32 @@ class JudgmentClient(metaclass=SingletonMeta):
83
82
  eval_run_name: str = "default_eval_run",
84
83
  override: bool = False,
85
84
  append: bool = False,
86
- use_judgment: bool = True,
87
85
  ignore_errors: bool = True,
88
86
  rules: Optional[List[Rule]] = None
89
87
  ) -> List[ScoringResult]:
90
- return self.run_evaluation(examples, scorers, model, aggregator, metadata, log_results, project_name, eval_run_name, override, append, use_judgment, ignore_errors, True, rules)
88
+ return self.run_evaluation(
89
+ examples=examples,
90
+ scorers=scorers,
91
+ model=model,
92
+ aggregator=aggregator,
93
+ metadata=metadata,
94
+ log_results=log_results,
95
+ project_name=project_name,
96
+ eval_run_name=eval_run_name,
97
+ override=override,
98
+ append=append,
99
+ ignore_errors=ignore_errors,
100
+ rules=rules
101
+ )
91
102
 
92
103
  def run_sequence_evaluation(
93
104
  self,
94
105
  sequences: List[Sequence],
95
106
  model: Union[str, List[str], JudgevalJudge],
96
- scorers: List[Union[ScorerWrapper, JudgevalScorer]],
107
+ scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
97
108
  aggregator: Optional[str] = None,
98
109
  project_name: str = "default_project",
99
110
  eval_run_name: str = "default_eval_sequence",
100
- use_judgment: bool = True,
101
111
  log_results: bool = True,
102
112
  append: bool = False,
103
113
  override: bool = False,
@@ -105,16 +115,6 @@ class JudgmentClient(metaclass=SingletonMeta):
105
115
  rules: Optional[List[Rule]] = None
106
116
  ) -> List[ScoringResult]:
107
117
  try:
108
- loaded_scorers = []
109
- for scorer in scorers:
110
- try:
111
- if isinstance(scorer, ScorerWrapper):
112
- loaded_scorers.append(scorer.load_implementation())
113
- else:
114
- loaded_scorers.append(scorer)
115
- except Exception as e:
116
- raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
117
-
118
118
  def get_all_sequences(root: Sequence) -> List[Sequence]:
119
119
  all_sequences = [root]
120
120
 
@@ -132,31 +132,7 @@ class JudgmentClient(metaclass=SingletonMeta):
132
132
 
133
133
  flattened_sequences = flatten_sequence_list(sequences)
134
134
  for sequence in flattened_sequences:
135
- sequence.scorers = loaded_scorers
136
-
137
- if rules:
138
- loaded_rules = []
139
- for rule in rules:
140
- try:
141
- processed_conditions = []
142
- for condition in rule.conditions:
143
- # Convert metric if it's a ScorerWrapper
144
- if isinstance(condition.metric, ScorerWrapper):
145
- try:
146
- condition_copy = condition.model_copy()
147
- condition_copy.metric = condition.metric.load_implementation(use_judgment=use_judgment)
148
- processed_conditions.append(condition_copy)
149
- except Exception as e:
150
- raise ValueError(f"Failed to convert ScorerWrapper to implementation in rule '{rule.name}', condition metric '{condition.metric}': {str(e)}")
151
- else:
152
- processed_conditions.append(condition)
153
-
154
- # Create new rule with processed conditions
155
- new_rule = rule.model_copy()
156
- new_rule.conditions = processed_conditions
157
- loaded_rules.append(new_rule)
158
- except Exception as e:
159
- raise ValueError(f"Failed to process rule '{rule.name}': {str(e)}")
135
+ sequence.scorers = scorers
160
136
 
161
137
  sequence_run = SequenceRun(
162
138
  project_name=project_name,
@@ -169,7 +145,7 @@ class JudgmentClient(metaclass=SingletonMeta):
169
145
  judgment_api_key=self.judgment_api_key,
170
146
  organization_id=self.organization_id
171
147
  )
172
- return run_sequence_eval(sequence_run, override, ignore_errors, use_judgment)
148
+ return run_sequence_eval(sequence_run, override, ignore_errors)
173
149
  except ValueError as e:
174
150
  raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: \n{str(e)}")
175
151
  except Exception as e:
@@ -178,7 +154,7 @@ class JudgmentClient(metaclass=SingletonMeta):
178
154
  def run_evaluation(
179
155
  self,
180
156
  examples: Union[List[Example], List[CustomExample]],
181
- scorers: List[Union[ScorerWrapper, JudgevalScorer]],
157
+ scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
182
158
  model: Union[str, List[str], JudgevalJudge],
183
159
  aggregator: Optional[str] = None,
184
160
  metadata: Optional[Dict[str, Any]] = None,
@@ -187,7 +163,6 @@ class JudgmentClient(metaclass=SingletonMeta):
187
163
  eval_run_name: str = "default_eval_run",
188
164
  override: bool = False,
189
165
  append: bool = False,
190
- use_judgment: bool = True,
191
166
  ignore_errors: bool = True,
192
167
  async_execution: bool = False,
193
168
  rules: Optional[List[Rule]] = None
@@ -197,7 +172,7 @@ class JudgmentClient(metaclass=SingletonMeta):
197
172
 
198
173
  Args:
199
174
  examples (Union[List[Example], List[CustomExample]]): The examples to evaluate
200
- scorers (List[Union[ScorerWrapper, JudgevalScorer]]): A list of scorers to use for evaluation
175
+ scorers (List[Union[APIJudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
201
176
  model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
202
177
  aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
203
178
  metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run
@@ -205,7 +180,6 @@ class JudgmentClient(metaclass=SingletonMeta):
205
180
  project_name (str): The name of the project the evaluation results belong to
206
181
  eval_run_name (str): A name for this evaluation run
207
182
  override (bool): Whether to override an existing evaluation run with the same name
208
- use_judgment (bool): Whether to use Judgment API for evaluation
209
183
  ignore_errors (bool): Whether to ignore errors during evaluation (safely handled)
210
184
  rules (Optional[List[Rule]]): Rules to evaluate against scoring results
211
185
 
@@ -216,58 +190,21 @@ class JudgmentClient(metaclass=SingletonMeta):
216
190
  raise ValueError("Cannot set both override and append to True. Please choose one.")
217
191
 
218
192
  try:
219
- # Load appropriate implementations for all scorers
220
- loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = []
221
- for scorer in scorers:
222
- try:
223
- if isinstance(scorer, ScorerWrapper):
224
- loaded_scorers.append(scorer.load_implementation(use_judgment=use_judgment))
225
- else:
226
- loaded_scorers.append(scorer)
227
- except Exception as e:
228
- raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
229
-
230
- # Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
231
- if rules and any(isinstance(scorer, JudgevalScorer) for scorer in loaded_scorers):
193
+ if rules and any(isinstance(scorer, JudgevalScorer) for scorer in scorers):
232
194
  raise ValueError("Cannot use Judgeval scorers (only API scorers) when using rules. Please either remove rules or use only APIJudgmentScorer types.")
233
195
 
234
- # Convert ScorerWrapper in rules to their implementations
235
- loaded_rules = None
236
- if rules:
237
- loaded_rules = []
238
- for rule in rules:
239
- try:
240
- processed_conditions = []
241
- for condition in rule.conditions:
242
- # Convert metric if it's a ScorerWrapper
243
- if isinstance(condition.metric, ScorerWrapper):
244
- try:
245
- condition_copy = condition.model_copy()
246
- condition_copy.metric = condition.metric.load_implementation(use_judgment=use_judgment)
247
- processed_conditions.append(condition_copy)
248
- except Exception as e:
249
- raise ValueError(f"Failed to convert ScorerWrapper to implementation in rule '{rule.name}', condition metric '{condition.metric}': {str(e)}")
250
- else:
251
- processed_conditions.append(condition)
252
-
253
- # Create new rule with processed conditions
254
- new_rule = rule.model_copy()
255
- new_rule.conditions = processed_conditions
256
- loaded_rules.append(new_rule)
257
- except Exception as e:
258
- raise ValueError(f"Failed to process rule '{rule.name}': {str(e)}")
259
196
  eval = EvaluationRun(
260
197
  log_results=log_results,
261
198
  append=append,
262
199
  project_name=project_name,
263
200
  eval_name=eval_run_name,
264
201
  examples=examples,
265
- scorers=loaded_scorers,
202
+ scorers=scorers,
266
203
  model=model,
267
204
  aggregator=aggregator,
268
205
  metadata=metadata,
269
206
  judgment_api_key=self.judgment_api_key,
270
- rules=loaded_rules,
207
+ rules=rules,
271
208
  organization_id=self.organization_id
272
209
  )
273
210
  return run_eval(eval, override, ignore_errors=ignore_errors, async_execution=async_execution)
@@ -505,6 +442,8 @@ class JudgmentClient(metaclass=SingletonMeta):
505
442
  raise JudgmentAPIError(f"Failed to fetch classifier scorer '{slug}': {response.json().get('detail', '')}")
506
443
 
507
444
  scorer_config = response.json()
445
+ created_at = scorer_config.pop("created_at")
446
+ updated_at = scorer_config.pop("updated_at")
508
447
 
509
448
  try:
510
449
  return ClassifierScorer(**scorer_config)
judgeval/rules.py CHANGED
@@ -10,7 +10,7 @@ from concurrent.futures import ThreadPoolExecutor
10
10
  import time
11
11
  import uuid
12
12
 
13
- from judgeval.scorers import APIJudgmentScorer, JudgevalScorer, ScorerWrapper
13
+ from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
14
14
 
15
15
  class AlertStatus(str, Enum):
16
16
  """Status of an alert evaluation."""
@@ -23,22 +23,19 @@ class Condition(BaseModel):
23
23
 
24
24
  Example:
25
25
  {
26
- "metric": FaithfulnessScorer(threshold=0.7) # Must be a scorer object: APIJudgmentScorer, JudgevalScorer, or ScorerWrapper
26
+ "metric": FaithfulnessScorer(threshold=0.7) # Must be a scorer object: APIJudgmentScorer, JudgevalScorer
27
27
  }
28
28
 
29
29
  The Condition class uses the scorer's threshold and success function internally.
30
30
  """
31
31
  model_config = ConfigDict(arbitrary_types_allowed=True)
32
32
 
33
- metric: Union[APIJudgmentScorer, JudgevalScorer, ScorerWrapper]
33
+ metric: Union[APIJudgmentScorer, JudgevalScorer]
34
34
 
35
35
  @property
36
36
  def metric_name(self) -> str:
37
37
  """Get the name of the metric for lookups in scores dictionary."""
38
- if isinstance(self.metric, ScorerWrapper):
39
- # Handle ScorerWrapper case specifically
40
- return self.metric.scorer.score_type if hasattr(self.metric.scorer, 'score_type') else str(self.metric.scorer)
41
- elif hasattr(self.metric, 'score_type'):
38
+ if hasattr(self.metric, 'score_type'):
42
39
  # Handle APIJudgmentScorer and JudgevalScorer which have score_type
43
40
  return self.metric.score_type
44
41
  elif hasattr(self.metric, '__name__'):
@@ -334,7 +334,7 @@ def check_examples(examples: List[Example], scorers: List[APIJudgmentScorer]) ->
334
334
  # Example ID (usually random UUID) does not provide any helpful information for the user but printing the entire example is overdoing it
335
335
  print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
336
336
 
337
- def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True, async_execution: bool = False) -> List[ScoringResult]:
337
+ def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True) -> List[ScoringResult]:
338
338
  # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
339
339
  if not override and sequence_run.log_results and not sequence_run.append:
340
340
  check_eval_run_name_exists(
@@ -1,7 +1,7 @@
1
1
  from judgeval.scorers.api_scorer import APIJudgmentScorer
2
2
  from judgeval.scorers.judgeval_scorer import JudgevalScorer
3
3
  from judgeval.scorers.prompt_scorer import PromptScorer, ClassifierScorer
4
- from judgeval.scorers.judgeval_scorers import (
4
+ from judgeval.scorers.judgeval_scorers.api_scorers import (
5
5
  ExecutionOrderScorer,
6
6
  JSONCorrectnessScorer,
7
7
  SummarizationScorer,
@@ -11,14 +11,15 @@ from judgeval.scorers.judgeval_scorers import (
11
11
  ContextualPrecisionScorer,
12
12
  ContextualRecallScorer,
13
13
  AnswerRelevancyScorer,
14
- ScorerWrapper,
15
14
  AnswerCorrectnessScorer,
16
- Text2SQLScorer,
17
15
  ComparisonScorer,
18
16
  InstructionAdherenceScorer,
19
17
  GroundednessScorer,
20
18
  DerailmentScorer,
21
19
  )
20
+ from judgeval.scorers.judgeval_scorers.classifiers import (
21
+ Text2SQLScorer,
22
+ )
22
23
 
23
24
  __all__ = [
24
25
  "APIJudgmentScorer",
@@ -34,7 +35,6 @@ __all__ = [
34
35
  "ContextualPrecisionScorer",
35
36
  "ContextualRecallScorer",
36
37
  "AnswerRelevancyScorer",
37
- "ScorerWrapper",
38
38
  "AnswerCorrectnessScorer",
39
39
  "Text2SQLScorer",
40
40
  "ComparisonScorer",
@@ -1,176 +0,0 @@
1
- from typing import Type, Optional, Any
2
-
3
- # Import implementations
4
- from judgeval.scorers.judgeval_scorers.api_scorers import (
5
- ExecutionOrderScorer as APIExecutionOrderScorer,
6
- JSONCorrectnessScorer as APIJSONCorrectnessScorer,
7
- SummarizationScorer as APISummarizationScorer,
8
- HallucinationScorer as APIHallucinationScorer,
9
- FaithfulnessScorer as APIFaithfulnessScorer,
10
- ContextualRelevancyScorer as APIContextualRelevancyScorer,
11
- ContextualPrecisionScorer as APIContextualPrecisionScorer,
12
- ContextualRecallScorer as APIContextualRecallScorer,
13
- AnswerRelevancyScorer as APIAnswerRelevancyScorer,
14
- AnswerCorrectnessScorer as APIAnswerCorrectnessScorer,
15
- ComparisonScorer as APIComparisonScorer,
16
- InstructionAdherenceScorer as APIInstructionAdherenceScorer,
17
- GroundednessScorer as APIGroundednessScorer,
18
- DerailmentScorer as APIDerailmentScorer,
19
- )
20
-
21
- from judgeval.scorers.judgeval_scorers.local_implementations import (
22
- AnswerRelevancyScorer as LocalAnswerRelevancyScorer,
23
- ContextualPrecisionScorer as LocalContextualPrecisionScorer,
24
- ContextualRecallScorer as LocalContextualRecallScorer,
25
- ContextualRelevancyScorer as LocalContextualRelevancyScorer,
26
- FaithfulnessScorer as LocalFaithfulnessScorer,
27
- JsonCorrectnessScorer as LocalJsonCorrectnessScorer,
28
- ExecutionOrderScorer as LocalExecutionOrderScorer,
29
- HallucinationScorer as LocalHallucinationScorer,
30
- SummarizationScorer as LocalSummarizationScorer,
31
- AnswerCorrectnessScorer as LocalAnswerCorrectnessScorer,
32
- ComparisonScorer as LocalComparisonScorer,
33
- InstructionAdherenceScorer as LocalInstructionAdherenceScorer,
34
- )
35
-
36
- from judgeval.scorers.judgeval_scorers.classifiers import Text2SQLScorer
37
-
38
-
39
- class ScorerWrapper:
40
- """
41
- Wrapper class that can dynamically load either API or local implementation of a scorer.
42
- """
43
- def __init__(self, api_implementation: Type, local_implementation: Optional[Type] = None):
44
- self.api_implementation = api_implementation
45
- self.local_implementation = local_implementation
46
- self._instance = None
47
- self._init_args = None
48
- self._init_kwargs = None
49
-
50
- def __call__(self, *args, **kwargs):
51
- """Store initialization arguments for later use when implementation is loaded"""
52
- self._init_args = args
53
- self._init_kwargs = kwargs
54
- return self
55
-
56
- def load_implementation(self, use_judgment: bool = True) -> Any:
57
- """
58
- Load the appropriate implementation based on the use_judgment flag.
59
-
60
- Args:
61
- use_judgment (bool): If True, use API implementation. If False, use local implementation.
62
-
63
- Returns:
64
- Instance of the appropriate implementation
65
-
66
- Raises:
67
- ValueError: If local implementation is requested but not available
68
- """
69
- if self._instance is not None:
70
- return self._instance
71
-
72
- if use_judgment:
73
- implementation = self.api_implementation
74
- else:
75
- if self.local_implementation is None:
76
- raise ValueError("No local implementation available for this scorer")
77
- implementation = self.local_implementation
78
-
79
- args = self._init_args or ()
80
- kwargs = self._init_kwargs or {}
81
- self._instance = implementation(*args, **kwargs)
82
- return self._instance
83
-
84
- def __getattr__(self, name):
85
- """Defer all attribute access to the loaded implementation"""
86
- if self._instance is None:
87
- raise RuntimeError("Implementation not loaded. Call load_implementation() first")
88
- return getattr(self._instance, name)
89
-
90
- # Create wrapped versions of all scorers
91
-
92
- AnswerCorrectnessScorer = ScorerWrapper(
93
- api_implementation=APIAnswerCorrectnessScorer,
94
- local_implementation=LocalAnswerCorrectnessScorer
95
- )
96
-
97
- AnswerRelevancyScorer = ScorerWrapper(
98
- api_implementation=APIAnswerRelevancyScorer,
99
- local_implementation=LocalAnswerRelevancyScorer
100
- )
101
-
102
- ExecutionOrderScorer = ScorerWrapper(
103
- api_implementation=APIExecutionOrderScorer,
104
- local_implementation=LocalExecutionOrderScorer
105
- )
106
-
107
- JSONCorrectnessScorer = ScorerWrapper(
108
- api_implementation=APIJSONCorrectnessScorer,
109
- local_implementation=LocalJsonCorrectnessScorer
110
- )
111
-
112
- SummarizationScorer = ScorerWrapper(
113
- api_implementation=APISummarizationScorer,
114
- local_implementation=LocalSummarizationScorer
115
- )
116
-
117
- HallucinationScorer = ScorerWrapper(
118
- api_implementation=APIHallucinationScorer,
119
- local_implementation=LocalHallucinationScorer
120
- )
121
-
122
- FaithfulnessScorer = ScorerWrapper(
123
- api_implementation=APIFaithfulnessScorer,
124
- local_implementation=LocalFaithfulnessScorer
125
- )
126
-
127
- ContextualRelevancyScorer = ScorerWrapper(
128
- api_implementation=APIContextualRelevancyScorer,
129
- local_implementation=LocalContextualRelevancyScorer
130
- )
131
-
132
- ContextualPrecisionScorer = ScorerWrapper(
133
- api_implementation=APIContextualPrecisionScorer,
134
- local_implementation=LocalContextualPrecisionScorer
135
- )
136
-
137
- ContextualRecallScorer = ScorerWrapper(
138
- api_implementation=APIContextualRecallScorer,
139
- local_implementation=LocalContextualRecallScorer
140
- )
141
-
142
- InstructionAdherenceScorer = ScorerWrapper(
143
- api_implementation=APIInstructionAdherenceScorer,
144
- local_implementation=LocalInstructionAdherenceScorer
145
- )
146
-
147
- def ComparisonScorer(threshold: float, criteria: str, description: str):
148
- return ScorerWrapper(
149
- api_implementation=APIComparisonScorer,
150
- local_implementation=LocalComparisonScorer
151
- )(threshold=threshold, criteria=criteria, description=description)
152
-
153
- GroundednessScorer = ScorerWrapper(
154
- api_implementation=APIGroundednessScorer,
155
- )
156
-
157
- DerailmentScorer = ScorerWrapper(
158
- api_implementation=APIDerailmentScorer,
159
- local_implementation=LocalInstructionAdherenceScorer # TODO: add local implementation
160
- )
161
-
162
- __all__ = [
163
- "ExecutionOrderScorer",
164
- "JSONCorrectnessScorer",
165
- "SummarizationScorer",
166
- "HallucinationScorer",
167
- "FaithfulnessScorer",
168
- "ContextualRelevancyScorer",
169
- "ContextualPrecisionScorer",
170
- "ContextualRecallScorer",
171
- "AnswerRelevancyScorer",
172
- "Text2SQLScorer",
173
- "ComparisonScorer",
174
- "GroundednessScorer",
175
- "DerailmentScorer",
176
- ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.32
3
+ Version: 0.0.33
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -12,12 +12,13 @@ Classifier: Programming Language :: Python :: 3
12
12
  Requires-Python: >=3.11
13
13
  Requires-Dist: anthropic
14
14
  Requires-Dist: fastapi
15
+ Requires-Dist: google-genai
15
16
  Requires-Dist: langchain
16
17
  Requires-Dist: langchain-anthropic
17
18
  Requires-Dist: langchain-core
18
19
  Requires-Dist: langchain-huggingface
19
20
  Requires-Dist: langchain-openai
20
- Requires-Dist: litellm
21
+ Requires-Dist: litellm==1.38.12
21
22
  Requires-Dist: nest-asyncio
22
23
  Requires-Dist: openai
23
24
  Requires-Dist: openpyxl
@@ -94,9 +95,21 @@ Create a file named `traces.py` with the following code:
94
95
  from judgeval.common.tracer import Tracer, wrap
95
96
  from openai import OpenAI
96
97
 
98
+ # Basic initialization
97
99
  client = wrap(OpenAI())
98
100
  judgment = Tracer(project_name="my_project")
99
101
 
102
+ # Or with S3 storage enabled
103
+ # NOTE: Make sure AWS creds correspond to an account with write access to the specified S3 bucket
104
+ judgment = Tracer(
105
+ project_name="my_project",
106
+ use_s3=True,
107
+ s3_bucket_name="my-traces-bucket", # Bucket created automatically if it doesn't exist
108
+ s3_aws_access_key_id="your-access-key", # Optional: defaults to AWS_ACCESS_KEY_ID env var
109
+ s3_aws_secret_access_key="your-secret-key", # Optional: defaults to AWS_SECRET_ACCESS_KEY env var
110
+ s3_region_name="us-west-1" # Optional: defaults to AWS_REGION env var or "us-west-1"
111
+ )
112
+
100
113
  @judgment.observe(span_type="tool")
101
114
  def my_tool():
102
115
  return "Hello world!"
@@ -0,0 +1,63 @@
1
+ judgeval/__init__.py,sha256=x9HWt4waJwJMAqTuJSg2MezF9Zg-macEjeU-ajbly-8,330
2
+ judgeval/clients.py,sha256=6VQmEqmfCngUdS2MuPBIpHvtDFqOENm8-_BmMvjLyRQ,944
3
+ judgeval/constants.py,sha256=_XmVAkebMyGrDvvanAVlMgVd4p6MLHdEVsTQFI0kz1k,5411
4
+ judgeval/evaluation_run.py,sha256=WGzx-Ug2qhSmunFo8NrmSstBRsOUc5KpKq0Lc51rqsM,6739
5
+ judgeval/judgment_client.py,sha256=brRYmphZR-2IUre9kdOhfse1mYDilcIqUzzH21ROAdk,22208
6
+ judgeval/rules.py,sha256=jkh1cXXcUf8oRY7xJUZfcQBYWn_rjUW4GvrhRt15PeU,20265
7
+ judgeval/run_evaluation.py,sha256=elMpFHahyeukKKa09fmJM3c_afwJ00mbZRqm18l5f00,28481
8
+ judgeval/version_check.py,sha256=bvJEidB7rAeXozoUbN9Yb97QOR_s2hgvpvj74jJ5HlY,943
9
+ judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
10
+ judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
11
+ judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
12
+ judgeval/common/s3_storage.py,sha256=W8wq9S7qJZdqdBR4sk3aEZ4K3-pz40DOoolOJrWs9Vo,3768
13
+ judgeval/common/tracer.py,sha256=YsObK8VQXp1DDbU9xncU8NjuY-JUI54BqmG4olezrZc,92507
14
+ judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
15
+ judgeval/data/__init__.py,sha256=xuKx_KCVHGp6CXvQuVmKl3v7pJp-qDaz0NccKxwjtO0,481
16
+ judgeval/data/custom_example.py,sha256=QRBqiRiZS8UgVeTRHY0r1Jzm6yAYsyg6zmHxQGxdiQs,739
17
+ judgeval/data/example.py,sha256=cJrmPGLel_P2sy1UaRvuVSAi35EnA9XMR11Lhp4aDLo,5930
18
+ judgeval/data/result.py,sha256=Gb9tiSDsk1amXgh0cFG6JmlW_BMKxS2kuTwNA0rrHjA,3184
19
+ judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
20
+ judgeval/data/sequence.py,sha256=FmKVdzQP5VTujRCHDWk097MKRR-rJgbsdrxyCKee6tA,1994
21
+ judgeval/data/sequence_run.py,sha256=RmYjfWKMWg-pcF5PLeiWfrhuDkjDZi5VEmAIEXN3Ib0,2104
22
+ judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
23
+ judgeval/data/datasets/dataset.py,sha256=dhLo30hvpmmOK2R6O5wDs_neawUJ4lS8bb4S42SufNQ,13034
24
+ judgeval/data/datasets/eval_dataset_client.py,sha256=xjj66BO9Es9IxXqzQe1RT_e0kpeKlt7OrhRoSuj4KHM,15085
25
+ judgeval/integrations/langgraph.py,sha256=J-cQfFP52TjJewdSTe-fcsUC4HDvjNbXoxmbmF0SgiE,11743
26
+ judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
27
+ judgeval/judges/base_judge.py,sha256=ch_S7uBB7lyv44Lf1d7mIGFpveOO58zOkkpImKgd9_4,994
28
+ judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6KTg,2424
29
+ judgeval/judges/mixture_of_judges.py,sha256=IJoi4Twk8ze1CJWVEp69k6TSqTCTGrmVYQ0qdffer60,15549
30
+ judgeval/judges/together_judge.py,sha256=l00hhPerAZXg3oYBd8cyMtWsOTNt_0FIqoxhKJKQe3k,2302
31
+ judgeval/judges/utils.py,sha256=9lvUxziGV86ISvVFxYBWc09TWFyAQgUTyPf_a9mD5Rs,2686
32
+ judgeval/scorers/__init__.py,sha256=Mk-mWUt_gNpJqY_WIEuQynD6fxc34fWSRSuobMSrj94,1238
33
+ judgeval/scorers/api_scorer.py,sha256=NQ_CrrUPhSUk1k2Q8rKpCG_TU2FT32sFEqvb-Yi54B0,2688
34
+ judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
35
+ judgeval/scorers/judgeval_scorer.py,sha256=79-JJurqHP-qTaWNWInx4SjvQYwXc9lvfPPNgwsh2yA,6773
36
+ judgeval/scorers/prompt_scorer.py,sha256=PaAs2qRolw1P3_I061Xvk9qzvF4O-JR8g_39RqXnHcM,17728
37
+ judgeval/scorers/score.py,sha256=r9QiT4-LIvivcJ6XxByrbswKSO8eQTtAD1UlXT_lcmo,18741
38
+ judgeval/scorers/utils.py,sha256=iHQVTlIANbmCTXz9kTeSdOytgUZ_T74Re61ajqsk_WQ,6827
39
+ judgeval/scorers/judgeval_scorers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
+ judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=_sDUBxSG536KGqXNi6dFpaYKghjEAadxBxaaxV9HuuE,1764
41
+ judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=Fnd9CVIOZ73sWEWymsU5eBrrZqPFjMZ0BKpeW-PDyTg,711
42
+ judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=oETeN9K0HSIRdL2SDqn82Vskpwh5SlKnZvs5VDm2OBU,658
43
+ judgeval/scorers/judgeval_scorers/api_scorers/comparison.py,sha256=kuzf9OWvpY38yYSwlBgneLkUZwJNM4FQqvbS66keA90,1249
44
+ judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py,sha256=tpSuzFAaW8X9xqA0aLLKwh7qmBK0Pc_bJZMIe_q412U,770
45
+ judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py,sha256=pFVhk4pLtQ-FnNlbI-dFF-SIh69Jza7erHqiPkFWoBo,758
46
+ judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py,sha256=RQ6DZwEhChfecd89Ey-T7ke--7qTaXZlRsNxwH8gaME,823
47
+ judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py,sha256=V9WPuwNMm097V7IknKs8UkmAk0yjnBXTcJha_BHXxTA,475
48
+ judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py,sha256=Pb3CiNF2Ca826B92wJCVAi_68lJjLhqqCKwQKaflSUg,1294
49
+ judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=-BwOapqjryYNKNydtdkUiKIij76dY0O1jBmdc6dKazQ,692
50
+ judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py,sha256=ntEEeTANEOsGlcbiTAF_3r6BeSJEaVDns8po8T0L6Vg,692
51
+ judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py,sha256=k5gDOki-8KXrZXydvdSqDt3NZqQ28hXoOCHQf6jNxr4,686
52
+ judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=XnSGEkQfwVqaqnHEGMCsxNiHVzrsrej48uDbLoWc8CQ,678
53
+ judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py,sha256=mMKEuR87_yanEuZJ5YSGFMHDD_oLVZ6-rQuciFaDOMA,1095
54
+ judgeval/scorers/judgeval_scorers/api_scorers/summarization.py,sha256=QmWB8bVbDYHY5FcF0rYZE_3c2XXgMLRmR6aXJWfdMC4,655
55
+ judgeval/scorers/judgeval_scorers/classifiers/__init__.py,sha256=Qt81W5ZCwMvBAne0LfQDb8xvg5iOG1vEYP7WizgwAZo,67
56
+ judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py,sha256=8iTzMvou1Dr8pybul6lZHKjc9Ye2-0_racRGYkhEdTY,74
57
+ judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py,sha256=ly72Z7s_c8NID6-nQnuW8qEGEW2MqdvpJ-5WfXzbAQg,2579
58
+ judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
59
+ judgeval/utils/alerts.py,sha256=O19Xj7DA0YVjl8PWiuH4zfdZeu3yiLVvHfY8ah2wG0g,2759
60
+ judgeval-0.0.33.dist-info/METADATA,sha256=KzTkGTHYE8Uplehvtk_7x30XrV0xe1bpd-tU5lt0mHg,6097
61
+ judgeval-0.0.33.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
62
+ judgeval-0.0.33.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
63
+ judgeval-0.0.33.dist-info/RECORD,,
@@ -1,58 +0,0 @@
1
- """
2
- Judgment Scorer class.
3
-
4
- Scores `Example`s using ready-made Judgment evaluators.
5
- """
6
-
7
- from pydantic import BaseModel, field_validator
8
- from judgeval.common.logger import debug, info, warning, error
9
-
10
- from judgeval.constants import APIScorer, UNBOUNDED_SCORERS
11
-
12
-
13
- class APIJudgmentScorer(BaseModel):
14
- """
15
- Class for ready-made, "out-of-the-box" scorer that uses Judgment evaluators to score `Example`s.
16
-
17
- Args:
18
- score_type (APIScorer): The Judgment metric to use for scoring `Example`s
19
- threshold (float): A value between 0 and 1 that determines the scoring threshold
20
- """
21
- score_type: APIScorer
22
- threshold: float
23
-
24
- @field_validator('threshold')
25
- def validate_threshold(cls, v, info):
26
- """
27
- Validates that the threshold is between 0 and 1 inclusive.
28
- """
29
- score_type = info.data.get('score_type')
30
- if score_type in UNBOUNDED_SCORERS:
31
- if v < 0:
32
- error(f"Threshold for {score_type} must be greater than 0, got: {v}")
33
- raise ValueError(f"Threshold for {score_type} must be greater than 0, got: {v}")
34
- else:
35
- if not 0 <= v <= 1:
36
- error(f"Threshold for {score_type} must be between 0 and 1, got: {v}")
37
- raise ValueError(f"Threshold for {score_type} must be between 0 and 1, got: {v}")
38
- return v
39
-
40
- @field_validator('score_type')
41
- def convert_to_enum_value(cls, v):
42
- """
43
- Validates that the `score_type` is a valid `JudgmentMetric` enum value.
44
- Converts string values to `JudgmentMetric` enum values.
45
- """
46
- debug(f"Attempting to convert score_type value: {v}")
47
- if isinstance(v, APIScorer):
48
- info(f"Using existing JudgmentMetric: {v.value}")
49
- return v.value
50
- elif isinstance(v, str):
51
- debug(f"Converting string value to JudgmentMetric enum: {v}")
52
- return APIScorer[v.upper()].value
53
- error(f"Invalid score_type value: {v}")
54
- raise ValueError(f"Invalid value for score_type: {v}")
55
-
56
- def __str__(self):
57
- return f"JudgmentScorer(score_type={self.score_type.value}, threshold={self.threshold})"
58
-