judgeval 0.0.44__py3-none-any.whl → 0.0.46__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. judgeval/__init__.py +5 -4
  2. judgeval/clients.py +6 -6
  3. judgeval/common/__init__.py +7 -2
  4. judgeval/common/exceptions.py +2 -3
  5. judgeval/common/logger.py +74 -49
  6. judgeval/common/s3_storage.py +30 -23
  7. judgeval/common/tracer.py +1273 -939
  8. judgeval/common/utils.py +416 -244
  9. judgeval/constants.py +73 -61
  10. judgeval/data/__init__.py +1 -1
  11. judgeval/data/custom_example.py +3 -2
  12. judgeval/data/datasets/dataset.py +80 -54
  13. judgeval/data/datasets/eval_dataset_client.py +131 -181
  14. judgeval/data/example.py +67 -43
  15. judgeval/data/result.py +11 -9
  16. judgeval/data/scorer_data.py +4 -2
  17. judgeval/data/tool.py +25 -16
  18. judgeval/data/trace.py +57 -29
  19. judgeval/data/trace_run.py +5 -11
  20. judgeval/evaluation_run.py +22 -82
  21. judgeval/integrations/langgraph.py +546 -184
  22. judgeval/judges/base_judge.py +1 -2
  23. judgeval/judges/litellm_judge.py +33 -11
  24. judgeval/judges/mixture_of_judges.py +128 -78
  25. judgeval/judges/together_judge.py +22 -9
  26. judgeval/judges/utils.py +14 -5
  27. judgeval/judgment_client.py +259 -271
  28. judgeval/rules.py +169 -142
  29. judgeval/run_evaluation.py +462 -305
  30. judgeval/scorers/api_scorer.py +20 -11
  31. judgeval/scorers/exceptions.py +1 -0
  32. judgeval/scorers/judgeval_scorer.py +77 -58
  33. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +46 -15
  34. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +3 -2
  35. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +3 -2
  36. judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +12 -11
  37. judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +7 -5
  38. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +3 -2
  39. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +3 -2
  40. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +5 -2
  41. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +2 -1
  42. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +17 -8
  43. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +3 -2
  44. judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +3 -2
  45. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +3 -2
  46. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +3 -2
  47. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +8 -9
  48. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +4 -4
  49. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +5 -5
  50. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +5 -2
  51. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +9 -10
  52. judgeval/scorers/prompt_scorer.py +48 -37
  53. judgeval/scorers/score.py +86 -53
  54. judgeval/scorers/utils.py +11 -7
  55. judgeval/tracer/__init__.py +1 -1
  56. judgeval/utils/alerts.py +23 -12
  57. judgeval/utils/{data_utils.py → file_utils.py} +5 -9
  58. judgeval/utils/requests.py +29 -0
  59. judgeval/version_check.py +5 -2
  60. {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/METADATA +79 -135
  61. judgeval-0.0.46.dist-info/RECORD +69 -0
  62. judgeval-0.0.44.dist-info/RECORD +0 -68
  63. {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/WHEEL +0 -0
  64. {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/licenses/LICENSE.md +0 -0
judgeval/data/result.py CHANGED
@@ -1,6 +1,5 @@
1
- from dataclasses import dataclass
2
- from typing import List, Union, Optional, Dict, Any, Union
3
- from judgeval.common.logger import debug, error
1
+ from typing import List, Optional, Union
2
+ from judgeval.common.logger import debug
4
3
  from pydantic import BaseModel
5
4
  from judgeval.data import ScorerData, Example, CustomExample
6
5
  from judgeval.data.trace import TraceSpan
@@ -12,13 +11,14 @@ class ScoringResult(BaseModel):
12
11
  Ie: One input, one actual_output, one expected_output, etc..., and 1+ scorer (Faithfulness, Hallucination, Summarization, etc...)
13
12
 
14
13
  Args:
15
- success (bool): Whether the evaluation was successful.
14
+ success (bool): Whether the evaluation was successful.
16
15
  This means that all scorers applied to this example returned a success.
17
16
  scorer_data (List[ScorerData]): The scorers data for the evaluated example
18
17
  data_object (Optional[Example]): The original example object that was used to create the ScoringResult, can be Example, CustomExample (future), WorkflowRun (future)
19
-
18
+
20
19
  """
21
- # Fields for scoring outputs
20
+
21
+ # Fields for scoring outputs
22
22
  success: bool # used for unit testing
23
23
  scorers_data: Union[List[ScorerData], None]
24
24
  name: Optional[str] = None
@@ -26,16 +26,18 @@ class ScoringResult(BaseModel):
26
26
  # The original example object that was used to create the ScoringResult
27
27
  data_object: Optional[Union[TraceSpan, CustomExample, Example]] = None
28
28
  trace_id: Optional[str] = None
29
-
29
+
30
30
  # Additional fields for internal use
31
31
  run_duration: Optional[float] = None
32
32
  evaluation_cost: Optional[float] = None
33
-
33
+
34
34
  def to_dict(self) -> dict:
35
35
  """Convert the ScoringResult instance to a dictionary, properly serializing scorer_data."""
36
36
  return {
37
37
  "success": self.success,
38
- "scorers_data": [scorer_data.to_dict() for scorer_data in self.scorers_data] if self.scorers_data else None,
38
+ "scorers_data": [scorer_data.to_dict() for scorer_data in self.scorers_data]
39
+ if self.scorers_data
40
+ else None,
39
41
  "data_object": self.data_object.to_dict() if self.data_object else None,
40
42
  }
41
43
 
@@ -9,6 +9,7 @@ from pydantic import BaseModel
9
9
 
10
10
  from judgeval.scorers import JudgevalScorer
11
11
 
12
+
12
13
  class ScorerData(BaseModel):
13
14
  """
14
15
  ScorerData holds the information related to a single, completed Scorer evaluation run.
@@ -18,13 +19,14 @@ class ScorerData(BaseModel):
18
19
  information surrounding the evaluation run such as the claims and verdicts generated by the
19
20
  judge model(s).
20
21
  """
22
+
21
23
  name: str
22
24
  threshold: float
23
25
  success: bool
24
26
  score: Optional[float] = None
25
27
  reason: Optional[str] = None
26
28
  strict_mode: Optional[bool] = None
27
- evaluation_model: Union[List[str], str] = None
29
+ evaluation_model: Union[List[str], str] | None = None
28
30
  error: Optional[str] = None
29
31
  evaluation_cost: Union[float, None] = None
30
32
  verbose_logs: Optional[str] = None
@@ -43,7 +45,7 @@ class ScorerData(BaseModel):
43
45
  "error": self.error,
44
46
  "evaluation_cost": self.evaluation_cost,
45
47
  "verbose_logs": self.verbose_logs,
46
- "additional_metadata": self.additional_metadata
48
+ "additional_metadata": self.additional_metadata,
47
49
  }
48
50
 
49
51
 
judgeval/data/tool.py CHANGED
@@ -2,6 +2,7 @@ from pydantic import BaseModel, field_validator
2
2
  from typing import Dict, Any, Optional, List
3
3
  import warnings
4
4
 
5
+
5
6
  class Tool(BaseModel):
6
7
  tool_name: str
7
8
  parameters: Optional[Dict[str, Any]] = None
@@ -9,39 +10,47 @@ class Tool(BaseModel):
9
10
  result_dependencies: Optional[List[Dict[str, Any]]] = None
10
11
  action_dependencies: Optional[List[Dict[str, Any]]] = None
11
12
  require_all: Optional[bool] = None
12
-
13
- @field_validator('tool_name')
13
+
14
+ @field_validator("tool_name")
14
15
  def validate_tool_name(cls, v):
15
16
  if not v:
16
17
  warnings.warn("Tool name is empty or None", UserWarning)
17
18
  return v
18
-
19
- @field_validator('parameters')
19
+
20
+ @field_validator("parameters")
20
21
  def validate_parameters(cls, v):
21
22
  if v is not None and not isinstance(v, dict):
22
- warnings.warn(f"Parameters should be a dictionary, got {type(v)}", UserWarning)
23
+ warnings.warn(
24
+ f"Parameters should be a dictionary, got {type(v)}", UserWarning
25
+ )
23
26
  return v
24
-
25
- @field_validator('agent_name')
27
+
28
+ @field_validator("agent_name")
26
29
  def validate_agent_name(cls, v):
27
30
  if v is not None and not isinstance(v, str):
28
31
  warnings.warn(f"Agent name should be a string, got {type(v)}", UserWarning)
29
32
  return v
30
-
31
- @field_validator('result_dependencies')
33
+
34
+ @field_validator("result_dependencies")
32
35
  def validate_result_dependencies(cls, v):
33
36
  if v is not None and not isinstance(v, list):
34
- warnings.warn(f"Result dependencies should be a list, got {type(v)}", UserWarning)
37
+ warnings.warn(
38
+ f"Result dependencies should be a list, got {type(v)}", UserWarning
39
+ )
35
40
  return v
36
-
37
- @field_validator('action_dependencies')
41
+
42
+ @field_validator("action_dependencies")
38
43
  def validate_action_dependencies(cls, v):
39
44
  if v is not None and not isinstance(v, list):
40
- warnings.warn(f"Action dependencies should be a list, got {type(v)}", UserWarning)
45
+ warnings.warn(
46
+ f"Action dependencies should be a list, got {type(v)}", UserWarning
47
+ )
41
48
  return v
42
49
 
43
- @field_validator('require_all')
50
+ @field_validator("require_all")
44
51
  def validate_require_all(cls, v):
45
52
  if v is not None and not isinstance(v, bool):
46
- warnings.warn(f"Require all should be a boolean, got {type(v)}", UserWarning)
47
- return v
53
+ warnings.warn(
54
+ f"Require all should be a boolean, got {type(v)}", UserWarning
55
+ )
56
+ return v
judgeval/data/trace.py CHANGED
@@ -1,10 +1,12 @@
1
- from pydantic import BaseModel
1
+ from pydantic import BaseModel, Field
2
2
  from typing import Optional, Dict, Any, List
3
3
  from judgeval.evaluation_run import EvaluationRun
4
4
  from judgeval.data.tool import Tool
5
5
  import json
6
+ import sys
6
7
  from datetime import datetime, timezone
7
8
 
9
+
8
10
  class TraceUsage(BaseModel):
9
11
  prompt_tokens: Optional[int] = None
10
12
  completion_tokens: Optional[int] = None
@@ -14,6 +16,7 @@ class TraceUsage(BaseModel):
14
16
  total_cost_usd: Optional[float] = None
15
17
  model_name: Optional[str] = None
16
18
 
19
+
17
20
  class TraceSpan(BaseModel):
18
21
  span_id: str
19
22
  trace_id: str
@@ -41,11 +44,15 @@ class TraceSpan(BaseModel):
41
44
  "span_id": self.span_id,
42
45
  "trace_id": self.trace_id,
43
46
  "depth": self.depth,
44
- "created_at": datetime.fromtimestamp(self.created_at, tz=timezone.utc).isoformat(),
47
+ "created_at": datetime.fromtimestamp(
48
+ self.created_at, tz=timezone.utc
49
+ ).isoformat(),
45
50
  "inputs": self._serialize_value(self.inputs),
46
51
  "output": self._serialize_value(self.output),
47
52
  "error": self._serialize_value(self.error),
48
- "evaluation_runs": [run.model_dump() for run in self.evaluation_runs] if self.evaluation_runs else [],
53
+ "evaluation_runs": [run.model_dump() for run in self.evaluation_runs]
54
+ if self.evaluation_runs
55
+ else [],
49
56
  "parent_span_id": self.parent_span_id,
50
57
  "function": self.function,
51
58
  "duration": self.duration,
@@ -55,13 +62,15 @@ class TraceSpan(BaseModel):
55
62
  "agent_name": self.agent_name,
56
63
  "state_before": self.state_before,
57
64
  "state_after": self.state_after,
58
- "additional_metadata": self._serialize_value(self.additional_metadata)
65
+ "additional_metadata": self._serialize_value(self.additional_metadata),
59
66
  }
60
-
67
+
61
68
  def print_span(self):
62
69
  """Print the span with proper formatting and parent relationship information."""
63
70
  indent = " " * self.depth
64
- parent_info = f" (parent_id: {self.parent_span_id})" if self.parent_span_id else ""
71
+ parent_info = (
72
+ f" (parent_id: {self.parent_span_id})" if self.parent_span_id else ""
73
+ )
65
74
  print(f"{indent}→ {self.function} (id: {self.span_id}){parent_info}")
66
75
 
67
76
  def _is_json_serializable(self, obj: Any) -> bool:
@@ -80,38 +89,56 @@ class TraceSpan(BaseModel):
80
89
  return str(output)
81
90
  except (TypeError, OverflowError, ValueError):
82
91
  pass
83
-
92
+
84
93
  try:
85
94
  return repr(output)
86
95
  except (TypeError, OverflowError, ValueError):
87
96
  pass
88
97
  return None
89
-
98
+
90
99
  def _serialize_value(self, value: Any) -> Any:
91
100
  """Helper method to deep serialize a value safely supporting Pydantic Models / regular PyObjects."""
92
101
  if value is None:
93
102
  return None
94
-
95
- def serialize_value(value):
96
- if isinstance(value, BaseModel):
97
- return value.model_dump()
98
- elif isinstance(value, dict):
99
- # Recursively serialize dictionary values
100
- return {k: serialize_value(v) for k, v in value.items()}
101
- elif isinstance(value, (list, tuple)):
102
- # Recursively serialize list/tuple items
103
- return [serialize_value(item) for item in value]
104
- else:
105
- # Try direct JSON serialization first
106
- try:
107
- json.dumps(value)
108
- return value
109
- except (TypeError, OverflowError, ValueError):
110
- # Fallback to safe stringification
111
- return self.safe_stringify(value, self.function)
103
+
104
+ recursion_limit = sys.getrecursionlimit()
105
+ recursion_limit = int(recursion_limit * 0.75)
106
+
107
+ def serialize_value(value, current_depth=0):
108
+ try:
109
+ if current_depth > recursion_limit:
110
+ return {"error": "max_depth_reached: " + type(value).__name__}
111
+
112
+ if isinstance(value, BaseModel):
113
+ return value.model_dump()
114
+ elif isinstance(value, dict):
115
+ # Recursively serialize dictionary values
116
+ return {
117
+ k: serialize_value(v, current_depth + 1)
118
+ for k, v in value.items()
119
+ }
120
+ elif isinstance(value, (list, tuple)):
121
+ # Recursively serialize list/tuple items
122
+ return [serialize_value(item, current_depth + 1) for item in value]
123
+ else:
124
+ # Try direct JSON serialization first
125
+ try:
126
+ json.dumps(value)
127
+ return value
128
+ except (TypeError, OverflowError, ValueError):
129
+ # Fallback to safe stringification
130
+ return self.safe_stringify(value, self.function)
131
+ except Exception:
132
+ return {"error": "Unable to serialize"}
133
+ except Exception:
134
+ return {"error": "Unable to serialize"}
112
135
 
113
136
  # Start serialization with the top-level value
114
- return serialize_value(value)
137
+ try:
138
+ return serialize_value(value, current_depth=0)
139
+ except Exception:
140
+ return {"error": "Unable to serialize"}
141
+
115
142
 
116
143
  class Trace(BaseModel):
117
144
  trace_id: str
@@ -121,6 +148,7 @@ class Trace(BaseModel):
121
148
  trace_spans: List[TraceSpan]
122
149
  overwrite: bool = False
123
150
  offline_mode: bool = False
124
- rules: Optional[Dict[str, Any]] = None
151
+ rules: Dict[str, Any] = Field(default_factory=dict)
125
152
  has_notification: Optional[bool] = False
126
-
153
+ customer_id: Optional[str] = None
154
+ tags: List[str] = Field(default_factory=list)
@@ -1,22 +1,20 @@
1
1
  from pydantic import BaseModel
2
- from typing import List, Optional, Dict, Any, Union, Callable
2
+ from typing import List, Optional, Dict, Any, Union
3
3
  from judgeval.data import Trace
4
4
  from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
5
- from judgeval.judges import JudgevalJudge
6
5
  from judgeval.rules import Rule
7
6
 
8
7
 
9
8
  class TraceRun(BaseModel):
10
9
  """
11
10
  Stores example and evaluation scorers together for running an eval task
12
-
13
- Args:
11
+
12
+ Args:
14
13
  project_name (str): The name of the project the evaluation results belong to
15
14
  eval_name (str): A name for this evaluation run
16
15
  traces (List[Trace]): The traces to evaluate
17
16
  scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
18
17
  model (str): The model used as a judge when using LLM as a Judge
19
- aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
20
18
  metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
21
19
  judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
22
20
  rules (Optional[List[Rule]]): Rules to evaluate against scoring results
@@ -24,16 +22,12 @@ class TraceRun(BaseModel):
24
22
  tools (Optional[List[Dict[str, Any]]]): List of tools to use for evaluation
25
23
  """
26
24
 
27
- # The user will specify whether they want log_results when they call run_eval
28
- log_results: bool = False # NOTE: log_results has to be set first because it is used to validate project_name and eval_name
29
25
  organization_id: Optional[str] = None
30
26
  project_name: Optional[str] = None
31
27
  eval_name: Optional[str] = None
32
28
  traces: Optional[List[Trace]] = None
33
29
  scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
34
- model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1"
35
- aggregator: Optional[str] = None
36
- metadata: Optional[Dict[str, Any]] = None
30
+ model: Optional[str] = "gpt-4.1"
37
31
  trace_span_id: Optional[str] = None
38
32
  append: Optional[bool] = False
39
33
  # API Key will be "" until user calls client.run_eval(), then API Key will be set
@@ -43,4 +37,4 @@ class TraceRun(BaseModel):
43
37
  tools: Optional[List[Dict[str, Any]]] = None
44
38
 
45
39
  class Config:
46
- arbitrary_types_allowed = True
40
+ arbitrary_types_allowed = True
@@ -1,144 +1,84 @@
1
- from typing import List, Optional, Dict, Any, Union
1
+ from typing import List, Optional, Union
2
2
  from pydantic import BaseModel, field_validator, Field
3
3
 
4
4
  from judgeval.data import Example, CustomExample
5
5
  from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
6
6
  from judgeval.constants import ACCEPTABLE_MODELS
7
- from judgeval.common.logger import debug, error
8
- from judgeval.judges import JudgevalJudge
9
- from judgeval.rules import Rule
7
+
10
8
 
11
9
  class EvaluationRun(BaseModel):
12
10
  """
13
11
  Stores example and evaluation scorers together for running an eval task
14
-
15
- Args:
12
+
13
+ Args:
16
14
  project_name (str): The name of the project the evaluation results belong to
17
15
  eval_name (str): A name for this evaluation run
18
16
  examples (Union[List[Example], List[CustomExample]]): The examples to evaluate
19
17
  scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
20
18
  model (str): The model used as a judge when using LLM as a Judge
21
- aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
22
19
  metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
23
20
  judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
24
- rules (Optional[List[Rule]]): Rules to evaluate against scoring results
25
21
  """
26
22
 
27
- # The user will specify whether they want log_results when they call run_eval
28
- log_results: bool = False # NOTE: log_results has to be set first because it is used to validate project_name and eval_name
29
23
  organization_id: Optional[str] = None
30
24
  project_name: Optional[str] = Field(default=None, validate_default=True)
31
25
  eval_name: Optional[str] = Field(default=None, validate_default=True)
32
26
  examples: Union[List[Example], List[CustomExample]]
33
27
  scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
34
- model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1"
35
- aggregator: Optional[str] = Field(default=None, validate_default=True)
36
- metadata: Optional[Dict[str, Any]] = None
28
+ model: Optional[str] = "gpt-4.1"
37
29
  trace_span_id: Optional[str] = None
38
30
  # API Key will be "" until user calls client.run_eval(), then API Key will be set
39
31
  judgment_api_key: Optional[str] = ""
40
32
  override: Optional[bool] = False
41
33
  append: Optional[bool] = False
42
- rules: Optional[List[Rule]] = None
43
-
34
+
44
35
  def model_dump(self, **kwargs):
45
36
  data = super().model_dump(**kwargs)
46
37
 
47
38
  data["scorers"] = [
48
- scorer.to_dict() if hasattr(scorer, "to_dict")
49
- else scorer.model_dump() if hasattr(scorer, "model_dump")
39
+ scorer.to_dict()
40
+ if hasattr(scorer, "to_dict")
41
+ else scorer.model_dump()
42
+ if hasattr(scorer, "model_dump")
50
43
  else {"score_type": scorer.score_type, "threshold": scorer.threshold}
51
44
  for scorer in self.scorers
52
45
  ]
53
46
 
54
- if self.rules:
55
- # Process rules to ensure proper serialization
56
- data["rules"] = [rule.model_dump() for rule in self.rules]
57
-
58
47
  return data
59
48
 
60
- @field_validator('log_results', mode='before')
61
- def validate_log_results(cls, v):
62
- if not isinstance(v, bool):
63
- raise ValueError(f"log_results must be a boolean. Received {v} of type {type(v)}")
64
- return v
65
-
66
- @field_validator('project_name')
67
- def validate_project_name(cls, v, values):
68
- if values.data.get('log_results', False) and not v:
69
- debug("No project name provided when log_results is True")
70
- error("Validation failed: Project name required when logging results")
71
- raise ValueError("Project name is required when log_results is True. Please include the project_name argument.")
72
- return v
73
-
74
- @field_validator('eval_name')
75
- def validate_eval_name(cls, v, values):
76
- if values.data.get('log_results', False) and not v:
77
- debug("No eval name provided when log_results is True")
78
- error("Validation failed: Eval name required when logging results")
79
- raise ValueError("Eval name is required when log_results is True. Please include the eval_run_name argument.")
80
- return v
81
-
82
- @field_validator('examples')
49
+ @field_validator("examples")
83
50
  def validate_examples(cls, v):
84
51
  if not v:
85
52
  raise ValueError("Examples cannot be empty.")
86
-
53
+
87
54
  first_type = type(v[0])
88
55
  if first_type not in (Example, CustomExample):
89
56
  raise ValueError(f"Invalid type for Example/CustomExample: {first_type}")
90
57
  if not all(isinstance(ex, first_type) for ex in v):
91
- raise ValueError("All examples must be of the same type, either all Example or all CustomExample.")
92
-
58
+ raise ValueError(
59
+ "All examples must be of the same type, either all Example or all CustomExample."
60
+ )
61
+
93
62
  return v
94
63
 
95
- @field_validator('scorers')
64
+ @field_validator("scorers")
96
65
  def validate_scorers(cls, v):
97
66
  if not v:
98
67
  raise ValueError("Scorers cannot be empty.")
99
68
  return v
100
69
 
101
- @field_validator('model')
70
+ @field_validator("model")
102
71
  def validate_model(cls, v, values):
103
72
  if not v:
104
73
  raise ValueError("Model cannot be empty.")
105
-
106
- # Check if model is a judgevalJudge
107
- if isinstance(v, JudgevalJudge):
108
- # Verify all scorers are JudgevalScorer when using judgevalJudge
109
- scorers = values.data.get('scorers', [])
110
- if not all(isinstance(s, JudgevalScorer) for s in scorers):
111
- raise ValueError("When using a judgevalJudge model, all scorers must be JudgevalScorer type")
112
- return v
113
-
74
+
114
75
  # Check if model is string or list of strings
115
76
  if isinstance(v, str):
116
77
  if v not in ACCEPTABLE_MODELS:
117
- raise ValueError(f"Model name {v} not recognized. Please select a valid model name.)")
118
- return v
119
-
120
- if isinstance(v, list):
121
- if not all(isinstance(m, str) for m in v):
122
- raise ValueError("When providing a list of models, all elements must be strings")
123
- for m in v:
124
- if m not in ACCEPTABLE_MODELS:
125
- raise ValueError(f"Model name {m} not recognized. Please select a valid model name.")
78
+ raise ValueError(
79
+ f"Model name {v} not recognized. Please select a valid model name.)"
80
+ )
126
81
  return v
127
- raise ValueError(f"Model must be one of: string, list of strings, or JudgevalJudge instance. Received type {type(v)}.")
128
82
 
129
- @field_validator('aggregator', mode='before')
130
- def validate_aggregator(cls, v, values):
131
- model = values.data.get('model')
132
- if isinstance(model, list) and v is None:
133
- raise ValueError("Aggregator cannot be empty.")
134
-
135
- if isinstance(model, list) and not isinstance(v, str):
136
- raise ValueError("Aggregator must be a string if provided.")
137
-
138
- if v is not None and v not in ACCEPTABLE_MODELS:
139
- raise ValueError(f"Model name {v} not recognized.")
140
-
141
- return v
142
-
143
83
  class Config:
144
84
  arbitrary_types_allowed = True