judgeval 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. judgeval/__init__.py +0 -71
  2. judgeval/common/tracer.py +57 -31
  3. judgeval/constants.py +1 -0
  4. judgeval/data/__init__.py +2 -1
  5. judgeval/data/scorer_data.py +2 -2
  6. judgeval/evaluation_run.py +16 -15
  7. judgeval/judges/__init__.py +2 -2
  8. judgeval/judges/base_judge.py +1 -1
  9. judgeval/judges/litellm_judge.py +2 -2
  10. judgeval/judges/mixture_of_judges.py +2 -2
  11. judgeval/judges/together_judge.py +2 -2
  12. judgeval/judges/utils.py +4 -4
  13. judgeval/judgment_client.py +67 -15
  14. judgeval/run_evaluation.py +79 -14
  15. judgeval/scorers/__init__.py +8 -4
  16. judgeval/scorers/api_scorer.py +64 -0
  17. judgeval/scorers/base_scorer.py +3 -2
  18. judgeval/scorers/exceptions.py +11 -0
  19. judgeval/scorers/{custom_scorer.py → judgeval_scorer.py} +9 -5
  20. judgeval/scorers/judgeval_scorers/__init__.py +132 -9
  21. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +23 -0
  22. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +19 -0
  23. judgeval/scorers/judgeval_scorers/{answer_relevancy.py → api_scorers/answer_relevancy.py} +2 -2
  24. judgeval/scorers/judgeval_scorers/{contextual_precision.py → api_scorers/contextual_precision.py} +2 -2
  25. judgeval/scorers/judgeval_scorers/{contextual_recall.py → api_scorers/contextual_recall.py} +2 -2
  26. judgeval/scorers/judgeval_scorers/{contextual_relevancy.py → api_scorers/contextual_relevancy.py} +2 -2
  27. judgeval/scorers/judgeval_scorers/{faithfulness.py → api_scorers/faithfulness.py} +2 -2
  28. judgeval/scorers/judgeval_scorers/{hallucination.py → api_scorers/hallucination.py} +2 -2
  29. judgeval/scorers/judgeval_scorers/{json_correctness.py → api_scorers/json_correctness.py} +7 -7
  30. judgeval/scorers/judgeval_scorers/{summarization.py → api_scorers/summarization.py} +2 -2
  31. judgeval/scorers/judgeval_scorers/{tool_correctness.py → api_scorers/tool_correctness.py} +2 -2
  32. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +24 -0
  33. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +4 -0
  34. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +272 -0
  35. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +169 -0
  36. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +4 -0
  37. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +292 -0
  38. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +174 -0
  39. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +3 -0
  40. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +259 -0
  41. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +106 -0
  42. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +3 -0
  43. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +249 -0
  44. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +142 -0
  45. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +3 -0
  46. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +240 -0
  47. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +121 -0
  48. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +3 -0
  49. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +318 -0
  50. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +265 -0
  51. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +3 -0
  52. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +258 -0
  53. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +104 -0
  54. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +127 -0
  55. judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +3 -0
  56. judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +247 -0
  57. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +541 -0
  58. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +3 -0
  59. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +151 -0
  60. judgeval/scorers/prompt_scorer.py +4 -4
  61. judgeval/scorers/score.py +14 -14
  62. judgeval/scorers/utils.py +40 -6
  63. {judgeval-0.0.3.dist-info → judgeval-0.0.4.dist-info}/METADATA +1 -1
  64. judgeval-0.0.4.dist-info/RECORD +78 -0
  65. judgeval-0.0.3.dist-info/RECORD +0 -46
  66. {judgeval-0.0.3.dist-info → judgeval-0.0.4.dist-info}/WHEEL +0 -0
  67. {judgeval-0.0.3.dist-info → judgeval-0.0.4.dist-info}/licenses/LICENSE.md +0 -0
judgeval/__init__.py CHANGED
@@ -1,42 +1,4 @@
1
1
  # Import key components that should be publicly accessible
2
- from judgeval.common.utils import (
3
- get_chat_completion,
4
- aget_chat_completion,
5
- get_completion_multiple_models,
6
- aget_completion_multiple_models
7
- )
8
- from judgeval.data import (
9
- Example,
10
- ProcessExample,
11
- ScorerData,
12
- ScoringResult,
13
- )
14
- from judgeval.data.datasets import (
15
- EvalDataset,
16
- GroundTruthExample
17
- )
18
-
19
- from judgeval.judges import (
20
- judgevalJudge,
21
- LiteLLMJudge,
22
- TogetherJudge,
23
- MixtureOfJudges
24
- )
25
- from judgeval.scorers import (
26
- JudgmentScorer,
27
- CustomScorer,
28
- PromptScorer,
29
- ClassifierScorer,
30
- ToolCorrectnessScorer,
31
- JSONCorrectnessScorer,
32
- SummarizationScorer,
33
- HallucinationScorer,
34
- FaithfulnessScorer,
35
- ContextualRelevancyScorer,
36
- ContextualPrecisionScorer,
37
- ContextualRecallScorer,
38
- AnswerRelevancyScorer
39
- )
40
2
  from judgeval.clients import client, langfuse, together_client
41
3
  from judgeval.judgment_client import JudgmentClient
42
4
 
@@ -46,38 +8,5 @@ __all__ = [
46
8
  'langfuse',
47
9
  'together_client',
48
10
 
49
- # # Common utilities
50
- # 'get_chat_completion',
51
- # 'aget_chat_completion',
52
- # 'get_completion_multiple_models',
53
- # 'aget_completion_multiple_models',
54
-
55
- # # Data classes
56
- # 'Example',
57
- # 'ProcessExample',
58
- # 'ScorerData',
59
- # 'ScoringResult',
60
-
61
- # # Judges
62
- # 'judgevalJudge',
63
- # 'LiteLLMJudge',
64
- # 'TogetherJudge',
65
- # 'MixtureOfJudges',
66
-
67
- # # Scorers
68
- # 'JudgmentScorer',
69
- # 'CustomScorer',
70
- # 'PromptScorer',
71
- # 'ClassifierScorer',
72
- # 'ToolCorrectnessScorer',
73
- # 'JSONCorrectnessScorer',
74
- # 'SummarizationScorer',
75
- # 'HallucinationScorer',
76
- # 'FaithfulnessScorer',
77
- # 'ContextualRelevancyScorer',
78
- # 'ContextualPrecisionScorer',
79
- # 'ContextualRecallScorer',
80
- # 'AnswerRelevancyScorer',
81
-
82
11
  'JudgmentClient',
83
12
  ]
judgeval/common/tracer.py CHANGED
@@ -7,16 +7,7 @@ import functools
7
7
  import requests
8
8
  import uuid
9
9
  from contextlib import contextmanager
10
- from typing import (
11
- Optional,
12
- Any,
13
- List,
14
- Literal,
15
- Tuple,
16
- Generator,
17
- TypeAlias,
18
- Union
19
- )
10
+ from typing import Optional, Any, List, Literal, Tuple, Generator, TypeAlias, Union
20
11
  from dataclasses import dataclass, field
21
12
  from datetime import datetime
22
13
  from openai import OpenAI
@@ -33,7 +24,7 @@ from http import HTTPStatus
33
24
  from judgeval.constants import JUDGMENT_TRACES_SAVE_API_URL
34
25
  from judgeval.judgment_client import JudgmentClient
35
26
  from judgeval.data import Example
36
- from judgeval.scorers import JudgmentScorer, CustomScorer
27
+ from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
37
28
  from judgeval.data.result import ScoringResult
38
29
 
39
30
  # Define type aliases for better code readability and maintainability
@@ -76,16 +67,42 @@ class TraceEntry:
76
67
  elif self.type == "evaluation":
77
68
  print(f"{indent}Evaluation: {self.evaluation_result} ({self.duration:.3f}s)")
78
69
 
79
- def to_dict(self) -> dict:
80
- """Convert the trace entry to a dictionary format for storage/transmission."""
70
+ def _serialize_inputs(self) -> dict:
71
+ """Helper method to serialize input data safely.
72
+
73
+ Returns a dict with serializable versions of inputs, converting non-serializable
74
+ objects to None with a warning.
75
+ """
76
+ serialized_inputs = {}
77
+ for key, value in self.inputs.items():
78
+ if isinstance(value, BaseModel):
79
+ serialized_inputs[key] = value.model_dump()
80
+ elif isinstance(value, (list, tuple)):
81
+ # Handle lists/tuples of arguments
82
+ serialized_inputs[key] = [
83
+ item.model_dump() if isinstance(item, BaseModel)
84
+ else None if not self._is_json_serializable(item)
85
+ else item
86
+ for item in value
87
+ ]
88
+ else:
89
+ if self._is_json_serializable(value):
90
+ serialized_inputs[key] = value
91
+ else:
92
+ warnings.warn(f"Input '{key}' for function {self.function} is not JSON serializable. Setting to None.")
93
+ serialized_inputs[key] = None
94
+ return serialized_inputs
95
+
96
+ def _is_json_serializable(self, obj: Any) -> bool:
97
+ """Helper method to check if an object is JSON serializable."""
81
98
  try:
82
- output = self._serialize_output()
99
+ json.dumps(obj)
100
+ return True
83
101
  except (TypeError, OverflowError, ValueError):
84
- # Handle cases where output cannot be serialized
85
- warnings.warn(f"Output for function {self.function} is not JSON serializable. Setting to None.")
86
- output = None
102
+ return False
87
103
 
88
- # Build a complete dictionary representation of the trace entry
104
+ def to_dict(self) -> dict:
105
+ """Convert the trace entry to a dictionary format for storage/transmission."""
89
106
  return {
90
107
  "type": self.type,
91
108
  "function": self.function,
@@ -93,8 +110,8 @@ class TraceEntry:
93
110
  "message": self.message,
94
111
  "timestamp": self.timestamp,
95
112
  "duration": self.duration,
96
- "output": output,
97
- "inputs": self.inputs or None, # Convert empty dict to None
113
+ "output": self._serialize_output(),
114
+ "inputs": self._serialize_inputs(),
98
115
  "evaluation_result": [result.to_dict() for result in self.evaluation_result] if self.evaluation_result else None,
99
116
  "span_type": self.span_type
100
117
  }
@@ -104,18 +121,22 @@ class TraceEntry:
104
121
 
105
122
  Handles special cases:
106
123
  - Pydantic models are converted using model_dump()
107
- - Other objects must be JSON serializable
124
+ - Non-serializable objects return None with a warning
108
125
  """
109
126
  if isinstance(self.output, BaseModel):
110
127
  return self.output.model_dump()
111
128
 
112
- # Verify JSON serialization is possible
113
- json.dumps(self.output)
114
- return self.output
129
+ try:
130
+ # Try to serialize the output to verify it's JSON compatible
131
+ json.dumps(self.output)
132
+ return self.output
133
+ except (TypeError, OverflowError, ValueError):
134
+ warnings.warn(f"Output for function {self.function} is not JSON serializable. Setting to None.")
135
+ return None
115
136
 
116
137
  class TraceClient:
117
138
  """Client for managing a single trace context"""
118
- def __init__(self, tracer, trace_id: str, name: str, project_name: str = "default_project"):
139
+ def __init__(self, tracer, trace_id: str, name: str, project_name: str = "default_project", overwrite: bool = False):
119
140
  self.tracer = tracer
120
141
  self.trace_id = trace_id
121
142
  self.name = name
@@ -125,6 +146,7 @@ class TraceClient:
125
146
  self.start_time = time.time()
126
147
  self.span_type = None
127
148
  self._current_span: Optional[TraceEntry] = None
149
+ self.overwrite = overwrite
128
150
 
129
151
  @contextmanager
130
152
  def span(self, name: str, span_type: SpanType = "span"):
@@ -165,7 +187,7 @@ class TraceClient:
165
187
 
166
188
  async def async_evaluate(
167
189
  self,
168
- scorers: List[Union[JudgmentScorer, CustomScorer]],
190
+ scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
169
191
  input: Optional[str] = None,
170
192
  actual_output: Optional[str] = None,
171
193
  expected_output: Optional[str] = None,
@@ -175,7 +197,7 @@ class TraceClient:
175
197
  expected_tools: Optional[List[str]] = None,
176
198
  additional_metadata: Optional[Dict[str, Any]] = None,
177
199
  model: Optional[str] = None,
178
- log_results: Optional[bool] = False,
200
+ log_results: Optional[bool] = True,
179
201
  ):
180
202
  start_time = time.time() # Record start time
181
203
  example = Example(
@@ -195,9 +217,13 @@ class TraceClient:
195
217
  model=model,
196
218
  metadata={},
197
219
  log_results=log_results,
198
- project_name="TestSpanLevel1", # TODO this should be dynamic
199
- eval_run_name="TestSpanLevel1",
200
- override=True,
220
+ project_name=self.project_name,
221
+ eval_run_name=(
222
+ f"{self.name.capitalize()}-"
223
+ f"{self._current_span}-"
224
+ f"[{','.join(scorer.load_implementation().score_type.capitalize() for scorer in scorers)}]"
225
+ ),
226
+ override=self.overwrite
201
227
  )
202
228
 
203
229
  self.record_evaluation(scoring_results, start_time) # Pass start_time to record_evaluation
@@ -393,7 +419,7 @@ class Tracer:
393
419
  def trace(self, name: str, project_name: str = "default_project", overwrite: bool = False) -> Generator[TraceClient, None, None]:
394
420
  """Start a new trace context using a context manager"""
395
421
  trace_id = str(uuid.uuid4())
396
- trace = TraceClient(self, trace_id, name, project_name=project_name)
422
+ trace = TraceClient(self, trace_id, name, project_name=project_name, overwrite=overwrite)
397
423
  prev_trace = self._current_trace
398
424
  self._current_trace = trace
399
425
 
judgeval/constants.py CHANGED
@@ -15,6 +15,7 @@ class APIScorer(str, Enum):
15
15
  """
16
16
  FAITHFULNESS = "faithfulness"
17
17
  ANSWER_RELEVANCY = "answer_relevancy"
18
+ ANSWER_CORRECTNESS = "answer_correctness"
18
19
  HALLUCINATION = "hallucination"
19
20
  SUMMARIZATION = "summarization"
20
21
  CONTEXTUAL_RECALL = "contextual_recall"
judgeval/data/__init__.py CHANGED
@@ -1,10 +1,11 @@
1
- from judgeval.data.example import Example
1
+ from judgeval.data.example import Example, ExampleParams
2
2
  from judgeval.data.api_example import ProcessExample, create_process_example
3
3
  from judgeval.data.scorer_data import ScorerData, create_scorer_data
4
4
  from judgeval.data.result import ScoringResult, generate_scoring_result
5
5
 
6
6
  __all__ = [
7
7
  "Example",
8
+ "ExampleParams",
8
9
  "ProcessExample",
9
10
  "create_process_example",
10
11
  "ScorerData",
@@ -7,7 +7,7 @@ ScorerData holds the information related to a single, completed Scorer evaluatio
7
7
  from typing import List, Union, Optional, Dict
8
8
  from pydantic import BaseModel, Field
9
9
 
10
- from judgeval.scorers import CustomScorer
10
+ from judgeval.scorers import JudgevalScorer
11
11
 
12
12
  class ScorerData(BaseModel):
13
13
  """
@@ -47,7 +47,7 @@ class ScorerData(BaseModel):
47
47
  }
48
48
 
49
49
 
50
- def create_scorer_data(scorer: CustomScorer) -> ScorerData:
50
+ def create_scorer_data(scorer: JudgevalScorer) -> ScorerData:
51
51
  """
52
52
  After a `scorer` is run, it contains information about the example that was evaluated
53
53
  using the scorer. For example, after computing Faithfulness, the `scorer` object will contain
@@ -2,11 +2,10 @@ from typing import List, Optional, Dict, Any, Union
2
2
  from pydantic import BaseModel, field_validator
3
3
 
4
4
  from judgeval.data import Example
5
- from judgeval.scorers import CustomScorer, JudgmentScorer
6
- from judgeval.judges import judgevalJudge
5
+ from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
7
6
  from judgeval.constants import ACCEPTABLE_MODELS
8
7
  from judgeval.common.logger import debug, error
9
-
8
+ from judgeval.judges import JudgevalJudge
10
9
 
11
10
  class EvaluationRun(BaseModel):
12
11
  """
@@ -28,8 +27,8 @@ class EvaluationRun(BaseModel):
28
27
  project_name: Optional[str] = None
29
28
  eval_name: Optional[str] = None
30
29
  examples: List[Example]
31
- scorers: List[Union[JudgmentScorer, CustomScorer]]
32
- model: Union[str, List[str], judgevalJudge]
30
+ scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
31
+ model: Union[str, List[str], JudgevalJudge]
33
32
  aggregator: Optional[str] = None
34
33
  metadata: Optional[Dict[str, Any]] = None
35
34
  # API Key will be "" until user calls client.run_eval(), then API Key will be set
@@ -39,8 +38,9 @@ class EvaluationRun(BaseModel):
39
38
  data = super().model_dump(**kwargs)
40
39
 
41
40
  data["scorers"] = [
42
- scorer.to_dict() \
43
- if hasattr(scorer, "to_dict") else {"score_type": scorer.score_type, "threshold": scorer.threshold}
41
+ scorer.to_dict() if hasattr(scorer, "to_dict")
42
+ else scorer.model_dump() if hasattr(scorer, "model_dump")
43
+ else {"score_type": scorer.score_type, "threshold": scorer.threshold}
44
44
  for scorer in self.scorers
45
45
  ]
46
46
  return data
@@ -81,7 +81,7 @@ class EvaluationRun(BaseModel):
81
81
  if not v:
82
82
  raise ValueError("Scorers cannot be empty.")
83
83
  for s in v:
84
- if not isinstance(s, JudgmentScorer) and not isinstance(s, CustomScorer):
84
+ if not isinstance(s, APIJudgmentScorer) and not isinstance(s, JudgevalScorer):
85
85
  raise ValueError(f"Invalid type for Scorer: {type(s)}")
86
86
  return v
87
87
 
@@ -89,20 +89,21 @@ class EvaluationRun(BaseModel):
89
89
  def validate_model(cls, v, values):
90
90
  if not v:
91
91
  raise ValueError("Model cannot be empty.")
92
+
92
93
  # Check if model is a judgevalJudge
93
- if isinstance(v, judgevalJudge):
94
- # Verify all scorers are CustomScorer when using judgevalJudge
94
+ if isinstance(v, JudgevalJudge):
95
+ # Verify all scorers are JudgevalScorer when using judgevalJudge
95
96
  scorers = values.data.get('scorers', [])
96
- if not all(isinstance(s, CustomScorer) for s in scorers):
97
- raise ValueError("When using a judgevalJudge model, all scorers must be CustomScorer type")
97
+ if not all(isinstance(s, JudgevalScorer) for s in scorers):
98
+ raise ValueError("When using a judgevalJudge model, all scorers must be JudgevalScorer type")
98
99
  return v
99
-
100
+
100
101
  # Check if model is string or list of strings
101
102
  if isinstance(v, str):
102
103
  if v not in ACCEPTABLE_MODELS:
103
104
  raise ValueError(f"Model name {v} not recognized.")
104
105
  return v
105
-
106
+
106
107
  if isinstance(v, list):
107
108
  if not all(isinstance(m, str) for m in v):
108
109
  raise ValueError("When providing a list of models, all elements must be strings")
@@ -110,7 +111,7 @@ class EvaluationRun(BaseModel):
110
111
  if m not in ACCEPTABLE_MODELS:
111
112
  raise ValueError(f"Model name {m} not recognized.")
112
113
  return v
113
- raise ValueError(f"Model must be one of: string, list of strings, or judgevalJudge instance. Received type {type(v)}.")
114
+ raise ValueError(f"Model must be one of: string, list of strings, or JudgevalJudge instance. Received type {type(v)}.")
114
115
 
115
116
  @field_validator('aggregator', mode='before')
116
117
  def validate_aggregator(cls, v, values):
@@ -1,7 +1,7 @@
1
1
  from pydantic import BaseModel
2
- from judgeval.judges.base_judge import judgevalJudge
2
+ from judgeval.judges.base_judge import JudgevalJudge
3
3
  from judgeval.judges.litellm_judge import LiteLLMJudge
4
4
  from judgeval.judges.together_judge import TogetherJudge
5
5
  from judgeval.judges.mixture_of_judges import MixtureOfJudges
6
6
 
7
- __all__ = ["judgevalJudge", "LiteLLMJudge", "TogetherJudge", "MixtureOfJudges"]
7
+ __all__ = ["JudgevalJudge", "LiteLLMJudge", "TogetherJudge", "MixtureOfJudges"]
@@ -6,7 +6,7 @@ from abc import ABC, abstractmethod
6
6
  from typing import Optional, List
7
7
 
8
8
 
9
- class judgevalJudge(ABC):
9
+ class JudgevalJudge(ABC):
10
10
  def __init__(self, model_name: Optional[str] = None, *args, **kwargs):
11
11
  self.model_name = model_name
12
12
  self.model = self.load_model(*args, **kwargs)
@@ -2,7 +2,7 @@ import pydantic
2
2
  from typing import List, Union, Mapping
3
3
 
4
4
  from judgeval import *
5
- from judgeval.judges import judgevalJudge
5
+ from judgeval.judges import JudgevalJudge
6
6
  from judgeval.common.utils import afetch_litellm_api_response, fetch_litellm_api_response
7
7
  from judgeval.common.logger import debug, error
8
8
 
@@ -11,7 +11,7 @@ BASE_CONVERSATION = [
11
11
  ] # for string inputs, we need to add the user query to a base conversation, since LiteLLM only accepts a list of dictionaries as a chat history
12
12
 
13
13
 
14
- class LiteLLMJudge(judgevalJudge):
14
+ class LiteLLMJudge(JudgevalJudge):
15
15
  def __init__(self, model: str = "gpt-4o-mini", **kwargs):
16
16
  debug(f"Initializing LiteLLMJudge with model={model}")
17
17
  self.model = model
@@ -6,7 +6,7 @@ Enables client to use multiple models to generate responses and then aggregate t
6
6
  from judgeval import *
7
7
  import pydantic
8
8
  from typing import List, Union, Mapping, Dict
9
- from judgeval.judges import judgevalJudge
9
+ from judgeval.judges import JudgevalJudge
10
10
  from judgeval.common.utils import get_completion_multiple_models, get_chat_completion, aget_completion_multiple_models, aget_chat_completion
11
11
  from judgeval.common.logger import debug, error
12
12
 
@@ -115,7 +115,7 @@ def build_dynamic_mixture_prompt(
115
115
  BASE_CONVERSATION = [
116
116
  {"role": "system", "content": "You are a helpful assistant."},
117
117
  ] # for string inputs, we need to add the user query to a base conversation, since LiteLLM only accepts a list of dictionaries as a chat history
118
- class MixtureOfJudges(judgevalJudge):
118
+ class MixtureOfJudges(JudgevalJudge):
119
119
  """
120
120
  IMPORTANT: When supplying custom prompts and conversation histories for aggregation, supply them in the following format:
121
121
  in kwargs:
@@ -6,14 +6,14 @@ from pydantic import BaseModel
6
6
  from typing import List, Union, Mapping
7
7
  from judgeval.common.logger import debug, error
8
8
 
9
- from judgeval.judges import judgevalJudge
9
+ from judgeval.judges import JudgevalJudge
10
10
  from judgeval.common.utils import fetch_together_api_response, afetch_together_api_response
11
11
 
12
12
  BASE_CONVERSATION = [
13
13
  {"role": "system", "content": "You are a helpful assistant."},
14
14
  ]
15
15
 
16
- class TogetherJudge(judgevalJudge):
16
+ class TogetherJudge(JudgevalJudge):
17
17
  def __init__(self, model: str = "QWEN", **kwargs):
18
18
  debug(f"Initializing TogetherJudge with model={model}")
19
19
  self.model = model
judgeval/judges/utils.py CHANGED
@@ -5,13 +5,13 @@ import litellm
5
5
  from typing import Optional, Union, Tuple, List
6
6
 
7
7
  from judgeval.common.exceptions import InvalidJudgeModelError
8
- from judgeval.judges import judgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges
8
+ from judgeval.judges import JudgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges
9
9
  from judgeval.constants import TOGETHER_SUPPORTED_MODELS
10
10
 
11
11
  LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
12
12
 
13
13
  def create_judge(
14
- model: Optional[Union[str, List[str], judgevalJudge]] = None) -> Tuple[judgevalJudge, bool]:
14
+ model: Optional[Union[str, List[str], JudgevalJudge]] = None) -> Tuple[JudgevalJudge, bool]:
15
15
  """
16
16
  Creates a judge model from string(s) or a judgeval judge object.
17
17
 
@@ -24,10 +24,10 @@ def create_judge(
24
24
  """
25
25
  if model is None: # default option
26
26
  return LiteLLMJudge(model="gpt-4o"), True
27
- if not isinstance(model, (str, list, judgevalJudge)):
27
+ if not isinstance(model, (str, list, JudgevalJudge)):
28
28
  raise InvalidJudgeModelError(f"Model must be a string, list of strings, or a judgeval judge object. Got: {type(model)} instead.")
29
29
  # If model is already a valid judge type, return it and mark native
30
- if isinstance(model, (judgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges)):
30
+ if isinstance(model, (JudgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges)):
31
31
  return model, True
32
32
 
33
33
  # Either string or List[str]
@@ -7,11 +7,22 @@ import requests
7
7
 
8
8
  from judgeval.constants import ROOT_API
9
9
  from judgeval.data.datasets import EvalDataset
10
- from judgeval.data import ScoringResult, Example
11
- from judgeval.judges import judgevalJudge
12
- from judgeval.scorers import JudgmentScorer, CustomScorer, ClassifierScorer
10
+ from judgeval.data import (
11
+ ScoringResult,
12
+ Example
13
+ )
14
+ from judgeval.scorers import (
15
+ APIJudgmentScorer,
16
+ JudgevalScorer,
17
+ ClassifierScorer,
18
+ ScorerWrapper
19
+ )
13
20
  from judgeval.evaluation_run import EvaluationRun
14
- from judgeval.run_evaluation import run_eval
21
+ from judgeval.run_evaluation import (
22
+ run_eval,
23
+ assert_test
24
+ )
25
+ from judgeval.judges import JudgevalJudge
15
26
  from judgeval.constants import JUDGMENT_EVAL_FETCH_API_URL
16
27
  from judgeval.common.exceptions import JudgmentAPIError
17
28
  from pydantic import BaseModel
@@ -37,25 +48,32 @@ class JudgmentClient:
37
48
  def run_evaluation(
38
49
  self,
39
50
  examples: List[Example],
40
- scorers: List[Union[JudgmentScorer, CustomScorer]],
41
- model: Union[str, List[str], judgevalJudge],
51
+ scorers: List[Union[ScorerWrapper, JudgevalScorer]],
52
+ model: Union[str, List[str], JudgevalJudge],
42
53
  aggregator: Optional[str] = None,
43
54
  metadata: Optional[Dict[str, Any]] = None,
44
- log_results: bool = False,
45
- project_name: str = "",
46
- eval_run_name: str = "",
55
+ log_results: bool = True,
56
+ project_name: str = "default_project",
57
+ eval_run_name: str = "default_eval_run",
47
58
  override: bool = False,
59
+ use_judgment: bool = True
48
60
  ) -> List[ScoringResult]:
49
61
  """
50
62
  Executes an evaluation of `Example`s using one or more `Scorer`s
51
63
  """
52
64
  try:
65
+ # Load appropriate implementations for all scorers
66
+ loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = [
67
+ scorer.load_implementation(use_judgment=use_judgment) if isinstance(scorer, ScorerWrapper) else scorer
68
+ for scorer in scorers
69
+ ]
70
+
53
71
  eval = EvaluationRun(
54
72
  log_results=log_results,
55
73
  project_name=project_name,
56
74
  eval_name=eval_run_name,
57
75
  examples=examples,
58
- scorers=scorers,
76
+ scorers=loaded_scorers,
59
77
  model=model,
60
78
  aggregator=aggregator,
61
79
  metadata=metadata,
@@ -68,24 +86,31 @@ class JudgmentClient:
68
86
  def evaluate_dataset(
69
87
  self,
70
88
  dataset: EvalDataset,
71
- scorers: List[Union[JudgmentScorer, CustomScorer]],
72
- model: Union[str, List[str]],
89
+ scorers: List[Union[ScorerWrapper, JudgevalScorer]],
90
+ model: Union[str, List[str], JudgevalJudge],
73
91
  aggregator: Optional[str] = None,
74
92
  metadata: Optional[Dict[str, Any]] = None,
75
93
  project_name: str = "",
76
94
  eval_run_name: str = "",
77
- log_results: bool = False
95
+ log_results: bool = False,
96
+ use_judgment: bool = True
78
97
  ) -> List[ScoringResult]:
79
98
  """
80
99
  Executes an evaluation of a `EvalDataset` using one or more `Scorer`s
81
100
  """
82
101
  try:
102
+ # Load appropriate implementations for all scorers
103
+ loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = [
104
+ scorer.load_implementation(use_judgment=use_judgment) if isinstance(scorer, ScorerWrapper) else scorer
105
+ for scorer in scorers
106
+ ]
107
+
83
108
  evaluation_run = EvaluationRun(
84
109
  log_results=log_results,
85
110
  project_name=project_name,
86
111
  eval_name=eval_run_name,
87
112
  examples=dataset.examples,
88
- scorers=scorers,
113
+ scorers=loaded_scorers,
89
114
  model=model,
90
115
  aggregator=aggregator,
91
116
  metadata=metadata,
@@ -241,4 +266,31 @@ class JudgmentClient:
241
266
  raise JudgmentAPIError(f"Failed to save classifier scorer: {response.json().get('detail', '')}")
242
267
 
243
268
  return response.json()["slug"]
244
-
269
+
270
+
271
+ def assert_test(
272
+ self,
273
+ examples: List[Example],
274
+ scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
275
+ model: Union[str, List[str], JudgevalJudge],
276
+ aggregator: Optional[str] = None,
277
+ metadata: Optional[Dict[str, Any]] = None,
278
+ log_results: bool = False,
279
+ project_name: str = "",
280
+ eval_run_name: str = "",
281
+ override: bool = False,
282
+ ) -> None:
283
+
284
+ results = self.run_evaluation(
285
+ examples=examples,
286
+ scorers=scorers,
287
+ model=model,
288
+ aggregator=aggregator,
289
+ metadata=metadata,
290
+ log_results=log_results,
291
+ project_name=project_name,
292
+ eval_run_name=eval_run_name,
293
+ override=override
294
+ )
295
+
296
+ assert_test(results)