judgeval 0.0.51__py3-none-any.whl → 0.0.53__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. judgeval/common/logger.py +46 -199
  2. judgeval/common/s3_storage.py +2 -6
  3. judgeval/common/tracer.py +182 -262
  4. judgeval/common/utils.py +16 -36
  5. judgeval/constants.py +14 -20
  6. judgeval/data/__init__.py +0 -2
  7. judgeval/data/datasets/dataset.py +6 -10
  8. judgeval/data/datasets/eval_dataset_client.py +25 -27
  9. judgeval/data/example.py +5 -138
  10. judgeval/data/judgment_types.py +214 -0
  11. judgeval/data/result.py +7 -25
  12. judgeval/data/scorer_data.py +28 -40
  13. judgeval/data/scripts/fix_default_factory.py +23 -0
  14. judgeval/data/scripts/openapi_transform.py +123 -0
  15. judgeval/data/tool.py +3 -54
  16. judgeval/data/trace.py +31 -50
  17. judgeval/data/trace_run.py +3 -3
  18. judgeval/evaluation_run.py +16 -23
  19. judgeval/integrations/langgraph.py +11 -12
  20. judgeval/judges/litellm_judge.py +3 -6
  21. judgeval/judges/mixture_of_judges.py +8 -25
  22. judgeval/judges/together_judge.py +3 -6
  23. judgeval/judgment_client.py +22 -24
  24. judgeval/rules.py +7 -19
  25. judgeval/run_evaluation.py +79 -242
  26. judgeval/scorers/__init__.py +4 -20
  27. judgeval/scorers/agent_scorer.py +21 -0
  28. judgeval/scorers/api_scorer.py +28 -38
  29. judgeval/scorers/base_scorer.py +98 -0
  30. judgeval/scorers/example_scorer.py +19 -0
  31. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -20
  32. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +10 -17
  33. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +9 -24
  34. judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +16 -68
  35. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +4 -12
  36. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +4 -4
  37. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +10 -17
  38. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +4 -4
  39. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +4 -4
  40. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +4 -4
  41. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +18 -14
  42. judgeval/scorers/score.py +45 -330
  43. judgeval/scorers/utils.py +6 -88
  44. judgeval/utils/file_utils.py +4 -6
  45. judgeval/version_check.py +3 -2
  46. {judgeval-0.0.51.dist-info → judgeval-0.0.53.dist-info}/METADATA +3 -2
  47. judgeval-0.0.53.dist-info/RECORD +65 -0
  48. judgeval/data/custom_example.py +0 -19
  49. judgeval/scorers/judgeval_scorer.py +0 -177
  50. judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +0 -45
  51. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -29
  52. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -29
  53. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -32
  54. judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +0 -28
  55. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -38
  56. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -27
  57. judgeval/scorers/prompt_scorer.py +0 -296
  58. judgeval-0.0.51.dist-info/RECORD +0 -69
  59. {judgeval-0.0.51.dist-info → judgeval-0.0.53.dist-info}/WHEEL +0 -0
  60. {judgeval-0.0.51.dist-info → judgeval-0.0.53.dist-info}/licenses/LICENSE.md +0 -0
@@ -13,7 +13,7 @@ from judgeval.common.utils import (
13
13
  aget_completion_multiple_models,
14
14
  aget_chat_completion,
15
15
  )
16
- from judgeval.common.logger import debug, error
16
+ from judgeval.common.logger import judgeval_logger
17
17
 
18
18
 
19
19
  def build_dynamic_mixture_prompt(
@@ -85,14 +85,13 @@ def build_dynamic_mixture_prompt(
85
85
  # If a custom system prompt is provided, validate and use it
86
86
  if custom_system_prompt is not None:
87
87
  if not isinstance(custom_system_prompt, str):
88
- error(
88
+ judgeval_logger.error(
89
89
  f"TypeError: Custom system prompt must be a string. Received: {type(custom_system_prompt)}."
90
90
  )
91
91
  raise TypeError(
92
92
  f"Custom system prompt must be a string. Received: {type(custom_system_prompt)}."
93
93
  )
94
94
  if not custom_system_prompt:
95
- error("ValueError: Custom system prompt cannot be empty")
96
95
  raise ValueError("Custom system prompt cannot be empty")
97
96
  # Override the default system prompt, but also add special instructions for handling JSON
98
97
  default_conversation[0]["content"] = (
@@ -105,31 +104,21 @@ def build_dynamic_mixture_prompt(
105
104
  # Validate custom conversation history format
106
105
  for message in custom_conversation_history:
107
106
  if not isinstance(message, dict):
108
- error(
109
- f"TypeError: Custom conversation history must be a list of dictionaries. Received: {message}."
110
- )
111
107
  raise TypeError(
112
108
  f"Custom conversation history must be a list of dictionaries. Received: {message}."
113
109
  )
114
110
 
115
111
  if "role" not in message or "content" not in message:
116
- error("ValueError: Each message must have 'role' and 'content' keys")
117
112
  raise ValueError("Each message must have 'role' and 'content' keys")
118
113
 
119
114
  if not isinstance(message["role"], str) or not isinstance(
120
115
  message["content"], str
121
116
  ):
122
- error(
123
- f"TypeError: Message role and content must be strings. Received: {type(message['role'])}, {type(message['content'])}."
124
- )
125
117
  raise TypeError(
126
118
  f"Message role and content must be strings. Received: {type(message['role'])}, {type(message['content'])}."
127
119
  )
128
120
 
129
121
  if message["role"] not in ["system", "user", "assistant"]:
130
- error(
131
- f"ValueError: Message role must be one of: 'system', 'user', 'assistant'. Received: {message['role']}."
132
- )
133
122
  raise ValueError(
134
123
  f"Message role must be one of: 'system', 'user', 'assistant'. Received: {message['role']}."
135
124
  )
@@ -200,7 +189,6 @@ class MixtureOfJudges(JudgevalJudge):
200
189
  aggregation_schema (pydantic.BaseModel): Response schema for the aggregator model.
201
190
  kwargs: Additional keyword arguments.
202
191
  """
203
- debug(f"Generating response for input type: {type(input)}")
204
192
 
205
193
  # Convert input to conversation format if needed
206
194
  if isinstance(input, str):
@@ -208,7 +196,7 @@ class MixtureOfJudges(JudgevalJudge):
208
196
  elif isinstance(input, list):
209
197
  convo = input
210
198
  else:
211
- error(f"Invalid input type received: {type(input)}")
199
+ judgeval_logger.error(f"Invalid input type received: {type(input)}")
212
200
  raise TypeError(
213
201
  f"Input must be a string or a list of dictionaries. Input type of: {type(input)}"
214
202
  )
@@ -219,8 +207,7 @@ class MixtureOfJudges(JudgevalJudge):
219
207
  messages=[convo] * len(self.models),
220
208
  response_formats=[response_schema] * len(self.models),
221
209
  )
222
- except Exception as e:
223
- error(f"Error getting completions from multiple models: {str(e)}")
210
+ except Exception:
224
211
  raise
225
212
 
226
213
  compiled_mixture_prompt = build_dynamic_mixture_prompt(
@@ -235,8 +222,7 @@ class MixtureOfJudges(JudgevalJudge):
235
222
  messages=compiled_mixture_prompt,
236
223
  response_format=aggregation_schema,
237
224
  )
238
- except Exception as e:
239
- error(f"Error getting chat completion from aggregator: {str(e)}")
225
+ except Exception:
240
226
  raise
241
227
 
242
228
  return mixed_response
@@ -255,7 +241,6 @@ class MixtureOfJudges(JudgevalJudge):
255
241
  aggregation_schema (pydantic.BaseModel): Response schema for the aggregator model.
256
242
  kwargs: Additional keyword arguments.
257
243
  """
258
- debug(f"Generating response for input type: {type(input)}")
259
244
 
260
245
  # Convert input to conversation format if needed
261
246
  if isinstance(input, str):
@@ -263,7 +248,7 @@ class MixtureOfJudges(JudgevalJudge):
263
248
  elif isinstance(input, list):
264
249
  convo = input
265
250
  else:
266
- error(f"Invalid input type received: {type(input)}")
251
+ judgeval_logger.error(f"Invalid input type received: {type(input)}")
267
252
  raise TypeError(
268
253
  f"Input must be a string or a list of dictionaries. Input type of: {type(input)}"
269
254
  )
@@ -274,8 +259,7 @@ class MixtureOfJudges(JudgevalJudge):
274
259
  messages=[convo] * len(self.models),
275
260
  response_formats=[response_schema] * len(self.models),
276
261
  )
277
- except Exception as e:
278
- error(f"Error getting async completions from multiple models: {str(e)}")
262
+ except Exception:
279
263
  raise
280
264
 
281
265
  compiled_mixture_prompt = build_dynamic_mixture_prompt(
@@ -290,8 +274,7 @@ class MixtureOfJudges(JudgevalJudge):
290
274
  messages=compiled_mixture_prompt,
291
275
  response_format=aggregation_schema,
292
276
  )
293
- except Exception as e:
294
- error(f"Error getting async chat completion from aggregator: {str(e)}")
277
+ except Exception:
295
278
  raise
296
279
 
297
280
  return mixed_response
@@ -4,13 +4,13 @@ Implementation of using TogetherAI inference for judges.
4
4
 
5
5
  from pydantic import BaseModel
6
6
  from typing import List, Union
7
- from judgeval.common.logger import debug, error
8
7
 
9
8
  from judgeval.judges import JudgevalJudge
10
9
  from judgeval.common.utils import (
11
10
  fetch_together_api_response,
12
11
  afetch_together_api_response,
13
12
  )
13
+ from judgeval.common.logger import judgeval_logger
14
14
 
15
15
  BASE_CONVERSATION = [
16
16
  {"role": "system", "content": "You are a helpful assistant."},
@@ -19,14 +19,12 @@ BASE_CONVERSATION = [
19
19
 
20
20
  class TogetherJudge(JudgevalJudge):
21
21
  def __init__(self, model: str = "Qwen/Qwen2.5-72B-Instruct-Turbo", **kwargs):
22
- debug(f"Initializing TogetherJudge with model={model}")
23
22
  self.model = model
24
23
  self.kwargs = kwargs
25
24
  super().__init__(model_name=model)
26
25
 
27
26
  # TODO: Fix cost for generate and a_generate
28
27
  def generate(self, input: Union[str, List[dict]], schema: BaseModel = None) -> str:
29
- debug(f"Generating response for input type: {type(input)}")
30
28
  if isinstance(input, str):
31
29
  convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
32
30
  return fetch_together_api_response(
@@ -38,13 +36,12 @@ class TogetherJudge(JudgevalJudge):
38
36
  self.model, convo, response_format=schema
39
37
  )
40
38
  else:
41
- error(f"Invalid input type received: {type(input)}")
39
+ judgeval_logger.error(f"Invalid input type received: {type(input)}")
42
40
  raise TypeError("Input must be a string or a list of dictionaries.")
43
41
 
44
42
  async def a_generate(
45
43
  self, input: Union[str, List[dict]], schema: BaseModel = None
46
44
  ) -> str:
47
- debug(f"Async generating response for input type: {type(input)}")
48
45
  if isinstance(input, str):
49
46
  convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
50
47
  res = await afetch_together_api_response(
@@ -58,7 +55,7 @@ class TogetherJudge(JudgevalJudge):
58
55
  )
59
56
  return res
60
57
  else:
61
- error(f"Invalid input type received: {type(input)}")
58
+ judgeval_logger.error(f"Invalid input type received: {type(input)}")
62
59
  raise TypeError("Input must be a string or a list of dictionaries.")
63
60
 
64
61
  def load_model(self) -> str:
@@ -14,12 +14,11 @@ from judgeval.data.datasets import EvalDataset, EvalDatasetClient
14
14
  from judgeval.data import (
15
15
  ScoringResult,
16
16
  Example,
17
- CustomExample,
18
17
  Trace,
19
18
  )
20
19
  from judgeval.scorers import (
21
- APIJudgmentScorer,
22
- JudgevalScorer,
20
+ APIScorerConfig,
21
+ BaseScorer,
23
22
  ClassifierScorer,
24
23
  )
25
24
  from judgeval.evaluation_run import EvaluationRun
@@ -41,6 +40,7 @@ from judgeval.common.tracer import Tracer
41
40
  from judgeval.common.utils import validate_api_key
42
41
  from pydantic import BaseModel
43
42
  from judgeval.run_evaluation import SpinnerWrappedTask
43
+ from judgeval.common.logger import judgeval_logger
44
44
 
45
45
 
46
46
  class EvalRunRequestBody(BaseModel):
@@ -68,37 +68,35 @@ class SingletonMeta(type):
68
68
  class JudgmentClient(metaclass=SingletonMeta):
69
69
  def __init__(
70
70
  self,
71
- judgment_api_key: Optional[str] = os.getenv("JUDGMENT_API_KEY"),
71
+ api_key: Optional[str] = os.getenv("JUDGMENT_API_KEY"),
72
72
  organization_id: Optional[str] = os.getenv("JUDGMENT_ORG_ID"),
73
73
  ):
74
- # Check if API key is None
75
- if judgment_api_key is None:
74
+ if not api_key:
76
75
  raise ValueError(
77
- "JUDGMENT_API_KEY cannot be None. Please provide a valid API key or set the JUDGMENT_API_KEY environment variable."
76
+ "api_key parameter must be provided. Please provide a valid API key value or set the JUDGMENT_API_KEY environment variable."
78
77
  )
79
78
 
80
- # Check if organization ID is None
81
- if organization_id is None:
79
+ if not organization_id:
82
80
  raise ValueError(
83
- "JUDGMENT_ORG_ID cannot be None. Please provide a valid organization ID or set the JUDGMENT_ORG_ID environment variable."
81
+ "organization_id parameter must be provided. Please provide a valid organization ID value or set the JUDGMENT_ORG_ID environment variable."
84
82
  )
85
83
 
86
- self.judgment_api_key = judgment_api_key
84
+ self.judgment_api_key = api_key
87
85
  self.organization_id = organization_id
88
- self.eval_dataset_client = EvalDatasetClient(judgment_api_key, organization_id)
86
+ self.eval_dataset_client = EvalDatasetClient(api_key, organization_id)
89
87
 
90
88
  # Verify API key is valid
91
- result, response = validate_api_key(judgment_api_key)
89
+ result, response = validate_api_key(api_key)
92
90
  if not result:
93
91
  # May be bad to output their invalid API key...
94
92
  raise JudgmentAPIError(f"Issue with passed in Judgment API key: {response}")
95
93
  else:
96
- print("Successfully initialized JudgmentClient!")
94
+ judgeval_logger.info("Successfully initialized JudgmentClient!")
97
95
 
98
96
  def a_run_evaluation(
99
97
  self,
100
98
  examples: List[Example],
101
- scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
99
+ scorers: List[Union[APIScorerConfig, BaseScorer]],
102
100
  model: Optional[str] = "gpt-4.1",
103
101
  project_name: str = "default_project",
104
102
  eval_run_name: str = "default_eval_run",
@@ -120,7 +118,7 @@ class JudgmentClient(metaclass=SingletonMeta):
120
118
 
121
119
  def run_trace_evaluation(
122
120
  self,
123
- scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
121
+ scorers: List[Union[APIScorerConfig, BaseScorer]],
124
122
  examples: Optional[List[Example]] = None,
125
123
  function: Optional[Callable] = None,
126
124
  tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
@@ -163,8 +161,8 @@ class JudgmentClient(metaclass=SingletonMeta):
163
161
 
164
162
  def run_evaluation(
165
163
  self,
166
- examples: Union[List[Example], List[CustomExample]],
167
- scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
164
+ examples: List[Example],
165
+ scorers: List[Union[APIScorerConfig, BaseScorer]],
168
166
  model: Optional[str] = "gpt-4.1",
169
167
  project_name: str = "default_project",
170
168
  eval_run_name: str = "default_eval_run",
@@ -176,8 +174,8 @@ class JudgmentClient(metaclass=SingletonMeta):
176
174
  Executes an evaluation of `Example`s using one or more `Scorer`s
177
175
 
178
176
  Args:
179
- examples (Union[List[Example], List[CustomExample]]): The examples to evaluate
180
- scorers (List[Union[APIJudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
177
+ examples (List[Example]): The examples to evaluate
178
+ scorers (List[Union[APIScorerConfig, BaseScorer]]): A list of scorers to use for evaluation
181
179
  model (str): The model used as a judge when using LLM as a Judge
182
180
  project_name (str): The name of the project the evaluation results belong to
183
181
  eval_run_name (str): A name for this evaluation run
@@ -450,7 +448,7 @@ class JudgmentClient(metaclass=SingletonMeta):
450
448
  def assert_test(
451
449
  self,
452
450
  examples: List[Example],
453
- scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
451
+ scorers: List[Union[APIScorerConfig, BaseScorer]],
454
452
  model: Optional[str] = "gpt-4.1",
455
453
  project_name: str = "default_test",
456
454
  eval_run_name: str = str(uuid4()),
@@ -463,7 +461,7 @@ class JudgmentClient(metaclass=SingletonMeta):
463
461
 
464
462
  Args:
465
463
  examples (List[Example]): The examples to evaluate.
466
- scorers (List[Union[APIJudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
464
+ scorers (List[Union[APIScorerConfig, BaseScorer]]): A list of scorers to use for evaluation
467
465
  model (str): The model used as a judge when using LLM as a Judge
468
466
  project_name (str): The name of the project the evaluation results belong to
469
467
  eval_run_name (str): A name for this evaluation run
@@ -498,7 +496,7 @@ class JudgmentClient(metaclass=SingletonMeta):
498
496
 
499
497
  def assert_trace_test(
500
498
  self,
501
- scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
499
+ scorers: List[Union[APIScorerConfig, BaseScorer]],
502
500
  examples: Optional[List[Example]] = None,
503
501
  function: Optional[Callable] = None,
504
502
  tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
@@ -516,7 +514,7 @@ class JudgmentClient(metaclass=SingletonMeta):
516
514
 
517
515
  Args:
518
516
  examples (List[Example]): The examples to evaluate.
519
- scorers (List[Union[APIJudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
517
+ scorers (List[Union[APIScorerConfig, BaseScorer]]): A list of scorers to use for evaluation
520
518
  model (str): The model used as a judge when using LLM as a Judge
521
519
  project_name (str): The name of the project the evaluation results belong to
522
520
  eval_run_name (str): A name for this evaluation run
judgeval/rules.py CHANGED
@@ -3,12 +3,12 @@ Rules system for Judgeval that enables alerts based on metric thresholds.
3
3
  """
4
4
 
5
5
  from typing import Dict, List, Optional, Union, Any, Tuple
6
- from pydantic import BaseModel, Field, field_validator, ConfigDict
6
+ from pydantic import BaseModel, Field, ConfigDict
7
7
  import asyncio
8
8
  from concurrent.futures import ThreadPoolExecutor
9
9
  import uuid
10
10
 
11
- from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
11
+ from judgeval.scorers import APIScorerConfig, BaseScorer
12
12
  from judgeval.utils.alerts import AlertStatus, AlertResult
13
13
 
14
14
 
@@ -18,7 +18,7 @@ class Condition(BaseModel):
18
18
 
19
19
  Example:
20
20
  {
21
- "metric": FaithfulnessScorer(threshold=0.7) # Must be a scorer object: APIJudgmentScorer, JudgevalScorer
21
+ "metric": FaithfulnessScorer(threshold=0.7) # Must be a scorer object: APIScorerConfig, BaseScorer
22
22
  }
23
23
 
24
24
  The Condition class uses the scorer's threshold and success function internally.
@@ -26,13 +26,13 @@ class Condition(BaseModel):
26
26
 
27
27
  model_config = ConfigDict(arbitrary_types_allowed=True)
28
28
 
29
- metric: Union[APIJudgmentScorer, JudgevalScorer]
29
+ metric: Union[APIScorerConfig, BaseScorer]
30
30
 
31
31
  @property
32
32
  def metric_name(self) -> str:
33
33
  """Get the name of the metric for lookups in scores dictionary."""
34
34
  if hasattr(self.metric, "score_type"):
35
- # Handle APIJudgmentScorer and JudgevalScorer which have score_type
35
+ # Handle APIScorerConfig and BaseScorer which have score_type
36
36
  return self.metric.score_type
37
37
  elif hasattr(self.metric, "__name__"):
38
38
  # Handle cases where metric has a __name__ attribute
@@ -58,8 +58,8 @@ class Condition(BaseModel):
58
58
  # Use the scorer's success check function if available
59
59
  if hasattr(self.metric, "success_check"):
60
60
  return self.metric.success_check()
61
- elif hasattr(self.metric, "_success_check"):
62
- return self.metric._success_check()
61
+ elif hasattr(self.metric, "success_check"):
62
+ return self.metric.success_check()
63
63
  else:
64
64
  # Fallback to default comparison (greater than or equal)
65
65
  return value >= self.threshold if self.threshold is not None else False
@@ -241,18 +241,6 @@ class Rule(BaseModel):
241
241
 
242
242
  return data
243
243
 
244
- @field_validator("conditions")
245
- def validate_conditions_not_empty(cls, v):
246
- if not v:
247
- raise ValueError("Conditions list cannot be empty")
248
- return v
249
-
250
- @field_validator("combine_type")
251
- def validate_combine_type(cls, v):
252
- if v not in ["all", "any"]:
253
- raise ValueError(f"combine_type must be 'all' or 'any', got: {v}")
254
- return v
255
-
256
244
 
257
245
  class RulesEngine:
258
246
  """