judgeval 0.0.16__py3-none-any.whl → 0.0.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. judgeval/__init__.py +1 -3
  2. judgeval/clients.py +0 -6
  3. judgeval/common/logger.py +0 -1
  4. judgeval/common/tracer.py +270 -62
  5. judgeval/common/utils.py +9 -5
  6. judgeval/constants.py +7 -2
  7. judgeval/data/__init__.py +2 -0
  8. judgeval/data/api_example.py +2 -2
  9. judgeval/data/datasets/__init__.py +1 -2
  10. judgeval/data/datasets/dataset.py +4 -5
  11. judgeval/data/datasets/eval_dataset_client.py +11 -7
  12. judgeval/data/datasets/utils.py +1 -2
  13. judgeval/data/example.py +72 -17
  14. judgeval/data/scorer_data.py +1 -1
  15. judgeval/evaluation_run.py +2 -2
  16. judgeval/judges/__init__.py +0 -1
  17. judgeval/judges/base_judge.py +1 -1
  18. judgeval/judges/mixture_of_judges.py +7 -2
  19. judgeval/judgment_client.py +16 -8
  20. judgeval/rules.py +2 -4
  21. judgeval/run_evaluation.py +8 -8
  22. judgeval/scorers/__init__.py +6 -0
  23. judgeval/scorers/api_scorer.py +12 -6
  24. judgeval/scorers/base_scorer.py +12 -6
  25. judgeval/scorers/judgeval_scorer.py +7 -3
  26. judgeval/scorers/judgeval_scorers/__init__.py +24 -3
  27. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +6 -0
  28. judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +35 -0
  29. judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +19 -0
  30. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +19 -0
  31. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +4 -1
  32. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -1
  33. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +2 -2
  34. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +7 -6
  35. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +2 -2
  36. judgeval/scorers/judgeval_scorers/local_implementations/comparison/__init__.py +0 -0
  37. judgeval/scorers/judgeval_scorers/local_implementations/comparison/comparison_scorer.py +161 -0
  38. judgeval/scorers/judgeval_scorers/local_implementations/comparison/prompts.py +222 -0
  39. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +2 -2
  40. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +2 -2
  41. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +2 -2
  42. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +1 -8
  43. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +7 -6
  44. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +2 -2
  45. judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/instruction_adherence.py +232 -0
  46. judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/prompt.py +102 -0
  47. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +7 -7
  48. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +7 -6
  49. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +1 -2
  50. judgeval/scorers/prompt_scorer.py +7 -5
  51. judgeval/scorers/utils.py +1 -1
  52. {judgeval-0.0.16.dist-info → judgeval-0.0.18.dist-info}/METADATA +1 -1
  53. {judgeval-0.0.16.dist-info → judgeval-0.0.18.dist-info}/RECORD +56 -48
  54. /judgeval/data/{datasets/ground_truth.py → ground_truth.py} +0 -0
  55. {judgeval-0.0.16.dist-info → judgeval-0.0.18.dist-info}/WHEEL +0 -0
  56. {judgeval-0.0.16.dist-info → judgeval-0.0.18.dist-info}/licenses/LICENSE.md +0 -0
judgeval/data/example.py CHANGED
@@ -2,17 +2,12 @@
2
2
  Classes for representing examples in a dataset.
3
3
  """
4
4
 
5
-
6
- from typing import TypeVar, Optional, Any, Dict, List
5
+ from typing import Optional, Any, Dict, List
7
6
  from uuid import uuid4
8
- from pydantic import BaseModel, Field, field_validator
7
+ from pydantic import BaseModel, Field
9
8
  from enum import Enum
10
9
  from datetime import datetime
11
- import time
12
-
13
10
 
14
- Input = TypeVar('Input')
15
- Output = TypeVar('Output')
16
11
 
17
12
  class ExampleParams(Enum):
18
13
  INPUT = "input"
@@ -23,11 +18,12 @@ class ExampleParams(Enum):
23
18
  TOOLS_CALLED = "tools_called"
24
19
  EXPECTED_TOOLS = "expected_tools"
25
20
  REASONING = "reasoning"
21
+ ADDITIONAL_METADATA = "additional_metadata"
26
22
 
27
23
 
28
24
  class Example(BaseModel):
29
- input: Input
30
- actual_output: Output
25
+ input: str
26
+ actual_output: str
31
27
  expected_output: Optional[str] = None
32
28
  context: Optional[List[str]] = None
33
29
  retrieval_context: Optional[List[str]] = None
@@ -39,22 +35,81 @@ class Example(BaseModel):
39
35
  example_index: Optional[int] = None
40
36
  timestamp: Optional[str] = None
41
37
  trace_id: Optional[str] = None
42
-
43
- @field_validator('input', 'actual_output', mode='before')
44
- def convert_to_str(cls, value):
45
- try:
46
- return str(value)
47
- except Exception:
48
- return repr(value)
49
38
 
50
39
  def __init__(self, **data):
40
+ # Check that required fields are provided
41
+ if 'input' not in data:
42
+ raise ValueError("Example must be initialized with 'input' field.")
43
+ if 'actual_output' not in data:
44
+ raise ValueError("Example must be initialized with 'actual_output' field.")
45
+
51
46
  if 'example_id' not in data:
52
47
  data['example_id'] = str(uuid4())
53
48
  # Set timestamp if not provided
54
49
  if 'timestamp' not in data:
55
50
  data['timestamp'] = datetime.now().strftime("%Y%m%d_%H%M%S")
56
51
  super().__init__(**data)
57
-
52
+
53
+ @field_validator('input', mode='before')
54
+ @classmethod
55
+ def validate_input(cls, v):
56
+ if not v or not isinstance(v, str):
57
+ raise ValueError(f"Input must be a non-empty string but got '{v}' of type {type(v)}")
58
+ return v
59
+
60
+ @field_validator('actual_output', mode='before')
61
+ @classmethod
62
+ def validate_actual_output(cls, v):
63
+ if not isinstance(v, str):
64
+ raise ValueError(f"Actual output must be a string but got '{v}' of type {type(v)}")
65
+ return v
66
+
67
+ @field_validator('expected_output', mode='before')
68
+ @classmethod
69
+ def validate_expected_output(cls, v):
70
+ if v is not None and not isinstance(v, str):
71
+ raise ValueError(f"Expected output must be a string or None but got {v} of type {type(v)}")
72
+ return v
73
+
74
+ @field_validator('context', 'retrieval_context', 'tools_called', 'expected_tools', mode='before')
75
+ @classmethod
76
+ def validate_string_lists(cls, v, info):
77
+ field_name = info.field_name
78
+ if v is not None:
79
+ if not isinstance(v, list):
80
+ raise ValueError(f"{field_name} must be a list of strings or None but got {v} of type {type(v)}")
81
+ for i, item in enumerate(v):
82
+ if not isinstance(item, str):
83
+ raise ValueError(f"All items in {field_name} must be strings but item at index {i} is {item} of type {type(item)}")
84
+ return v
85
+
86
+ @field_validator('additional_metadata', mode='before')
87
+ @classmethod
88
+ def validate_additional_metadata(cls, v):
89
+ if v is not None and not isinstance(v, dict):
90
+ raise ValueError(f"Additional metadata must be a dictionary or None but got {v} of type {type(v)}")
91
+ return v
92
+
93
+ @field_validator('example_index', mode='before')
94
+ @classmethod
95
+ def validate_example_index(cls, v):
96
+ if v is not None and not isinstance(v, int):
97
+ raise ValueError(f"Example index must be an integer or None but got {v} of type {type(v)}")
98
+ return v
99
+
100
+ @field_validator('timestamp', mode='before')
101
+ @classmethod
102
+ def validate_timestamp(cls, v):
103
+ if v is not None and not isinstance(v, str):
104
+ raise ValueError(f"Timestamp must be a string or None but got {v} of type {type(v)}")
105
+ return v
106
+
107
+ @field_validator('trace_id', mode='before')
108
+ @classmethod
109
+ def validate_trace_id(cls, v):
110
+ if v is not None and not isinstance(v, str):
111
+ raise ValueError(f"Trace ID must be a string or None but got {v} of type {type(v)}")
112
+ return v
58
113
 
59
114
  def to_dict(self):
60
115
  return {
@@ -5,7 +5,7 @@ ScorerData holds the information related to a single, completed Scorer evaluatio
5
5
  """
6
6
 
7
7
  from typing import List, Union, Optional, Dict
8
- from pydantic import BaseModel, Field
8
+ from pydantic import BaseModel
9
9
 
10
10
  from judgeval.scorers import JudgevalScorer
11
11
 
@@ -111,7 +111,7 @@ class EvaluationRun(BaseModel):
111
111
  # Check if model is string or list of strings
112
112
  if isinstance(v, str):
113
113
  if v not in ACCEPTABLE_MODELS:
114
- raise ValueError(f"Model name {v} not recognized.")
114
+ raise ValueError(f"Model name {v} not recognized. Please select a valid model name.)")
115
115
  return v
116
116
 
117
117
  if isinstance(v, list):
@@ -119,7 +119,7 @@ class EvaluationRun(BaseModel):
119
119
  raise ValueError("When providing a list of models, all elements must be strings")
120
120
  for m in v:
121
121
  if m not in ACCEPTABLE_MODELS:
122
- raise ValueError(f"Model name {m} not recognized.")
122
+ raise ValueError(f"Model name {m} not recognized. Please select a valid model name.")
123
123
  return v
124
124
  raise ValueError(f"Model must be one of: string, list of strings, or JudgevalJudge instance. Received type {type(v)}.")
125
125
 
@@ -1,4 +1,3 @@
1
- from pydantic import BaseModel
2
1
  from judgeval.judges.base_judge import JudgevalJudge
3
2
  from judgeval.judges.litellm_judge import LiteLLMJudge
4
3
  from judgeval.judges.together_judge import TogetherJudge
@@ -3,7 +3,7 @@ Implements the base class for all Judgeval Judge models.
3
3
  """
4
4
 
5
5
  from abc import ABC, abstractmethod
6
- from typing import Optional, List
6
+ from typing import Optional
7
7
 
8
8
 
9
9
  class JudgevalJudge(ABC):
@@ -5,9 +5,14 @@ Enables client to use multiple models to generate responses and then aggregate t
5
5
  """
6
6
  from judgeval import *
7
7
  import pydantic
8
- from typing import List, Union, Mapping, Dict
8
+ from typing import List, Union, Mapping
9
9
  from judgeval.judges import JudgevalJudge
10
- from judgeval.common.utils import get_completion_multiple_models, get_chat_completion, aget_completion_multiple_models, aget_chat_completion
10
+ from judgeval.common.utils import (
11
+ get_completion_multiple_models,
12
+ get_chat_completion,
13
+ aget_completion_multiple_models,
14
+ aget_chat_completion
15
+ )
11
16
  from judgeval.common.logger import debug, error
12
17
 
13
18
  def build_dynamic_mixture_prompt(
@@ -6,17 +6,17 @@ from typing import Optional, List, Dict, Any, Union
6
6
  import requests
7
7
 
8
8
  from judgeval.constants import ROOT_API
9
- from judgeval.data.datasets import EvalDataset, EvalDatasetClient, GroundTruthExample
9
+ from judgeval.data.datasets import EvalDataset, EvalDatasetClient
10
10
  from judgeval.data import (
11
11
  ScoringResult,
12
- Example
12
+ Example,
13
+ GroundTruthExample
13
14
  )
14
15
  from judgeval.scorers import (
15
16
  APIJudgmentScorer,
16
17
  JudgevalScorer,
17
18
  ClassifierScorer,
18
19
  ScorerWrapper,
19
- score,
20
20
  )
21
21
  from judgeval.evaluation_run import EvaluationRun
22
22
  from judgeval.run_evaluation import (
@@ -24,7 +24,11 @@ from judgeval.run_evaluation import (
24
24
  assert_test
25
25
  )
26
26
  from judgeval.judges import JudgevalJudge
27
- from judgeval.constants import JUDGMENT_EVAL_FETCH_API_URL, JUDGMENT_EVAL_DELETE_API_URL, JUDGMENT_EVAL_DELETE_PROJECT_API_URL
27
+ from judgeval.constants import (
28
+ JUDGMENT_EVAL_FETCH_API_URL,
29
+ JUDGMENT_EVAL_DELETE_API_URL,
30
+ JUDGMENT_EVAL_DELETE_PROJECT_API_URL
31
+ )
28
32
  from judgeval.common.exceptions import JudgmentAPIError
29
33
  from pydantic import BaseModel
30
34
  from judgeval.rules import Rule
@@ -306,7 +310,8 @@ class JudgmentClient:
306
310
  "Authorization": f"Bearer {self.judgment_api_key}",
307
311
  "X-Organization-Id": self.organization_id
308
312
  },
309
- json=eval_run_request_body.model_dump())
313
+ json=eval_run_request_body.model_dump(),
314
+ verify=True)
310
315
  if eval_run.status_code != requests.codes.ok:
311
316
  raise ValueError(f"Error fetching eval results: {eval_run.json()}")
312
317
 
@@ -378,7 +383,8 @@ class JudgmentClient:
378
383
  "Content-Type": "application/json",
379
384
  "Authorization": f"Bearer {self.judgment_api_key}",
380
385
  },
381
- json={} # Empty body now
386
+ json={}, # Empty body now
387
+ verify=True
382
388
  )
383
389
  if response.status_code == 200:
384
390
  return True, response.json()
@@ -409,7 +415,8 @@ class JudgmentClient:
409
415
  "Content-Type": "application/json",
410
416
  "Authorization": f"Bearer {self.judgment_api_key}",
411
417
  "X-Organization-Id": self.organization_id
412
- }
418
+ },
419
+ verify=True
413
420
  )
414
421
 
415
422
  if response.status_code == 500:
@@ -452,7 +459,8 @@ class JudgmentClient:
452
459
  "Content-Type": "application/json",
453
460
  "Authorization": f"Bearer {self.judgment_api_key}",
454
461
  "X-Organization-Id": self.organization_id
455
- }
462
+ },
463
+ verify=True
456
464
  )
457
465
 
458
466
  if response.status_code == 500:
judgeval/rules.py CHANGED
@@ -5,14 +5,12 @@ Rules system for Judgeval that enables alerts based on metric thresholds.
5
5
  from typing import Dict, List, Optional, Union, Any, Set, Tuple
6
6
  from pydantic import BaseModel, Field, field_validator, ConfigDict
7
7
  from enum import Enum
8
- from datetime import datetime
9
8
  import asyncio
10
9
  from concurrent.futures import ThreadPoolExecutor
11
10
  import time
12
- import uuid # Add import for uuid module
11
+ import uuid
13
12
 
14
- from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
15
- from judgeval.scorers.judgeval_scorers import ScorerWrapper # Import from the correct module
13
+ from judgeval.scorers import APIJudgmentScorer, JudgevalScorer, ScorerWrapper
16
14
 
17
15
  class AlertStatus(str, Enum):
18
16
  """Status of an alert evaluation."""
@@ -5,7 +5,6 @@ from datetime import datetime
5
5
  from rich import print as rprint
6
6
 
7
7
  from judgeval.data import (
8
- Example,
9
8
  ScorerData,
10
9
  ScoringResult
11
10
  )
@@ -25,13 +24,11 @@ from judgeval.constants import (
25
24
  from judgeval.common.exceptions import JudgmentAPIError
26
25
  from judgeval.evaluation_run import EvaluationRun
27
26
  from judgeval.common.logger import (
28
- enable_logging,
29
27
  debug,
30
28
  info,
31
29
  error,
32
30
  example_logging_context
33
31
  )
34
- from judgeval.rules import RulesEngine, Rule, AlertResult, AlertStatus
35
32
 
36
33
 
37
34
  def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
@@ -55,7 +52,8 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
55
52
  "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
56
53
  "X-Organization-Id": evaluation_run.organization_id
57
54
  },
58
- json=payload)
55
+ json=payload,
56
+ verify=True)
59
57
  response_data = response.json()
60
58
  except Exception as e:
61
59
  error(f"Error: {e}")
@@ -168,12 +166,13 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
168
166
  "eval_name": eval_name,
169
167
  "project_name": project_name,
170
168
  "judgment_api_key": judgment_api_key,
171
- }
169
+ },
170
+ verify=True
172
171
  )
173
172
 
174
173
  if response.status_code == 409:
175
- error(f"Evaluation run name '{eval_name}' already exists for this project")
176
- raise ValueError(f"Evaluation run name '{eval_name}' already exists for this project")
174
+ error(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name or set the `override` flag to true.")
175
+ raise ValueError(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name or set the `override` flag to true.")
177
176
 
178
177
  if not response.ok:
179
178
  response_data = response.json()
@@ -210,7 +209,8 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
210
209
  "results": [result.to_dict() for result in merged_results],
211
210
  "project_name": evaluation_run.project_name,
212
211
  "eval_name": evaluation_run.eval_name,
213
- }
212
+ },
213
+ verify=True
214
214
  )
215
215
 
216
216
  if not res.ok:
@@ -14,6 +14,9 @@ from judgeval.scorers.judgeval_scorers import (
14
14
  ScorerWrapper,
15
15
  AnswerCorrectnessScorer,
16
16
  Text2SQLScorer,
17
+ ComparisonScorer,
18
+ InstructionAdherenceScorer,
19
+ GroundednessScorer,
17
20
  )
18
21
 
19
22
  __all__ = [
@@ -33,4 +36,7 @@ __all__ = [
33
36
  "ScorerWrapper",
34
37
  "AnswerCorrectnessScorer",
35
38
  "Text2SQLScorer",
39
+ "ComparisonScorer",
40
+ "InstructionAdherenceScorer",
41
+ "GroundednessScorer",
36
42
  ]
@@ -7,7 +7,7 @@ Scores `Example`s using ready-made Judgment evaluators.
7
7
  from pydantic import BaseModel, field_validator
8
8
  from judgeval.common.logger import debug, info, warning, error
9
9
 
10
- from judgeval.constants import APIScorer
10
+ from judgeval.constants import APIScorer, UNBOUNDED_SCORERS
11
11
 
12
12
 
13
13
  class APIJudgmentScorer(BaseModel):
@@ -18,17 +18,23 @@ class APIJudgmentScorer(BaseModel):
18
18
  score_type (APIScorer): The Judgment metric to use for scoring `Example`s
19
19
  threshold (float): A value between 0 and 1 that determines the scoring threshold
20
20
  """
21
- threshold: float
22
21
  score_type: APIScorer
22
+ threshold: float
23
23
 
24
24
  @field_validator('threshold')
25
- def validate_threshold(cls, v):
25
+ def validate_threshold(cls, v, info):
26
26
  """
27
27
  Validates that the threshold is between 0 and 1 inclusive.
28
28
  """
29
- if not 0 <= v <= 1:
30
- error(f"Threshold must be between 0 and 1, got: {v}")
31
- raise ValueError(f"Threshold must be between 0 and 1, got: {v}")
29
+ score_type = info.data.get('score_type')
30
+ if score_type in UNBOUNDED_SCORERS:
31
+ if v < 0:
32
+ error(f"Threshold for {score_type} must be greater than 0, got: {v}")
33
+ raise ValueError(f"Threshold for {score_type} must be greater than 0, got: {v}")
34
+ else:
35
+ if not 0 <= v <= 1:
36
+ error(f"Threshold for {score_type} must be between 0 and 1, got: {v}")
37
+ raise ValueError(f"Threshold for {score_type} must be between 0 and 1, got: {v}")
32
38
  return v
33
39
 
34
40
  @field_validator('score_type')
@@ -7,7 +7,7 @@ Scores `Example`s using ready-made Judgment evaluators.
7
7
  from pydantic import BaseModel, field_validator
8
8
  from judgeval.common.logger import debug, info, warning, error
9
9
 
10
- from judgeval.constants import APIScorer
10
+ from judgeval.constants import APIScorer, UNBOUNDED_SCORERS
11
11
 
12
12
 
13
13
  class APIJudgmentScorer(BaseModel):
@@ -18,17 +18,23 @@ class APIJudgmentScorer(BaseModel):
18
18
  score_type (APIScorer): The Judgment metric to use for scoring `Example`s
19
19
  threshold (float): A value between 0 and 1 that determines the scoring threshold
20
20
  """
21
- threshold: float
22
21
  score_type: APIScorer
22
+ threshold: float
23
23
 
24
24
  @field_validator('threshold')
25
- def validate_threshold(cls, v):
25
+ def validate_threshold(cls, v, info):
26
26
  """
27
27
  Validates that the threshold is between 0 and 1 inclusive.
28
28
  """
29
- if not 0 <= v <= 1:
30
- error(f"Threshold must be between 0 and 1, got: {v}")
31
- raise ValueError(f"Threshold must be between 0 and 1, got: {v}")
29
+ score_type = info.data.get('score_type')
30
+ if score_type in UNBOUNDED_SCORERS:
31
+ if v < 0:
32
+ error(f"Threshold for {score_type} must be greater than 0, got: {v}")
33
+ raise ValueError(f"Threshold for {score_type} must be greater than 0, got: {v}")
34
+ else:
35
+ if not 0 <= v <= 1:
36
+ error(f"Threshold for {score_type} must be between 0 and 1, got: {v}")
37
+ raise ValueError(f"Threshold for {score_type} must be between 0 and 1, got: {v}")
32
38
  return v
33
39
 
34
40
  @field_validator('score_type')
@@ -11,7 +11,7 @@ from abc import abstractmethod
11
11
  from judgeval.common.logger import debug, info, warning, error
12
12
  from judgeval.judges import JudgevalJudge
13
13
  from judgeval.judges.utils import create_judge
14
-
14
+ from judgeval.constants import UNBOUNDED_SCORERS
15
15
 
16
16
  class JudgevalScorer:
17
17
  """
@@ -58,8 +58,12 @@ class JudgevalScorer:
58
58
  additional_metadata: Optional[Dict] = None
59
59
  ):
60
60
  debug(f"Initializing JudgevalScorer with score_type={score_type}, threshold={threshold}")
61
- if not 0 <= threshold <= 1:
62
- raise ValueError("Threshold must be between 0 and 1")
61
+ if score_type in UNBOUNDED_SCORERS:
62
+ if threshold < 0:
63
+ raise ValueError(f"Threshold for {score_type} must be greater than 0, got: {threshold}")
64
+ else:
65
+ if not 0 <= threshold <= 1:
66
+ raise ValueError(f"Threshold for {score_type} must be between 0 and 1, got: {threshold}")
63
67
  if strict_mode:
64
68
  warning("Strict mode enabled - scoring will be more rigorous")
65
69
  info(f"JudgevalScorer initialized with evaluation_model: {evaluation_model}")
@@ -1,5 +1,4 @@
1
1
  from typing import Type, Optional, Any
2
- from functools import wraps
3
2
 
4
3
  # Import implementations
5
4
  from judgeval.scorers.judgeval_scorers.api_scorers import (
@@ -12,7 +11,10 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
12
11
  ContextualPrecisionScorer as APIContextualPrecisionScorer,
13
12
  ContextualRecallScorer as APIContextualRecallScorer,
14
13
  AnswerRelevancyScorer as APIAnswerRelevancyScorer,
15
- AnswerCorrectnessScorer as APIAnswerCorrectnessScorer,
14
+ AnswerCorrectnessScorer as APIAnswerCorrectnessScorer,
15
+ ComparisonScorer as APIComparisonScorer,
16
+ InstructionAdherenceScorer as APIInstructionAdherenceScorer,
17
+ GroundednessScorer as APIGroundednessScorer,
16
18
  )
17
19
 
18
20
  from judgeval.scorers.judgeval_scorers.local_implementations import (
@@ -25,7 +27,9 @@ from judgeval.scorers.judgeval_scorers.local_implementations import (
25
27
  ToolCorrectnessScorer as LocalToolCorrectnessScorer,
26
28
  HallucinationScorer as LocalHallucinationScorer,
27
29
  SummarizationScorer as LocalSummarizationScorer,
28
- AnswerCorrectnessScorer as LocalAnswerCorrectnessScorer
30
+ AnswerCorrectnessScorer as LocalAnswerCorrectnessScorer,
31
+ ComparisonScorer as LocalComparisonScorer,
32
+ InstructionAdherenceScorer as LocalInstructionAdherenceScorer,
29
33
  )
30
34
 
31
35
  from judgeval.scorers.judgeval_scorers.classifiers import Text2SQLScorer
@@ -134,6 +138,21 @@ ContextualRecallScorer = ScorerWrapper(
134
138
  local_implementation=LocalContextualRecallScorer
135
139
  )
136
140
 
141
+ InstructionAdherenceScorer = ScorerWrapper(
142
+ api_implementation=APIInstructionAdherenceScorer,
143
+ local_implementation=LocalInstructionAdherenceScorer
144
+ )
145
+
146
+ def ComparisonScorer(threshold: float, criteria: str, description: str):
147
+ return ScorerWrapper(
148
+ api_implementation=APIComparisonScorer,
149
+ local_implementation=LocalComparisonScorer
150
+ )(threshold=threshold, criteria=criteria, description=description)
151
+
152
+ GroundednessScorer = ScorerWrapper(
153
+ api_implementation=APIGroundednessScorer,
154
+ )
155
+
137
156
  __all__ = [
138
157
  "ToolCorrectnessScorer",
139
158
  "JSONCorrectnessScorer",
@@ -145,4 +164,6 @@ __all__ = [
145
164
  "ContextualRecallScorer",
146
165
  "AnswerRelevancyScorer",
147
166
  "Text2SQLScorer",
167
+ "ComparisonScorer",
168
+ "GroundednessScorer",
148
169
  ]
@@ -8,6 +8,9 @@ from judgeval.scorers.judgeval_scorers.api_scorers.contextual_precision import C
8
8
  from judgeval.scorers.judgeval_scorers.api_scorers.contextual_recall import ContextualRecallScorer
9
9
  from judgeval.scorers.judgeval_scorers.api_scorers.answer_relevancy import AnswerRelevancyScorer
10
10
  from judgeval.scorers.judgeval_scorers.api_scorers.answer_correctness import AnswerCorrectnessScorer
11
+ from judgeval.scorers.judgeval_scorers.api_scorers.comparison import ComparisonScorer
12
+ from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import InstructionAdherenceScorer
13
+ from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import GroundednessScorer
11
14
 
12
15
  __all__ = [
13
16
  "ToolCorrectnessScorer",
@@ -20,4 +23,7 @@ __all__ = [
20
23
  "ContextualRecallScorer",
21
24
  "AnswerRelevancyScorer",
22
25
  "AnswerCorrectnessScorer",
26
+ "ComparisonScorer",
27
+ "InstructionAdherenceScorer",
28
+ "GroundednessScorer",
23
29
  ]
@@ -0,0 +1,35 @@
1
+ """
2
+ `judgeval` comparison scorer
3
+
4
+ TODO add link to docs page for this scorer
5
+
6
+ """
7
+
8
+ # Internal imports
9
+ from judgeval.scorers.api_scorer import APIJudgmentScorer
10
+ from judgeval.constants import APIScorer
11
+ from typing import Optional, Dict
12
+
13
+ class ComparisonScorer(APIJudgmentScorer):
14
+ kwargs: Optional[Dict] = None
15
+
16
+ def __init__(self, threshold: float, criteria: str, description: str):
17
+ super().__init__(threshold=threshold, score_type=APIScorer.COMPARISON)
18
+ self.kwargs = {"criteria": criteria, "description": description}
19
+
20
+ @property
21
+ def __name__(self):
22
+ return f"Comparison-{self.kwargs['criteria']}"
23
+
24
+ def to_dict(self) -> dict:
25
+ """
26
+ Converts the scorer configuration to a dictionary format.
27
+
28
+ Returns:
29
+ dict: A dictionary containing the scorer's configuration
30
+ """
31
+ return {
32
+ "score_type": self.score_type,
33
+ "threshold": self.threshold,
34
+ "kwargs": self.kwargs
35
+ }
@@ -0,0 +1,19 @@
1
+ """
2
+ `judgeval` Groundedness scorer
3
+
4
+ TODO add link to docs page for this scorer
5
+
6
+ """
7
+
8
+ # Internal imports
9
+ from judgeval.scorers.api_scorer import APIJudgmentScorer
10
+ from judgeval.constants import APIScorer
11
+
12
+
13
+ class GroundednessScorer(APIJudgmentScorer):
14
+ def __init__(self, threshold: float):
15
+ super().__init__(threshold=threshold, score_type=APIScorer.GROUNDEDNESS)
16
+
17
+ @property
18
+ def __name__(self):
19
+ return "Groundedness"
@@ -0,0 +1,19 @@
1
+ """
2
+ `judgeval` instruction adherence scorer
3
+
4
+ TODO add link to docs page for this scorer
5
+
6
+ """
7
+
8
+ # Internal imports
9
+ from judgeval.scorers.api_scorer import APIJudgmentScorer
10
+ from judgeval.constants import APIScorer
11
+
12
+
13
+ class InstructionAdherenceScorer(APIJudgmentScorer):
14
+ def __init__(self, threshold: float):
15
+ super().__init__(threshold=threshold, score_type=APIScorer.INSTRUCTION_ADHERENCE)
16
+
17
+ @property
18
+ def __name__(self):
19
+ return "Instruction Adherence"
@@ -8,11 +8,13 @@ from judgeval.scorers.judgeval_scorers.local_implementations.tool_correctness.to
8
8
  from judgeval.scorers.judgeval_scorers.local_implementations.hallucination.hallucination_scorer import HallucinationScorer
9
9
  from judgeval.scorers.judgeval_scorers.local_implementations.summarization.summarization_scorer import SummarizationScorer
10
10
  from judgeval.scorers.judgeval_scorers.local_implementations.answer_correctness.answer_correctness_scorer import AnswerCorrectnessScorer
11
-
11
+ from judgeval.scorers.judgeval_scorers.local_implementations.comparison.comparison_scorer import ComparisonScorer
12
+ from judgeval.scorers.judgeval_scorers.local_implementations.instruction_adherence.instruction_adherence import InstructionAdherenceScorer
12
13
 
13
14
  __all__ = [
14
15
  "AnswerCorrectnessScorer",
15
16
  "AnswerRelevancyScorer",
17
+ "ComparisonScorer",
16
18
  "ContextualPrecisionScorer",
17
19
  "ContextualRecallScorer",
18
20
  "ContextualRelevancyScorer",
@@ -21,4 +23,5 @@ __all__ = [
21
23
  "ToolCorrectnessScorer",
22
24
  "HallucinationScorer",
23
25
  "SummarizationScorer",
26
+ "InstructionAdherenceScorer",
24
27
  ]
@@ -1,5 +1,4 @@
1
1
  from typing import Optional, List, Union, Tuple
2
- from pydantic import BaseModel
3
2
 
4
3
  from judgeval.constants import APIScorer
5
4
  from judgeval.judges import JudgevalJudge
@@ -2,8 +2,8 @@
2
2
  Util prompts for AnswerCorrectnessScorer
3
3
  """
4
4
 
5
- from typing import List, Optional, Tuple
6
- from pydantic import BaseModel, Field
5
+ from typing import List, Tuple
6
+ from pydantic import BaseModel
7
7
 
8
8
 
9
9
  # BaseModels to enforce formatting in LLM JSON response