judgeval 0.0.17__py3-none-any.whl → 0.0.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. judgeval/__init__.py +1 -3
  2. judgeval/clients.py +0 -7
  3. judgeval/common/logger.py +0 -1
  4. judgeval/common/tracer.py +250 -42
  5. judgeval/common/utils.py +9 -5
  6. judgeval/constants.py +6 -1
  7. judgeval/data/__init__.py +2 -0
  8. judgeval/data/api_example.py +2 -2
  9. judgeval/data/datasets/__init__.py +1 -2
  10. judgeval/data/datasets/dataset.py +4 -5
  11. judgeval/data/datasets/eval_dataset_client.py +1 -2
  12. judgeval/data/datasets/utils.py +1 -2
  13. judgeval/data/example.py +71 -16
  14. judgeval/data/scorer_data.py +1 -1
  15. judgeval/evaluation_run.py +2 -2
  16. judgeval/judges/__init__.py +0 -1
  17. judgeval/judges/base_judge.py +1 -1
  18. judgeval/judges/mixture_of_judges.py +7 -2
  19. judgeval/judgment_client.py +8 -4
  20. judgeval/rules.py +2 -4
  21. judgeval/run_evaluation.py +2 -5
  22. judgeval/scorers/__init__.py +6 -0
  23. judgeval/scorers/api_scorer.py +12 -6
  24. judgeval/scorers/base_scorer.py +12 -6
  25. judgeval/scorers/judgeval_scorer.py +7 -3
  26. judgeval/scorers/judgeval_scorers/__init__.py +24 -3
  27. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +6 -0
  28. judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +35 -0
  29. judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +19 -0
  30. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +19 -0
  31. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +4 -1
  32. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -1
  33. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +2 -2
  34. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +7 -6
  35. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +2 -2
  36. judgeval/scorers/judgeval_scorers/local_implementations/comparison/__init__.py +0 -0
  37. judgeval/scorers/judgeval_scorers/local_implementations/comparison/comparison_scorer.py +161 -0
  38. judgeval/scorers/judgeval_scorers/local_implementations/comparison/prompts.py +222 -0
  39. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +2 -2
  40. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +2 -2
  41. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +2 -2
  42. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +1 -8
  43. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +7 -6
  44. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +2 -2
  45. judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/instruction_adherence.py +232 -0
  46. judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/prompt.py +102 -0
  47. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +7 -7
  48. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +7 -6
  49. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +1 -2
  50. judgeval/scorers/prompt_scorer.py +7 -5
  51. judgeval/scorers/utils.py +1 -1
  52. {judgeval-0.0.17.dist-info → judgeval-0.0.19.dist-info}/METADATA +1 -1
  53. {judgeval-0.0.17.dist-info → judgeval-0.0.19.dist-info}/RECORD +56 -48
  54. /judgeval/data/{datasets/ground_truth.py → ground_truth.py} +0 -0
  55. {judgeval-0.0.17.dist-info → judgeval-0.0.19.dist-info}/WHEEL +0 -0
  56. {judgeval-0.0.17.dist-info → judgeval-0.0.19.dist-info}/licenses/LICENSE.md +0 -0
@@ -5,7 +5,7 @@ ScorerData holds the information related to a single, completed Scorer evaluatio
5
5
  """
6
6
 
7
7
  from typing import List, Union, Optional, Dict
8
- from pydantic import BaseModel, Field
8
+ from pydantic import BaseModel
9
9
 
10
10
  from judgeval.scorers import JudgevalScorer
11
11
 
@@ -111,7 +111,7 @@ class EvaluationRun(BaseModel):
111
111
  # Check if model is string or list of strings
112
112
  if isinstance(v, str):
113
113
  if v not in ACCEPTABLE_MODELS:
114
- raise ValueError(f"Model name {v} not recognized.")
114
+ raise ValueError(f"Model name {v} not recognized. Please select a valid model name.)")
115
115
  return v
116
116
 
117
117
  if isinstance(v, list):
@@ -119,7 +119,7 @@ class EvaluationRun(BaseModel):
119
119
  raise ValueError("When providing a list of models, all elements must be strings")
120
120
  for m in v:
121
121
  if m not in ACCEPTABLE_MODELS:
122
- raise ValueError(f"Model name {m} not recognized.")
122
+ raise ValueError(f"Model name {m} not recognized. Please select a valid model name.")
123
123
  return v
124
124
  raise ValueError(f"Model must be one of: string, list of strings, or JudgevalJudge instance. Received type {type(v)}.")
125
125
 
@@ -1,4 +1,3 @@
1
- from pydantic import BaseModel
2
1
  from judgeval.judges.base_judge import JudgevalJudge
3
2
  from judgeval.judges.litellm_judge import LiteLLMJudge
4
3
  from judgeval.judges.together_judge import TogetherJudge
@@ -3,7 +3,7 @@ Implements the base class for all Judgeval Judge models.
3
3
  """
4
4
 
5
5
  from abc import ABC, abstractmethod
6
- from typing import Optional, List
6
+ from typing import Optional
7
7
 
8
8
 
9
9
  class JudgevalJudge(ABC):
@@ -5,9 +5,14 @@ Enables client to use multiple models to generate responses and then aggregate t
5
5
  """
6
6
  from judgeval import *
7
7
  import pydantic
8
- from typing import List, Union, Mapping, Dict
8
+ from typing import List, Union, Mapping
9
9
  from judgeval.judges import JudgevalJudge
10
- from judgeval.common.utils import get_completion_multiple_models, get_chat_completion, aget_completion_multiple_models, aget_chat_completion
10
+ from judgeval.common.utils import (
11
+ get_completion_multiple_models,
12
+ get_chat_completion,
13
+ aget_completion_multiple_models,
14
+ aget_chat_completion
15
+ )
11
16
  from judgeval.common.logger import debug, error
12
17
 
13
18
  def build_dynamic_mixture_prompt(
@@ -6,17 +6,17 @@ from typing import Optional, List, Dict, Any, Union
6
6
  import requests
7
7
 
8
8
  from judgeval.constants import ROOT_API
9
- from judgeval.data.datasets import EvalDataset, EvalDatasetClient, GroundTruthExample
9
+ from judgeval.data.datasets import EvalDataset, EvalDatasetClient
10
10
  from judgeval.data import (
11
11
  ScoringResult,
12
- Example
12
+ Example,
13
+ GroundTruthExample
13
14
  )
14
15
  from judgeval.scorers import (
15
16
  APIJudgmentScorer,
16
17
  JudgevalScorer,
17
18
  ClassifierScorer,
18
19
  ScorerWrapper,
19
- score,
20
20
  )
21
21
  from judgeval.evaluation_run import EvaluationRun
22
22
  from judgeval.run_evaluation import (
@@ -24,7 +24,11 @@ from judgeval.run_evaluation import (
24
24
  assert_test
25
25
  )
26
26
  from judgeval.judges import JudgevalJudge
27
- from judgeval.constants import JUDGMENT_EVAL_FETCH_API_URL, JUDGMENT_EVAL_DELETE_API_URL, JUDGMENT_EVAL_DELETE_PROJECT_API_URL
27
+ from judgeval.constants import (
28
+ JUDGMENT_EVAL_FETCH_API_URL,
29
+ JUDGMENT_EVAL_DELETE_API_URL,
30
+ JUDGMENT_EVAL_DELETE_PROJECT_API_URL
31
+ )
28
32
  from judgeval.common.exceptions import JudgmentAPIError
29
33
  from pydantic import BaseModel
30
34
  from judgeval.rules import Rule
judgeval/rules.py CHANGED
@@ -5,14 +5,12 @@ Rules system for Judgeval that enables alerts based on metric thresholds.
5
5
  from typing import Dict, List, Optional, Union, Any, Set, Tuple
6
6
  from pydantic import BaseModel, Field, field_validator, ConfigDict
7
7
  from enum import Enum
8
- from datetime import datetime
9
8
  import asyncio
10
9
  from concurrent.futures import ThreadPoolExecutor
11
10
  import time
12
- import uuid # Add import for uuid module
11
+ import uuid
13
12
 
14
- from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
15
- from judgeval.scorers.judgeval_scorers import ScorerWrapper # Import from the correct module
13
+ from judgeval.scorers import APIJudgmentScorer, JudgevalScorer, ScorerWrapper
16
14
 
17
15
  class AlertStatus(str, Enum):
18
16
  """Status of an alert evaluation."""
@@ -5,7 +5,6 @@ from datetime import datetime
5
5
  from rich import print as rprint
6
6
 
7
7
  from judgeval.data import (
8
- Example,
9
8
  ScorerData,
10
9
  ScoringResult
11
10
  )
@@ -25,13 +24,11 @@ from judgeval.constants import (
25
24
  from judgeval.common.exceptions import JudgmentAPIError
26
25
  from judgeval.evaluation_run import EvaluationRun
27
26
  from judgeval.common.logger import (
28
- enable_logging,
29
27
  debug,
30
28
  info,
31
29
  error,
32
30
  example_logging_context
33
31
  )
34
- from judgeval.rules import RulesEngine, Rule, AlertResult, AlertStatus
35
32
 
36
33
 
37
34
  def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
@@ -174,8 +171,8 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
174
171
  )
175
172
 
176
173
  if response.status_code == 409:
177
- error(f"Evaluation run name '{eval_name}' already exists for this project")
178
- raise ValueError(f"Evaluation run name '{eval_name}' already exists for this project")
174
+ error(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name or set the `override` flag to true.")
175
+ raise ValueError(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name or set the `override` flag to true.")
179
176
 
180
177
  if not response.ok:
181
178
  response_data = response.json()
@@ -14,6 +14,9 @@ from judgeval.scorers.judgeval_scorers import (
14
14
  ScorerWrapper,
15
15
  AnswerCorrectnessScorer,
16
16
  Text2SQLScorer,
17
+ ComparisonScorer,
18
+ InstructionAdherenceScorer,
19
+ GroundednessScorer,
17
20
  )
18
21
 
19
22
  __all__ = [
@@ -33,4 +36,7 @@ __all__ = [
33
36
  "ScorerWrapper",
34
37
  "AnswerCorrectnessScorer",
35
38
  "Text2SQLScorer",
39
+ "ComparisonScorer",
40
+ "InstructionAdherenceScorer",
41
+ "GroundednessScorer",
36
42
  ]
@@ -7,7 +7,7 @@ Scores `Example`s using ready-made Judgment evaluators.
7
7
  from pydantic import BaseModel, field_validator
8
8
  from judgeval.common.logger import debug, info, warning, error
9
9
 
10
- from judgeval.constants import APIScorer
10
+ from judgeval.constants import APIScorer, UNBOUNDED_SCORERS
11
11
 
12
12
 
13
13
  class APIJudgmentScorer(BaseModel):
@@ -18,17 +18,23 @@ class APIJudgmentScorer(BaseModel):
18
18
  score_type (APIScorer): The Judgment metric to use for scoring `Example`s
19
19
  threshold (float): A value between 0 and 1 that determines the scoring threshold
20
20
  """
21
- threshold: float
22
21
  score_type: APIScorer
22
+ threshold: float
23
23
 
24
24
  @field_validator('threshold')
25
- def validate_threshold(cls, v):
25
+ def validate_threshold(cls, v, info):
26
26
  """
27
27
  Validates that the threshold is between 0 and 1 inclusive.
28
28
  """
29
- if not 0 <= v <= 1:
30
- error(f"Threshold must be between 0 and 1, got: {v}")
31
- raise ValueError(f"Threshold must be between 0 and 1, got: {v}")
29
+ score_type = info.data.get('score_type')
30
+ if score_type in UNBOUNDED_SCORERS:
31
+ if v < 0:
32
+ error(f"Threshold for {score_type} must be greater than 0, got: {v}")
33
+ raise ValueError(f"Threshold for {score_type} must be greater than 0, got: {v}")
34
+ else:
35
+ if not 0 <= v <= 1:
36
+ error(f"Threshold for {score_type} must be between 0 and 1, got: {v}")
37
+ raise ValueError(f"Threshold for {score_type} must be between 0 and 1, got: {v}")
32
38
  return v
33
39
 
34
40
  @field_validator('score_type')
@@ -7,7 +7,7 @@ Scores `Example`s using ready-made Judgment evaluators.
7
7
  from pydantic import BaseModel, field_validator
8
8
  from judgeval.common.logger import debug, info, warning, error
9
9
 
10
- from judgeval.constants import APIScorer
10
+ from judgeval.constants import APIScorer, UNBOUNDED_SCORERS
11
11
 
12
12
 
13
13
  class APIJudgmentScorer(BaseModel):
@@ -18,17 +18,23 @@ class APIJudgmentScorer(BaseModel):
18
18
  score_type (APIScorer): The Judgment metric to use for scoring `Example`s
19
19
  threshold (float): A value between 0 and 1 that determines the scoring threshold
20
20
  """
21
- threshold: float
22
21
  score_type: APIScorer
22
+ threshold: float
23
23
 
24
24
  @field_validator('threshold')
25
- def validate_threshold(cls, v):
25
+ def validate_threshold(cls, v, info):
26
26
  """
27
27
  Validates that the threshold is between 0 and 1 inclusive.
28
28
  """
29
- if not 0 <= v <= 1:
30
- error(f"Threshold must be between 0 and 1, got: {v}")
31
- raise ValueError(f"Threshold must be between 0 and 1, got: {v}")
29
+ score_type = info.data.get('score_type')
30
+ if score_type in UNBOUNDED_SCORERS:
31
+ if v < 0:
32
+ error(f"Threshold for {score_type} must be greater than 0, got: {v}")
33
+ raise ValueError(f"Threshold for {score_type} must be greater than 0, got: {v}")
34
+ else:
35
+ if not 0 <= v <= 1:
36
+ error(f"Threshold for {score_type} must be between 0 and 1, got: {v}")
37
+ raise ValueError(f"Threshold for {score_type} must be between 0 and 1, got: {v}")
32
38
  return v
33
39
 
34
40
  @field_validator('score_type')
@@ -11,7 +11,7 @@ from abc import abstractmethod
11
11
  from judgeval.common.logger import debug, info, warning, error
12
12
  from judgeval.judges import JudgevalJudge
13
13
  from judgeval.judges.utils import create_judge
14
-
14
+ from judgeval.constants import UNBOUNDED_SCORERS
15
15
 
16
16
  class JudgevalScorer:
17
17
  """
@@ -58,8 +58,12 @@ class JudgevalScorer:
58
58
  additional_metadata: Optional[Dict] = None
59
59
  ):
60
60
  debug(f"Initializing JudgevalScorer with score_type={score_type}, threshold={threshold}")
61
- if not 0 <= threshold <= 1:
62
- raise ValueError("Threshold must be between 0 and 1")
61
+ if score_type in UNBOUNDED_SCORERS:
62
+ if threshold < 0:
63
+ raise ValueError(f"Threshold for {score_type} must be greater than 0, got: {threshold}")
64
+ else:
65
+ if not 0 <= threshold <= 1:
66
+ raise ValueError(f"Threshold for {score_type} must be between 0 and 1, got: {threshold}")
63
67
  if strict_mode:
64
68
  warning("Strict mode enabled - scoring will be more rigorous")
65
69
  info(f"JudgevalScorer initialized with evaluation_model: {evaluation_model}")
@@ -1,5 +1,4 @@
1
1
  from typing import Type, Optional, Any
2
- from functools import wraps
3
2
 
4
3
  # Import implementations
5
4
  from judgeval.scorers.judgeval_scorers.api_scorers import (
@@ -12,7 +11,10 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
12
11
  ContextualPrecisionScorer as APIContextualPrecisionScorer,
13
12
  ContextualRecallScorer as APIContextualRecallScorer,
14
13
  AnswerRelevancyScorer as APIAnswerRelevancyScorer,
15
- AnswerCorrectnessScorer as APIAnswerCorrectnessScorer,
14
+ AnswerCorrectnessScorer as APIAnswerCorrectnessScorer,
15
+ ComparisonScorer as APIComparisonScorer,
16
+ InstructionAdherenceScorer as APIInstructionAdherenceScorer,
17
+ GroundednessScorer as APIGroundednessScorer,
16
18
  )
17
19
 
18
20
  from judgeval.scorers.judgeval_scorers.local_implementations import (
@@ -25,7 +27,9 @@ from judgeval.scorers.judgeval_scorers.local_implementations import (
25
27
  ToolCorrectnessScorer as LocalToolCorrectnessScorer,
26
28
  HallucinationScorer as LocalHallucinationScorer,
27
29
  SummarizationScorer as LocalSummarizationScorer,
28
- AnswerCorrectnessScorer as LocalAnswerCorrectnessScorer
30
+ AnswerCorrectnessScorer as LocalAnswerCorrectnessScorer,
31
+ ComparisonScorer as LocalComparisonScorer,
32
+ InstructionAdherenceScorer as LocalInstructionAdherenceScorer,
29
33
  )
30
34
 
31
35
  from judgeval.scorers.judgeval_scorers.classifiers import Text2SQLScorer
@@ -134,6 +138,21 @@ ContextualRecallScorer = ScorerWrapper(
134
138
  local_implementation=LocalContextualRecallScorer
135
139
  )
136
140
 
141
+ InstructionAdherenceScorer = ScorerWrapper(
142
+ api_implementation=APIInstructionAdherenceScorer,
143
+ local_implementation=LocalInstructionAdherenceScorer
144
+ )
145
+
146
+ def ComparisonScorer(threshold: float, criteria: str, description: str):
147
+ return ScorerWrapper(
148
+ api_implementation=APIComparisonScorer,
149
+ local_implementation=LocalComparisonScorer
150
+ )(threshold=threshold, criteria=criteria, description=description)
151
+
152
+ GroundednessScorer = ScorerWrapper(
153
+ api_implementation=APIGroundednessScorer,
154
+ )
155
+
137
156
  __all__ = [
138
157
  "ToolCorrectnessScorer",
139
158
  "JSONCorrectnessScorer",
@@ -145,4 +164,6 @@ __all__ = [
145
164
  "ContextualRecallScorer",
146
165
  "AnswerRelevancyScorer",
147
166
  "Text2SQLScorer",
167
+ "ComparisonScorer",
168
+ "GroundednessScorer",
148
169
  ]
@@ -8,6 +8,9 @@ from judgeval.scorers.judgeval_scorers.api_scorers.contextual_precision import C
8
8
  from judgeval.scorers.judgeval_scorers.api_scorers.contextual_recall import ContextualRecallScorer
9
9
  from judgeval.scorers.judgeval_scorers.api_scorers.answer_relevancy import AnswerRelevancyScorer
10
10
  from judgeval.scorers.judgeval_scorers.api_scorers.answer_correctness import AnswerCorrectnessScorer
11
+ from judgeval.scorers.judgeval_scorers.api_scorers.comparison import ComparisonScorer
12
+ from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import InstructionAdherenceScorer
13
+ from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import GroundednessScorer
11
14
 
12
15
  __all__ = [
13
16
  "ToolCorrectnessScorer",
@@ -20,4 +23,7 @@ __all__ = [
20
23
  "ContextualRecallScorer",
21
24
  "AnswerRelevancyScorer",
22
25
  "AnswerCorrectnessScorer",
26
+ "ComparisonScorer",
27
+ "InstructionAdherenceScorer",
28
+ "GroundednessScorer",
23
29
  ]
@@ -0,0 +1,35 @@
1
+ """
2
+ `judgeval` comparison scorer
3
+
4
+ TODO add link to docs page for this scorer
5
+
6
+ """
7
+
8
+ # Internal imports
9
+ from judgeval.scorers.api_scorer import APIJudgmentScorer
10
+ from judgeval.constants import APIScorer
11
+ from typing import Optional, Dict
12
+
13
+ class ComparisonScorer(APIJudgmentScorer):
14
+ kwargs: Optional[Dict] = None
15
+
16
+ def __init__(self, threshold: float, criteria: str, description: str):
17
+ super().__init__(threshold=threshold, score_type=APIScorer.COMPARISON)
18
+ self.kwargs = {"criteria": criteria, "description": description}
19
+
20
+ @property
21
+ def __name__(self):
22
+ return f"Comparison-{self.kwargs['criteria']}"
23
+
24
+ def to_dict(self) -> dict:
25
+ """
26
+ Converts the scorer configuration to a dictionary format.
27
+
28
+ Returns:
29
+ dict: A dictionary containing the scorer's configuration
30
+ """
31
+ return {
32
+ "score_type": self.score_type,
33
+ "threshold": self.threshold,
34
+ "kwargs": self.kwargs
35
+ }
@@ -0,0 +1,19 @@
1
+ """
2
+ `judgeval` Groundedness scorer
3
+
4
+ TODO add link to docs page for this scorer
5
+
6
+ """
7
+
8
+ # Internal imports
9
+ from judgeval.scorers.api_scorer import APIJudgmentScorer
10
+ from judgeval.constants import APIScorer
11
+
12
+
13
+ class GroundednessScorer(APIJudgmentScorer):
14
+ def __init__(self, threshold: float):
15
+ super().__init__(threshold=threshold, score_type=APIScorer.GROUNDEDNESS)
16
+
17
+ @property
18
+ def __name__(self):
19
+ return "Groundedness"
@@ -0,0 +1,19 @@
1
+ """
2
+ `judgeval` instruction adherence scorer
3
+
4
+ TODO add link to docs page for this scorer
5
+
6
+ """
7
+
8
+ # Internal imports
9
+ from judgeval.scorers.api_scorer import APIJudgmentScorer
10
+ from judgeval.constants import APIScorer
11
+
12
+
13
+ class InstructionAdherenceScorer(APIJudgmentScorer):
14
+ def __init__(self, threshold: float):
15
+ super().__init__(threshold=threshold, score_type=APIScorer.INSTRUCTION_ADHERENCE)
16
+
17
+ @property
18
+ def __name__(self):
19
+ return "Instruction Adherence"
@@ -8,11 +8,13 @@ from judgeval.scorers.judgeval_scorers.local_implementations.tool_correctness.to
8
8
  from judgeval.scorers.judgeval_scorers.local_implementations.hallucination.hallucination_scorer import HallucinationScorer
9
9
  from judgeval.scorers.judgeval_scorers.local_implementations.summarization.summarization_scorer import SummarizationScorer
10
10
  from judgeval.scorers.judgeval_scorers.local_implementations.answer_correctness.answer_correctness_scorer import AnswerCorrectnessScorer
11
-
11
+ from judgeval.scorers.judgeval_scorers.local_implementations.comparison.comparison_scorer import ComparisonScorer
12
+ from judgeval.scorers.judgeval_scorers.local_implementations.instruction_adherence.instruction_adherence import InstructionAdherenceScorer
12
13
 
13
14
  __all__ = [
14
15
  "AnswerCorrectnessScorer",
15
16
  "AnswerRelevancyScorer",
17
+ "ComparisonScorer",
16
18
  "ContextualPrecisionScorer",
17
19
  "ContextualRecallScorer",
18
20
  "ContextualRelevancyScorer",
@@ -21,4 +23,5 @@ __all__ = [
21
23
  "ToolCorrectnessScorer",
22
24
  "HallucinationScorer",
23
25
  "SummarizationScorer",
26
+ "InstructionAdherenceScorer",
24
27
  ]
@@ -1,5 +1,4 @@
1
1
  from typing import Optional, List, Union, Tuple
2
- from pydantic import BaseModel
3
2
 
4
3
  from judgeval.constants import APIScorer
5
4
  from judgeval.judges import JudgevalJudge
@@ -2,8 +2,8 @@
2
2
  Util prompts for AnswerCorrectnessScorer
3
3
  """
4
4
 
5
- from typing import List, Optional, Tuple
6
- from pydantic import BaseModel, Field
5
+ from typing import List, Tuple
6
+ from pydantic import BaseModel
7
7
 
8
8
 
9
9
  # BaseModels to enforce formatting in LLM JSON response
@@ -1,12 +1,13 @@
1
1
  from typing import Optional, List, Union, Tuple
2
2
 
3
3
  from judgeval.constants import APIScorer
4
- from judgeval.scorers.utils import (get_or_create_event_loop,
5
- scorer_progress_meter,
6
- create_verbose_logs,
7
- parse_response_json,
8
- check_example_params
9
- )
4
+ from judgeval.scorers.utils import (
5
+ get_or_create_event_loop,
6
+ scorer_progress_meter,
7
+ create_verbose_logs,
8
+ parse_response_json,
9
+ check_example_params
10
+ )
10
11
  from judgeval.scorers import JudgevalScorer
11
12
  from judgeval.judges import JudgevalJudge
12
13
  from judgeval.judges.utils import create_judge
@@ -2,8 +2,8 @@
2
2
  Util prompts for AnswerRelevancyScorer
3
3
  """
4
4
 
5
- from typing import List, Optional, Tuple
6
- from pydantic import BaseModel, Field
5
+ from typing import List, Tuple
6
+ from pydantic import BaseModel
7
7
 
8
8
 
9
9
  # BaseModels to enforce formatting in LLM JSON response
@@ -0,0 +1,161 @@
1
+ from typing import Optional, Union, List
2
+ from pydantic import BaseModel
3
+
4
+ from judgeval.constants import APIScorer
5
+ from judgeval.scorers import JudgevalScorer
6
+ from judgeval.judges import JudgevalJudge
7
+ from judgeval.judges.utils import create_judge
8
+ from judgeval.data import Example, ExampleParams
9
+ from judgeval.scorers.utils import (
10
+ get_or_create_event_loop,
11
+ scorer_progress_meter,
12
+ create_verbose_logs,
13
+ parse_response_json,
14
+ check_example_params
15
+ )
16
+ from .prompts import ComparisonTemplate
17
+
18
+ required_params = [
19
+ ExampleParams.INPUT,
20
+ ExampleParams.ACTUAL_OUTPUT,
21
+ ExampleParams.EXPECTED_OUTPUT,
22
+ ]
23
+
24
+ class ComparisonDifference(BaseModel):
25
+ actual_output_sentence: str
26
+ expected_output_sentence: str
27
+ reason: str
28
+
29
+ class ComparisonDifferences(BaseModel):
30
+ differences: List[ComparisonDifference]
31
+
32
+ class ComparisonScorer(JudgevalScorer):
33
+ def __init__(
34
+ self,
35
+ criteria: str,
36
+ description: str,
37
+ threshold: float = 1,
38
+ model: Optional[Union[str, JudgevalJudge]] = None,
39
+ include_reason: bool = True,
40
+ async_mode: bool = True,
41
+ verbose_mode: bool = False,
42
+ ):
43
+ super().__init__(
44
+ score_type=APIScorer.COMPARISON,
45
+ threshold=threshold,
46
+ evaluation_model=None,
47
+ include_reason=include_reason,
48
+ async_mode=async_mode,
49
+ verbose_mode=verbose_mode
50
+ )
51
+ self.model, self.using_native_model = create_judge(model)
52
+ self.evaluation_model = self.model.get_model_name()
53
+ self.criteria = criteria
54
+ self.description = description
55
+
56
+ def score_example(
57
+ self,
58
+ example: Example,
59
+ _show_indicator: bool = True,
60
+ ) -> float:
61
+ check_example_params(example, required_params, self)
62
+
63
+ with scorer_progress_meter(self, display_meter=_show_indicator):
64
+ if self.async_mode:
65
+ loop = get_or_create_event_loop()
66
+ loop.run_until_complete(
67
+ self.a_score_example(
68
+ example,
69
+ _show_indicator=False
70
+ )
71
+ )
72
+ else:
73
+ self.differences = self._find_differences(example)
74
+ self.score = len(self.differences)
75
+ self.reason = str(self.differences)
76
+ self.success = self.score <= self.threshold
77
+ self.verbose_logs = create_verbose_logs(
78
+ self,
79
+ steps=[
80
+ f"Score: {self.score}\nReason: {self.reason}",
81
+ ],
82
+ )
83
+
84
+ return len(self.differences)
85
+
86
+ async def a_score_example(
87
+ self,
88
+ example: Example,
89
+ _show_indicator: bool = True
90
+ ) -> float:
91
+ check_example_params(example, required_params, self)
92
+
93
+ with scorer_progress_meter(
94
+ self, async_mode=True, display_meter=_show_indicator
95
+ ):
96
+ self.differences = self.a_find_differences(example)
97
+ self.score = len(self.differences)
98
+ self.reason = str(self.differences)
99
+ self.success = self.score <= self.threshold
100
+ self.verbose_logs = create_verbose_logs(
101
+ self,
102
+ steps=[
103
+ f"Score: {self.score}\nReason: {self.reason}",
104
+ ],
105
+ )
106
+
107
+ return self.score
108
+
109
+ def _find_differences(self, example: Example) -> float:
110
+ prompt = ComparisonTemplate.find_differences(
111
+ criteria=self.criteria,
112
+ description=self.description,
113
+ actual_output=example.actual_output,
114
+ expected_output=example.expected_output
115
+ )
116
+ if self.using_native_model:
117
+ res = self.model.generate(prompt)
118
+ data = parse_response_json(res, self)
119
+ return data["differences"]
120
+ else:
121
+ try:
122
+ res: ComparisonDifferences = self.model.generate(prompt, schema=ComparisonDifferences)
123
+ return res.differences
124
+ except TypeError:
125
+ res = self.model.generate(prompt)
126
+ data = parse_response_json(res, self)
127
+ return data["differences"]
128
+
129
+ async def a_find_differences(self, example: Example) -> float:
130
+ prompt = ComparisonTemplate.find_differences(
131
+ criteria=self.criteria,
132
+ description=self.description,
133
+ actual_output=example.actual_output,
134
+ expected_output=example.expected_output
135
+ )
136
+ if self.using_native_model:
137
+ res = await self.model.a_generate(prompt)
138
+ data = parse_response_json(res, self)
139
+ return data["differences"]
140
+ else:
141
+ try:
142
+ res: ComparisonDifferences = await self.model.a_generate(prompt, schema=ComparisonDifferences)
143
+ return res.differences
144
+ except TypeError:
145
+ res = await self.model.a_generate(prompt)
146
+ data = parse_response_json(res, self)
147
+ return data["differences"]
148
+
149
+ def _success_check(self) -> bool:
150
+ if self.error is not None:
151
+ self.success = False
152
+ else:
153
+ try:
154
+ self.success = self.score <= self.threshold
155
+ except:
156
+ self.success = False
157
+ return self.success
158
+
159
+ @property
160
+ def __name__(self):
161
+ return f"Comparison - {self.criteria}"