judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of judgeval might be problematic. Click here for more details.

Files changed (171) hide show
  1. judgeval/__init__.py +177 -12
  2. judgeval/api/__init__.py +519 -0
  3. judgeval/api/api_types.py +407 -0
  4. judgeval/cli.py +79 -0
  5. judgeval/constants.py +76 -47
  6. judgeval/data/__init__.py +3 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +15 -56
  9. judgeval/data/judgment_types.py +450 -0
  10. judgeval/data/result.py +29 -73
  11. judgeval/data/scorer_data.py +29 -62
  12. judgeval/data/scripts/fix_default_factory.py +23 -0
  13. judgeval/data/scripts/openapi_transform.py +123 -0
  14. judgeval/data/trace.py +121 -0
  15. judgeval/dataset/__init__.py +264 -0
  16. judgeval/env.py +52 -0
  17. judgeval/evaluation/__init__.py +344 -0
  18. judgeval/exceptions.py +27 -0
  19. judgeval/integrations/langgraph/__init__.py +13 -0
  20. judgeval/integrations/openlit/__init__.py +50 -0
  21. judgeval/judges/__init__.py +2 -3
  22. judgeval/judges/base_judge.py +2 -3
  23. judgeval/judges/litellm_judge.py +100 -20
  24. judgeval/judges/together_judge.py +101 -20
  25. judgeval/judges/utils.py +20 -24
  26. judgeval/logger.py +62 -0
  27. judgeval/prompt/__init__.py +330 -0
  28. judgeval/scorers/__init__.py +18 -25
  29. judgeval/scorers/agent_scorer.py +17 -0
  30. judgeval/scorers/api_scorer.py +45 -41
  31. judgeval/scorers/base_scorer.py +83 -38
  32. judgeval/scorers/example_scorer.py +17 -0
  33. judgeval/scorers/exceptions.py +1 -0
  34. judgeval/scorers/judgeval_scorers/__init__.py +0 -148
  35. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
  36. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
  37. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
  38. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
  39. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
  40. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
  41. judgeval/scorers/score.py +77 -306
  42. judgeval/scorers/utils.py +4 -199
  43. judgeval/tracer/__init__.py +1122 -2
  44. judgeval/tracer/constants.py +1 -0
  45. judgeval/tracer/exporters/__init__.py +40 -0
  46. judgeval/tracer/exporters/s3.py +119 -0
  47. judgeval/tracer/exporters/store.py +59 -0
  48. judgeval/tracer/exporters/utils.py +32 -0
  49. judgeval/tracer/keys.py +63 -0
  50. judgeval/tracer/llm/__init__.py +7 -0
  51. judgeval/tracer/llm/config.py +78 -0
  52. judgeval/tracer/llm/constants.py +9 -0
  53. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  54. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  55. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  56. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  57. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  58. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  59. judgeval/tracer/llm/llm_google/config.py +6 -0
  60. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  61. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  62. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  63. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  64. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  65. judgeval/tracer/llm/llm_openai/config.py +6 -0
  66. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  67. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  68. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  69. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  70. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  71. judgeval/tracer/llm/llm_together/config.py +6 -0
  72. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  73. judgeval/tracer/llm/providers.py +19 -0
  74. judgeval/tracer/managers.py +167 -0
  75. judgeval/tracer/processors/__init__.py +220 -0
  76. judgeval/tracer/utils.py +19 -0
  77. judgeval/trainer/__init__.py +14 -0
  78. judgeval/trainer/base_trainer.py +122 -0
  79. judgeval/trainer/config.py +128 -0
  80. judgeval/trainer/console.py +144 -0
  81. judgeval/trainer/fireworks_trainer.py +396 -0
  82. judgeval/trainer/trainable_model.py +243 -0
  83. judgeval/trainer/trainer.py +70 -0
  84. judgeval/utils/async_utils.py +39 -0
  85. judgeval/utils/decorators/__init__.py +0 -0
  86. judgeval/utils/decorators/dont_throw.py +37 -0
  87. judgeval/utils/decorators/use_once.py +13 -0
  88. judgeval/utils/file_utils.py +97 -0
  89. judgeval/utils/guards.py +36 -0
  90. judgeval/utils/meta.py +27 -0
  91. judgeval/utils/project.py +15 -0
  92. judgeval/utils/serialize.py +253 -0
  93. judgeval/utils/testing.py +70 -0
  94. judgeval/utils/url.py +10 -0
  95. judgeval/utils/version_check.py +28 -0
  96. judgeval/utils/wrappers/README.md +3 -0
  97. judgeval/utils/wrappers/__init__.py +15 -0
  98. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  99. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  100. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  101. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  102. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  103. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  104. judgeval/utils/wrappers/py.typed +0 -0
  105. judgeval/utils/wrappers/utils.py +35 -0
  106. judgeval/version.py +5 -0
  107. judgeval/warnings.py +4 -0
  108. judgeval-0.22.2.dist-info/METADATA +265 -0
  109. judgeval-0.22.2.dist-info/RECORD +112 -0
  110. judgeval-0.22.2.dist-info/entry_points.txt +2 -0
  111. judgeval/clients.py +0 -39
  112. judgeval/common/__init__.py +0 -8
  113. judgeval/common/exceptions.py +0 -28
  114. judgeval/common/logger.py +0 -189
  115. judgeval/common/tracer.py +0 -798
  116. judgeval/common/utils.py +0 -763
  117. judgeval/data/api_example.py +0 -111
  118. judgeval/data/datasets/__init__.py +0 -5
  119. judgeval/data/datasets/dataset.py +0 -286
  120. judgeval/data/datasets/eval_dataset_client.py +0 -193
  121. judgeval/data/datasets/ground_truth.py +0 -54
  122. judgeval/data/datasets/utils.py +0 -74
  123. judgeval/evaluation_run.py +0 -132
  124. judgeval/judges/mixture_of_judges.py +0 -248
  125. judgeval/judgment_client.py +0 -354
  126. judgeval/run_evaluation.py +0 -439
  127. judgeval/scorers/judgeval_scorer.py +0 -140
  128. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
  129. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
  130. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
  131. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
  132. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
  133. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
  134. judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
  135. judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
  136. judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
  137. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
  138. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
  139. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
  140. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
  141. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
  142. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
  143. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
  144. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
  145. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
  146. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
  147. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
  148. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
  149. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
  150. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
  151. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
  152. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
  153. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
  154. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
  155. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
  156. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
  157. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
  158. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
  159. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
  160. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
  161. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
  162. judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
  163. judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
  164. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
  165. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
  166. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
  167. judgeval/scorers/prompt_scorer.py +0 -439
  168. judgeval-0.0.11.dist-info/METADATA +0 -36
  169. judgeval-0.0.11.dist-info/RECORD +0 -84
  170. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
  171. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,148 +0,0 @@
1
- from typing import Type, Optional, Any
2
- from functools import wraps
3
-
4
- # Import implementations
5
- from judgeval.scorers.judgeval_scorers.api_scorers import (
6
- ToolCorrectnessScorer as APIToolCorrectnessScorer,
7
- JSONCorrectnessScorer as APIJSONCorrectnessScorer,
8
- SummarizationScorer as APISummarizationScorer,
9
- HallucinationScorer as APIHallucinationScorer,
10
- FaithfulnessScorer as APIFaithfulnessScorer,
11
- ContextualRelevancyScorer as APIContextualRelevancyScorer,
12
- ContextualPrecisionScorer as APIContextualPrecisionScorer,
13
- ContextualRecallScorer as APIContextualRecallScorer,
14
- AnswerRelevancyScorer as APIAnswerRelevancyScorer,
15
- AnswerCorrectnessScorer as APIAnswerCorrectnessScorer,
16
- )
17
-
18
- from judgeval.scorers.judgeval_scorers.local_implementations import (
19
- AnswerRelevancyScorer as LocalAnswerRelevancyScorer,
20
- ContextualPrecisionScorer as LocalContextualPrecisionScorer,
21
- ContextualRecallScorer as LocalContextualRecallScorer,
22
- ContextualRelevancyScorer as LocalContextualRelevancyScorer,
23
- FaithfulnessScorer as LocalFaithfulnessScorer,
24
- JsonCorrectnessScorer as LocalJsonCorrectnessScorer,
25
- ToolCorrectnessScorer as LocalToolCorrectnessScorer,
26
- HallucinationScorer as LocalHallucinationScorer,
27
- SummarizationScorer as LocalSummarizationScorer,
28
- AnswerCorrectnessScorer as LocalAnswerCorrectnessScorer
29
- )
30
-
31
- from judgeval.scorers.judgeval_scorers.classifiers import Text2SQLScorer
32
-
33
-
34
- class ScorerWrapper:
35
- """
36
- Wrapper class that can dynamically load either API or local implementation of a scorer.
37
- """
38
- def __init__(self, api_implementation: Type, local_implementation: Optional[Type] = None):
39
- self.api_implementation = api_implementation
40
- self.local_implementation = local_implementation
41
- self._instance = None
42
- self._init_args = None
43
- self._init_kwargs = None
44
-
45
- def __call__(self, *args, **kwargs):
46
- """Store initialization arguments for later use when implementation is loaded"""
47
- self._init_args = args
48
- self._init_kwargs = kwargs
49
- return self
50
-
51
- def load_implementation(self, use_judgment: bool = True) -> Any:
52
- """
53
- Load the appropriate implementation based on the use_judgment flag.
54
-
55
- Args:
56
- use_judgment (bool): If True, use API implementation. If False, use local implementation.
57
-
58
- Returns:
59
- Instance of the appropriate implementation
60
-
61
- Raises:
62
- ValueError: If local implementation is requested but not available
63
- """
64
- if self._instance is not None:
65
- return self._instance
66
-
67
- if use_judgment:
68
- implementation = self.api_implementation
69
- else:
70
- if self.local_implementation is None:
71
- raise ValueError("No local implementation available for this scorer")
72
- implementation = self.local_implementation
73
-
74
- args = self._init_args or ()
75
- kwargs = self._init_kwargs or {}
76
- self._instance = implementation(*args, **kwargs)
77
- return self._instance
78
-
79
- def __getattr__(self, name):
80
- """Defer all attribute access to the loaded implementation"""
81
- if self._instance is None:
82
- raise RuntimeError("Implementation not loaded. Call load_implementation() first")
83
- return getattr(self._instance, name)
84
-
85
- # Create wrapped versions of all scorers
86
-
87
- AnswerCorrectnessScorer = ScorerWrapper(
88
- api_implementation=APIAnswerCorrectnessScorer,
89
- local_implementation=LocalAnswerCorrectnessScorer
90
- )
91
-
92
- AnswerRelevancyScorer = ScorerWrapper(
93
- api_implementation=APIAnswerRelevancyScorer,
94
- local_implementation=LocalAnswerRelevancyScorer
95
- )
96
-
97
- ToolCorrectnessScorer = ScorerWrapper(
98
- api_implementation=APIToolCorrectnessScorer,
99
- local_implementation=LocalToolCorrectnessScorer
100
- )
101
-
102
- JSONCorrectnessScorer = ScorerWrapper(
103
- api_implementation=APIJSONCorrectnessScorer,
104
- local_implementation=LocalJsonCorrectnessScorer
105
- )
106
-
107
- SummarizationScorer = ScorerWrapper(
108
- api_implementation=APISummarizationScorer,
109
- local_implementation=LocalSummarizationScorer
110
- )
111
-
112
- HallucinationScorer = ScorerWrapper(
113
- api_implementation=APIHallucinationScorer,
114
- local_implementation=LocalHallucinationScorer
115
- )
116
-
117
- FaithfulnessScorer = ScorerWrapper(
118
- api_implementation=APIFaithfulnessScorer,
119
- local_implementation=LocalFaithfulnessScorer
120
- )
121
-
122
- ContextualRelevancyScorer = ScorerWrapper(
123
- api_implementation=APIContextualRelevancyScorer,
124
- local_implementation=LocalContextualRelevancyScorer
125
- )
126
-
127
- ContextualPrecisionScorer = ScorerWrapper(
128
- api_implementation=APIContextualPrecisionScorer,
129
- local_implementation=LocalContextualPrecisionScorer
130
- )
131
-
132
- ContextualRecallScorer = ScorerWrapper(
133
- api_implementation=APIContextualRecallScorer,
134
- local_implementation=LocalContextualRecallScorer
135
- )
136
-
137
- __all__ = [
138
- "ToolCorrectnessScorer",
139
- "JSONCorrectnessScorer",
140
- "SummarizationScorer",
141
- "HallucinationScorer",
142
- "FaithfulnessScorer",
143
- "ContextualRelevancyScorer",
144
- "ContextualPrecisionScorer",
145
- "ContextualRecallScorer",
146
- "AnswerRelevancyScorer",
147
- "Text2SQLScorer",
148
- ]
@@ -1,23 +1,25 @@
1
- from judgeval.scorers.judgeval_scorers.api_scorers.tool_correctness import ToolCorrectnessScorer
2
- from judgeval.scorers.judgeval_scorers.api_scorers.json_correctness import JSONCorrectnessScorer
3
- from judgeval.scorers.judgeval_scorers.api_scorers.summarization import SummarizationScorer
4
- from judgeval.scorers.judgeval_scorers.api_scorers.hallucination import HallucinationScorer
5
- from judgeval.scorers.judgeval_scorers.api_scorers.faithfulness import FaithfulnessScorer
6
- from judgeval.scorers.judgeval_scorers.api_scorers.contextual_relevancy import ContextualRelevancyScorer
7
- from judgeval.scorers.judgeval_scorers.api_scorers.contextual_precision import ContextualPrecisionScorer
8
- from judgeval.scorers.judgeval_scorers.api_scorers.contextual_recall import ContextualRecallScorer
9
- from judgeval.scorers.judgeval_scorers.api_scorers.answer_relevancy import AnswerRelevancyScorer
10
- from judgeval.scorers.judgeval_scorers.api_scorers.answer_correctness import AnswerCorrectnessScorer
1
+ from judgeval.scorers.judgeval_scorers.api_scorers.faithfulness import (
2
+ FaithfulnessScorer,
3
+ )
4
+ from judgeval.scorers.judgeval_scorers.api_scorers.answer_relevancy import (
5
+ AnswerRelevancyScorer,
6
+ )
7
+ from judgeval.scorers.judgeval_scorers.api_scorers.answer_correctness import (
8
+ AnswerCorrectnessScorer,
9
+ )
10
+ from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import (
11
+ InstructionAdherenceScorer,
12
+ )
13
+ from judgeval.scorers.judgeval_scorers.api_scorers.prompt_scorer import (
14
+ TracePromptScorer,
15
+ PromptScorer,
16
+ )
11
17
 
12
18
  __all__ = [
13
- "ToolCorrectnessScorer",
14
- "JSONCorrectnessScorer",
15
- "SummarizationScorer",
16
- "HallucinationScorer",
17
19
  "FaithfulnessScorer",
18
- "ContextualRelevancyScorer",
19
- "ContextualPrecisionScorer",
20
- "ContextualRecallScorer",
21
20
  "AnswerRelevancyScorer",
22
21
  "AnswerCorrectnessScorer",
22
+ "InstructionAdherenceScorer",
23
+ "TracePromptScorer",
24
+ "PromptScorer",
23
25
  ]
@@ -1,19 +1,13 @@
1
- """
2
- `judgeval` answer relevancy scorer
3
-
4
- TODO add link to docs page for this scorer
5
-
6
- """
7
-
8
- # Internal imports
9
- from judgeval.scorers.api_scorer import APIJudgmentScorer
10
- from judgeval.constants import APIScorer
11
-
12
-
13
- class AnswerCorrectnessScorer(APIJudgmentScorer):
14
- def __init__(self, threshold: float):
15
- super().__init__(threshold=threshold, score_type=APIScorer.ANSWER_CORRECTNESS)
16
-
17
- @property
18
- def __name__(self):
19
- return "Answer Correctness"
1
+ from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
2
+ from judgeval.constants import APIScorerType
3
+ from judgeval.data import ExampleParams
4
+ from typing import List
5
+
6
+
7
+ class AnswerCorrectnessScorer(ExampleAPIScorerConfig):
8
+ score_type: APIScorerType = APIScorerType.ANSWER_CORRECTNESS
9
+ required_params: List[ExampleParams] = [
10
+ ExampleParams.INPUT,
11
+ ExampleParams.ACTUAL_OUTPUT,
12
+ ExampleParams.EXPECTED_OUTPUT,
13
+ ]
@@ -1,19 +1,12 @@
1
- """
2
- `judgeval` answer relevancy scorer
3
-
4
- TODO add link to docs page for this scorer
5
-
6
- """
7
-
8
- # Internal imports
9
- from judgeval.scorers.api_scorer import APIJudgmentScorer
10
- from judgeval.constants import APIScorer
11
-
12
-
13
- class AnswerRelevancyScorer(APIJudgmentScorer):
14
- def __init__(self, threshold: float):
15
- super().__init__(threshold=threshold, score_type=APIScorer.ANSWER_RELEVANCY)
16
-
17
- @property
18
- def __name__(self):
19
- return "Answer Relevancy"
1
+ from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
2
+ from judgeval.constants import APIScorerType
3
+ from judgeval.data import ExampleParams
4
+ from typing import List
5
+
6
+
7
+ class AnswerRelevancyScorer(ExampleAPIScorerConfig):
8
+ score_type: APIScorerType = APIScorerType.ANSWER_RELEVANCY
9
+ required_params: List[ExampleParams] = [
10
+ ExampleParams.INPUT,
11
+ ExampleParams.ACTUAL_OUTPUT,
12
+ ]
@@ -1,19 +1,13 @@
1
- """
2
- `judgeval` faithfulness scorer
3
-
4
- TODO add link to docs page for this scorer
5
-
6
- """
7
-
8
- # Internal imports
9
- from judgeval.scorers.api_scorer import APIJudgmentScorer
10
- from judgeval.constants import APIScorer
11
-
12
-
13
- class FaithfulnessScorer(APIJudgmentScorer):
14
- def __init__(self, threshold: float):
15
- super().__init__(threshold=threshold, score_type=APIScorer.FAITHFULNESS)
16
-
17
- @property
18
- def __name__(self):
19
- return "Faithfulness"
1
+ from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
2
+ from judgeval.constants import APIScorerType
3
+ from judgeval.data import ExampleParams
4
+ from typing import List
5
+
6
+
7
+ class FaithfulnessScorer(ExampleAPIScorerConfig):
8
+ score_type: APIScorerType = APIScorerType.FAITHFULNESS
9
+ required_params: List[ExampleParams] = [
10
+ ExampleParams.INPUT,
11
+ ExampleParams.ACTUAL_OUTPUT,
12
+ ExampleParams.RETRIEVAL_CONTEXT,
13
+ ]
@@ -0,0 +1,15 @@
1
+ from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
2
+ from judgeval.constants import APIScorerType
3
+ from judgeval.data import ExampleParams
4
+
5
+
6
+ class InstructionAdherenceScorer(ExampleAPIScorerConfig):
7
+ def __init__(self, threshold: float):
8
+ super().__init__(
9
+ threshold=threshold,
10
+ score_type=APIScorerType.INSTRUCTION_ADHERENCE,
11
+ required_params=[
12
+ ExampleParams.INPUT,
13
+ ExampleParams.ACTUAL_OUTPUT,
14
+ ],
15
+ )
@@ -0,0 +1,327 @@
1
+ from judgeval.scorers.api_scorer import (
2
+ APIScorerConfig,
3
+ ExampleAPIScorerConfig,
4
+ TraceAPIScorerConfig,
5
+ )
6
+ from judgeval.constants import APIScorerType
7
+ from typing import Dict, Any, Optional
8
+ from judgeval.api import JudgmentSyncClient
9
+ from judgeval.exceptions import JudgmentAPIError
10
+ import os
11
+ from judgeval.logger import judgeval_logger
12
+ from abc import ABC
13
+ from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
14
+ from copy import copy
15
+ from judgeval.utils.decorators.dont_throw import dont_throw
16
+
17
+
18
+ def push_prompt_scorer(
19
+ name: str,
20
+ prompt: str,
21
+ threshold: float,
22
+ options: Optional[Dict[str, float]] = None,
23
+ model: str = JUDGMENT_DEFAULT_GPT_MODEL,
24
+ description: Optional[str] = None,
25
+ judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
26
+ organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
27
+ is_trace: bool = False,
28
+ ) -> str:
29
+ client = JudgmentSyncClient(judgment_api_key, organization_id)
30
+ try:
31
+ r = client.save_scorer(
32
+ payload={
33
+ "name": name,
34
+ "prompt": prompt,
35
+ "threshold": threshold,
36
+ "options": options,
37
+ "model": model,
38
+ "description": description,
39
+ "is_trace": is_trace,
40
+ }
41
+ )
42
+ except JudgmentAPIError as e:
43
+ raise JudgmentAPIError(
44
+ status_code=e.status_code,
45
+ detail=f"Failed to save prompt scorer: {e.detail}",
46
+ response=e.response,
47
+ )
48
+ return r["scorer_response"]["name"]
49
+
50
+
51
+ def fetch_prompt_scorer(
52
+ name: str,
53
+ judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
54
+ organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
55
+ ):
56
+ client = JudgmentSyncClient(judgment_api_key, organization_id)
57
+ try:
58
+ fetched_scorers = client.fetch_scorers({"names": [name]})
59
+ if len(fetched_scorers["scorers"]) == 0:
60
+ judgeval_logger.error(f"Prompt scorer '{name}' not found")
61
+ raise JudgmentAPIError(
62
+ status_code=404,
63
+ detail=f"Prompt scorer '{name}' not found",
64
+ response=None, # type: ignore
65
+ )
66
+ else:
67
+ scorer_config = fetched_scorers["scorers"][0]
68
+ scorer_config.pop("created_at")
69
+ scorer_config.pop("updated_at")
70
+ return scorer_config
71
+ except JudgmentAPIError as e:
72
+ raise JudgmentAPIError(
73
+ status_code=e.status_code,
74
+ detail=f"Failed to fetch prompt scorer '{name}': {e.detail}",
75
+ response=e.response,
76
+ )
77
+
78
+
79
+ def scorer_exists(
80
+ name: str,
81
+ judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
82
+ organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
83
+ ):
84
+ client = JudgmentSyncClient(judgment_api_key, organization_id)
85
+ try:
86
+ return client.scorer_exists({"name": name})["exists"]
87
+ except JudgmentAPIError as e:
88
+ if e.status_code == 500:
89
+ raise JudgmentAPIError(
90
+ status_code=e.status_code,
91
+ detail=f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {e.detail}",
92
+ response=e.response,
93
+ )
94
+ raise JudgmentAPIError(
95
+ status_code=e.status_code,
96
+ detail=f"Failed to check if scorer exists: {e.detail}",
97
+ response=e.response,
98
+ )
99
+
100
+
101
+ class BasePromptScorer(ABC, APIScorerConfig):
102
+ score_type: APIScorerType
103
+ prompt: str
104
+ options: Optional[Dict[str, float]] = None
105
+ description: Optional[str] = None
106
+ judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or ""
107
+ organization_id: str = os.getenv("JUDGMENT_ORG_ID") or ""
108
+
109
+ @classmethod
110
+ @dont_throw
111
+ def get(
112
+ cls,
113
+ name: str,
114
+ judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
115
+ organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
116
+ ):
117
+ scorer_config = fetch_prompt_scorer(name, judgment_api_key, organization_id)
118
+ if scorer_config["is_trace"] != issubclass(cls, TracePromptScorer):
119
+ raise JudgmentAPIError(
120
+ status_code=400,
121
+ detail=f"Scorer with name {name} is not a {cls.__name__}",
122
+ response=None, # type: ignore
123
+ )
124
+ if issubclass(cls, TracePromptScorer):
125
+ score_type = APIScorerType.TRACE_PROMPT_SCORER
126
+ else:
127
+ score_type = APIScorerType.PROMPT_SCORER
128
+ return cls(
129
+ score_type=score_type,
130
+ name=name,
131
+ prompt=scorer_config["prompt"],
132
+ threshold=scorer_config["threshold"],
133
+ options=scorer_config.get("options"),
134
+ model=scorer_config.get("model"),
135
+ description=scorer_config.get("description"),
136
+ judgment_api_key=judgment_api_key,
137
+ organization_id=organization_id,
138
+ )
139
+
140
+ @classmethod
141
+ def create(
142
+ cls,
143
+ name: str,
144
+ prompt: str,
145
+ threshold: float = 0.5,
146
+ options: Optional[Dict[str, float]] = None,
147
+ model: str = JUDGMENT_DEFAULT_GPT_MODEL,
148
+ description: Optional[str] = None,
149
+ judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
150
+ organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
151
+ ):
152
+ if not scorer_exists(name, judgment_api_key, organization_id):
153
+ if issubclass(cls, TracePromptScorer):
154
+ is_trace = True
155
+ score_type = APIScorerType.TRACE_PROMPT_SCORER
156
+ else:
157
+ is_trace = False
158
+ score_type = APIScorerType.PROMPT_SCORER
159
+ push_prompt_scorer(
160
+ name,
161
+ prompt,
162
+ threshold,
163
+ options,
164
+ model,
165
+ description,
166
+ judgment_api_key,
167
+ organization_id,
168
+ is_trace,
169
+ )
170
+ judgeval_logger.info(f"Successfully created PromptScorer: {name}")
171
+ return cls(
172
+ score_type=score_type,
173
+ name=name,
174
+ prompt=prompt,
175
+ threshold=threshold,
176
+ options=options,
177
+ model=model,
178
+ description=description,
179
+ judgment_api_key=judgment_api_key,
180
+ organization_id=organization_id,
181
+ )
182
+ else:
183
+ raise JudgmentAPIError(
184
+ status_code=400,
185
+ detail=f"Scorer with name {name} already exists. Either use the existing scorer with the get() method or use a new name.",
186
+ response=None, # type: ignore
187
+ )
188
+
189
+ # Setter functions. Each setter function pushes the scorer to the DB.
190
+ def set_threshold(self, threshold: float):
191
+ """
192
+ Updates the threshold of the scorer.
193
+ """
194
+ self.threshold = threshold
195
+ self.push_prompt_scorer()
196
+
197
+ def set_prompt(self, prompt: str):
198
+ """
199
+ Updates the prompt with the new prompt.
200
+
201
+ Sample prompt:
202
+ "Did the chatbot answer the user's question in a kind way?"
203
+ """
204
+ self.prompt = prompt
205
+ self.push_prompt_scorer()
206
+ judgeval_logger.info(f"Successfully updated prompt for {self.name}")
207
+
208
+ def set_model(self, model: str):
209
+ """
210
+ Updates the model of the scorer.
211
+ """
212
+ self.model = model
213
+ self.push_prompt_scorer()
214
+ judgeval_logger.info(f"Successfully updated model for {self.name}")
215
+
216
+ def set_options(self, options: Optional[Dict[str, float]]):
217
+ """
218
+ Updates the options of the scorer.
219
+ """
220
+ self.options = options
221
+ self.push_prompt_scorer()
222
+ judgeval_logger.info(f"Successfully updated options for {self.name}")
223
+
224
+ def set_description(self, description: Optional[str]):
225
+ """
226
+ Updates the description of the scorer.
227
+ """
228
+ self.description = description
229
+ self.push_prompt_scorer()
230
+ judgeval_logger.info(f"Successfully updated description for {self.name}")
231
+
232
+ def append_to_prompt(self, prompt_addition: str):
233
+ """
234
+ Appends a string to the prompt.
235
+ """
236
+ self.prompt += prompt_addition
237
+ self.push_prompt_scorer()
238
+ judgeval_logger.info(f"Successfully appended to prompt for {self.name}")
239
+
240
+ # Getters
241
+ def get_threshold(self) -> float:
242
+ """
243
+ Returns the threshold of the scorer.
244
+ """
245
+ return self.threshold
246
+
247
+ def get_prompt(self) -> str:
248
+ """
249
+ Returns the prompt of the scorer.
250
+ """
251
+ return self.prompt
252
+
253
+ def get_model(self) -> str:
254
+ """
255
+ Returns the model of the scorer.
256
+ """
257
+ return self.model
258
+
259
+ def get_options(self) -> Dict[str, float] | None:
260
+ """
261
+ Returns the options of the scorer.
262
+ """
263
+ return copy(self.options) if self.options is not None else None
264
+
265
+ def get_description(self) -> str | None:
266
+ """
267
+ Returns the description of the scorer.
268
+ """
269
+ return self.description
270
+
271
+ def get_name(self) -> str:
272
+ """
273
+ Returns the name of the scorer.
274
+ """
275
+ return self.name
276
+
277
+ def get_config(self) -> dict:
278
+ """
279
+ Returns a dictionary with all the fields in the scorer.
280
+ """
281
+ return {
282
+ "name": self.name,
283
+ "model": self.model,
284
+ "prompt": self.prompt,
285
+ "threshold": self.threshold,
286
+ "options": self.options,
287
+ "description": self.description,
288
+ }
289
+
290
+ def push_prompt_scorer(self):
291
+ """
292
+ Pushes the scorer to the DB.
293
+ """
294
+ push_prompt_scorer(
295
+ self.name,
296
+ self.prompt,
297
+ self.threshold,
298
+ self.options,
299
+ self.model,
300
+ self.description,
301
+ self.judgment_api_key,
302
+ self.organization_id,
303
+ isinstance(self, TracePromptScorer),
304
+ )
305
+
306
+ def __str__(self):
307
+ return f"PromptScorer(name={self.name}, model={self.model}, prompt={self.prompt}, threshold={self.threshold}, options={self.options}, description={self.description})"
308
+
309
+ def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
310
+ base = super().model_dump(*args, **kwargs)
311
+ base_fields = set(APIScorerConfig.model_fields.keys())
312
+ all_fields = set(self.__class__.model_fields.keys())
313
+
314
+ extra_fields = all_fields - base_fields - {"kwargs"}
315
+
316
+ base["kwargs"] = {
317
+ k: getattr(self, k) for k in extra_fields if getattr(self, k) is not None
318
+ }
319
+ return base
320
+
321
+
322
+ class PromptScorer(BasePromptScorer, ExampleAPIScorerConfig):
323
+ pass
324
+
325
+
326
+ class TracePromptScorer(BasePromptScorer, TraceAPIScorerConfig):
327
+ pass