judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of judgeval might be problematic. Click here for more details.
- judgeval/__init__.py +177 -12
- judgeval/api/__init__.py +519 -0
- judgeval/api/api_types.py +407 -0
- judgeval/cli.py +79 -0
- judgeval/constants.py +76 -47
- judgeval/data/__init__.py +3 -3
- judgeval/data/evaluation_run.py +125 -0
- judgeval/data/example.py +15 -56
- judgeval/data/judgment_types.py +450 -0
- judgeval/data/result.py +29 -73
- judgeval/data/scorer_data.py +29 -62
- judgeval/data/scripts/fix_default_factory.py +23 -0
- judgeval/data/scripts/openapi_transform.py +123 -0
- judgeval/data/trace.py +121 -0
- judgeval/dataset/__init__.py +264 -0
- judgeval/env.py +52 -0
- judgeval/evaluation/__init__.py +344 -0
- judgeval/exceptions.py +27 -0
- judgeval/integrations/langgraph/__init__.py +13 -0
- judgeval/integrations/openlit/__init__.py +50 -0
- judgeval/judges/__init__.py +2 -3
- judgeval/judges/base_judge.py +2 -3
- judgeval/judges/litellm_judge.py +100 -20
- judgeval/judges/together_judge.py +101 -20
- judgeval/judges/utils.py +20 -24
- judgeval/logger.py +62 -0
- judgeval/prompt/__init__.py +330 -0
- judgeval/scorers/__init__.py +18 -25
- judgeval/scorers/agent_scorer.py +17 -0
- judgeval/scorers/api_scorer.py +45 -41
- judgeval/scorers/base_scorer.py +83 -38
- judgeval/scorers/example_scorer.py +17 -0
- judgeval/scorers/exceptions.py +1 -0
- judgeval/scorers/judgeval_scorers/__init__.py +0 -148
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
- judgeval/scorers/score.py +77 -306
- judgeval/scorers/utils.py +4 -199
- judgeval/tracer/__init__.py +1122 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +40 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +59 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +63 -0
- judgeval/tracer/llm/__init__.py +7 -0
- judgeval/tracer/llm/config.py +78 -0
- judgeval/tracer/llm/constants.py +9 -0
- judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
- judgeval/tracer/llm/llm_anthropic/config.py +6 -0
- judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
- judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
- judgeval/tracer/llm/llm_google/__init__.py +3 -0
- judgeval/tracer/llm/llm_google/config.py +6 -0
- judgeval/tracer/llm/llm_google/generate_content.py +127 -0
- judgeval/tracer/llm/llm_google/wrapper.py +30 -0
- judgeval/tracer/llm/llm_openai/__init__.py +3 -0
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
- judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
- judgeval/tracer/llm/llm_openai/config.py +6 -0
- judgeval/tracer/llm/llm_openai/responses.py +506 -0
- judgeval/tracer/llm/llm_openai/utils.py +42 -0
- judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
- judgeval/tracer/llm/llm_together/__init__.py +3 -0
- judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
- judgeval/tracer/llm/llm_together/config.py +6 -0
- judgeval/tracer/llm/llm_together/wrapper.py +52 -0
- judgeval/tracer/llm/providers.py +19 -0
- judgeval/tracer/managers.py +167 -0
- judgeval/tracer/processors/__init__.py +220 -0
- judgeval/tracer/utils.py +19 -0
- judgeval/trainer/__init__.py +14 -0
- judgeval/trainer/base_trainer.py +122 -0
- judgeval/trainer/config.py +128 -0
- judgeval/trainer/console.py +144 -0
- judgeval/trainer/fireworks_trainer.py +396 -0
- judgeval/trainer/trainable_model.py +243 -0
- judgeval/trainer/trainer.py +70 -0
- judgeval/utils/async_utils.py +39 -0
- judgeval/utils/decorators/__init__.py +0 -0
- judgeval/utils/decorators/dont_throw.py +37 -0
- judgeval/utils/decorators/use_once.py +13 -0
- judgeval/utils/file_utils.py +97 -0
- judgeval/utils/guards.py +36 -0
- judgeval/utils/meta.py +27 -0
- judgeval/utils/project.py +15 -0
- judgeval/utils/serialize.py +253 -0
- judgeval/utils/testing.py +70 -0
- judgeval/utils/url.py +10 -0
- judgeval/utils/version_check.py +28 -0
- judgeval/utils/wrappers/README.md +3 -0
- judgeval/utils/wrappers/__init__.py +15 -0
- judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
- judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
- judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
- judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
- judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
- judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
- judgeval/utils/wrappers/py.typed +0 -0
- judgeval/utils/wrappers/utils.py +35 -0
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- judgeval-0.22.2.dist-info/METADATA +265 -0
- judgeval-0.22.2.dist-info/RECORD +112 -0
- judgeval-0.22.2.dist-info/entry_points.txt +2 -0
- judgeval/clients.py +0 -39
- judgeval/common/__init__.py +0 -8
- judgeval/common/exceptions.py +0 -28
- judgeval/common/logger.py +0 -189
- judgeval/common/tracer.py +0 -798
- judgeval/common/utils.py +0 -763
- judgeval/data/api_example.py +0 -111
- judgeval/data/datasets/__init__.py +0 -5
- judgeval/data/datasets/dataset.py +0 -286
- judgeval/data/datasets/eval_dataset_client.py +0 -193
- judgeval/data/datasets/ground_truth.py +0 -54
- judgeval/data/datasets/utils.py +0 -74
- judgeval/evaluation_run.py +0 -132
- judgeval/judges/mixture_of_judges.py +0 -248
- judgeval/judgment_client.py +0 -354
- judgeval/run_evaluation.py +0 -439
- judgeval/scorers/judgeval_scorer.py +0 -140
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
- judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
- judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
- judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
- judgeval/scorers/prompt_scorer.py +0 -439
- judgeval-0.0.11.dist-info/METADATA +0 -36
- judgeval-0.0.11.dist-info/RECORD +0 -84
- {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
- {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -1,148 +0,0 @@
|
|
|
1
|
-
from typing import Type, Optional, Any
|
|
2
|
-
from functools import wraps
|
|
3
|
-
|
|
4
|
-
# Import implementations
|
|
5
|
-
from judgeval.scorers.judgeval_scorers.api_scorers import (
|
|
6
|
-
ToolCorrectnessScorer as APIToolCorrectnessScorer,
|
|
7
|
-
JSONCorrectnessScorer as APIJSONCorrectnessScorer,
|
|
8
|
-
SummarizationScorer as APISummarizationScorer,
|
|
9
|
-
HallucinationScorer as APIHallucinationScorer,
|
|
10
|
-
FaithfulnessScorer as APIFaithfulnessScorer,
|
|
11
|
-
ContextualRelevancyScorer as APIContextualRelevancyScorer,
|
|
12
|
-
ContextualPrecisionScorer as APIContextualPrecisionScorer,
|
|
13
|
-
ContextualRecallScorer as APIContextualRecallScorer,
|
|
14
|
-
AnswerRelevancyScorer as APIAnswerRelevancyScorer,
|
|
15
|
-
AnswerCorrectnessScorer as APIAnswerCorrectnessScorer,
|
|
16
|
-
)
|
|
17
|
-
|
|
18
|
-
from judgeval.scorers.judgeval_scorers.local_implementations import (
|
|
19
|
-
AnswerRelevancyScorer as LocalAnswerRelevancyScorer,
|
|
20
|
-
ContextualPrecisionScorer as LocalContextualPrecisionScorer,
|
|
21
|
-
ContextualRecallScorer as LocalContextualRecallScorer,
|
|
22
|
-
ContextualRelevancyScorer as LocalContextualRelevancyScorer,
|
|
23
|
-
FaithfulnessScorer as LocalFaithfulnessScorer,
|
|
24
|
-
JsonCorrectnessScorer as LocalJsonCorrectnessScorer,
|
|
25
|
-
ToolCorrectnessScorer as LocalToolCorrectnessScorer,
|
|
26
|
-
HallucinationScorer as LocalHallucinationScorer,
|
|
27
|
-
SummarizationScorer as LocalSummarizationScorer,
|
|
28
|
-
AnswerCorrectnessScorer as LocalAnswerCorrectnessScorer
|
|
29
|
-
)
|
|
30
|
-
|
|
31
|
-
from judgeval.scorers.judgeval_scorers.classifiers import Text2SQLScorer
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
class ScorerWrapper:
|
|
35
|
-
"""
|
|
36
|
-
Wrapper class that can dynamically load either API or local implementation of a scorer.
|
|
37
|
-
"""
|
|
38
|
-
def __init__(self, api_implementation: Type, local_implementation: Optional[Type] = None):
|
|
39
|
-
self.api_implementation = api_implementation
|
|
40
|
-
self.local_implementation = local_implementation
|
|
41
|
-
self._instance = None
|
|
42
|
-
self._init_args = None
|
|
43
|
-
self._init_kwargs = None
|
|
44
|
-
|
|
45
|
-
def __call__(self, *args, **kwargs):
|
|
46
|
-
"""Store initialization arguments for later use when implementation is loaded"""
|
|
47
|
-
self._init_args = args
|
|
48
|
-
self._init_kwargs = kwargs
|
|
49
|
-
return self
|
|
50
|
-
|
|
51
|
-
def load_implementation(self, use_judgment: bool = True) -> Any:
|
|
52
|
-
"""
|
|
53
|
-
Load the appropriate implementation based on the use_judgment flag.
|
|
54
|
-
|
|
55
|
-
Args:
|
|
56
|
-
use_judgment (bool): If True, use API implementation. If False, use local implementation.
|
|
57
|
-
|
|
58
|
-
Returns:
|
|
59
|
-
Instance of the appropriate implementation
|
|
60
|
-
|
|
61
|
-
Raises:
|
|
62
|
-
ValueError: If local implementation is requested but not available
|
|
63
|
-
"""
|
|
64
|
-
if self._instance is not None:
|
|
65
|
-
return self._instance
|
|
66
|
-
|
|
67
|
-
if use_judgment:
|
|
68
|
-
implementation = self.api_implementation
|
|
69
|
-
else:
|
|
70
|
-
if self.local_implementation is None:
|
|
71
|
-
raise ValueError("No local implementation available for this scorer")
|
|
72
|
-
implementation = self.local_implementation
|
|
73
|
-
|
|
74
|
-
args = self._init_args or ()
|
|
75
|
-
kwargs = self._init_kwargs or {}
|
|
76
|
-
self._instance = implementation(*args, **kwargs)
|
|
77
|
-
return self._instance
|
|
78
|
-
|
|
79
|
-
def __getattr__(self, name):
|
|
80
|
-
"""Defer all attribute access to the loaded implementation"""
|
|
81
|
-
if self._instance is None:
|
|
82
|
-
raise RuntimeError("Implementation not loaded. Call load_implementation() first")
|
|
83
|
-
return getattr(self._instance, name)
|
|
84
|
-
|
|
85
|
-
# Create wrapped versions of all scorers
|
|
86
|
-
|
|
87
|
-
AnswerCorrectnessScorer = ScorerWrapper(
|
|
88
|
-
api_implementation=APIAnswerCorrectnessScorer,
|
|
89
|
-
local_implementation=LocalAnswerCorrectnessScorer
|
|
90
|
-
)
|
|
91
|
-
|
|
92
|
-
AnswerRelevancyScorer = ScorerWrapper(
|
|
93
|
-
api_implementation=APIAnswerRelevancyScorer,
|
|
94
|
-
local_implementation=LocalAnswerRelevancyScorer
|
|
95
|
-
)
|
|
96
|
-
|
|
97
|
-
ToolCorrectnessScorer = ScorerWrapper(
|
|
98
|
-
api_implementation=APIToolCorrectnessScorer,
|
|
99
|
-
local_implementation=LocalToolCorrectnessScorer
|
|
100
|
-
)
|
|
101
|
-
|
|
102
|
-
JSONCorrectnessScorer = ScorerWrapper(
|
|
103
|
-
api_implementation=APIJSONCorrectnessScorer,
|
|
104
|
-
local_implementation=LocalJsonCorrectnessScorer
|
|
105
|
-
)
|
|
106
|
-
|
|
107
|
-
SummarizationScorer = ScorerWrapper(
|
|
108
|
-
api_implementation=APISummarizationScorer,
|
|
109
|
-
local_implementation=LocalSummarizationScorer
|
|
110
|
-
)
|
|
111
|
-
|
|
112
|
-
HallucinationScorer = ScorerWrapper(
|
|
113
|
-
api_implementation=APIHallucinationScorer,
|
|
114
|
-
local_implementation=LocalHallucinationScorer
|
|
115
|
-
)
|
|
116
|
-
|
|
117
|
-
FaithfulnessScorer = ScorerWrapper(
|
|
118
|
-
api_implementation=APIFaithfulnessScorer,
|
|
119
|
-
local_implementation=LocalFaithfulnessScorer
|
|
120
|
-
)
|
|
121
|
-
|
|
122
|
-
ContextualRelevancyScorer = ScorerWrapper(
|
|
123
|
-
api_implementation=APIContextualRelevancyScorer,
|
|
124
|
-
local_implementation=LocalContextualRelevancyScorer
|
|
125
|
-
)
|
|
126
|
-
|
|
127
|
-
ContextualPrecisionScorer = ScorerWrapper(
|
|
128
|
-
api_implementation=APIContextualPrecisionScorer,
|
|
129
|
-
local_implementation=LocalContextualPrecisionScorer
|
|
130
|
-
)
|
|
131
|
-
|
|
132
|
-
ContextualRecallScorer = ScorerWrapper(
|
|
133
|
-
api_implementation=APIContextualRecallScorer,
|
|
134
|
-
local_implementation=LocalContextualRecallScorer
|
|
135
|
-
)
|
|
136
|
-
|
|
137
|
-
__all__ = [
|
|
138
|
-
"ToolCorrectnessScorer",
|
|
139
|
-
"JSONCorrectnessScorer",
|
|
140
|
-
"SummarizationScorer",
|
|
141
|
-
"HallucinationScorer",
|
|
142
|
-
"FaithfulnessScorer",
|
|
143
|
-
"ContextualRelevancyScorer",
|
|
144
|
-
"ContextualPrecisionScorer",
|
|
145
|
-
"ContextualRecallScorer",
|
|
146
|
-
"AnswerRelevancyScorer",
|
|
147
|
-
"Text2SQLScorer",
|
|
148
|
-
]
|
|
@@ -1,23 +1,25 @@
|
|
|
1
|
-
from judgeval.scorers.judgeval_scorers.api_scorers.
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
from judgeval.scorers.judgeval_scorers.api_scorers.
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
from judgeval.scorers.judgeval_scorers.api_scorers.
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
from judgeval.scorers.judgeval_scorers.api_scorers.
|
|
1
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.faithfulness import (
|
|
2
|
+
FaithfulnessScorer,
|
|
3
|
+
)
|
|
4
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.answer_relevancy import (
|
|
5
|
+
AnswerRelevancyScorer,
|
|
6
|
+
)
|
|
7
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.answer_correctness import (
|
|
8
|
+
AnswerCorrectnessScorer,
|
|
9
|
+
)
|
|
10
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import (
|
|
11
|
+
InstructionAdherenceScorer,
|
|
12
|
+
)
|
|
13
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.prompt_scorer import (
|
|
14
|
+
TracePromptScorer,
|
|
15
|
+
PromptScorer,
|
|
16
|
+
)
|
|
11
17
|
|
|
12
18
|
__all__ = [
|
|
13
|
-
"ToolCorrectnessScorer",
|
|
14
|
-
"JSONCorrectnessScorer",
|
|
15
|
-
"SummarizationScorer",
|
|
16
|
-
"HallucinationScorer",
|
|
17
19
|
"FaithfulnessScorer",
|
|
18
|
-
"ContextualRelevancyScorer",
|
|
19
|
-
"ContextualPrecisionScorer",
|
|
20
|
-
"ContextualRecallScorer",
|
|
21
20
|
"AnswerRelevancyScorer",
|
|
22
21
|
"AnswerCorrectnessScorer",
|
|
22
|
+
"InstructionAdherenceScorer",
|
|
23
|
+
"TracePromptScorer",
|
|
24
|
+
"PromptScorer",
|
|
23
25
|
]
|
|
@@ -1,19 +1,13 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def __init__(self, threshold: float):
|
|
15
|
-
super().__init__(threshold=threshold, score_type=APIScorer.ANSWER_CORRECTNESS)
|
|
16
|
-
|
|
17
|
-
@property
|
|
18
|
-
def __name__(self):
|
|
19
|
-
return "Answer Correctness"
|
|
1
|
+
from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
|
|
2
|
+
from judgeval.constants import APIScorerType
|
|
3
|
+
from judgeval.data import ExampleParams
|
|
4
|
+
from typing import List
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class AnswerCorrectnessScorer(ExampleAPIScorerConfig):
|
|
8
|
+
score_type: APIScorerType = APIScorerType.ANSWER_CORRECTNESS
|
|
9
|
+
required_params: List[ExampleParams] = [
|
|
10
|
+
ExampleParams.INPUT,
|
|
11
|
+
ExampleParams.ACTUAL_OUTPUT,
|
|
12
|
+
ExampleParams.EXPECTED_OUTPUT,
|
|
13
|
+
]
|
|
@@ -1,19 +1,12 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class AnswerRelevancyScorer(APIJudgmentScorer):
|
|
14
|
-
def __init__(self, threshold: float):
|
|
15
|
-
super().__init__(threshold=threshold, score_type=APIScorer.ANSWER_RELEVANCY)
|
|
16
|
-
|
|
17
|
-
@property
|
|
18
|
-
def __name__(self):
|
|
19
|
-
return "Answer Relevancy"
|
|
1
|
+
from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
|
|
2
|
+
from judgeval.constants import APIScorerType
|
|
3
|
+
from judgeval.data import ExampleParams
|
|
4
|
+
from typing import List
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class AnswerRelevancyScorer(ExampleAPIScorerConfig):
|
|
8
|
+
score_type: APIScorerType = APIScorerType.ANSWER_RELEVANCY
|
|
9
|
+
required_params: List[ExampleParams] = [
|
|
10
|
+
ExampleParams.INPUT,
|
|
11
|
+
ExampleParams.ACTUAL_OUTPUT,
|
|
12
|
+
]
|
|
@@ -1,19 +1,13 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def __init__(self, threshold: float):
|
|
15
|
-
super().__init__(threshold=threshold, score_type=APIScorer.FAITHFULNESS)
|
|
16
|
-
|
|
17
|
-
@property
|
|
18
|
-
def __name__(self):
|
|
19
|
-
return "Faithfulness"
|
|
1
|
+
from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
|
|
2
|
+
from judgeval.constants import APIScorerType
|
|
3
|
+
from judgeval.data import ExampleParams
|
|
4
|
+
from typing import List
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class FaithfulnessScorer(ExampleAPIScorerConfig):
|
|
8
|
+
score_type: APIScorerType = APIScorerType.FAITHFULNESS
|
|
9
|
+
required_params: List[ExampleParams] = [
|
|
10
|
+
ExampleParams.INPUT,
|
|
11
|
+
ExampleParams.ACTUAL_OUTPUT,
|
|
12
|
+
ExampleParams.RETRIEVAL_CONTEXT,
|
|
13
|
+
]
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
|
|
2
|
+
from judgeval.constants import APIScorerType
|
|
3
|
+
from judgeval.data import ExampleParams
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class InstructionAdherenceScorer(ExampleAPIScorerConfig):
|
|
7
|
+
def __init__(self, threshold: float):
|
|
8
|
+
super().__init__(
|
|
9
|
+
threshold=threshold,
|
|
10
|
+
score_type=APIScorerType.INSTRUCTION_ADHERENCE,
|
|
11
|
+
required_params=[
|
|
12
|
+
ExampleParams.INPUT,
|
|
13
|
+
ExampleParams.ACTUAL_OUTPUT,
|
|
14
|
+
],
|
|
15
|
+
)
|
|
@@ -0,0 +1,327 @@
|
|
|
1
|
+
from judgeval.scorers.api_scorer import (
|
|
2
|
+
APIScorerConfig,
|
|
3
|
+
ExampleAPIScorerConfig,
|
|
4
|
+
TraceAPIScorerConfig,
|
|
5
|
+
)
|
|
6
|
+
from judgeval.constants import APIScorerType
|
|
7
|
+
from typing import Dict, Any, Optional
|
|
8
|
+
from judgeval.api import JudgmentSyncClient
|
|
9
|
+
from judgeval.exceptions import JudgmentAPIError
|
|
10
|
+
import os
|
|
11
|
+
from judgeval.logger import judgeval_logger
|
|
12
|
+
from abc import ABC
|
|
13
|
+
from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
|
|
14
|
+
from copy import copy
|
|
15
|
+
from judgeval.utils.decorators.dont_throw import dont_throw
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def push_prompt_scorer(
|
|
19
|
+
name: str,
|
|
20
|
+
prompt: str,
|
|
21
|
+
threshold: float,
|
|
22
|
+
options: Optional[Dict[str, float]] = None,
|
|
23
|
+
model: str = JUDGMENT_DEFAULT_GPT_MODEL,
|
|
24
|
+
description: Optional[str] = None,
|
|
25
|
+
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
|
|
26
|
+
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
|
27
|
+
is_trace: bool = False,
|
|
28
|
+
) -> str:
|
|
29
|
+
client = JudgmentSyncClient(judgment_api_key, organization_id)
|
|
30
|
+
try:
|
|
31
|
+
r = client.save_scorer(
|
|
32
|
+
payload={
|
|
33
|
+
"name": name,
|
|
34
|
+
"prompt": prompt,
|
|
35
|
+
"threshold": threshold,
|
|
36
|
+
"options": options,
|
|
37
|
+
"model": model,
|
|
38
|
+
"description": description,
|
|
39
|
+
"is_trace": is_trace,
|
|
40
|
+
}
|
|
41
|
+
)
|
|
42
|
+
except JudgmentAPIError as e:
|
|
43
|
+
raise JudgmentAPIError(
|
|
44
|
+
status_code=e.status_code,
|
|
45
|
+
detail=f"Failed to save prompt scorer: {e.detail}",
|
|
46
|
+
response=e.response,
|
|
47
|
+
)
|
|
48
|
+
return r["scorer_response"]["name"]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def fetch_prompt_scorer(
|
|
52
|
+
name: str,
|
|
53
|
+
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
|
|
54
|
+
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
|
55
|
+
):
|
|
56
|
+
client = JudgmentSyncClient(judgment_api_key, organization_id)
|
|
57
|
+
try:
|
|
58
|
+
fetched_scorers = client.fetch_scorers({"names": [name]})
|
|
59
|
+
if len(fetched_scorers["scorers"]) == 0:
|
|
60
|
+
judgeval_logger.error(f"Prompt scorer '{name}' not found")
|
|
61
|
+
raise JudgmentAPIError(
|
|
62
|
+
status_code=404,
|
|
63
|
+
detail=f"Prompt scorer '{name}' not found",
|
|
64
|
+
response=None, # type: ignore
|
|
65
|
+
)
|
|
66
|
+
else:
|
|
67
|
+
scorer_config = fetched_scorers["scorers"][0]
|
|
68
|
+
scorer_config.pop("created_at")
|
|
69
|
+
scorer_config.pop("updated_at")
|
|
70
|
+
return scorer_config
|
|
71
|
+
except JudgmentAPIError as e:
|
|
72
|
+
raise JudgmentAPIError(
|
|
73
|
+
status_code=e.status_code,
|
|
74
|
+
detail=f"Failed to fetch prompt scorer '{name}': {e.detail}",
|
|
75
|
+
response=e.response,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def scorer_exists(
|
|
80
|
+
name: str,
|
|
81
|
+
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
|
|
82
|
+
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
|
83
|
+
):
|
|
84
|
+
client = JudgmentSyncClient(judgment_api_key, organization_id)
|
|
85
|
+
try:
|
|
86
|
+
return client.scorer_exists({"name": name})["exists"]
|
|
87
|
+
except JudgmentAPIError as e:
|
|
88
|
+
if e.status_code == 500:
|
|
89
|
+
raise JudgmentAPIError(
|
|
90
|
+
status_code=e.status_code,
|
|
91
|
+
detail=f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {e.detail}",
|
|
92
|
+
response=e.response,
|
|
93
|
+
)
|
|
94
|
+
raise JudgmentAPIError(
|
|
95
|
+
status_code=e.status_code,
|
|
96
|
+
detail=f"Failed to check if scorer exists: {e.detail}",
|
|
97
|
+
response=e.response,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class BasePromptScorer(ABC, APIScorerConfig):
|
|
102
|
+
score_type: APIScorerType
|
|
103
|
+
prompt: str
|
|
104
|
+
options: Optional[Dict[str, float]] = None
|
|
105
|
+
description: Optional[str] = None
|
|
106
|
+
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or ""
|
|
107
|
+
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or ""
|
|
108
|
+
|
|
109
|
+
@classmethod
|
|
110
|
+
@dont_throw
|
|
111
|
+
def get(
|
|
112
|
+
cls,
|
|
113
|
+
name: str,
|
|
114
|
+
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
|
|
115
|
+
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
|
116
|
+
):
|
|
117
|
+
scorer_config = fetch_prompt_scorer(name, judgment_api_key, organization_id)
|
|
118
|
+
if scorer_config["is_trace"] != issubclass(cls, TracePromptScorer):
|
|
119
|
+
raise JudgmentAPIError(
|
|
120
|
+
status_code=400,
|
|
121
|
+
detail=f"Scorer with name {name} is not a {cls.__name__}",
|
|
122
|
+
response=None, # type: ignore
|
|
123
|
+
)
|
|
124
|
+
if issubclass(cls, TracePromptScorer):
|
|
125
|
+
score_type = APIScorerType.TRACE_PROMPT_SCORER
|
|
126
|
+
else:
|
|
127
|
+
score_type = APIScorerType.PROMPT_SCORER
|
|
128
|
+
return cls(
|
|
129
|
+
score_type=score_type,
|
|
130
|
+
name=name,
|
|
131
|
+
prompt=scorer_config["prompt"],
|
|
132
|
+
threshold=scorer_config["threshold"],
|
|
133
|
+
options=scorer_config.get("options"),
|
|
134
|
+
model=scorer_config.get("model"),
|
|
135
|
+
description=scorer_config.get("description"),
|
|
136
|
+
judgment_api_key=judgment_api_key,
|
|
137
|
+
organization_id=organization_id,
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
@classmethod
|
|
141
|
+
def create(
|
|
142
|
+
cls,
|
|
143
|
+
name: str,
|
|
144
|
+
prompt: str,
|
|
145
|
+
threshold: float = 0.5,
|
|
146
|
+
options: Optional[Dict[str, float]] = None,
|
|
147
|
+
model: str = JUDGMENT_DEFAULT_GPT_MODEL,
|
|
148
|
+
description: Optional[str] = None,
|
|
149
|
+
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
|
|
150
|
+
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
|
151
|
+
):
|
|
152
|
+
if not scorer_exists(name, judgment_api_key, organization_id):
|
|
153
|
+
if issubclass(cls, TracePromptScorer):
|
|
154
|
+
is_trace = True
|
|
155
|
+
score_type = APIScorerType.TRACE_PROMPT_SCORER
|
|
156
|
+
else:
|
|
157
|
+
is_trace = False
|
|
158
|
+
score_type = APIScorerType.PROMPT_SCORER
|
|
159
|
+
push_prompt_scorer(
|
|
160
|
+
name,
|
|
161
|
+
prompt,
|
|
162
|
+
threshold,
|
|
163
|
+
options,
|
|
164
|
+
model,
|
|
165
|
+
description,
|
|
166
|
+
judgment_api_key,
|
|
167
|
+
organization_id,
|
|
168
|
+
is_trace,
|
|
169
|
+
)
|
|
170
|
+
judgeval_logger.info(f"Successfully created PromptScorer: {name}")
|
|
171
|
+
return cls(
|
|
172
|
+
score_type=score_type,
|
|
173
|
+
name=name,
|
|
174
|
+
prompt=prompt,
|
|
175
|
+
threshold=threshold,
|
|
176
|
+
options=options,
|
|
177
|
+
model=model,
|
|
178
|
+
description=description,
|
|
179
|
+
judgment_api_key=judgment_api_key,
|
|
180
|
+
organization_id=organization_id,
|
|
181
|
+
)
|
|
182
|
+
else:
|
|
183
|
+
raise JudgmentAPIError(
|
|
184
|
+
status_code=400,
|
|
185
|
+
detail=f"Scorer with name {name} already exists. Either use the existing scorer with the get() method or use a new name.",
|
|
186
|
+
response=None, # type: ignore
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
# Setter functions. Each setter function pushes the scorer to the DB.
|
|
190
|
+
def set_threshold(self, threshold: float):
|
|
191
|
+
"""
|
|
192
|
+
Updates the threshold of the scorer.
|
|
193
|
+
"""
|
|
194
|
+
self.threshold = threshold
|
|
195
|
+
self.push_prompt_scorer()
|
|
196
|
+
|
|
197
|
+
def set_prompt(self, prompt: str):
|
|
198
|
+
"""
|
|
199
|
+
Updates the prompt with the new prompt.
|
|
200
|
+
|
|
201
|
+
Sample prompt:
|
|
202
|
+
"Did the chatbot answer the user's question in a kind way?"
|
|
203
|
+
"""
|
|
204
|
+
self.prompt = prompt
|
|
205
|
+
self.push_prompt_scorer()
|
|
206
|
+
judgeval_logger.info(f"Successfully updated prompt for {self.name}")
|
|
207
|
+
|
|
208
|
+
def set_model(self, model: str):
|
|
209
|
+
"""
|
|
210
|
+
Updates the model of the scorer.
|
|
211
|
+
"""
|
|
212
|
+
self.model = model
|
|
213
|
+
self.push_prompt_scorer()
|
|
214
|
+
judgeval_logger.info(f"Successfully updated model for {self.name}")
|
|
215
|
+
|
|
216
|
+
def set_options(self, options: Optional[Dict[str, float]]):
|
|
217
|
+
"""
|
|
218
|
+
Updates the options of the scorer.
|
|
219
|
+
"""
|
|
220
|
+
self.options = options
|
|
221
|
+
self.push_prompt_scorer()
|
|
222
|
+
judgeval_logger.info(f"Successfully updated options for {self.name}")
|
|
223
|
+
|
|
224
|
+
def set_description(self, description: Optional[str]):
|
|
225
|
+
"""
|
|
226
|
+
Updates the description of the scorer.
|
|
227
|
+
"""
|
|
228
|
+
self.description = description
|
|
229
|
+
self.push_prompt_scorer()
|
|
230
|
+
judgeval_logger.info(f"Successfully updated description for {self.name}")
|
|
231
|
+
|
|
232
|
+
def append_to_prompt(self, prompt_addition: str):
|
|
233
|
+
"""
|
|
234
|
+
Appends a string to the prompt.
|
|
235
|
+
"""
|
|
236
|
+
self.prompt += prompt_addition
|
|
237
|
+
self.push_prompt_scorer()
|
|
238
|
+
judgeval_logger.info(f"Successfully appended to prompt for {self.name}")
|
|
239
|
+
|
|
240
|
+
# Getters
|
|
241
|
+
def get_threshold(self) -> float:
|
|
242
|
+
"""
|
|
243
|
+
Returns the threshold of the scorer.
|
|
244
|
+
"""
|
|
245
|
+
return self.threshold
|
|
246
|
+
|
|
247
|
+
def get_prompt(self) -> str:
|
|
248
|
+
"""
|
|
249
|
+
Returns the prompt of the scorer.
|
|
250
|
+
"""
|
|
251
|
+
return self.prompt
|
|
252
|
+
|
|
253
|
+
def get_model(self) -> str:
|
|
254
|
+
"""
|
|
255
|
+
Returns the model of the scorer.
|
|
256
|
+
"""
|
|
257
|
+
return self.model
|
|
258
|
+
|
|
259
|
+
def get_options(self) -> Dict[str, float] | None:
|
|
260
|
+
"""
|
|
261
|
+
Returns the options of the scorer.
|
|
262
|
+
"""
|
|
263
|
+
return copy(self.options) if self.options is not None else None
|
|
264
|
+
|
|
265
|
+
def get_description(self) -> str | None:
|
|
266
|
+
"""
|
|
267
|
+
Returns the description of the scorer.
|
|
268
|
+
"""
|
|
269
|
+
return self.description
|
|
270
|
+
|
|
271
|
+
def get_name(self) -> str:
|
|
272
|
+
"""
|
|
273
|
+
Returns the name of the scorer.
|
|
274
|
+
"""
|
|
275
|
+
return self.name
|
|
276
|
+
|
|
277
|
+
def get_config(self) -> dict:
|
|
278
|
+
"""
|
|
279
|
+
Returns a dictionary with all the fields in the scorer.
|
|
280
|
+
"""
|
|
281
|
+
return {
|
|
282
|
+
"name": self.name,
|
|
283
|
+
"model": self.model,
|
|
284
|
+
"prompt": self.prompt,
|
|
285
|
+
"threshold": self.threshold,
|
|
286
|
+
"options": self.options,
|
|
287
|
+
"description": self.description,
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
def push_prompt_scorer(self):
|
|
291
|
+
"""
|
|
292
|
+
Pushes the scorer to the DB.
|
|
293
|
+
"""
|
|
294
|
+
push_prompt_scorer(
|
|
295
|
+
self.name,
|
|
296
|
+
self.prompt,
|
|
297
|
+
self.threshold,
|
|
298
|
+
self.options,
|
|
299
|
+
self.model,
|
|
300
|
+
self.description,
|
|
301
|
+
self.judgment_api_key,
|
|
302
|
+
self.organization_id,
|
|
303
|
+
isinstance(self, TracePromptScorer),
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
def __str__(self):
|
|
307
|
+
return f"PromptScorer(name={self.name}, model={self.model}, prompt={self.prompt}, threshold={self.threshold}, options={self.options}, description={self.description})"
|
|
308
|
+
|
|
309
|
+
def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
|
|
310
|
+
base = super().model_dump(*args, **kwargs)
|
|
311
|
+
base_fields = set(APIScorerConfig.model_fields.keys())
|
|
312
|
+
all_fields = set(self.__class__.model_fields.keys())
|
|
313
|
+
|
|
314
|
+
extra_fields = all_fields - base_fields - {"kwargs"}
|
|
315
|
+
|
|
316
|
+
base["kwargs"] = {
|
|
317
|
+
k: getattr(self, k) for k in extra_fields if getattr(self, k) is not None
|
|
318
|
+
}
|
|
319
|
+
return base
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
class PromptScorer(BasePromptScorer, ExampleAPIScorerConfig):
|
|
323
|
+
pass
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
class TracePromptScorer(BasePromptScorer, TraceAPIScorerConfig):
|
|
327
|
+
pass
|