judgeval 0.0.32__py3-none-any.whl → 0.0.33__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/s3_storage.py +93 -0
- judgeval/common/tracer.py +612 -123
- judgeval/data/sequence.py +4 -10
- judgeval/judgment_client.py +25 -86
- judgeval/rules.py +4 -7
- judgeval/run_evaluation.py +1 -1
- judgeval/scorers/__init__.py +4 -4
- judgeval/scorers/judgeval_scorers/__init__.py +0 -176
- {judgeval-0.0.32.dist-info → judgeval-0.0.33.dist-info}/METADATA +15 -2
- judgeval-0.0.33.dist-info/RECORD +63 -0
- judgeval/scorers/base_scorer.py +0 -58
- judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -27
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -276
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
- judgeval/scorers/judgeval_scorers/local_implementations/comparison/__init__.py +0 -0
- judgeval/scorers/judgeval_scorers/local_implementations/comparison/comparison_scorer.py +0 -161
- judgeval/scorers/judgeval_scorers/local_implementations/comparison/prompts.py +0 -222
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
- judgeval/scorers/judgeval_scorers/local_implementations/execution_order/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/execution_order/execution_order.py +0 -156
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -318
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -264
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
- judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/instruction_adherence.py +0 -232
- judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/prompt.py +0 -102
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -551
- judgeval-0.0.32.dist-info/RECORD +0 -97
- {judgeval-0.0.32.dist-info → judgeval-0.0.33.dist-info}/WHEEL +0 -0
- {judgeval-0.0.32.dist-info → judgeval-0.0.33.dist-info}/licenses/LICENSE.md +0 -0
judgeval/data/sequence.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
from pydantic import BaseModel, Field, field_validator, model_validator
|
2
2
|
from typing import List, Optional, Union, Any
|
3
3
|
from judgeval.data.example import Example
|
4
|
-
from judgeval.scorers import
|
4
|
+
from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
|
5
5
|
from uuid import uuid4
|
6
6
|
from datetime import datetime, timezone
|
7
7
|
|
@@ -22,16 +22,10 @@ class Sequence(BaseModel):
|
|
22
22
|
|
23
23
|
@field_validator("scorers")
|
24
24
|
def validate_scorer(cls, v):
|
25
|
-
loaded_scorers = []
|
26
25
|
for scorer in v or []:
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
else:
|
31
|
-
loaded_scorers.append(scorer)
|
32
|
-
except Exception as e:
|
33
|
-
raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
|
34
|
-
return loaded_scorers
|
26
|
+
if not isinstance(scorer, APIJudgmentScorer) and not isinstance(scorer, JudgevalScorer):
|
27
|
+
raise ValueError(f"Invalid scorer type: {type(scorer)}")
|
28
|
+
return v
|
35
29
|
|
36
30
|
@model_validator(mode="after")
|
37
31
|
def populate_sequence_metadata(self) -> "Sequence":
|
judgeval/judgment_client.py
CHANGED
@@ -17,7 +17,6 @@ from judgeval.scorers import (
|
|
17
17
|
APIJudgmentScorer,
|
18
18
|
JudgevalScorer,
|
19
19
|
ClassifierScorer,
|
20
|
-
ScorerWrapper,
|
21
20
|
)
|
22
21
|
from judgeval.evaluation_run import EvaluationRun
|
23
22
|
from judgeval.run_evaluation import (
|
@@ -74,7 +73,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
74
73
|
def a_run_evaluation(
|
75
74
|
self,
|
76
75
|
examples: List[Example],
|
77
|
-
scorers: List[Union[
|
76
|
+
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
|
78
77
|
model: Union[str, List[str], JudgevalJudge],
|
79
78
|
aggregator: Optional[str] = None,
|
80
79
|
metadata: Optional[Dict[str, Any]] = None,
|
@@ -83,21 +82,32 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
83
82
|
eval_run_name: str = "default_eval_run",
|
84
83
|
override: bool = False,
|
85
84
|
append: bool = False,
|
86
|
-
use_judgment: bool = True,
|
87
85
|
ignore_errors: bool = True,
|
88
86
|
rules: Optional[List[Rule]] = None
|
89
87
|
) -> List[ScoringResult]:
|
90
|
-
return self.run_evaluation(
|
88
|
+
return self.run_evaluation(
|
89
|
+
examples=examples,
|
90
|
+
scorers=scorers,
|
91
|
+
model=model,
|
92
|
+
aggregator=aggregator,
|
93
|
+
metadata=metadata,
|
94
|
+
log_results=log_results,
|
95
|
+
project_name=project_name,
|
96
|
+
eval_run_name=eval_run_name,
|
97
|
+
override=override,
|
98
|
+
append=append,
|
99
|
+
ignore_errors=ignore_errors,
|
100
|
+
rules=rules
|
101
|
+
)
|
91
102
|
|
92
103
|
def run_sequence_evaluation(
|
93
104
|
self,
|
94
105
|
sequences: List[Sequence],
|
95
106
|
model: Union[str, List[str], JudgevalJudge],
|
96
|
-
scorers: List[Union[
|
107
|
+
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
|
97
108
|
aggregator: Optional[str] = None,
|
98
109
|
project_name: str = "default_project",
|
99
110
|
eval_run_name: str = "default_eval_sequence",
|
100
|
-
use_judgment: bool = True,
|
101
111
|
log_results: bool = True,
|
102
112
|
append: bool = False,
|
103
113
|
override: bool = False,
|
@@ -105,16 +115,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
105
115
|
rules: Optional[List[Rule]] = None
|
106
116
|
) -> List[ScoringResult]:
|
107
117
|
try:
|
108
|
-
loaded_scorers = []
|
109
|
-
for scorer in scorers:
|
110
|
-
try:
|
111
|
-
if isinstance(scorer, ScorerWrapper):
|
112
|
-
loaded_scorers.append(scorer.load_implementation())
|
113
|
-
else:
|
114
|
-
loaded_scorers.append(scorer)
|
115
|
-
except Exception as e:
|
116
|
-
raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
|
117
|
-
|
118
118
|
def get_all_sequences(root: Sequence) -> List[Sequence]:
|
119
119
|
all_sequences = [root]
|
120
120
|
|
@@ -132,31 +132,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
132
132
|
|
133
133
|
flattened_sequences = flatten_sequence_list(sequences)
|
134
134
|
for sequence in flattened_sequences:
|
135
|
-
sequence.scorers =
|
136
|
-
|
137
|
-
if rules:
|
138
|
-
loaded_rules = []
|
139
|
-
for rule in rules:
|
140
|
-
try:
|
141
|
-
processed_conditions = []
|
142
|
-
for condition in rule.conditions:
|
143
|
-
# Convert metric if it's a ScorerWrapper
|
144
|
-
if isinstance(condition.metric, ScorerWrapper):
|
145
|
-
try:
|
146
|
-
condition_copy = condition.model_copy()
|
147
|
-
condition_copy.metric = condition.metric.load_implementation(use_judgment=use_judgment)
|
148
|
-
processed_conditions.append(condition_copy)
|
149
|
-
except Exception as e:
|
150
|
-
raise ValueError(f"Failed to convert ScorerWrapper to implementation in rule '{rule.name}', condition metric '{condition.metric}': {str(e)}")
|
151
|
-
else:
|
152
|
-
processed_conditions.append(condition)
|
153
|
-
|
154
|
-
# Create new rule with processed conditions
|
155
|
-
new_rule = rule.model_copy()
|
156
|
-
new_rule.conditions = processed_conditions
|
157
|
-
loaded_rules.append(new_rule)
|
158
|
-
except Exception as e:
|
159
|
-
raise ValueError(f"Failed to process rule '{rule.name}': {str(e)}")
|
135
|
+
sequence.scorers = scorers
|
160
136
|
|
161
137
|
sequence_run = SequenceRun(
|
162
138
|
project_name=project_name,
|
@@ -169,7 +145,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
169
145
|
judgment_api_key=self.judgment_api_key,
|
170
146
|
organization_id=self.organization_id
|
171
147
|
)
|
172
|
-
return run_sequence_eval(sequence_run, override, ignore_errors
|
148
|
+
return run_sequence_eval(sequence_run, override, ignore_errors)
|
173
149
|
except ValueError as e:
|
174
150
|
raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: \n{str(e)}")
|
175
151
|
except Exception as e:
|
@@ -178,7 +154,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
178
154
|
def run_evaluation(
|
179
155
|
self,
|
180
156
|
examples: Union[List[Example], List[CustomExample]],
|
181
|
-
scorers: List[Union[
|
157
|
+
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
|
182
158
|
model: Union[str, List[str], JudgevalJudge],
|
183
159
|
aggregator: Optional[str] = None,
|
184
160
|
metadata: Optional[Dict[str, Any]] = None,
|
@@ -187,7 +163,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
187
163
|
eval_run_name: str = "default_eval_run",
|
188
164
|
override: bool = False,
|
189
165
|
append: bool = False,
|
190
|
-
use_judgment: bool = True,
|
191
166
|
ignore_errors: bool = True,
|
192
167
|
async_execution: bool = False,
|
193
168
|
rules: Optional[List[Rule]] = None
|
@@ -197,7 +172,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
197
172
|
|
198
173
|
Args:
|
199
174
|
examples (Union[List[Example], List[CustomExample]]): The examples to evaluate
|
200
|
-
scorers (List[Union[
|
175
|
+
scorers (List[Union[APIJudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
|
201
176
|
model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
|
202
177
|
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
203
178
|
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run
|
@@ -205,7 +180,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
205
180
|
project_name (str): The name of the project the evaluation results belong to
|
206
181
|
eval_run_name (str): A name for this evaluation run
|
207
182
|
override (bool): Whether to override an existing evaluation run with the same name
|
208
|
-
use_judgment (bool): Whether to use Judgment API for evaluation
|
209
183
|
ignore_errors (bool): Whether to ignore errors during evaluation (safely handled)
|
210
184
|
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
211
185
|
|
@@ -216,58 +190,21 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
216
190
|
raise ValueError("Cannot set both override and append to True. Please choose one.")
|
217
191
|
|
218
192
|
try:
|
219
|
-
|
220
|
-
loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = []
|
221
|
-
for scorer in scorers:
|
222
|
-
try:
|
223
|
-
if isinstance(scorer, ScorerWrapper):
|
224
|
-
loaded_scorers.append(scorer.load_implementation(use_judgment=use_judgment))
|
225
|
-
else:
|
226
|
-
loaded_scorers.append(scorer)
|
227
|
-
except Exception as e:
|
228
|
-
raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
|
229
|
-
|
230
|
-
# Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
|
231
|
-
if rules and any(isinstance(scorer, JudgevalScorer) for scorer in loaded_scorers):
|
193
|
+
if rules and any(isinstance(scorer, JudgevalScorer) for scorer in scorers):
|
232
194
|
raise ValueError("Cannot use Judgeval scorers (only API scorers) when using rules. Please either remove rules or use only APIJudgmentScorer types.")
|
233
195
|
|
234
|
-
# Convert ScorerWrapper in rules to their implementations
|
235
|
-
loaded_rules = None
|
236
|
-
if rules:
|
237
|
-
loaded_rules = []
|
238
|
-
for rule in rules:
|
239
|
-
try:
|
240
|
-
processed_conditions = []
|
241
|
-
for condition in rule.conditions:
|
242
|
-
# Convert metric if it's a ScorerWrapper
|
243
|
-
if isinstance(condition.metric, ScorerWrapper):
|
244
|
-
try:
|
245
|
-
condition_copy = condition.model_copy()
|
246
|
-
condition_copy.metric = condition.metric.load_implementation(use_judgment=use_judgment)
|
247
|
-
processed_conditions.append(condition_copy)
|
248
|
-
except Exception as e:
|
249
|
-
raise ValueError(f"Failed to convert ScorerWrapper to implementation in rule '{rule.name}', condition metric '{condition.metric}': {str(e)}")
|
250
|
-
else:
|
251
|
-
processed_conditions.append(condition)
|
252
|
-
|
253
|
-
# Create new rule with processed conditions
|
254
|
-
new_rule = rule.model_copy()
|
255
|
-
new_rule.conditions = processed_conditions
|
256
|
-
loaded_rules.append(new_rule)
|
257
|
-
except Exception as e:
|
258
|
-
raise ValueError(f"Failed to process rule '{rule.name}': {str(e)}")
|
259
196
|
eval = EvaluationRun(
|
260
197
|
log_results=log_results,
|
261
198
|
append=append,
|
262
199
|
project_name=project_name,
|
263
200
|
eval_name=eval_run_name,
|
264
201
|
examples=examples,
|
265
|
-
scorers=
|
202
|
+
scorers=scorers,
|
266
203
|
model=model,
|
267
204
|
aggregator=aggregator,
|
268
205
|
metadata=metadata,
|
269
206
|
judgment_api_key=self.judgment_api_key,
|
270
|
-
rules=
|
207
|
+
rules=rules,
|
271
208
|
organization_id=self.organization_id
|
272
209
|
)
|
273
210
|
return run_eval(eval, override, ignore_errors=ignore_errors, async_execution=async_execution)
|
@@ -505,6 +442,8 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
505
442
|
raise JudgmentAPIError(f"Failed to fetch classifier scorer '{slug}': {response.json().get('detail', '')}")
|
506
443
|
|
507
444
|
scorer_config = response.json()
|
445
|
+
created_at = scorer_config.pop("created_at")
|
446
|
+
updated_at = scorer_config.pop("updated_at")
|
508
447
|
|
509
448
|
try:
|
510
449
|
return ClassifierScorer(**scorer_config)
|
judgeval/rules.py
CHANGED
@@ -10,7 +10,7 @@ from concurrent.futures import ThreadPoolExecutor
|
|
10
10
|
import time
|
11
11
|
import uuid
|
12
12
|
|
13
|
-
from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
|
13
|
+
from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
|
14
14
|
|
15
15
|
class AlertStatus(str, Enum):
|
16
16
|
"""Status of an alert evaluation."""
|
@@ -23,22 +23,19 @@ class Condition(BaseModel):
|
|
23
23
|
|
24
24
|
Example:
|
25
25
|
{
|
26
|
-
"metric": FaithfulnessScorer(threshold=0.7) # Must be a scorer object: APIJudgmentScorer, JudgevalScorer
|
26
|
+
"metric": FaithfulnessScorer(threshold=0.7) # Must be a scorer object: APIJudgmentScorer, JudgevalScorer
|
27
27
|
}
|
28
28
|
|
29
29
|
The Condition class uses the scorer's threshold and success function internally.
|
30
30
|
"""
|
31
31
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
32
32
|
|
33
|
-
metric: Union[APIJudgmentScorer, JudgevalScorer
|
33
|
+
metric: Union[APIJudgmentScorer, JudgevalScorer]
|
34
34
|
|
35
35
|
@property
|
36
36
|
def metric_name(self) -> str:
|
37
37
|
"""Get the name of the metric for lookups in scores dictionary."""
|
38
|
-
if
|
39
|
-
# Handle ScorerWrapper case specifically
|
40
|
-
return self.metric.scorer.score_type if hasattr(self.metric.scorer, 'score_type') else str(self.metric.scorer)
|
41
|
-
elif hasattr(self.metric, 'score_type'):
|
38
|
+
if hasattr(self.metric, 'score_type'):
|
42
39
|
# Handle APIJudgmentScorer and JudgevalScorer which have score_type
|
43
40
|
return self.metric.score_type
|
44
41
|
elif hasattr(self.metric, '__name__'):
|
judgeval/run_evaluation.py
CHANGED
@@ -334,7 +334,7 @@ def check_examples(examples: List[Example], scorers: List[APIJudgmentScorer]) ->
|
|
334
334
|
# Example ID (usually random UUID) does not provide any helpful information for the user but printing the entire example is overdoing it
|
335
335
|
print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
|
336
336
|
|
337
|
-
def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True
|
337
|
+
def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True) -> List[ScoringResult]:
|
338
338
|
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
339
339
|
if not override and sequence_run.log_results and not sequence_run.append:
|
340
340
|
check_eval_run_name_exists(
|
judgeval/scorers/__init__.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
2
2
|
from judgeval.scorers.judgeval_scorer import JudgevalScorer
|
3
3
|
from judgeval.scorers.prompt_scorer import PromptScorer, ClassifierScorer
|
4
|
-
from judgeval.scorers.judgeval_scorers import (
|
4
|
+
from judgeval.scorers.judgeval_scorers.api_scorers import (
|
5
5
|
ExecutionOrderScorer,
|
6
6
|
JSONCorrectnessScorer,
|
7
7
|
SummarizationScorer,
|
@@ -11,14 +11,15 @@ from judgeval.scorers.judgeval_scorers import (
|
|
11
11
|
ContextualPrecisionScorer,
|
12
12
|
ContextualRecallScorer,
|
13
13
|
AnswerRelevancyScorer,
|
14
|
-
ScorerWrapper,
|
15
14
|
AnswerCorrectnessScorer,
|
16
|
-
Text2SQLScorer,
|
17
15
|
ComparisonScorer,
|
18
16
|
InstructionAdherenceScorer,
|
19
17
|
GroundednessScorer,
|
20
18
|
DerailmentScorer,
|
21
19
|
)
|
20
|
+
from judgeval.scorers.judgeval_scorers.classifiers import (
|
21
|
+
Text2SQLScorer,
|
22
|
+
)
|
22
23
|
|
23
24
|
__all__ = [
|
24
25
|
"APIJudgmentScorer",
|
@@ -34,7 +35,6 @@ __all__ = [
|
|
34
35
|
"ContextualPrecisionScorer",
|
35
36
|
"ContextualRecallScorer",
|
36
37
|
"AnswerRelevancyScorer",
|
37
|
-
"ScorerWrapper",
|
38
38
|
"AnswerCorrectnessScorer",
|
39
39
|
"Text2SQLScorer",
|
40
40
|
"ComparisonScorer",
|
@@ -1,176 +0,0 @@
|
|
1
|
-
from typing import Type, Optional, Any
|
2
|
-
|
3
|
-
# Import implementations
|
4
|
-
from judgeval.scorers.judgeval_scorers.api_scorers import (
|
5
|
-
ExecutionOrderScorer as APIExecutionOrderScorer,
|
6
|
-
JSONCorrectnessScorer as APIJSONCorrectnessScorer,
|
7
|
-
SummarizationScorer as APISummarizationScorer,
|
8
|
-
HallucinationScorer as APIHallucinationScorer,
|
9
|
-
FaithfulnessScorer as APIFaithfulnessScorer,
|
10
|
-
ContextualRelevancyScorer as APIContextualRelevancyScorer,
|
11
|
-
ContextualPrecisionScorer as APIContextualPrecisionScorer,
|
12
|
-
ContextualRecallScorer as APIContextualRecallScorer,
|
13
|
-
AnswerRelevancyScorer as APIAnswerRelevancyScorer,
|
14
|
-
AnswerCorrectnessScorer as APIAnswerCorrectnessScorer,
|
15
|
-
ComparisonScorer as APIComparisonScorer,
|
16
|
-
InstructionAdherenceScorer as APIInstructionAdherenceScorer,
|
17
|
-
GroundednessScorer as APIGroundednessScorer,
|
18
|
-
DerailmentScorer as APIDerailmentScorer,
|
19
|
-
)
|
20
|
-
|
21
|
-
from judgeval.scorers.judgeval_scorers.local_implementations import (
|
22
|
-
AnswerRelevancyScorer as LocalAnswerRelevancyScorer,
|
23
|
-
ContextualPrecisionScorer as LocalContextualPrecisionScorer,
|
24
|
-
ContextualRecallScorer as LocalContextualRecallScorer,
|
25
|
-
ContextualRelevancyScorer as LocalContextualRelevancyScorer,
|
26
|
-
FaithfulnessScorer as LocalFaithfulnessScorer,
|
27
|
-
JsonCorrectnessScorer as LocalJsonCorrectnessScorer,
|
28
|
-
ExecutionOrderScorer as LocalExecutionOrderScorer,
|
29
|
-
HallucinationScorer as LocalHallucinationScorer,
|
30
|
-
SummarizationScorer as LocalSummarizationScorer,
|
31
|
-
AnswerCorrectnessScorer as LocalAnswerCorrectnessScorer,
|
32
|
-
ComparisonScorer as LocalComparisonScorer,
|
33
|
-
InstructionAdherenceScorer as LocalInstructionAdherenceScorer,
|
34
|
-
)
|
35
|
-
|
36
|
-
from judgeval.scorers.judgeval_scorers.classifiers import Text2SQLScorer
|
37
|
-
|
38
|
-
|
39
|
-
class ScorerWrapper:
|
40
|
-
"""
|
41
|
-
Wrapper class that can dynamically load either API or local implementation of a scorer.
|
42
|
-
"""
|
43
|
-
def __init__(self, api_implementation: Type, local_implementation: Optional[Type] = None):
|
44
|
-
self.api_implementation = api_implementation
|
45
|
-
self.local_implementation = local_implementation
|
46
|
-
self._instance = None
|
47
|
-
self._init_args = None
|
48
|
-
self._init_kwargs = None
|
49
|
-
|
50
|
-
def __call__(self, *args, **kwargs):
|
51
|
-
"""Store initialization arguments for later use when implementation is loaded"""
|
52
|
-
self._init_args = args
|
53
|
-
self._init_kwargs = kwargs
|
54
|
-
return self
|
55
|
-
|
56
|
-
def load_implementation(self, use_judgment: bool = True) -> Any:
|
57
|
-
"""
|
58
|
-
Load the appropriate implementation based on the use_judgment flag.
|
59
|
-
|
60
|
-
Args:
|
61
|
-
use_judgment (bool): If True, use API implementation. If False, use local implementation.
|
62
|
-
|
63
|
-
Returns:
|
64
|
-
Instance of the appropriate implementation
|
65
|
-
|
66
|
-
Raises:
|
67
|
-
ValueError: If local implementation is requested but not available
|
68
|
-
"""
|
69
|
-
if self._instance is not None:
|
70
|
-
return self._instance
|
71
|
-
|
72
|
-
if use_judgment:
|
73
|
-
implementation = self.api_implementation
|
74
|
-
else:
|
75
|
-
if self.local_implementation is None:
|
76
|
-
raise ValueError("No local implementation available for this scorer")
|
77
|
-
implementation = self.local_implementation
|
78
|
-
|
79
|
-
args = self._init_args or ()
|
80
|
-
kwargs = self._init_kwargs or {}
|
81
|
-
self._instance = implementation(*args, **kwargs)
|
82
|
-
return self._instance
|
83
|
-
|
84
|
-
def __getattr__(self, name):
|
85
|
-
"""Defer all attribute access to the loaded implementation"""
|
86
|
-
if self._instance is None:
|
87
|
-
raise RuntimeError("Implementation not loaded. Call load_implementation() first")
|
88
|
-
return getattr(self._instance, name)
|
89
|
-
|
90
|
-
# Create wrapped versions of all scorers
|
91
|
-
|
92
|
-
AnswerCorrectnessScorer = ScorerWrapper(
|
93
|
-
api_implementation=APIAnswerCorrectnessScorer,
|
94
|
-
local_implementation=LocalAnswerCorrectnessScorer
|
95
|
-
)
|
96
|
-
|
97
|
-
AnswerRelevancyScorer = ScorerWrapper(
|
98
|
-
api_implementation=APIAnswerRelevancyScorer,
|
99
|
-
local_implementation=LocalAnswerRelevancyScorer
|
100
|
-
)
|
101
|
-
|
102
|
-
ExecutionOrderScorer = ScorerWrapper(
|
103
|
-
api_implementation=APIExecutionOrderScorer,
|
104
|
-
local_implementation=LocalExecutionOrderScorer
|
105
|
-
)
|
106
|
-
|
107
|
-
JSONCorrectnessScorer = ScorerWrapper(
|
108
|
-
api_implementation=APIJSONCorrectnessScorer,
|
109
|
-
local_implementation=LocalJsonCorrectnessScorer
|
110
|
-
)
|
111
|
-
|
112
|
-
SummarizationScorer = ScorerWrapper(
|
113
|
-
api_implementation=APISummarizationScorer,
|
114
|
-
local_implementation=LocalSummarizationScorer
|
115
|
-
)
|
116
|
-
|
117
|
-
HallucinationScorer = ScorerWrapper(
|
118
|
-
api_implementation=APIHallucinationScorer,
|
119
|
-
local_implementation=LocalHallucinationScorer
|
120
|
-
)
|
121
|
-
|
122
|
-
FaithfulnessScorer = ScorerWrapper(
|
123
|
-
api_implementation=APIFaithfulnessScorer,
|
124
|
-
local_implementation=LocalFaithfulnessScorer
|
125
|
-
)
|
126
|
-
|
127
|
-
ContextualRelevancyScorer = ScorerWrapper(
|
128
|
-
api_implementation=APIContextualRelevancyScorer,
|
129
|
-
local_implementation=LocalContextualRelevancyScorer
|
130
|
-
)
|
131
|
-
|
132
|
-
ContextualPrecisionScorer = ScorerWrapper(
|
133
|
-
api_implementation=APIContextualPrecisionScorer,
|
134
|
-
local_implementation=LocalContextualPrecisionScorer
|
135
|
-
)
|
136
|
-
|
137
|
-
ContextualRecallScorer = ScorerWrapper(
|
138
|
-
api_implementation=APIContextualRecallScorer,
|
139
|
-
local_implementation=LocalContextualRecallScorer
|
140
|
-
)
|
141
|
-
|
142
|
-
InstructionAdherenceScorer = ScorerWrapper(
|
143
|
-
api_implementation=APIInstructionAdherenceScorer,
|
144
|
-
local_implementation=LocalInstructionAdherenceScorer
|
145
|
-
)
|
146
|
-
|
147
|
-
def ComparisonScorer(threshold: float, criteria: str, description: str):
|
148
|
-
return ScorerWrapper(
|
149
|
-
api_implementation=APIComparisonScorer,
|
150
|
-
local_implementation=LocalComparisonScorer
|
151
|
-
)(threshold=threshold, criteria=criteria, description=description)
|
152
|
-
|
153
|
-
GroundednessScorer = ScorerWrapper(
|
154
|
-
api_implementation=APIGroundednessScorer,
|
155
|
-
)
|
156
|
-
|
157
|
-
DerailmentScorer = ScorerWrapper(
|
158
|
-
api_implementation=APIDerailmentScorer,
|
159
|
-
local_implementation=LocalInstructionAdherenceScorer # TODO: add local implementation
|
160
|
-
)
|
161
|
-
|
162
|
-
__all__ = [
|
163
|
-
"ExecutionOrderScorer",
|
164
|
-
"JSONCorrectnessScorer",
|
165
|
-
"SummarizationScorer",
|
166
|
-
"HallucinationScorer",
|
167
|
-
"FaithfulnessScorer",
|
168
|
-
"ContextualRelevancyScorer",
|
169
|
-
"ContextualPrecisionScorer",
|
170
|
-
"ContextualRecallScorer",
|
171
|
-
"AnswerRelevancyScorer",
|
172
|
-
"Text2SQLScorer",
|
173
|
-
"ComparisonScorer",
|
174
|
-
"GroundednessScorer",
|
175
|
-
"DerailmentScorer",
|
176
|
-
]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: judgeval
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.33
|
4
4
|
Summary: Judgeval Package
|
5
5
|
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
6
|
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
@@ -12,12 +12,13 @@ Classifier: Programming Language :: Python :: 3
|
|
12
12
|
Requires-Python: >=3.11
|
13
13
|
Requires-Dist: anthropic
|
14
14
|
Requires-Dist: fastapi
|
15
|
+
Requires-Dist: google-genai
|
15
16
|
Requires-Dist: langchain
|
16
17
|
Requires-Dist: langchain-anthropic
|
17
18
|
Requires-Dist: langchain-core
|
18
19
|
Requires-Dist: langchain-huggingface
|
19
20
|
Requires-Dist: langchain-openai
|
20
|
-
Requires-Dist: litellm
|
21
|
+
Requires-Dist: litellm==1.38.12
|
21
22
|
Requires-Dist: nest-asyncio
|
22
23
|
Requires-Dist: openai
|
23
24
|
Requires-Dist: openpyxl
|
@@ -94,9 +95,21 @@ Create a file named `traces.py` with the following code:
|
|
94
95
|
from judgeval.common.tracer import Tracer, wrap
|
95
96
|
from openai import OpenAI
|
96
97
|
|
98
|
+
# Basic initialization
|
97
99
|
client = wrap(OpenAI())
|
98
100
|
judgment = Tracer(project_name="my_project")
|
99
101
|
|
102
|
+
# Or with S3 storage enabled
|
103
|
+
# NOTE: Make sure AWS creds correspond to an account with write access to the specified S3 bucket
|
104
|
+
judgment = Tracer(
|
105
|
+
project_name="my_project",
|
106
|
+
use_s3=True,
|
107
|
+
s3_bucket_name="my-traces-bucket", # Bucket created automatically if it doesn't exist
|
108
|
+
s3_aws_access_key_id="your-access-key", # Optional: defaults to AWS_ACCESS_KEY_ID env var
|
109
|
+
s3_aws_secret_access_key="your-secret-key", # Optional: defaults to AWS_SECRET_ACCESS_KEY env var
|
110
|
+
s3_region_name="us-west-1" # Optional: defaults to AWS_REGION env var or "us-west-1"
|
111
|
+
)
|
112
|
+
|
100
113
|
@judgment.observe(span_type="tool")
|
101
114
|
def my_tool():
|
102
115
|
return "Hello world!"
|
@@ -0,0 +1,63 @@
|
|
1
|
+
judgeval/__init__.py,sha256=x9HWt4waJwJMAqTuJSg2MezF9Zg-macEjeU-ajbly-8,330
|
2
|
+
judgeval/clients.py,sha256=6VQmEqmfCngUdS2MuPBIpHvtDFqOENm8-_BmMvjLyRQ,944
|
3
|
+
judgeval/constants.py,sha256=_XmVAkebMyGrDvvanAVlMgVd4p6MLHdEVsTQFI0kz1k,5411
|
4
|
+
judgeval/evaluation_run.py,sha256=WGzx-Ug2qhSmunFo8NrmSstBRsOUc5KpKq0Lc51rqsM,6739
|
5
|
+
judgeval/judgment_client.py,sha256=brRYmphZR-2IUre9kdOhfse1mYDilcIqUzzH21ROAdk,22208
|
6
|
+
judgeval/rules.py,sha256=jkh1cXXcUf8oRY7xJUZfcQBYWn_rjUW4GvrhRt15PeU,20265
|
7
|
+
judgeval/run_evaluation.py,sha256=elMpFHahyeukKKa09fmJM3c_afwJ00mbZRqm18l5f00,28481
|
8
|
+
judgeval/version_check.py,sha256=bvJEidB7rAeXozoUbN9Yb97QOR_s2hgvpvj74jJ5HlY,943
|
9
|
+
judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
|
10
|
+
judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
|
11
|
+
judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
|
12
|
+
judgeval/common/s3_storage.py,sha256=W8wq9S7qJZdqdBR4sk3aEZ4K3-pz40DOoolOJrWs9Vo,3768
|
13
|
+
judgeval/common/tracer.py,sha256=YsObK8VQXp1DDbU9xncU8NjuY-JUI54BqmG4olezrZc,92507
|
14
|
+
judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
|
15
|
+
judgeval/data/__init__.py,sha256=xuKx_KCVHGp6CXvQuVmKl3v7pJp-qDaz0NccKxwjtO0,481
|
16
|
+
judgeval/data/custom_example.py,sha256=QRBqiRiZS8UgVeTRHY0r1Jzm6yAYsyg6zmHxQGxdiQs,739
|
17
|
+
judgeval/data/example.py,sha256=cJrmPGLel_P2sy1UaRvuVSAi35EnA9XMR11Lhp4aDLo,5930
|
18
|
+
judgeval/data/result.py,sha256=Gb9tiSDsk1amXgh0cFG6JmlW_BMKxS2kuTwNA0rrHjA,3184
|
19
|
+
judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
|
20
|
+
judgeval/data/sequence.py,sha256=FmKVdzQP5VTujRCHDWk097MKRR-rJgbsdrxyCKee6tA,1994
|
21
|
+
judgeval/data/sequence_run.py,sha256=RmYjfWKMWg-pcF5PLeiWfrhuDkjDZi5VEmAIEXN3Ib0,2104
|
22
|
+
judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
|
23
|
+
judgeval/data/datasets/dataset.py,sha256=dhLo30hvpmmOK2R6O5wDs_neawUJ4lS8bb4S42SufNQ,13034
|
24
|
+
judgeval/data/datasets/eval_dataset_client.py,sha256=xjj66BO9Es9IxXqzQe1RT_e0kpeKlt7OrhRoSuj4KHM,15085
|
25
|
+
judgeval/integrations/langgraph.py,sha256=J-cQfFP52TjJewdSTe-fcsUC4HDvjNbXoxmbmF0SgiE,11743
|
26
|
+
judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
|
27
|
+
judgeval/judges/base_judge.py,sha256=ch_S7uBB7lyv44Lf1d7mIGFpveOO58zOkkpImKgd9_4,994
|
28
|
+
judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6KTg,2424
|
29
|
+
judgeval/judges/mixture_of_judges.py,sha256=IJoi4Twk8ze1CJWVEp69k6TSqTCTGrmVYQ0qdffer60,15549
|
30
|
+
judgeval/judges/together_judge.py,sha256=l00hhPerAZXg3oYBd8cyMtWsOTNt_0FIqoxhKJKQe3k,2302
|
31
|
+
judgeval/judges/utils.py,sha256=9lvUxziGV86ISvVFxYBWc09TWFyAQgUTyPf_a9mD5Rs,2686
|
32
|
+
judgeval/scorers/__init__.py,sha256=Mk-mWUt_gNpJqY_WIEuQynD6fxc34fWSRSuobMSrj94,1238
|
33
|
+
judgeval/scorers/api_scorer.py,sha256=NQ_CrrUPhSUk1k2Q8rKpCG_TU2FT32sFEqvb-Yi54B0,2688
|
34
|
+
judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
|
35
|
+
judgeval/scorers/judgeval_scorer.py,sha256=79-JJurqHP-qTaWNWInx4SjvQYwXc9lvfPPNgwsh2yA,6773
|
36
|
+
judgeval/scorers/prompt_scorer.py,sha256=PaAs2qRolw1P3_I061Xvk9qzvF4O-JR8g_39RqXnHcM,17728
|
37
|
+
judgeval/scorers/score.py,sha256=r9QiT4-LIvivcJ6XxByrbswKSO8eQTtAD1UlXT_lcmo,18741
|
38
|
+
judgeval/scorers/utils.py,sha256=iHQVTlIANbmCTXz9kTeSdOytgUZ_T74Re61ajqsk_WQ,6827
|
39
|
+
judgeval/scorers/judgeval_scorers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
40
|
+
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=_sDUBxSG536KGqXNi6dFpaYKghjEAadxBxaaxV9HuuE,1764
|
41
|
+
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=Fnd9CVIOZ73sWEWymsU5eBrrZqPFjMZ0BKpeW-PDyTg,711
|
42
|
+
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=oETeN9K0HSIRdL2SDqn82Vskpwh5SlKnZvs5VDm2OBU,658
|
43
|
+
judgeval/scorers/judgeval_scorers/api_scorers/comparison.py,sha256=kuzf9OWvpY38yYSwlBgneLkUZwJNM4FQqvbS66keA90,1249
|
44
|
+
judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py,sha256=tpSuzFAaW8X9xqA0aLLKwh7qmBK0Pc_bJZMIe_q412U,770
|
45
|
+
judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py,sha256=pFVhk4pLtQ-FnNlbI-dFF-SIh69Jza7erHqiPkFWoBo,758
|
46
|
+
judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py,sha256=RQ6DZwEhChfecd89Ey-T7ke--7qTaXZlRsNxwH8gaME,823
|
47
|
+
judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py,sha256=V9WPuwNMm097V7IknKs8UkmAk0yjnBXTcJha_BHXxTA,475
|
48
|
+
judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py,sha256=Pb3CiNF2Ca826B92wJCVAi_68lJjLhqqCKwQKaflSUg,1294
|
49
|
+
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=-BwOapqjryYNKNydtdkUiKIij76dY0O1jBmdc6dKazQ,692
|
50
|
+
judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py,sha256=ntEEeTANEOsGlcbiTAF_3r6BeSJEaVDns8po8T0L6Vg,692
|
51
|
+
judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py,sha256=k5gDOki-8KXrZXydvdSqDt3NZqQ28hXoOCHQf6jNxr4,686
|
52
|
+
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=XnSGEkQfwVqaqnHEGMCsxNiHVzrsrej48uDbLoWc8CQ,678
|
53
|
+
judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py,sha256=mMKEuR87_yanEuZJ5YSGFMHDD_oLVZ6-rQuciFaDOMA,1095
|
54
|
+
judgeval/scorers/judgeval_scorers/api_scorers/summarization.py,sha256=QmWB8bVbDYHY5FcF0rYZE_3c2XXgMLRmR6aXJWfdMC4,655
|
55
|
+
judgeval/scorers/judgeval_scorers/classifiers/__init__.py,sha256=Qt81W5ZCwMvBAne0LfQDb8xvg5iOG1vEYP7WizgwAZo,67
|
56
|
+
judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py,sha256=8iTzMvou1Dr8pybul6lZHKjc9Ye2-0_racRGYkhEdTY,74
|
57
|
+
judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py,sha256=ly72Z7s_c8NID6-nQnuW8qEGEW2MqdvpJ-5WfXzbAQg,2579
|
58
|
+
judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
|
59
|
+
judgeval/utils/alerts.py,sha256=O19Xj7DA0YVjl8PWiuH4zfdZeu3yiLVvHfY8ah2wG0g,2759
|
60
|
+
judgeval-0.0.33.dist-info/METADATA,sha256=KzTkGTHYE8Uplehvtk_7x30XrV0xe1bpd-tU5lt0mHg,6097
|
61
|
+
judgeval-0.0.33.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
62
|
+
judgeval-0.0.33.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
|
63
|
+
judgeval-0.0.33.dist-info/RECORD,,
|
judgeval/scorers/base_scorer.py
DELETED
@@ -1,58 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Judgment Scorer class.
|
3
|
-
|
4
|
-
Scores `Example`s using ready-made Judgment evaluators.
|
5
|
-
"""
|
6
|
-
|
7
|
-
from pydantic import BaseModel, field_validator
|
8
|
-
from judgeval.common.logger import debug, info, warning, error
|
9
|
-
|
10
|
-
from judgeval.constants import APIScorer, UNBOUNDED_SCORERS
|
11
|
-
|
12
|
-
|
13
|
-
class APIJudgmentScorer(BaseModel):
|
14
|
-
"""
|
15
|
-
Class for ready-made, "out-of-the-box" scorer that uses Judgment evaluators to score `Example`s.
|
16
|
-
|
17
|
-
Args:
|
18
|
-
score_type (APIScorer): The Judgment metric to use for scoring `Example`s
|
19
|
-
threshold (float): A value between 0 and 1 that determines the scoring threshold
|
20
|
-
"""
|
21
|
-
score_type: APIScorer
|
22
|
-
threshold: float
|
23
|
-
|
24
|
-
@field_validator('threshold')
|
25
|
-
def validate_threshold(cls, v, info):
|
26
|
-
"""
|
27
|
-
Validates that the threshold is between 0 and 1 inclusive.
|
28
|
-
"""
|
29
|
-
score_type = info.data.get('score_type')
|
30
|
-
if score_type in UNBOUNDED_SCORERS:
|
31
|
-
if v < 0:
|
32
|
-
error(f"Threshold for {score_type} must be greater than 0, got: {v}")
|
33
|
-
raise ValueError(f"Threshold for {score_type} must be greater than 0, got: {v}")
|
34
|
-
else:
|
35
|
-
if not 0 <= v <= 1:
|
36
|
-
error(f"Threshold for {score_type} must be between 0 and 1, got: {v}")
|
37
|
-
raise ValueError(f"Threshold for {score_type} must be between 0 and 1, got: {v}")
|
38
|
-
return v
|
39
|
-
|
40
|
-
@field_validator('score_type')
|
41
|
-
def convert_to_enum_value(cls, v):
|
42
|
-
"""
|
43
|
-
Validates that the `score_type` is a valid `JudgmentMetric` enum value.
|
44
|
-
Converts string values to `JudgmentMetric` enum values.
|
45
|
-
"""
|
46
|
-
debug(f"Attempting to convert score_type value: {v}")
|
47
|
-
if isinstance(v, APIScorer):
|
48
|
-
info(f"Using existing JudgmentMetric: {v.value}")
|
49
|
-
return v.value
|
50
|
-
elif isinstance(v, str):
|
51
|
-
debug(f"Converting string value to JudgmentMetric enum: {v}")
|
52
|
-
return APIScorer[v.upper()].value
|
53
|
-
error(f"Invalid score_type value: {v}")
|
54
|
-
raise ValueError(f"Invalid value for score_type: {v}")
|
55
|
-
|
56
|
-
def __str__(self):
|
57
|
-
return f"JudgmentScorer(score_type={self.score_type.value}, threshold={self.threshold})"
|
58
|
-
|