judgeval 0.0.31__py3-none-any.whl → 0.0.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +3 -1
- judgeval/common/s3_storage.py +93 -0
- judgeval/common/tracer.py +869 -183
- judgeval/constants.py +1 -1
- judgeval/data/datasets/dataset.py +5 -1
- judgeval/data/datasets/eval_dataset_client.py +2 -2
- judgeval/data/sequence.py +16 -26
- judgeval/data/sequence_run.py +2 -0
- judgeval/judgment_client.py +44 -166
- judgeval/rules.py +4 -7
- judgeval/run_evaluation.py +2 -2
- judgeval/scorers/__init__.py +4 -4
- judgeval/scorers/judgeval_scorers/__init__.py +0 -176
- judgeval/version_check.py +22 -0
- {judgeval-0.0.31.dist-info → judgeval-0.0.34.dist-info}/METADATA +15 -2
- judgeval-0.0.34.dist-info/RECORD +63 -0
- judgeval/scorers/base_scorer.py +0 -58
- judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -27
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -276
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
- judgeval/scorers/judgeval_scorers/local_implementations/comparison/__init__.py +0 -0
- judgeval/scorers/judgeval_scorers/local_implementations/comparison/comparison_scorer.py +0 -161
- judgeval/scorers/judgeval_scorers/local_implementations/comparison/prompts.py +0 -222
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
- judgeval/scorers/judgeval_scorers/local_implementations/execution_order/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/execution_order/execution_order.py +0 -156
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -318
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -264
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
- judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/instruction_adherence.py +0 -232
- judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/prompt.py +0 -102
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -551
- judgeval-0.0.31.dist-info/RECORD +0 -96
- {judgeval-0.0.31.dist-info → judgeval-0.0.34.dist-info}/WHEEL +0 -0
- {judgeval-0.0.31.dist-info → judgeval-0.0.34.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,176 +0,0 @@
|
|
1
|
-
from typing import Type, Optional, Any
|
2
|
-
|
3
|
-
# Import implementations
|
4
|
-
from judgeval.scorers.judgeval_scorers.api_scorers import (
|
5
|
-
ExecutionOrderScorer as APIExecutionOrderScorer,
|
6
|
-
JSONCorrectnessScorer as APIJSONCorrectnessScorer,
|
7
|
-
SummarizationScorer as APISummarizationScorer,
|
8
|
-
HallucinationScorer as APIHallucinationScorer,
|
9
|
-
FaithfulnessScorer as APIFaithfulnessScorer,
|
10
|
-
ContextualRelevancyScorer as APIContextualRelevancyScorer,
|
11
|
-
ContextualPrecisionScorer as APIContextualPrecisionScorer,
|
12
|
-
ContextualRecallScorer as APIContextualRecallScorer,
|
13
|
-
AnswerRelevancyScorer as APIAnswerRelevancyScorer,
|
14
|
-
AnswerCorrectnessScorer as APIAnswerCorrectnessScorer,
|
15
|
-
ComparisonScorer as APIComparisonScorer,
|
16
|
-
InstructionAdherenceScorer as APIInstructionAdherenceScorer,
|
17
|
-
GroundednessScorer as APIGroundednessScorer,
|
18
|
-
DerailmentScorer as APIDerailmentScorer,
|
19
|
-
)
|
20
|
-
|
21
|
-
from judgeval.scorers.judgeval_scorers.local_implementations import (
|
22
|
-
AnswerRelevancyScorer as LocalAnswerRelevancyScorer,
|
23
|
-
ContextualPrecisionScorer as LocalContextualPrecisionScorer,
|
24
|
-
ContextualRecallScorer as LocalContextualRecallScorer,
|
25
|
-
ContextualRelevancyScorer as LocalContextualRelevancyScorer,
|
26
|
-
FaithfulnessScorer as LocalFaithfulnessScorer,
|
27
|
-
JsonCorrectnessScorer as LocalJsonCorrectnessScorer,
|
28
|
-
ExecutionOrderScorer as LocalExecutionOrderScorer,
|
29
|
-
HallucinationScorer as LocalHallucinationScorer,
|
30
|
-
SummarizationScorer as LocalSummarizationScorer,
|
31
|
-
AnswerCorrectnessScorer as LocalAnswerCorrectnessScorer,
|
32
|
-
ComparisonScorer as LocalComparisonScorer,
|
33
|
-
InstructionAdherenceScorer as LocalInstructionAdherenceScorer,
|
34
|
-
)
|
35
|
-
|
36
|
-
from judgeval.scorers.judgeval_scorers.classifiers import Text2SQLScorer
|
37
|
-
|
38
|
-
|
39
|
-
class ScorerWrapper:
|
40
|
-
"""
|
41
|
-
Wrapper class that can dynamically load either API or local implementation of a scorer.
|
42
|
-
"""
|
43
|
-
def __init__(self, api_implementation: Type, local_implementation: Optional[Type] = None):
|
44
|
-
self.api_implementation = api_implementation
|
45
|
-
self.local_implementation = local_implementation
|
46
|
-
self._instance = None
|
47
|
-
self._init_args = None
|
48
|
-
self._init_kwargs = None
|
49
|
-
|
50
|
-
def __call__(self, *args, **kwargs):
|
51
|
-
"""Store initialization arguments for later use when implementation is loaded"""
|
52
|
-
self._init_args = args
|
53
|
-
self._init_kwargs = kwargs
|
54
|
-
return self
|
55
|
-
|
56
|
-
def load_implementation(self, use_judgment: bool = True) -> Any:
|
57
|
-
"""
|
58
|
-
Load the appropriate implementation based on the use_judgment flag.
|
59
|
-
|
60
|
-
Args:
|
61
|
-
use_judgment (bool): If True, use API implementation. If False, use local implementation.
|
62
|
-
|
63
|
-
Returns:
|
64
|
-
Instance of the appropriate implementation
|
65
|
-
|
66
|
-
Raises:
|
67
|
-
ValueError: If local implementation is requested but not available
|
68
|
-
"""
|
69
|
-
if self._instance is not None:
|
70
|
-
return self._instance
|
71
|
-
|
72
|
-
if use_judgment:
|
73
|
-
implementation = self.api_implementation
|
74
|
-
else:
|
75
|
-
if self.local_implementation is None:
|
76
|
-
raise ValueError("No local implementation available for this scorer")
|
77
|
-
implementation = self.local_implementation
|
78
|
-
|
79
|
-
args = self._init_args or ()
|
80
|
-
kwargs = self._init_kwargs or {}
|
81
|
-
self._instance = implementation(*args, **kwargs)
|
82
|
-
return self._instance
|
83
|
-
|
84
|
-
def __getattr__(self, name):
|
85
|
-
"""Defer all attribute access to the loaded implementation"""
|
86
|
-
if self._instance is None:
|
87
|
-
raise RuntimeError("Implementation not loaded. Call load_implementation() first")
|
88
|
-
return getattr(self._instance, name)
|
89
|
-
|
90
|
-
# Create wrapped versions of all scorers
|
91
|
-
|
92
|
-
AnswerCorrectnessScorer = ScorerWrapper(
|
93
|
-
api_implementation=APIAnswerCorrectnessScorer,
|
94
|
-
local_implementation=LocalAnswerCorrectnessScorer
|
95
|
-
)
|
96
|
-
|
97
|
-
AnswerRelevancyScorer = ScorerWrapper(
|
98
|
-
api_implementation=APIAnswerRelevancyScorer,
|
99
|
-
local_implementation=LocalAnswerRelevancyScorer
|
100
|
-
)
|
101
|
-
|
102
|
-
ExecutionOrderScorer = ScorerWrapper(
|
103
|
-
api_implementation=APIExecutionOrderScorer,
|
104
|
-
local_implementation=LocalExecutionOrderScorer
|
105
|
-
)
|
106
|
-
|
107
|
-
JSONCorrectnessScorer = ScorerWrapper(
|
108
|
-
api_implementation=APIJSONCorrectnessScorer,
|
109
|
-
local_implementation=LocalJsonCorrectnessScorer
|
110
|
-
)
|
111
|
-
|
112
|
-
SummarizationScorer = ScorerWrapper(
|
113
|
-
api_implementation=APISummarizationScorer,
|
114
|
-
local_implementation=LocalSummarizationScorer
|
115
|
-
)
|
116
|
-
|
117
|
-
HallucinationScorer = ScorerWrapper(
|
118
|
-
api_implementation=APIHallucinationScorer,
|
119
|
-
local_implementation=LocalHallucinationScorer
|
120
|
-
)
|
121
|
-
|
122
|
-
FaithfulnessScorer = ScorerWrapper(
|
123
|
-
api_implementation=APIFaithfulnessScorer,
|
124
|
-
local_implementation=LocalFaithfulnessScorer
|
125
|
-
)
|
126
|
-
|
127
|
-
ContextualRelevancyScorer = ScorerWrapper(
|
128
|
-
api_implementation=APIContextualRelevancyScorer,
|
129
|
-
local_implementation=LocalContextualRelevancyScorer
|
130
|
-
)
|
131
|
-
|
132
|
-
ContextualPrecisionScorer = ScorerWrapper(
|
133
|
-
api_implementation=APIContextualPrecisionScorer,
|
134
|
-
local_implementation=LocalContextualPrecisionScorer
|
135
|
-
)
|
136
|
-
|
137
|
-
ContextualRecallScorer = ScorerWrapper(
|
138
|
-
api_implementation=APIContextualRecallScorer,
|
139
|
-
local_implementation=LocalContextualRecallScorer
|
140
|
-
)
|
141
|
-
|
142
|
-
InstructionAdherenceScorer = ScorerWrapper(
|
143
|
-
api_implementation=APIInstructionAdherenceScorer,
|
144
|
-
local_implementation=LocalInstructionAdherenceScorer
|
145
|
-
)
|
146
|
-
|
147
|
-
def ComparisonScorer(threshold: float, criteria: str, description: str):
|
148
|
-
return ScorerWrapper(
|
149
|
-
api_implementation=APIComparisonScorer,
|
150
|
-
local_implementation=LocalComparisonScorer
|
151
|
-
)(threshold=threshold, criteria=criteria, description=description)
|
152
|
-
|
153
|
-
GroundednessScorer = ScorerWrapper(
|
154
|
-
api_implementation=APIGroundednessScorer,
|
155
|
-
)
|
156
|
-
|
157
|
-
DerailmentScorer = ScorerWrapper(
|
158
|
-
api_implementation=APIDerailmentScorer,
|
159
|
-
local_implementation=LocalInstructionAdherenceScorer # TODO: add local implementation
|
160
|
-
)
|
161
|
-
|
162
|
-
__all__ = [
|
163
|
-
"ExecutionOrderScorer",
|
164
|
-
"JSONCorrectnessScorer",
|
165
|
-
"SummarizationScorer",
|
166
|
-
"HallucinationScorer",
|
167
|
-
"FaithfulnessScorer",
|
168
|
-
"ContextualRelevancyScorer",
|
169
|
-
"ContextualPrecisionScorer",
|
170
|
-
"ContextualRecallScorer",
|
171
|
-
"AnswerRelevancyScorer",
|
172
|
-
"Text2SQLScorer",
|
173
|
-
"ComparisonScorer",
|
174
|
-
"GroundednessScorer",
|
175
|
-
"DerailmentScorer",
|
176
|
-
]
|
@@ -0,0 +1,22 @@
|
|
1
|
+
import importlib.metadata
|
2
|
+
import requests
|
3
|
+
import threading
|
4
|
+
|
5
|
+
def check_latest_version(package_name: str = "judgeval"):
|
6
|
+
def _check():
|
7
|
+
try:
|
8
|
+
current_version = importlib.metadata.version(package_name)
|
9
|
+
response = requests.get(f"https://pypi.org/pypi/{package_name}/json", timeout=2)
|
10
|
+
latest_version = response.json()["info"]["version"]
|
11
|
+
|
12
|
+
if current_version != latest_version:
|
13
|
+
print(
|
14
|
+
f"\033[93mUPDATE AVAILABLE:\033[0m You are using '{package_name}=={current_version}', "
|
15
|
+
f"but the latest version is '{latest_version}'. While this version is still supported, "
|
16
|
+
f"we recommend upgrading to avoid potential issues or missing features: "
|
17
|
+
f"`pip install --upgrade {package_name}`"
|
18
|
+
)
|
19
|
+
except Exception:
|
20
|
+
pass
|
21
|
+
|
22
|
+
threading.Thread(target=_check, daemon=True).start()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: judgeval
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.34
|
4
4
|
Summary: Judgeval Package
|
5
5
|
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
6
|
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
@@ -12,12 +12,13 @@ Classifier: Programming Language :: Python :: 3
|
|
12
12
|
Requires-Python: >=3.11
|
13
13
|
Requires-Dist: anthropic
|
14
14
|
Requires-Dist: fastapi
|
15
|
+
Requires-Dist: google-genai
|
15
16
|
Requires-Dist: langchain
|
16
17
|
Requires-Dist: langchain-anthropic
|
17
18
|
Requires-Dist: langchain-core
|
18
19
|
Requires-Dist: langchain-huggingface
|
19
20
|
Requires-Dist: langchain-openai
|
20
|
-
Requires-Dist: litellm
|
21
|
+
Requires-Dist: litellm==1.38.12
|
21
22
|
Requires-Dist: nest-asyncio
|
22
23
|
Requires-Dist: openai
|
23
24
|
Requires-Dist: openpyxl
|
@@ -94,9 +95,21 @@ Create a file named `traces.py` with the following code:
|
|
94
95
|
from judgeval.common.tracer import Tracer, wrap
|
95
96
|
from openai import OpenAI
|
96
97
|
|
98
|
+
# Basic initialization
|
97
99
|
client = wrap(OpenAI())
|
98
100
|
judgment = Tracer(project_name="my_project")
|
99
101
|
|
102
|
+
# Or with S3 storage enabled
|
103
|
+
# NOTE: Make sure AWS creds correspond to an account with write access to the specified S3 bucket
|
104
|
+
judgment = Tracer(
|
105
|
+
project_name="my_project",
|
106
|
+
use_s3=True,
|
107
|
+
s3_bucket_name="my-traces-bucket", # Bucket created automatically if it doesn't exist
|
108
|
+
s3_aws_access_key_id="your-access-key", # Optional: defaults to AWS_ACCESS_KEY_ID env var
|
109
|
+
s3_aws_secret_access_key="your-secret-key", # Optional: defaults to AWS_SECRET_ACCESS_KEY env var
|
110
|
+
s3_region_name="us-west-1" # Optional: defaults to AWS_REGION env var or "us-west-1"
|
111
|
+
)
|
112
|
+
|
100
113
|
@judgment.observe(span_type="tool")
|
101
114
|
def my_tool():
|
102
115
|
return "Hello world!"
|
@@ -0,0 +1,63 @@
|
|
1
|
+
judgeval/__init__.py,sha256=x9HWt4waJwJMAqTuJSg2MezF9Zg-macEjeU-ajbly-8,330
|
2
|
+
judgeval/clients.py,sha256=6VQmEqmfCngUdS2MuPBIpHvtDFqOENm8-_BmMvjLyRQ,944
|
3
|
+
judgeval/constants.py,sha256=_XmVAkebMyGrDvvanAVlMgVd4p6MLHdEVsTQFI0kz1k,5411
|
4
|
+
judgeval/evaluation_run.py,sha256=WGzx-Ug2qhSmunFo8NrmSstBRsOUc5KpKq0Lc51rqsM,6739
|
5
|
+
judgeval/judgment_client.py,sha256=brRYmphZR-2IUre9kdOhfse1mYDilcIqUzzH21ROAdk,22208
|
6
|
+
judgeval/rules.py,sha256=jkh1cXXcUf8oRY7xJUZfcQBYWn_rjUW4GvrhRt15PeU,20265
|
7
|
+
judgeval/run_evaluation.py,sha256=elMpFHahyeukKKa09fmJM3c_afwJ00mbZRqm18l5f00,28481
|
8
|
+
judgeval/version_check.py,sha256=bvJEidB7rAeXozoUbN9Yb97QOR_s2hgvpvj74jJ5HlY,943
|
9
|
+
judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
|
10
|
+
judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
|
11
|
+
judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
|
12
|
+
judgeval/common/s3_storage.py,sha256=W8wq9S7qJZdqdBR4sk3aEZ4K3-pz40DOoolOJrWs9Vo,3768
|
13
|
+
judgeval/common/tracer.py,sha256=YsObK8VQXp1DDbU9xncU8NjuY-JUI54BqmG4olezrZc,92507
|
14
|
+
judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
|
15
|
+
judgeval/data/__init__.py,sha256=xuKx_KCVHGp6CXvQuVmKl3v7pJp-qDaz0NccKxwjtO0,481
|
16
|
+
judgeval/data/custom_example.py,sha256=QRBqiRiZS8UgVeTRHY0r1Jzm6yAYsyg6zmHxQGxdiQs,739
|
17
|
+
judgeval/data/example.py,sha256=cJrmPGLel_P2sy1UaRvuVSAi35EnA9XMR11Lhp4aDLo,5930
|
18
|
+
judgeval/data/result.py,sha256=Gb9tiSDsk1amXgh0cFG6JmlW_BMKxS2kuTwNA0rrHjA,3184
|
19
|
+
judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
|
20
|
+
judgeval/data/sequence.py,sha256=FmKVdzQP5VTujRCHDWk097MKRR-rJgbsdrxyCKee6tA,1994
|
21
|
+
judgeval/data/sequence_run.py,sha256=RmYjfWKMWg-pcF5PLeiWfrhuDkjDZi5VEmAIEXN3Ib0,2104
|
22
|
+
judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
|
23
|
+
judgeval/data/datasets/dataset.py,sha256=dhLo30hvpmmOK2R6O5wDs_neawUJ4lS8bb4S42SufNQ,13034
|
24
|
+
judgeval/data/datasets/eval_dataset_client.py,sha256=xjj66BO9Es9IxXqzQe1RT_e0kpeKlt7OrhRoSuj4KHM,15085
|
25
|
+
judgeval/integrations/langgraph.py,sha256=J-cQfFP52TjJewdSTe-fcsUC4HDvjNbXoxmbmF0SgiE,11743
|
26
|
+
judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
|
27
|
+
judgeval/judges/base_judge.py,sha256=ch_S7uBB7lyv44Lf1d7mIGFpveOO58zOkkpImKgd9_4,994
|
28
|
+
judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6KTg,2424
|
29
|
+
judgeval/judges/mixture_of_judges.py,sha256=IJoi4Twk8ze1CJWVEp69k6TSqTCTGrmVYQ0qdffer60,15549
|
30
|
+
judgeval/judges/together_judge.py,sha256=l00hhPerAZXg3oYBd8cyMtWsOTNt_0FIqoxhKJKQe3k,2302
|
31
|
+
judgeval/judges/utils.py,sha256=9lvUxziGV86ISvVFxYBWc09TWFyAQgUTyPf_a9mD5Rs,2686
|
32
|
+
judgeval/scorers/__init__.py,sha256=Mk-mWUt_gNpJqY_WIEuQynD6fxc34fWSRSuobMSrj94,1238
|
33
|
+
judgeval/scorers/api_scorer.py,sha256=NQ_CrrUPhSUk1k2Q8rKpCG_TU2FT32sFEqvb-Yi54B0,2688
|
34
|
+
judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
|
35
|
+
judgeval/scorers/judgeval_scorer.py,sha256=79-JJurqHP-qTaWNWInx4SjvQYwXc9lvfPPNgwsh2yA,6773
|
36
|
+
judgeval/scorers/prompt_scorer.py,sha256=PaAs2qRolw1P3_I061Xvk9qzvF4O-JR8g_39RqXnHcM,17728
|
37
|
+
judgeval/scorers/score.py,sha256=r9QiT4-LIvivcJ6XxByrbswKSO8eQTtAD1UlXT_lcmo,18741
|
38
|
+
judgeval/scorers/utils.py,sha256=iHQVTlIANbmCTXz9kTeSdOytgUZ_T74Re61ajqsk_WQ,6827
|
39
|
+
judgeval/scorers/judgeval_scorers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
40
|
+
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=_sDUBxSG536KGqXNi6dFpaYKghjEAadxBxaaxV9HuuE,1764
|
41
|
+
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=Fnd9CVIOZ73sWEWymsU5eBrrZqPFjMZ0BKpeW-PDyTg,711
|
42
|
+
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=oETeN9K0HSIRdL2SDqn82Vskpwh5SlKnZvs5VDm2OBU,658
|
43
|
+
judgeval/scorers/judgeval_scorers/api_scorers/comparison.py,sha256=kuzf9OWvpY38yYSwlBgneLkUZwJNM4FQqvbS66keA90,1249
|
44
|
+
judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py,sha256=tpSuzFAaW8X9xqA0aLLKwh7qmBK0Pc_bJZMIe_q412U,770
|
45
|
+
judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py,sha256=pFVhk4pLtQ-FnNlbI-dFF-SIh69Jza7erHqiPkFWoBo,758
|
46
|
+
judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py,sha256=RQ6DZwEhChfecd89Ey-T7ke--7qTaXZlRsNxwH8gaME,823
|
47
|
+
judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py,sha256=V9WPuwNMm097V7IknKs8UkmAk0yjnBXTcJha_BHXxTA,475
|
48
|
+
judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py,sha256=Pb3CiNF2Ca826B92wJCVAi_68lJjLhqqCKwQKaflSUg,1294
|
49
|
+
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=-BwOapqjryYNKNydtdkUiKIij76dY0O1jBmdc6dKazQ,692
|
50
|
+
judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py,sha256=ntEEeTANEOsGlcbiTAF_3r6BeSJEaVDns8po8T0L6Vg,692
|
51
|
+
judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py,sha256=k5gDOki-8KXrZXydvdSqDt3NZqQ28hXoOCHQf6jNxr4,686
|
52
|
+
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=XnSGEkQfwVqaqnHEGMCsxNiHVzrsrej48uDbLoWc8CQ,678
|
53
|
+
judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py,sha256=mMKEuR87_yanEuZJ5YSGFMHDD_oLVZ6-rQuciFaDOMA,1095
|
54
|
+
judgeval/scorers/judgeval_scorers/api_scorers/summarization.py,sha256=QmWB8bVbDYHY5FcF0rYZE_3c2XXgMLRmR6aXJWfdMC4,655
|
55
|
+
judgeval/scorers/judgeval_scorers/classifiers/__init__.py,sha256=Qt81W5ZCwMvBAne0LfQDb8xvg5iOG1vEYP7WizgwAZo,67
|
56
|
+
judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py,sha256=8iTzMvou1Dr8pybul6lZHKjc9Ye2-0_racRGYkhEdTY,74
|
57
|
+
judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py,sha256=ly72Z7s_c8NID6-nQnuW8qEGEW2MqdvpJ-5WfXzbAQg,2579
|
58
|
+
judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
|
59
|
+
judgeval/utils/alerts.py,sha256=O19Xj7DA0YVjl8PWiuH4zfdZeu3yiLVvHfY8ah2wG0g,2759
|
60
|
+
judgeval-0.0.34.dist-info/METADATA,sha256=VikpIwOQ2vV8jeUK1J9s_VliNnOnekQvEmPyRp9AYsc,6097
|
61
|
+
judgeval-0.0.34.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
62
|
+
judgeval-0.0.34.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
|
63
|
+
judgeval-0.0.34.dist-info/RECORD,,
|
judgeval/scorers/base_scorer.py
DELETED
@@ -1,58 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Judgment Scorer class.
|
3
|
-
|
4
|
-
Scores `Example`s using ready-made Judgment evaluators.
|
5
|
-
"""
|
6
|
-
|
7
|
-
from pydantic import BaseModel, field_validator
|
8
|
-
from judgeval.common.logger import debug, info, warning, error
|
9
|
-
|
10
|
-
from judgeval.constants import APIScorer, UNBOUNDED_SCORERS
|
11
|
-
|
12
|
-
|
13
|
-
class APIJudgmentScorer(BaseModel):
|
14
|
-
"""
|
15
|
-
Class for ready-made, "out-of-the-box" scorer that uses Judgment evaluators to score `Example`s.
|
16
|
-
|
17
|
-
Args:
|
18
|
-
score_type (APIScorer): The Judgment metric to use for scoring `Example`s
|
19
|
-
threshold (float): A value between 0 and 1 that determines the scoring threshold
|
20
|
-
"""
|
21
|
-
score_type: APIScorer
|
22
|
-
threshold: float
|
23
|
-
|
24
|
-
@field_validator('threshold')
|
25
|
-
def validate_threshold(cls, v, info):
|
26
|
-
"""
|
27
|
-
Validates that the threshold is between 0 and 1 inclusive.
|
28
|
-
"""
|
29
|
-
score_type = info.data.get('score_type')
|
30
|
-
if score_type in UNBOUNDED_SCORERS:
|
31
|
-
if v < 0:
|
32
|
-
error(f"Threshold for {score_type} must be greater than 0, got: {v}")
|
33
|
-
raise ValueError(f"Threshold for {score_type} must be greater than 0, got: {v}")
|
34
|
-
else:
|
35
|
-
if not 0 <= v <= 1:
|
36
|
-
error(f"Threshold for {score_type} must be between 0 and 1, got: {v}")
|
37
|
-
raise ValueError(f"Threshold for {score_type} must be between 0 and 1, got: {v}")
|
38
|
-
return v
|
39
|
-
|
40
|
-
@field_validator('score_type')
|
41
|
-
def convert_to_enum_value(cls, v):
|
42
|
-
"""
|
43
|
-
Validates that the `score_type` is a valid `JudgmentMetric` enum value.
|
44
|
-
Converts string values to `JudgmentMetric` enum values.
|
45
|
-
"""
|
46
|
-
debug(f"Attempting to convert score_type value: {v}")
|
47
|
-
if isinstance(v, APIScorer):
|
48
|
-
info(f"Using existing JudgmentMetric: {v.value}")
|
49
|
-
return v.value
|
50
|
-
elif isinstance(v, str):
|
51
|
-
debug(f"Converting string value to JudgmentMetric enum: {v}")
|
52
|
-
return APIScorer[v.upper()].value
|
53
|
-
error(f"Invalid score_type value: {v}")
|
54
|
-
raise ValueError(f"Invalid value for score_type: {v}")
|
55
|
-
|
56
|
-
def __str__(self):
|
57
|
-
return f"JudgmentScorer(score_type={self.score_type.value}, threshold={self.threshold})"
|
58
|
-
|
@@ -1,27 +0,0 @@
|
|
1
|
-
from judgeval.scorers.judgeval_scorers.local_implementations.answer_relevancy.answer_relevancy_scorer import AnswerRelevancyScorer
|
2
|
-
from judgeval.scorers.judgeval_scorers.local_implementations.contextual_precision.contextual_precision_scorer import ContextualPrecisionScorer
|
3
|
-
from judgeval.scorers.judgeval_scorers.local_implementations.contextual_recall.contextual_recall_scorer import ContextualRecallScorer
|
4
|
-
from judgeval.scorers.judgeval_scorers.local_implementations.contextual_relevancy.contextual_relevancy_scorer import ContextualRelevancyScorer
|
5
|
-
from judgeval.scorers.judgeval_scorers.local_implementations.faithfulness.faithfulness_scorer import FaithfulnessScorer
|
6
|
-
from judgeval.scorers.judgeval_scorers.local_implementations.json_correctness.json_correctness_scorer import JsonCorrectnessScorer
|
7
|
-
from judgeval.scorers.judgeval_scorers.local_implementations.execution_order.execution_order import ExecutionOrderScorer
|
8
|
-
from judgeval.scorers.judgeval_scorers.local_implementations.hallucination.hallucination_scorer import HallucinationScorer
|
9
|
-
from judgeval.scorers.judgeval_scorers.local_implementations.summarization.summarization_scorer import SummarizationScorer
|
10
|
-
from judgeval.scorers.judgeval_scorers.local_implementations.answer_correctness.answer_correctness_scorer import AnswerCorrectnessScorer
|
11
|
-
from judgeval.scorers.judgeval_scorers.local_implementations.comparison.comparison_scorer import ComparisonScorer
|
12
|
-
from judgeval.scorers.judgeval_scorers.local_implementations.instruction_adherence.instruction_adherence import InstructionAdherenceScorer
|
13
|
-
|
14
|
-
__all__ = [
|
15
|
-
"AnswerCorrectnessScorer",
|
16
|
-
"AnswerRelevancyScorer",
|
17
|
-
"ComparisonScorer",
|
18
|
-
"ContextualPrecisionScorer",
|
19
|
-
"ContextualRecallScorer",
|
20
|
-
"ContextualRelevancyScorer",
|
21
|
-
"FaithfulnessScorer",
|
22
|
-
"JsonCorrectnessScorer",
|
23
|
-
"ExecutionOrderScorer",
|
24
|
-
"HallucinationScorer",
|
25
|
-
"SummarizationScorer",
|
26
|
-
"InstructionAdherenceScorer",
|
27
|
-
]
|
@@ -1,276 +0,0 @@
|
|
1
|
-
from typing import Optional, List, Union, Tuple
|
2
|
-
|
3
|
-
from judgeval.constants import APIScorer
|
4
|
-
from judgeval.judges import JudgevalJudge
|
5
|
-
from judgeval.judges.utils import create_judge
|
6
|
-
from judgeval.data import Example, ExampleParams
|
7
|
-
from judgeval.scorers import JudgevalScorer
|
8
|
-
from judgeval.scorers.utils import (
|
9
|
-
get_or_create_event_loop,
|
10
|
-
parse_response_json,
|
11
|
-
scorer_progress_meter,
|
12
|
-
create_verbose_logs,
|
13
|
-
check_example_params,
|
14
|
-
)
|
15
|
-
from .prompts import (
|
16
|
-
ACVerdict,
|
17
|
-
AnswerCorrectnessTemplate,
|
18
|
-
Statements,
|
19
|
-
Verdicts,
|
20
|
-
Reason,
|
21
|
-
)
|
22
|
-
|
23
|
-
|
24
|
-
required_params = [
|
25
|
-
ExampleParams.INPUT,
|
26
|
-
ExampleParams.ACTUAL_OUTPUT,
|
27
|
-
ExampleParams.EXPECTED_OUTPUT,
|
28
|
-
]
|
29
|
-
|
30
|
-
|
31
|
-
class AnswerCorrectnessScorer(JudgevalScorer):
|
32
|
-
def __init__(
|
33
|
-
self,
|
34
|
-
threshold: float = 0.5,
|
35
|
-
model: Optional[Union[str, JudgevalJudge]] = None,
|
36
|
-
include_reason: bool = True,
|
37
|
-
async_mode: bool = True,
|
38
|
-
strict_mode: bool = False,
|
39
|
-
verbose_mode: bool = False
|
40
|
-
):
|
41
|
-
super().__init__(
|
42
|
-
score_type=APIScorer.ANSWER_CORRECTNESS,
|
43
|
-
threshold=1 if strict_mode else threshold,
|
44
|
-
evaluation_model=None,
|
45
|
-
include_reason=include_reason,
|
46
|
-
async_mode=async_mode,
|
47
|
-
strict_mode=strict_mode,
|
48
|
-
verbose_mode=verbose_mode
|
49
|
-
)
|
50
|
-
self.model, self.using_native_model = create_judge(model)
|
51
|
-
self.evaluation_model = self.model.get_model_name()
|
52
|
-
|
53
|
-
async def _a_get_statements(self, expected_output: str) -> List[str]:
|
54
|
-
prompt = AnswerCorrectnessTemplate.deduce_statements(
|
55
|
-
expected_output=expected_output,
|
56
|
-
)
|
57
|
-
if self.using_native_model:
|
58
|
-
res = await self.model.a_generate(prompt)
|
59
|
-
data = parse_response_json(res, self)
|
60
|
-
return data["statements"]
|
61
|
-
else:
|
62
|
-
try:
|
63
|
-
res: Statements = await self.model.a_generate(
|
64
|
-
prompt, schema=Statements
|
65
|
-
)
|
66
|
-
return res.statements
|
67
|
-
except TypeError:
|
68
|
-
res = await self.model.a_generate(prompt)
|
69
|
-
data = parse_response_json(res, self)
|
70
|
-
return data["statements"]
|
71
|
-
|
72
|
-
def _get_statements(self, expected_output: str) -> List[str]:
|
73
|
-
prompt = AnswerCorrectnessTemplate.deduce_statements(
|
74
|
-
expected_output=expected_output,
|
75
|
-
)
|
76
|
-
if self.using_native_model:
|
77
|
-
res = self.model.generate(prompt)
|
78
|
-
data = parse_response_json(res, self)
|
79
|
-
return data["statements"]
|
80
|
-
else:
|
81
|
-
try:
|
82
|
-
res: Statements = self.model.generate(
|
83
|
-
prompt, schema=Statements
|
84
|
-
)
|
85
|
-
return res.statements
|
86
|
-
except TypeError:
|
87
|
-
res = self.model.generate(prompt)
|
88
|
-
data = parse_response_json(res, self)
|
89
|
-
return data["statements"]
|
90
|
-
|
91
|
-
async def _a_get_verdicts(self, actual_output: str) -> List[ACVerdict]:
|
92
|
-
if len(self.statements) == 0:
|
93
|
-
return []
|
94
|
-
|
95
|
-
prompt = AnswerCorrectnessTemplate.generate_verdicts(
|
96
|
-
actual_output=actual_output,
|
97
|
-
statements=self.statements,
|
98
|
-
)
|
99
|
-
|
100
|
-
if self.using_native_model:
|
101
|
-
res = await self.model.a_generate(prompt)
|
102
|
-
data = parse_response_json(res, self)
|
103
|
-
return [ACVerdict(**item) for item in data["verdicts"]]
|
104
|
-
else:
|
105
|
-
try:
|
106
|
-
res: Verdicts = await self.model.a_generate(prompt, schema=Verdicts)
|
107
|
-
return [item for item in res.verdicts]
|
108
|
-
except TypeError:
|
109
|
-
res = await self.model.a_generate(prompt)
|
110
|
-
data = parse_response_json(res, self)
|
111
|
-
return [ACVerdict(**item) for item in data["verdicts"]]
|
112
|
-
|
113
|
-
def _get_verdicts(self, actual_output: str) -> List[ACVerdict]:
|
114
|
-
if len(self.statements) == 0:
|
115
|
-
return []
|
116
|
-
|
117
|
-
prompt = AnswerCorrectnessTemplate.generate_verdicts(
|
118
|
-
actual_output=actual_output,
|
119
|
-
statements=self.statements,
|
120
|
-
)
|
121
|
-
|
122
|
-
if self.using_native_model:
|
123
|
-
res = self.model.generate(prompt)
|
124
|
-
data = parse_response_json(res, self)
|
125
|
-
return [ACVerdict(**item) for item in data["verdicts"]]
|
126
|
-
else:
|
127
|
-
try:
|
128
|
-
res: Verdicts = self.model.generate(prompt, schema=Verdicts)
|
129
|
-
return [item for item in res.verdicts]
|
130
|
-
except TypeError:
|
131
|
-
res = self.model.generate(prompt)
|
132
|
-
data = parse_response_json(res, self)
|
133
|
-
return [ACVerdict(**item) for item in data["verdicts"]]
|
134
|
-
|
135
|
-
async def _a_get_reason(self) -> str:
|
136
|
-
if self.include_reason is False:
|
137
|
-
return None
|
138
|
-
|
139
|
-
incorrect_statements: List[Tuple[str, str]] = []
|
140
|
-
for idx, verdict in enumerate(self.verdicts):
|
141
|
-
if verdict.verdict.strip().lower() == "no":
|
142
|
-
incorrect_statements.append((self.statements[idx], verdict.reason))
|
143
|
-
|
144
|
-
prompt = AnswerCorrectnessTemplate.generate_reason(
|
145
|
-
incorrect_statements=incorrect_statements,
|
146
|
-
score=format(self.score, ".2f"),
|
147
|
-
)
|
148
|
-
if self.using_native_model:
|
149
|
-
res = await self.model.a_generate(prompt)
|
150
|
-
data = parse_response_json(res, self)
|
151
|
-
return data["reason"]
|
152
|
-
else:
|
153
|
-
try:
|
154
|
-
res: Reason = await self.model.a_generate(
|
155
|
-
prompt=prompt, schema=Reason
|
156
|
-
)
|
157
|
-
return res.reason
|
158
|
-
except TypeError:
|
159
|
-
res = await self.model.a_generate(prompt)
|
160
|
-
data = parse_response_json(res, self)
|
161
|
-
return data["reason"]
|
162
|
-
|
163
|
-
def _get_reason(self) -> str:
|
164
|
-
if self.include_reason is False:
|
165
|
-
return None
|
166
|
-
|
167
|
-
incorrect_statements: List[Tuple[str, str]] = []
|
168
|
-
for idx, verdict in enumerate(self.verdicts):
|
169
|
-
if verdict.verdict.strip().lower() == "no":
|
170
|
-
incorrect_statements.append((self.statements[idx], verdict.reason))
|
171
|
-
|
172
|
-
prompt = AnswerCorrectnessTemplate.generate_reason(
|
173
|
-
incorrect_statements=incorrect_statements,
|
174
|
-
score=format(self.score, ".2f"),
|
175
|
-
)
|
176
|
-
if self.using_native_model:
|
177
|
-
res = self.model.generate(prompt)
|
178
|
-
data = parse_response_json(res, self)
|
179
|
-
return data["reason"]
|
180
|
-
else:
|
181
|
-
try:
|
182
|
-
res: Reason = self.model.generate(
|
183
|
-
prompt=prompt, schema=Reason
|
184
|
-
)
|
185
|
-
return res.reason
|
186
|
-
except TypeError:
|
187
|
-
res = self.model.generate(prompt)
|
188
|
-
data = parse_response_json(res, self)
|
189
|
-
return data["reason"]
|
190
|
-
|
191
|
-
def _compute_score(self) -> float:
|
192
|
-
number_of_verdicts = len(self.verdicts)
|
193
|
-
if number_of_verdicts == 0:
|
194
|
-
return 1
|
195
|
-
|
196
|
-
correct_count = 0
|
197
|
-
for verdict in self.verdicts:
|
198
|
-
if verdict.verdict.strip().lower() == "yes":
|
199
|
-
correct_count += 1
|
200
|
-
|
201
|
-
score = correct_count / number_of_verdicts
|
202
|
-
return 0 if self.strict_mode and score < self.threshold else score
|
203
|
-
|
204
|
-
def score_example(
|
205
|
-
self,
|
206
|
-
example: Example,
|
207
|
-
_show_indicator: bool = True,
|
208
|
-
) -> float:
|
209
|
-
check_example_params(example, required_params, self)
|
210
|
-
|
211
|
-
with scorer_progress_meter(self, display_meter=_show_indicator):
|
212
|
-
try:
|
213
|
-
if self.async_mode:
|
214
|
-
loop = get_or_create_event_loop()
|
215
|
-
loop.run_until_complete(
|
216
|
-
self.a_score_example(example, _show_indicator=False)
|
217
|
-
)
|
218
|
-
else:
|
219
|
-
self.statements = self._get_statements(example.expected_output)
|
220
|
-
self.verdicts = self._get_verdicts(example.actual_output)
|
221
|
-
self.score = self._compute_score()
|
222
|
-
self.reason = self._get_reason()
|
223
|
-
self.success = self.score >= self.threshold
|
224
|
-
self.verbose_logs = create_verbose_logs(
|
225
|
-
self,
|
226
|
-
steps=[
|
227
|
-
f"Statements:\n{self.statements}",
|
228
|
-
f"Verdicts:\n{[v.model_dump() for v in self.verdicts]}",
|
229
|
-
f"Score: {self.score}\nReason: {self.reason}",
|
230
|
-
],
|
231
|
-
)
|
232
|
-
return self.score
|
233
|
-
except Exception as e:
|
234
|
-
print(f"Error in score_example for AnswerCorrectnessScorer: {e}")
|
235
|
-
raise
|
236
|
-
|
237
|
-
async def a_score_example(
|
238
|
-
self,
|
239
|
-
example: Example,
|
240
|
-
_show_indicator: bool = True,
|
241
|
-
) -> float:
|
242
|
-
check_example_params(example, required_params, self)
|
243
|
-
|
244
|
-
with scorer_progress_meter(self, async_mode=True, display_meter=_show_indicator):
|
245
|
-
try:
|
246
|
-
self.statements: List[str] = await self._a_get_statements(example.expected_output)
|
247
|
-
self.verdicts: List[ACVerdict] = await self._a_get_verdicts(example.actual_output)
|
248
|
-
self.score = self._compute_score()
|
249
|
-
self.reason = await self._a_get_reason()
|
250
|
-
self.success = self.score >= self.threshold
|
251
|
-
self.verbose_logs = create_verbose_logs(
|
252
|
-
self,
|
253
|
-
steps=[
|
254
|
-
f"Statements:\n{self.statements}",
|
255
|
-
f"Verdicts:\n{[v.model_dump() for v in self.verdicts]}",
|
256
|
-
f"Score: {self.score}\nReason: {self.reason}",
|
257
|
-
],
|
258
|
-
)
|
259
|
-
return self.score
|
260
|
-
except Exception as e:
|
261
|
-
print(f"Error in a_score_example for AnswerCorrectnessScorer: {e}")
|
262
|
-
raise
|
263
|
-
|
264
|
-
def _success_check(self) -> bool:
|
265
|
-
if self.error is not None:
|
266
|
-
self.success = False
|
267
|
-
else:
|
268
|
-
try:
|
269
|
-
self.success = self.score >= self.threshold
|
270
|
-
except:
|
271
|
-
self.success = False
|
272
|
-
return self.success
|
273
|
-
|
274
|
-
@property
|
275
|
-
def __name__(self):
|
276
|
-
return "Answer Correctness"
|