judgeval 0.0.52__py3-none-any.whl → 0.0.54__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/logger.py +46 -199
- judgeval/common/s3_storage.py +2 -6
- judgeval/common/tracer.py +182 -262
- judgeval/common/utils.py +16 -36
- judgeval/constants.py +14 -20
- judgeval/data/__init__.py +0 -2
- judgeval/data/datasets/dataset.py +6 -10
- judgeval/data/datasets/eval_dataset_client.py +25 -27
- judgeval/data/example.py +5 -138
- judgeval/data/judgment_types.py +214 -0
- judgeval/data/result.py +7 -25
- judgeval/data/scorer_data.py +28 -40
- judgeval/data/scripts/fix_default_factory.py +23 -0
- judgeval/data/scripts/openapi_transform.py +123 -0
- judgeval/data/tool.py +3 -54
- judgeval/data/trace.py +31 -50
- judgeval/data/trace_run.py +3 -3
- judgeval/evaluation_run.py +16 -23
- judgeval/integrations/langgraph.py +11 -12
- judgeval/judges/litellm_judge.py +3 -6
- judgeval/judges/mixture_of_judges.py +8 -25
- judgeval/judges/together_judge.py +3 -6
- judgeval/judgment_client.py +22 -24
- judgeval/rules.py +7 -19
- judgeval/run_evaluation.py +79 -242
- judgeval/scorers/__init__.py +4 -20
- judgeval/scorers/agent_scorer.py +21 -0
- judgeval/scorers/api_scorer.py +28 -38
- judgeval/scorers/base_scorer.py +98 -0
- judgeval/scorers/example_scorer.py +19 -0
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +10 -17
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +9 -24
- judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +16 -68
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +4 -12
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +10 -17
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +18 -14
- judgeval/scorers/score.py +45 -330
- judgeval/scorers/utils.py +6 -88
- judgeval/utils/file_utils.py +4 -6
- judgeval/version_check.py +3 -2
- {judgeval-0.0.52.dist-info → judgeval-0.0.54.dist-info}/METADATA +6 -5
- judgeval-0.0.54.dist-info/RECORD +65 -0
- judgeval/data/custom_example.py +0 -19
- judgeval/scorers/judgeval_scorer.py +0 -177
- judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +0 -45
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -29
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -29
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -32
- judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +0 -28
- judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -38
- judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -27
- judgeval/scorers/prompt_scorer.py +0 -296
- judgeval-0.0.52.dist-info/RECORD +0 -69
- {judgeval-0.0.52.dist-info → judgeval-0.0.54.dist-info}/WHEEL +0 -0
- {judgeval-0.0.52.dist-info → judgeval-0.0.54.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: judgeval
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.54
|
4
4
|
Summary: Judgeval Package
|
5
5
|
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
6
|
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
@@ -12,6 +12,7 @@ Classifier: Programming Language :: Python :: 3
|
|
12
12
|
Requires-Python: >=3.11
|
13
13
|
Requires-Dist: anthropic
|
14
14
|
Requires-Dist: boto3
|
15
|
+
Requires-Dist: datamodel-code-generator>=0.31.1
|
15
16
|
Requires-Dist: google-genai
|
16
17
|
Requires-Dist: langchain-anthropic
|
17
18
|
Requires-Dist: langchain-core
|
@@ -150,10 +151,10 @@ You'll see your trace exported to the Judgment Platform:
|
|
150
151
|
|
151
152
|
| | |
|
152
153
|
|:---|:---:|
|
153
|
-
| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic)
|
154
|
-
| <h3>🧪 Evals</h3>
|
155
|
-
| <h3>📡 Monitoring</h3>
|
156
|
-
| <h3>📊 Datasets</h3>Export
|
154
|
+
| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
|
155
|
+
| <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
|
156
|
+
| <h3>📡 Monitoring</h3>Get Slack alerts when you agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/error_analysis_dashboard.png" alt="Monitoring Dashboard" width="1200"/></p> |
|
157
|
+
| <h3>📊 Datasets</h3>Export traces and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
|
157
158
|
|
158
159
|
## 🏢 Self-Hosting
|
159
160
|
|
@@ -0,0 +1,65 @@
|
|
1
|
+
judgeval/__init__.py,sha256=HM1M8hmqRum6G554QKkXhB4DF4f5eh_xtYo0Kf-t3kw,332
|
2
|
+
judgeval/clients.py,sha256=JnB8n90GyXiYaGmSEYaA67mdJSnr3SIrzArao7NGebw,980
|
3
|
+
judgeval/constants.py,sha256=lqPVUR7XAr1zbmByJil3i0eY24ymWGzcgg88Npk-U20,5772
|
4
|
+
judgeval/evaluation_run.py,sha256=B5w6UiB2cu8km93p4XT3jtganOtIKAZJI3UKc5Qgrew,2936
|
5
|
+
judgeval/judgment_client.py,sha256=QT6jV1moshs_-1xjX8jAhQpr9vjznqqcXuobQ7eDBks,21343
|
6
|
+
judgeval/rules.py,sha256=CoQjqmP8daEXewMkplmA-7urubDtweOr5O6z8klVwLI,20031
|
7
|
+
judgeval/run_evaluation.py,sha256=WXQi2AIKu_iPSLZWnhgLarVbHE6nzyjHJcbKSHu3zYc,42568
|
8
|
+
judgeval/version_check.py,sha256=FoLEtpCjDw2HuDQdpw5yT29UtwumSc6ZZN6AV_c9Mnw,1057
|
9
|
+
judgeval/common/__init__.py,sha256=KH-QJyWtQ60R6yFIBDYS3WGRiNpEu1guynpxivZvpBQ,309
|
10
|
+
judgeval/common/exceptions.py,sha256=OkgDznu2wpBQZMXiZarLJYNk1HIcC8qYW7VypDC3Ook,556
|
11
|
+
judgeval/common/logger.py,sha256=514eFLYWS_UL8VY-zAR2ePUlpQe4rbYlleLASFllLE4,1511
|
12
|
+
judgeval/common/s3_storage.py,sha256=UvAKGSa0S1BnNprzDKHMAfyT-8zlMAOM5kCrXcVN0HE,3743
|
13
|
+
judgeval/common/tracer.py,sha256=qrvriShLG6INpE58sAhlQ6YZfZa3TtfJfsP-cVDyBe4,126135
|
14
|
+
judgeval/common/utils.py,sha256=wkdBg86OHROQBXpIPtMyNku5cGckwPpaiATeuilLNbE,34304
|
15
|
+
judgeval/data/__init__.py,sha256=1QagDcSQtfnJ632t9Dnq8d7XjAqhmY4mInOWt8qH9tM,455
|
16
|
+
judgeval/data/example.py,sha256=6xtPTwWUsZ0HdErU-g954nCv64fsbnS1I5xuEvs14EA,2027
|
17
|
+
judgeval/data/judgment_types.py,sha256=VM941NM7_uqwx6bKABV1cH2cocuYgclfORxCK3sPQZo,9853
|
18
|
+
judgeval/data/result.py,sha256=7FFD9kOla6ijvu2-Wx3tFE98Ry7ECeV-f8aiDeHNaHs,2449
|
19
|
+
judgeval/data/scorer_data.py,sha256=ty4clGts-Zp6NiU1SZXKbrVsyKvHhD5Tm1kbXx6we1k,2977
|
20
|
+
judgeval/data/tool.py,sha256=iWQSdy5uNbIeACu3gQy1DC2oGYxRVYNfkkczWdQMAiA,99
|
21
|
+
judgeval/data/trace.py,sha256=szugEHAb2R0YljmBQllQEVE5pOlBUC6eOSzbm_WXf-Y,4830
|
22
|
+
judgeval/data/trace_run.py,sha256=kovRZduC0l-9nM5YWM6lKaQNEVy_WtHwt4lvIwPbHvY,1825
|
23
|
+
judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
|
24
|
+
judgeval/data/datasets/dataset.py,sha256=dDmTYSBRj4YEUhgYOebAcDm4N14nj3tcCqHj9y2Z1z0,12725
|
25
|
+
judgeval/data/datasets/eval_dataset_client.py,sha256=0XS8irOA-gI1aEX3hk0LikzLjb6DOLuj18j2w64BoQM,12614
|
26
|
+
judgeval/data/scripts/fix_default_factory.py,sha256=lvp2JwYZqz-XpD9LZNa3mANZVP-jJSZoNzolI6JWERM,591
|
27
|
+
judgeval/data/scripts/openapi_transform.py,sha256=mT8qrzhvtMrMMC6Q_amSOGt-X-hUbDlT3xvpgEfcuEs,3828
|
28
|
+
judgeval/integrations/langgraph.py,sha256=WuaHqer8i2QV_yZWoB18RNDLAYeH_Z_quVERvTOySQU,36151
|
29
|
+
judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
|
30
|
+
judgeval/judges/base_judge.py,sha256=_dz0qWsKRxzXxpRY9l6mrxTRYPSF2FE4ZXkrzhZ4gbY,986
|
31
|
+
judgeval/judges/litellm_judge.py,sha256=LX4_KXb1Jp8IXif3vvOiKfRYH7ZkbQLs9AtWPGmj544,2483
|
32
|
+
judgeval/judges/mixture_of_judges.py,sha256=wcHwLi9zU0uwKMqRVhcPdjiYKgWflX4dpUbU2kS9yg0,14825
|
33
|
+
judgeval/judges/together_judge.py,sha256=r5k8ZcC6lnsFttGHhrocFtmglx2Cb3G-4ORKAeK-Nmw,2253
|
34
|
+
judgeval/judges/utils.py,sha256=0CF9qtIUQUL3-W-qTGpmTjZbkUUBAM6TslDsrCHnTBU,2725
|
35
|
+
judgeval/scorers/__init__.py,sha256=7f_zsJV29gO_u4o0n2011SovJ1ZGAI5Zk11WPUBPWcs,858
|
36
|
+
judgeval/scorers/agent_scorer.py,sha256=TjwD_YglSywr3EowEojiCyg5qDgCRa5LRGc5nFdmIBc,703
|
37
|
+
judgeval/scorers/api_scorer.py,sha256=xlhqkeMUBFxl8daSXOTWOYwZjBAz7o6b4sVD5f8cIHw,2523
|
38
|
+
judgeval/scorers/base_scorer.py,sha256=rZfRPolxbsghWS0-FMqXrbJKuLobysMGjAeZkqn0cr0,3581
|
39
|
+
judgeval/scorers/example_scorer.py,sha256=2n45y3LMV1Q-ARyXLHqvVWETlnY1DqS7OLzPu9IBGz8,716
|
40
|
+
judgeval/scorers/exceptions.py,sha256=ACDHK5-TWiF3NTk-wycaedpbrdobm-CvvC1JA_iP-Mk,179
|
41
|
+
judgeval/scorers/score.py,sha256=oQC6LMsalL4XAtXlA3S84MB9YiHjqYIgMhRRi-zaXJ4,6577
|
42
|
+
judgeval/scorers/utils.py,sha256=I13XwyBKMUpZK2oacgkwaieUOGlQbKxKKn6SdiA4lmE,4532
|
43
|
+
judgeval/scorers/judgeval_scorers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
44
|
+
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=saQmMol_CMzp1yovjgiF3YYhLTu-4O9xtmhygj1LRh8,1496
|
45
|
+
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=zJsU0VrUmRhY9qav48c6jTyDqUwI3JzhV9ajtlJCe0M,544
|
46
|
+
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=UDfzTO9Fx0FA5o0wfD8kprrGA4eW-43Rn9Gc0BQtKgY,393
|
47
|
+
judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py,sha256=rbG80J88cer7yfVRvLpu-x2cdwiTl-ztnF2wgOoIlcE,2624
|
48
|
+
judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py,sha256=mbBvirNcivu9dP6deM7FogDXrdwI9o8yqsO8IeKPSb4,309
|
49
|
+
judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py,sha256=NABO_iBdkOo3fdPVcoWfUkeN-FTX3t3-bErMjdqBXdk,1361
|
50
|
+
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=ps51bTgQsD9xGYsk1v9bx0WxQMqywSllCE9_xlJkLd8,531
|
51
|
+
judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py,sha256=SnFLvU4FGsMeUVUp0SGHSy_6wgfwr_vHPGnZx5YJl_Q,691
|
52
|
+
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=aQzu-TiGqG74JDQ927evv5yGmnZw2AOolyHvlIhiUbI,683
|
53
|
+
judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py,sha256=Mcp1CjMNyOax9UkvoRdSyUYdO2Os1-Nko43y89m2Luo,594
|
54
|
+
judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py,sha256=Z2FLGBC7m_CLx-CMgXVuTvYvN0vY5yOcWA0ImBkeBfY,787
|
55
|
+
judgeval/scorers/judgeval_scorers/classifiers/__init__.py,sha256=Qt81W5ZCwMvBAne0LfQDb8xvg5iOG1vEYP7WizgwAZo,67
|
56
|
+
judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py,sha256=8iTzMvou1Dr8pybul6lZHKjc9Ye2-0_racRGYkhEdTY,74
|
57
|
+
judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py,sha256=gloLzThkFsr8sHQargDAH8XaDrlF6OCuc_69hyNslFU,2589
|
58
|
+
judgeval/tracer/__init__.py,sha256=wkuXtOGDCrwgPPXlh_sSJmvGuWaAMHyNzk1TzB5f9aI,148
|
59
|
+
judgeval/utils/alerts.py,sha256=3w_AjQrgfmOZvfqCridW8WAnHVxHHXokX9jNzVFyGjA,3297
|
60
|
+
judgeval/utils/file_utils.py,sha256=wIEn8kjM0WrP216RGU_yhZhFOMWIS5ckigyHbzFSOMk,1774
|
61
|
+
judgeval/utils/requests.py,sha256=rbmZTaiyWI8t2YUkhk11SIe3dF7j2j25L1BuFp_1PII,770
|
62
|
+
judgeval-0.0.54.dist-info/METADATA,sha256=A3bcjOu-nBCLsP7W1vTDYGThPbrBZr5GJegewy9bEGs,54271
|
63
|
+
judgeval-0.0.54.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
64
|
+
judgeval-0.0.54.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
|
65
|
+
judgeval-0.0.54.dist-info/RECORD,,
|
judgeval/data/custom_example.py
DELETED
@@ -1,19 +0,0 @@
|
|
1
|
-
from pydantic import BaseModel, Field
|
2
|
-
from typing import Optional, List, Dict, Any
|
3
|
-
from uuid import uuid4
|
4
|
-
|
5
|
-
|
6
|
-
class CustomExample(BaseModel):
|
7
|
-
input: Optional[Dict[str, Any]] = None
|
8
|
-
actual_output: Optional[Dict[str, Any]] = None
|
9
|
-
expected_output: Optional[Dict[str, Any]] = None
|
10
|
-
context: Optional[List[str]] = None
|
11
|
-
retrieval_context: Optional[List[str]] = None
|
12
|
-
additional_metadata: Optional[Dict[str, Any]] = None
|
13
|
-
tools_called: Optional[List[str]] = None
|
14
|
-
expected_tools: Optional[List[str]] = None
|
15
|
-
name: Optional[str] = None
|
16
|
-
example_id: str = Field(default_factory=lambda: str(uuid4()))
|
17
|
-
example_index: Optional[int] = None
|
18
|
-
timestamp: Optional[str] = None
|
19
|
-
trace_id: Optional[str] = None
|
@@ -1,177 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Judgeval Scorer class
|
3
|
-
|
4
|
-
Enables client to create custom scorers that do not fall under any of the ready-made Judgment scorers.
|
5
|
-
To create a custom scorer, extend this class and implement the `score_example`, `a_score_example`, and `success_check` methods.
|
6
|
-
"""
|
7
|
-
|
8
|
-
from typing import Optional, Dict, Union, List
|
9
|
-
from abc import abstractmethod
|
10
|
-
|
11
|
-
from judgeval.common.logger import debug, info, warning, error
|
12
|
-
from judgeval.judges import JudgevalJudge
|
13
|
-
from judgeval.judges.utils import create_judge
|
14
|
-
from judgeval.constants import UNBOUNDED_SCORERS
|
15
|
-
from judgeval.data.example import ExampleParams
|
16
|
-
|
17
|
-
|
18
|
-
class JudgevalScorer:
|
19
|
-
"""
|
20
|
-
Base class for scorers in `judgeval`.
|
21
|
-
|
22
|
-
In practice, you should not implement this class unless you are creating a custom scorer.
|
23
|
-
Judgeval offers 10+ default scorers that you can use out of the box.
|
24
|
-
|
25
|
-
If you want to create a scorer that does not fall under any of the ready-made Judgment scorers,
|
26
|
-
you can create a custom scorer by extending this class.
|
27
|
-
"""
|
28
|
-
|
29
|
-
score_type: str # name of your new scorer
|
30
|
-
threshold: float # The threshold to pass a test while using this scorer as a scorer
|
31
|
-
score: Optional[float] = None # The float score of the scorer run on the test case
|
32
|
-
score_breakdown: Optional[Dict] = None
|
33
|
-
reason: Optional[str] = (
|
34
|
-
None # The reason for the score when evaluating the test case
|
35
|
-
)
|
36
|
-
success: Optional[bool] = None # Whether the test case passed or failed
|
37
|
-
evaluation_model: Optional[str] = None # The model used to evaluate the test case
|
38
|
-
strict_mode: bool = False # Whether to run the scorer in strict mode
|
39
|
-
async_mode: bool = True # Whether to run the scorer in async mode
|
40
|
-
verbose_mode: bool = True # Whether to run the scorer in verbose mode
|
41
|
-
include_reason: bool = False # Whether to include the reason in the output
|
42
|
-
custom_example: bool = False # Whether the scorer corresponds to CustomExamples
|
43
|
-
error: Optional[str] = None # The error message if the scorer failed
|
44
|
-
evaluation_cost: Optional[float] = None # The cost of running the scorer
|
45
|
-
verbose_logs: Optional[str] = None # The verbose logs of the scorer
|
46
|
-
additional_metadata: Optional[Dict] = None # Additional metadata for the scorer
|
47
|
-
required_params: Optional[List[ExampleParams]] = (
|
48
|
-
None # The required parameters for the scorer
|
49
|
-
)
|
50
|
-
|
51
|
-
def __init__(
|
52
|
-
self,
|
53
|
-
score_type: str,
|
54
|
-
threshold: float,
|
55
|
-
score: Optional[float] = None,
|
56
|
-
score_breakdown: Optional[Dict] = None,
|
57
|
-
reason: Optional[str] = None,
|
58
|
-
success: Optional[bool] = None,
|
59
|
-
evaluation_model: Optional[str] = None,
|
60
|
-
required_params: Optional[List[ExampleParams]] = None,
|
61
|
-
strict_mode: bool = False,
|
62
|
-
async_mode: bool = True,
|
63
|
-
verbose_mode: bool = True,
|
64
|
-
include_reason: bool = False,
|
65
|
-
custom_example: bool = False,
|
66
|
-
error: Optional[str] = None,
|
67
|
-
evaluation_cost: Optional[float] = None,
|
68
|
-
verbose_logs: Optional[str] = None,
|
69
|
-
additional_metadata: Optional[Dict] = None,
|
70
|
-
):
|
71
|
-
debug(
|
72
|
-
f"Initializing JudgevalScorer with score_type={score_type}, threshold={threshold}"
|
73
|
-
)
|
74
|
-
if score_type in UNBOUNDED_SCORERS:
|
75
|
-
if threshold < 0:
|
76
|
-
raise ValueError(
|
77
|
-
f"Threshold for {score_type} must be greater than 0, got: {threshold}"
|
78
|
-
)
|
79
|
-
else:
|
80
|
-
if not 0 <= threshold <= 1:
|
81
|
-
raise ValueError(
|
82
|
-
f"Threshold for {score_type} must be between 0 and 1, got: {threshold}"
|
83
|
-
)
|
84
|
-
if strict_mode:
|
85
|
-
warning("Strict mode enabled - scoring will be more rigorous")
|
86
|
-
info(f"JudgevalScorer initialized with evaluation_model: {evaluation_model}")
|
87
|
-
self.score_type = score_type
|
88
|
-
self.threshold = threshold
|
89
|
-
self.score = score
|
90
|
-
self.score_breakdown = score_breakdown
|
91
|
-
self.reason = reason
|
92
|
-
self.success = success
|
93
|
-
self.evaluation_model = evaluation_model
|
94
|
-
self.strict_mode = strict_mode
|
95
|
-
self.async_mode = async_mode
|
96
|
-
self.verbose_mode = verbose_mode
|
97
|
-
self.include_reason = include_reason
|
98
|
-
self.custom_example = custom_example
|
99
|
-
self.error = error
|
100
|
-
self.evaluation_cost = evaluation_cost
|
101
|
-
self.verbose_logs = verbose_logs
|
102
|
-
self.additional_metadata = additional_metadata
|
103
|
-
self.required_params = required_params
|
104
|
-
|
105
|
-
def _add_model(self, model: Optional[Union[str, List[str], JudgevalJudge]] = None):
|
106
|
-
"""
|
107
|
-
Adds the evaluation model to the JudgevalScorer instance
|
108
|
-
|
109
|
-
This method is used at eval time
|
110
|
-
"""
|
111
|
-
self.model, self.using_native_model = create_judge(model)
|
112
|
-
self.evaluation_model = self.model.get_model_name()
|
113
|
-
|
114
|
-
@abstractmethod
|
115
|
-
def score_example(self, example, *args, **kwargs) -> float:
|
116
|
-
"""
|
117
|
-
Measures the score on a single example
|
118
|
-
"""
|
119
|
-
warning("Attempting to call unimplemented score_example method")
|
120
|
-
error("score_example method not implemented")
|
121
|
-
raise NotImplementedError(
|
122
|
-
"You must implement the `score` method in your custom scorer"
|
123
|
-
)
|
124
|
-
|
125
|
-
@abstractmethod
|
126
|
-
async def a_score_example(self, example, *args, **kwargs) -> float:
|
127
|
-
"""
|
128
|
-
Asynchronously measures the score on a single example
|
129
|
-
"""
|
130
|
-
warning("Attempting to call unimplemented a_score_example method")
|
131
|
-
error("a_score_example method not implemented")
|
132
|
-
raise NotImplementedError(
|
133
|
-
"You must implement the `a_score` method in your custom scorer"
|
134
|
-
)
|
135
|
-
|
136
|
-
@abstractmethod
|
137
|
-
def _success_check(self) -> bool:
|
138
|
-
"""
|
139
|
-
For unit testing, determines whether the test case passes or fails
|
140
|
-
"""
|
141
|
-
warning("Attempting to call unimplemented success_check method")
|
142
|
-
error("_success_check method not implemented")
|
143
|
-
raise NotImplementedError(
|
144
|
-
"You must implement the `_success_check` method in your custom scorer"
|
145
|
-
)
|
146
|
-
|
147
|
-
def __str__(self):
|
148
|
-
debug("Converting JudgevalScorer instance to string representation")
|
149
|
-
if self.error:
|
150
|
-
warning(f"JudgevalScorer contains error: {self.error}")
|
151
|
-
info(f"JudgevalScorer status - success: {self.success}, score: {self.score}")
|
152
|
-
attributes = {
|
153
|
-
"score_type": self.score_type,
|
154
|
-
"threshold": self.threshold,
|
155
|
-
"score": self.score,
|
156
|
-
"score_breakdown": self.score_breakdown,
|
157
|
-
"reason": self.reason,
|
158
|
-
"success": self.success,
|
159
|
-
"evaluation_model": self.evaluation_model,
|
160
|
-
"strict_mode": self.strict_mode,
|
161
|
-
"async_mode": self.async_mode,
|
162
|
-
"verbose_mode": self.verbose_mode,
|
163
|
-
"include_reason": self.include_reason,
|
164
|
-
"error": self.error,
|
165
|
-
"evaluation_cost": self.evaluation_cost,
|
166
|
-
"verbose_logs": self.verbose_logs,
|
167
|
-
"additional_metadata": self.additional_metadata,
|
168
|
-
}
|
169
|
-
return f"JudgevalScorer({attributes})"
|
170
|
-
|
171
|
-
def to_dict(self):
|
172
|
-
return {
|
173
|
-
"score_type": str(
|
174
|
-
self.score_type
|
175
|
-
), # Convert enum to string for serialization
|
176
|
-
"threshold": self.threshold,
|
177
|
-
}
|
@@ -1,45 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
`judgeval` comparison scorer
|
3
|
-
|
4
|
-
TODO add link to docs page for this scorer
|
5
|
-
|
6
|
-
"""
|
7
|
-
|
8
|
-
# Internal imports
|
9
|
-
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
|
-
from judgeval.constants import APIScorer
|
11
|
-
from typing import Optional, Dict
|
12
|
-
from judgeval.data import ExampleParams
|
13
|
-
|
14
|
-
|
15
|
-
class ComparisonScorer(APIJudgmentScorer):
|
16
|
-
kwargs: Optional[Dict] = None
|
17
|
-
|
18
|
-
def __init__(self, threshold: float, criteria: str, description: str):
|
19
|
-
super().__init__(
|
20
|
-
threshold=threshold,
|
21
|
-
score_type=APIScorer.COMPARISON,
|
22
|
-
required_params=[
|
23
|
-
ExampleParams.INPUT,
|
24
|
-
ExampleParams.ACTUAL_OUTPUT,
|
25
|
-
ExampleParams.EXPECTED_OUTPUT,
|
26
|
-
],
|
27
|
-
)
|
28
|
-
self.kwargs = {"criteria": criteria, "description": description}
|
29
|
-
|
30
|
-
@property
|
31
|
-
def __name__(self):
|
32
|
-
return f"Comparison-{self.kwargs['criteria']}"
|
33
|
-
|
34
|
-
def to_dict(self) -> dict:
|
35
|
-
"""
|
36
|
-
Converts the scorer configuration to a dictionary format.
|
37
|
-
|
38
|
-
Returns:
|
39
|
-
dict: A dictionary containing the scorer's configuration
|
40
|
-
"""
|
41
|
-
return {
|
42
|
-
"score_type": self.score_type,
|
43
|
-
"threshold": self.threshold,
|
44
|
-
"kwargs": self.kwargs,
|
45
|
-
}
|
@@ -1,29 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
`judgeval` contextual precision scorer
|
3
|
-
|
4
|
-
TODO add link to docs page for this scorer
|
5
|
-
|
6
|
-
"""
|
7
|
-
|
8
|
-
# Internal imports
|
9
|
-
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
|
-
from judgeval.constants import APIScorer
|
11
|
-
from judgeval.data import ExampleParams
|
12
|
-
|
13
|
-
|
14
|
-
class ContextualPrecisionScorer(APIJudgmentScorer):
|
15
|
-
def __init__(self, threshold: float):
|
16
|
-
super().__init__(
|
17
|
-
threshold=threshold,
|
18
|
-
score_type=APIScorer.CONTEXTUAL_PRECISION,
|
19
|
-
required_params=[
|
20
|
-
ExampleParams.INPUT,
|
21
|
-
ExampleParams.ACTUAL_OUTPUT,
|
22
|
-
ExampleParams.RETRIEVAL_CONTEXT,
|
23
|
-
ExampleParams.EXPECTED_OUTPUT,
|
24
|
-
],
|
25
|
-
)
|
26
|
-
|
27
|
-
@property
|
28
|
-
def __name__(self):
|
29
|
-
return "Contextual Precision"
|
@@ -1,29 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
`judgeval` contextual recall scorer
|
3
|
-
|
4
|
-
TODO add link to docs page for this scorer
|
5
|
-
|
6
|
-
"""
|
7
|
-
|
8
|
-
# Internal imports
|
9
|
-
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
|
-
from judgeval.constants import APIScorer
|
11
|
-
from judgeval.data import ExampleParams
|
12
|
-
|
13
|
-
|
14
|
-
class ContextualRecallScorer(APIJudgmentScorer):
|
15
|
-
def __init__(self, threshold: float):
|
16
|
-
super().__init__(
|
17
|
-
threshold=threshold,
|
18
|
-
score_type=APIScorer.CONTEXTUAL_RECALL,
|
19
|
-
required_params=[
|
20
|
-
ExampleParams.INPUT,
|
21
|
-
ExampleParams.ACTUAL_OUTPUT,
|
22
|
-
ExampleParams.EXPECTED_OUTPUT,
|
23
|
-
ExampleParams.RETRIEVAL_CONTEXT,
|
24
|
-
],
|
25
|
-
)
|
26
|
-
|
27
|
-
@property
|
28
|
-
def __name__(self):
|
29
|
-
return "Contextual Recall"
|
@@ -1,32 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
`judgeval` contextual relevancy scorer
|
3
|
-
|
4
|
-
TODO add link to docs page for this scorer
|
5
|
-
|
6
|
-
"""
|
7
|
-
|
8
|
-
# Internal imports
|
9
|
-
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
|
-
from judgeval.constants import APIScorer
|
11
|
-
from judgeval.data import ExampleParams
|
12
|
-
|
13
|
-
|
14
|
-
class ContextualRelevancyScorer(APIJudgmentScorer):
|
15
|
-
"""
|
16
|
-
Scorer that checks if the output of a model is relevant to the retrieval context
|
17
|
-
"""
|
18
|
-
|
19
|
-
def __init__(self, threshold: float):
|
20
|
-
super().__init__(
|
21
|
-
threshold=threshold,
|
22
|
-
score_type=APIScorer.CONTEXTUAL_RELEVANCY,
|
23
|
-
required_params=[
|
24
|
-
ExampleParams.INPUT,
|
25
|
-
ExampleParams.ACTUAL_OUTPUT,
|
26
|
-
ExampleParams.RETRIEVAL_CONTEXT,
|
27
|
-
],
|
28
|
-
)
|
29
|
-
|
30
|
-
@property
|
31
|
-
def __name__(self):
|
32
|
-
return "Contextual Relevancy"
|
@@ -1,28 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
`judgeval` Groundedness scorer
|
3
|
-
|
4
|
-
TODO add link to docs page for this scorer
|
5
|
-
|
6
|
-
"""
|
7
|
-
|
8
|
-
# Internal imports
|
9
|
-
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
|
-
from judgeval.constants import APIScorer
|
11
|
-
from judgeval.data import ExampleParams
|
12
|
-
|
13
|
-
|
14
|
-
class GroundednessScorer(APIJudgmentScorer):
|
15
|
-
def __init__(self, threshold: float):
|
16
|
-
super().__init__(
|
17
|
-
threshold=threshold,
|
18
|
-
score_type=APIScorer.GROUNDEDNESS,
|
19
|
-
required_params=[
|
20
|
-
ExampleParams.INPUT,
|
21
|
-
ExampleParams.ACTUAL_OUTPUT,
|
22
|
-
ExampleParams.RETRIEVAL_CONTEXT,
|
23
|
-
],
|
24
|
-
)
|
25
|
-
|
26
|
-
@property
|
27
|
-
def __name__(self):
|
28
|
-
return "Groundedness"
|
@@ -1,38 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
`judgeval` JSON correctness scorer
|
3
|
-
|
4
|
-
TODO add link to docs page for this scorer
|
5
|
-
|
6
|
-
"""
|
7
|
-
|
8
|
-
# External imports
|
9
|
-
from pydantic import BaseModel, Field
|
10
|
-
|
11
|
-
# Internal imports
|
12
|
-
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
13
|
-
from judgeval.constants import APIScorer
|
14
|
-
from judgeval.data import ExampleParams
|
15
|
-
|
16
|
-
|
17
|
-
class JSONCorrectnessScorer(APIJudgmentScorer):
|
18
|
-
json_schema: BaseModel = Field(None, exclude=True)
|
19
|
-
|
20
|
-
def __init__(self, threshold: float, json_schema: BaseModel):
|
21
|
-
super().__init__(
|
22
|
-
threshold=threshold,
|
23
|
-
score_type=APIScorer.JSON_CORRECTNESS,
|
24
|
-
required_params=[
|
25
|
-
ExampleParams.INPUT,
|
26
|
-
ExampleParams.ACTUAL_OUTPUT,
|
27
|
-
],
|
28
|
-
)
|
29
|
-
object.__setattr__(self, "json_schema", json_schema)
|
30
|
-
|
31
|
-
def to_dict(self):
|
32
|
-
base_dict = super().to_dict() # Get the parent class's dictionary
|
33
|
-
base_dict["kwargs"] = {"json_schema": self.json_schema.model_json_schema()}
|
34
|
-
return base_dict
|
35
|
-
|
36
|
-
@property
|
37
|
-
def __name__(self):
|
38
|
-
return "JSON Correctness"
|
@@ -1,27 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
`judgeval` summarization scorer
|
3
|
-
|
4
|
-
TODO add link to docs page for this scorer
|
5
|
-
|
6
|
-
"""
|
7
|
-
|
8
|
-
# Internal imports
|
9
|
-
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
|
-
from judgeval.constants import APIScorer
|
11
|
-
from judgeval.data import ExampleParams
|
12
|
-
|
13
|
-
|
14
|
-
class SummarizationScorer(APIJudgmentScorer):
|
15
|
-
def __init__(self, threshold: float):
|
16
|
-
super().__init__(
|
17
|
-
threshold=threshold,
|
18
|
-
score_type=APIScorer.SUMMARIZATION,
|
19
|
-
required_params=[
|
20
|
-
ExampleParams.INPUT,
|
21
|
-
ExampleParams.ACTUAL_OUTPUT,
|
22
|
-
],
|
23
|
-
)
|
24
|
-
|
25
|
-
@property
|
26
|
-
def __name__(self):
|
27
|
-
return "Summarization"
|