PyPI - judgeval - Versions diffs - 0.0.52__py3-none-any.whl → 0.0.54__py3-none-any.whl - Mend

judgeval 0.0.52py3-none-any.whl → 0.0.54py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

judgeval/common/logger.py +46 -199
judgeval/common/s3_storage.py +2 -6
judgeval/common/tracer.py +182 -262
judgeval/common/utils.py +16 -36
judgeval/constants.py +14 -20
judgeval/data/__init__.py +0 -2
judgeval/data/datasets/dataset.py +6 -10
judgeval/data/datasets/eval_dataset_client.py +25 -27
judgeval/data/example.py +5 -138
judgeval/data/judgment_types.py +214 -0
judgeval/data/result.py +7 -25
judgeval/data/scorer_data.py +28 -40
judgeval/data/scripts/fix_default_factory.py +23 -0
judgeval/data/scripts/openapi_transform.py +123 -0
judgeval/data/tool.py +3 -54
judgeval/data/trace.py +31 -50
judgeval/data/trace_run.py +3 -3
judgeval/evaluation_run.py +16 -23
judgeval/integrations/langgraph.py +11 -12
judgeval/judges/litellm_judge.py +3 -6
judgeval/judges/mixture_of_judges.py +8 -25
judgeval/judges/together_judge.py +3 -6
judgeval/judgment_client.py +22 -24
judgeval/rules.py +7 -19
judgeval/run_evaluation.py +79 -242
judgeval/scorers/__init__.py +4 -20
judgeval/scorers/agent_scorer.py +21 -0
judgeval/scorers/api_scorer.py +28 -38
judgeval/scorers/base_scorer.py +98 -0
judgeval/scorers/example_scorer.py +19 -0
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -20
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +10 -17
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +9 -24
judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +16 -68
judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +4 -12
judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +4 -4
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +10 -17
judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +4 -4
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +4 -4
judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +4 -4
judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +18 -14
judgeval/scorers/score.py +45 -330
judgeval/scorers/utils.py +6 -88
judgeval/utils/file_utils.py +4 -6
judgeval/version_check.py +3 -2
{judgeval-0.0.52.dist-info → judgeval-0.0.54.dist-info}/METADATA +6 -5
judgeval-0.0.54.dist-info/RECORD +65 -0
judgeval/data/custom_example.py +0 -19
judgeval/scorers/judgeval_scorer.py +0 -177
judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +0 -45
judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -29
judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -29
judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -32
judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +0 -28
judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -38
judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -27
judgeval/scorers/prompt_scorer.py +0 -296
judgeval-0.0.52.dist-info/RECORD +0 -69
{judgeval-0.0.52.dist-info → judgeval-0.0.54.dist-info}/WHEEL +0 -0
{judgeval-0.0.52.dist-info → judgeval-0.0.54.dist-info}/licenses/LICENSE.md +0 -0

{judgeval-0.0.52.dist-info → judgeval-0.0.54.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: judgeval
-Version: 0.0.52
+Version: 0.0.54
 Summary: Judgeval Package
 Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
 Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -12,6 +12,7 @@ Classifier: Programming Language :: Python :: 3
 Requires-Python: >=3.11
 Requires-Dist: anthropic
 Requires-Dist: boto3
+Requires-Dist: datamodel-code-generator>=0.31.1
 Requires-Dist: google-genai
 Requires-Dist: langchain-anthropic
 Requires-Dist: langchain-core
@@ -150,10 +151,10 @@ You'll see your trace exported to the Judgment Platform:
 |  |  |
 |:---|:---:|
-| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, agent tool calls, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time. Export data per individual trace for detailed analysis.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
-| <h3>🧪 Evals</h3>Evals are the key to regression testing for agents. Judgeval provides 15+ research-backed metrics including tool call accuracy, hallucinations, instruction adherence, and retrieval context recall.<br><br>Judgeval supports LLM-as-a-judge, manual labeling, and custom evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 Experimental prompt testing<br>• 🛡️ Online guardrails | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
-| <h3>📡 Monitoring</h3>Track all your agent metrics in production. **Catch production regressions early.**<br><br>Configure alerts to trigger automated actions when metric thresholds are exceeded (add agent trace to review queue/dataset, Slack notification, etc.).<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/error_analysis_dashboard.png" alt="Monitoring Dashboard" width="1200"/></p> |
-| <h3>📊 Datasets</h3>Export comprehensive agent-environment interaction data or import external testcases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
+| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
+| <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
+| <h3>📡 Monitoring</h3>Get Slack alerts when you agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/error_analysis_dashboard.png" alt="Monitoring Dashboard" width="1200"/></p> |
+| <h3>📊 Datasets</h3>Export traces and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
 ## 🏢 Self-Hosting

judgeval-0.0.54.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,65 @@
+judgeval/__init__.py,sha256=HM1M8hmqRum6G554QKkXhB4DF4f5eh_xtYo0Kf-t3kw,332
+judgeval/clients.py,sha256=JnB8n90GyXiYaGmSEYaA67mdJSnr3SIrzArao7NGebw,980
+judgeval/constants.py,sha256=lqPVUR7XAr1zbmByJil3i0eY24ymWGzcgg88Npk-U20,5772
+judgeval/evaluation_run.py,sha256=B5w6UiB2cu8km93p4XT3jtganOtIKAZJI3UKc5Qgrew,2936
+judgeval/judgment_client.py,sha256=QT6jV1moshs_-1xjX8jAhQpr9vjznqqcXuobQ7eDBks,21343
+judgeval/rules.py,sha256=CoQjqmP8daEXewMkplmA-7urubDtweOr5O6z8klVwLI,20031
+judgeval/run_evaluation.py,sha256=WXQi2AIKu_iPSLZWnhgLarVbHE6nzyjHJcbKSHu3zYc,42568
+judgeval/version_check.py,sha256=FoLEtpCjDw2HuDQdpw5yT29UtwumSc6ZZN6AV_c9Mnw,1057
+judgeval/common/__init__.py,sha256=KH-QJyWtQ60R6yFIBDYS3WGRiNpEu1guynpxivZvpBQ,309
+judgeval/common/exceptions.py,sha256=OkgDznu2wpBQZMXiZarLJYNk1HIcC8qYW7VypDC3Ook,556
+judgeval/common/logger.py,sha256=514eFLYWS_UL8VY-zAR2ePUlpQe4rbYlleLASFllLE4,1511
+judgeval/common/s3_storage.py,sha256=UvAKGSa0S1BnNprzDKHMAfyT-8zlMAOM5kCrXcVN0HE,3743
+judgeval/common/tracer.py,sha256=qrvriShLG6INpE58sAhlQ6YZfZa3TtfJfsP-cVDyBe4,126135
+judgeval/common/utils.py,sha256=wkdBg86OHROQBXpIPtMyNku5cGckwPpaiATeuilLNbE,34304
+judgeval/data/__init__.py,sha256=1QagDcSQtfnJ632t9Dnq8d7XjAqhmY4mInOWt8qH9tM,455
+judgeval/data/example.py,sha256=6xtPTwWUsZ0HdErU-g954nCv64fsbnS1I5xuEvs14EA,2027
+judgeval/data/judgment_types.py,sha256=VM941NM7_uqwx6bKABV1cH2cocuYgclfORxCK3sPQZo,9853
+judgeval/data/result.py,sha256=7FFD9kOla6ijvu2-Wx3tFE98Ry7ECeV-f8aiDeHNaHs,2449
+judgeval/data/scorer_data.py,sha256=ty4clGts-Zp6NiU1SZXKbrVsyKvHhD5Tm1kbXx6we1k,2977
+judgeval/data/tool.py,sha256=iWQSdy5uNbIeACu3gQy1DC2oGYxRVYNfkkczWdQMAiA,99
+judgeval/data/trace.py,sha256=szugEHAb2R0YljmBQllQEVE5pOlBUC6eOSzbm_WXf-Y,4830
+judgeval/data/trace_run.py,sha256=kovRZduC0l-9nM5YWM6lKaQNEVy_WtHwt4lvIwPbHvY,1825
+judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
+judgeval/data/datasets/dataset.py,sha256=dDmTYSBRj4YEUhgYOebAcDm4N14nj3tcCqHj9y2Z1z0,12725
+judgeval/data/datasets/eval_dataset_client.py,sha256=0XS8irOA-gI1aEX3hk0LikzLjb6DOLuj18j2w64BoQM,12614
+judgeval/data/scripts/fix_default_factory.py,sha256=lvp2JwYZqz-XpD9LZNa3mANZVP-jJSZoNzolI6JWERM,591
+judgeval/data/scripts/openapi_transform.py,sha256=mT8qrzhvtMrMMC6Q_amSOGt-X-hUbDlT3xvpgEfcuEs,3828
+judgeval/integrations/langgraph.py,sha256=WuaHqer8i2QV_yZWoB18RNDLAYeH_Z_quVERvTOySQU,36151
+judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
+judgeval/judges/base_judge.py,sha256=_dz0qWsKRxzXxpRY9l6mrxTRYPSF2FE4ZXkrzhZ4gbY,986
+judgeval/judges/litellm_judge.py,sha256=LX4_KXb1Jp8IXif3vvOiKfRYH7ZkbQLs9AtWPGmj544,2483
+judgeval/judges/mixture_of_judges.py,sha256=wcHwLi9zU0uwKMqRVhcPdjiYKgWflX4dpUbU2kS9yg0,14825
+judgeval/judges/together_judge.py,sha256=r5k8ZcC6lnsFttGHhrocFtmglx2Cb3G-4ORKAeK-Nmw,2253
+judgeval/judges/utils.py,sha256=0CF9qtIUQUL3-W-qTGpmTjZbkUUBAM6TslDsrCHnTBU,2725
+judgeval/scorers/__init__.py,sha256=7f_zsJV29gO_u4o0n2011SovJ1ZGAI5Zk11WPUBPWcs,858
+judgeval/scorers/agent_scorer.py,sha256=TjwD_YglSywr3EowEojiCyg5qDgCRa5LRGc5nFdmIBc,703
+judgeval/scorers/api_scorer.py,sha256=xlhqkeMUBFxl8daSXOTWOYwZjBAz7o6b4sVD5f8cIHw,2523
+judgeval/scorers/base_scorer.py,sha256=rZfRPolxbsghWS0-FMqXrbJKuLobysMGjAeZkqn0cr0,3581
+judgeval/scorers/example_scorer.py,sha256=2n45y3LMV1Q-ARyXLHqvVWETlnY1DqS7OLzPu9IBGz8,716
+judgeval/scorers/exceptions.py,sha256=ACDHK5-TWiF3NTk-wycaedpbrdobm-CvvC1JA_iP-Mk,179
+judgeval/scorers/score.py,sha256=oQC6LMsalL4XAtXlA3S84MB9YiHjqYIgMhRRi-zaXJ4,6577
+judgeval/scorers/utils.py,sha256=I13XwyBKMUpZK2oacgkwaieUOGlQbKxKKn6SdiA4lmE,4532
+judgeval/scorers/judgeval_scorers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=saQmMol_CMzp1yovjgiF3YYhLTu-4O9xtmhygj1LRh8,1496
+judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=zJsU0VrUmRhY9qav48c6jTyDqUwI3JzhV9ajtlJCe0M,544
+judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=UDfzTO9Fx0FA5o0wfD8kprrGA4eW-43Rn9Gc0BQtKgY,393
+judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py,sha256=rbG80J88cer7yfVRvLpu-x2cdwiTl-ztnF2wgOoIlcE,2624
+judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py,sha256=mbBvirNcivu9dP6deM7FogDXrdwI9o8yqsO8IeKPSb4,309
+judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py,sha256=NABO_iBdkOo3fdPVcoWfUkeN-FTX3t3-bErMjdqBXdk,1361
+judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=ps51bTgQsD9xGYsk1v9bx0WxQMqywSllCE9_xlJkLd8,531
+judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py,sha256=SnFLvU4FGsMeUVUp0SGHSy_6wgfwr_vHPGnZx5YJl_Q,691
+judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=aQzu-TiGqG74JDQ927evv5yGmnZw2AOolyHvlIhiUbI,683
+judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py,sha256=Mcp1CjMNyOax9UkvoRdSyUYdO2Os1-Nko43y89m2Luo,594
+judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py,sha256=Z2FLGBC7m_CLx-CMgXVuTvYvN0vY5yOcWA0ImBkeBfY,787
+judgeval/scorers/judgeval_scorers/classifiers/__init__.py,sha256=Qt81W5ZCwMvBAne0LfQDb8xvg5iOG1vEYP7WizgwAZo,67
+judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py,sha256=8iTzMvou1Dr8pybul6lZHKjc9Ye2-0_racRGYkhEdTY,74
+judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py,sha256=gloLzThkFsr8sHQargDAH8XaDrlF6OCuc_69hyNslFU,2589
+judgeval/tracer/__init__.py,sha256=wkuXtOGDCrwgPPXlh_sSJmvGuWaAMHyNzk1TzB5f9aI,148
+judgeval/utils/alerts.py,sha256=3w_AjQrgfmOZvfqCridW8WAnHVxHHXokX9jNzVFyGjA,3297
+judgeval/utils/file_utils.py,sha256=wIEn8kjM0WrP216RGU_yhZhFOMWIS5ckigyHbzFSOMk,1774
+judgeval/utils/requests.py,sha256=rbmZTaiyWI8t2YUkhk11SIe3dF7j2j25L1BuFp_1PII,770
+judgeval-0.0.54.dist-info/METADATA,sha256=A3bcjOu-nBCLsP7W1vTDYGThPbrBZr5GJegewy9bEGs,54271
+judgeval-0.0.54.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+judgeval-0.0.54.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
+judgeval-0.0.54.dist-info/RECORD,,

judgeval/data/custom_example.py DELETED Viewed

@@ -1,19 +0,0 @@
-from pydantic import BaseModel, Field
-from typing import Optional, List, Dict, Any
-from uuid import uuid4
-class CustomExample(BaseModel):
-    input: Optional[Dict[str, Any]] = None
-    actual_output: Optional[Dict[str, Any]] = None
-    expected_output: Optional[Dict[str, Any]] = None
-    context: Optional[List[str]] = None
-    retrieval_context: Optional[List[str]] = None
-    additional_metadata: Optional[Dict[str, Any]] = None
-    tools_called: Optional[List[str]] = None
-    expected_tools: Optional[List[str]] = None
-    name: Optional[str] = None
-    example_id: str = Field(default_factory=lambda: str(uuid4()))
-    example_index: Optional[int] = None
-    timestamp: Optional[str] = None
-    trace_id: Optional[str] = None

judgeval/scorers/judgeval_scorer.py DELETED Viewed

@@ -1,177 +0,0 @@
-"""
-Judgeval Scorer class
-Enables client to create custom scorers that do not fall under any of the ready-made Judgment scorers.
-To create a custom scorer, extend this class and implement the `score_example`, `a_score_example`, and `success_check` methods.
-"""
-from typing import Optional, Dict, Union, List
-from abc import abstractmethod
-from judgeval.common.logger import debug, info, warning, error
-from judgeval.judges import JudgevalJudge
-from judgeval.judges.utils import create_judge
-from judgeval.constants import UNBOUNDED_SCORERS
-from judgeval.data.example import ExampleParams
-class JudgevalScorer:
-    """
-    Base class for scorers in `judgeval`.
-    In practice, you should not implement this class unless you are creating a custom scorer.
-    Judgeval offers 10+ default scorers that you can use out of the box.
-    If you want to create a scorer that does not fall under any of the ready-made Judgment scorers,
-    you can create a custom scorer by extending this class.
-    """
-    score_type: str  # name of your new scorer
-    threshold: float  # The threshold to pass a test while using this scorer as a scorer
-    score: Optional[float] = None  # The float score of the scorer run on the test case
-    score_breakdown: Optional[Dict] = None
-    reason: Optional[str] = (
-        None  # The reason for the score when evaluating the test case
-    )
-    success: Optional[bool] = None  # Whether the test case passed or failed
-    evaluation_model: Optional[str] = None  # The model used to evaluate the test case
-    strict_mode: bool = False  # Whether to run the scorer in strict mode
-    async_mode: bool = True  # Whether to run the scorer in async mode
-    verbose_mode: bool = True  # Whether to run the scorer in verbose mode
-    include_reason: bool = False  # Whether to include the reason in the output
-    custom_example: bool = False  # Whether the scorer corresponds to CustomExamples
-    error: Optional[str] = None  # The error message if the scorer failed
-    evaluation_cost: Optional[float] = None  # The cost of running the scorer
-    verbose_logs: Optional[str] = None  # The verbose logs of the scorer
-    additional_metadata: Optional[Dict] = None  # Additional metadata for the scorer
-    required_params: Optional[List[ExampleParams]] = (
-        None  # The required parameters for the scorer
-    )
-    def __init__(
-        self,
-        score_type: str,
-        threshold: float,
-        score: Optional[float] = None,
-        score_breakdown: Optional[Dict] = None,
-        reason: Optional[str] = None,
-        success: Optional[bool] = None,
-        evaluation_model: Optional[str] = None,
-        required_params: Optional[List[ExampleParams]] = None,
-        strict_mode: bool = False,
-        async_mode: bool = True,
-        verbose_mode: bool = True,
-        include_reason: bool = False,
-        custom_example: bool = False,
-        error: Optional[str] = None,
-        evaluation_cost: Optional[float] = None,
-        verbose_logs: Optional[str] = None,
-        additional_metadata: Optional[Dict] = None,
-    ):
-        debug(
-            f"Initializing JudgevalScorer with score_type={score_type}, threshold={threshold}"
-        )
-        if score_type in UNBOUNDED_SCORERS:
-            if threshold < 0:
-                raise ValueError(
-                    f"Threshold for {score_type} must be greater than 0, got: {threshold}"
-                )
-        else:
-            if not 0 <= threshold <= 1:
-                raise ValueError(
-                    f"Threshold for {score_type} must be between 0 and 1, got: {threshold}"
-                )
-        if strict_mode:
-            warning("Strict mode enabled - scoring will be more rigorous")
-        info(f"JudgevalScorer initialized with evaluation_model: {evaluation_model}")
-        self.score_type = score_type
-        self.threshold = threshold
-        self.score = score
-        self.score_breakdown = score_breakdown
-        self.reason = reason
-        self.success = success
-        self.evaluation_model = evaluation_model
-        self.strict_mode = strict_mode
-        self.async_mode = async_mode
-        self.verbose_mode = verbose_mode
-        self.include_reason = include_reason
-        self.custom_example = custom_example
-        self.error = error
-        self.evaluation_cost = evaluation_cost
-        self.verbose_logs = verbose_logs
-        self.additional_metadata = additional_metadata
-        self.required_params = required_params
-    def _add_model(self, model: Optional[Union[str, List[str], JudgevalJudge]] = None):
-        """
-        Adds the evaluation model to the JudgevalScorer instance
-        This method is used at eval time
-        """
-        self.model, self.using_native_model = create_judge(model)
-        self.evaluation_model = self.model.get_model_name()
-    @abstractmethod
-    def score_example(self, example, *args, **kwargs) -> float:
-        """
-        Measures the score on a single example
-        """
-        warning("Attempting to call unimplemented score_example method")
-        error("score_example method not implemented")
-        raise NotImplementedError(
-            "You must implement the `score` method in your custom scorer"
-        )
-    @abstractmethod
-    async def a_score_example(self, example, *args, **kwargs) -> float:
-        """
-        Asynchronously measures the score on a single example
-        """
-        warning("Attempting to call unimplemented a_score_example method")
-        error("a_score_example method not implemented")
-        raise NotImplementedError(
-            "You must implement the `a_score` method in your custom scorer"
-        )
-    @abstractmethod
-    def _success_check(self) -> bool:
-        """
-        For unit testing, determines whether the test case passes or fails
-        """
-        warning("Attempting to call unimplemented success_check method")
-        error("_success_check method not implemented")
-        raise NotImplementedError(
-            "You must implement the `_success_check` method in your custom scorer"
-        )
-    def __str__(self):
-        debug("Converting JudgevalScorer instance to string representation")
-        if self.error:
-            warning(f"JudgevalScorer contains error: {self.error}")
-        info(f"JudgevalScorer status - success: {self.success}, score: {self.score}")
-        attributes = {
-            "score_type": self.score_type,
-            "threshold": self.threshold,
-            "score": self.score,
-            "score_breakdown": self.score_breakdown,
-            "reason": self.reason,
-            "success": self.success,
-            "evaluation_model": self.evaluation_model,
-            "strict_mode": self.strict_mode,
-            "async_mode": self.async_mode,
-            "verbose_mode": self.verbose_mode,
-            "include_reason": self.include_reason,
-            "error": self.error,
-            "evaluation_cost": self.evaluation_cost,
-            "verbose_logs": self.verbose_logs,
-            "additional_metadata": self.additional_metadata,
-        }
-        return f"JudgevalScorer({attributes})"
-    def to_dict(self):
-        return {
-            "score_type": str(
-                self.score_type
-            ),  # Convert enum to string for serialization
-            "threshold": self.threshold,
-        }

judgeval/scorers/judgeval_scorers/api_scorers/comparison.py DELETED Viewed

@@ -1,45 +0,0 @@
-"""
-`judgeval` comparison scorer
-TODO add link to docs page for this scorer
-"""
-# Internal imports
-from judgeval.scorers.api_scorer import APIJudgmentScorer
-from judgeval.constants import APIScorer
-from typing import Optional, Dict
-from judgeval.data import ExampleParams
-class ComparisonScorer(APIJudgmentScorer):
-    kwargs: Optional[Dict] = None
-    def __init__(self, threshold: float, criteria: str, description: str):
-        super().__init__(
-            threshold=threshold,
-            score_type=APIScorer.COMPARISON,
-            required_params=[
-                ExampleParams.INPUT,
-                ExampleParams.ACTUAL_OUTPUT,
-                ExampleParams.EXPECTED_OUTPUT,
-            ],
-        )
-        self.kwargs = {"criteria": criteria, "description": description}
-    @property
-    def __name__(self):
-        return f"Comparison-{self.kwargs['criteria']}"
-    def to_dict(self) -> dict:
-        """
-        Converts the scorer configuration to a dictionary format.
-        Returns:
-            dict: A dictionary containing the scorer's configuration
-        """
-        return {
-            "score_type": self.score_type,
-            "threshold": self.threshold,
-            "kwargs": self.kwargs,
-        }

judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py DELETED Viewed

@@ -1,29 +0,0 @@
-"""
-`judgeval` contextual precision scorer
-TODO add link to docs page for this scorer
-"""
-# Internal imports
-from judgeval.scorers.api_scorer import APIJudgmentScorer
-from judgeval.constants import APIScorer
-from judgeval.data import ExampleParams
-class ContextualPrecisionScorer(APIJudgmentScorer):
-    def __init__(self, threshold: float):
-        super().__init__(
-            threshold=threshold,
-            score_type=APIScorer.CONTEXTUAL_PRECISION,
-            required_params=[
-                ExampleParams.INPUT,
-                ExampleParams.ACTUAL_OUTPUT,
-                ExampleParams.RETRIEVAL_CONTEXT,
-                ExampleParams.EXPECTED_OUTPUT,
-            ],
-        )
-    @property
-    def __name__(self):
-        return "Contextual Precision"

judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py DELETED Viewed

@@ -1,29 +0,0 @@
-"""
-`judgeval` contextual recall scorer
-TODO add link to docs page for this scorer
-"""
-# Internal imports
-from judgeval.scorers.api_scorer import APIJudgmentScorer
-from judgeval.constants import APIScorer
-from judgeval.data import ExampleParams
-class ContextualRecallScorer(APIJudgmentScorer):
-    def __init__(self, threshold: float):
-        super().__init__(
-            threshold=threshold,
-            score_type=APIScorer.CONTEXTUAL_RECALL,
-            required_params=[
-                ExampleParams.INPUT,
-                ExampleParams.ACTUAL_OUTPUT,
-                ExampleParams.EXPECTED_OUTPUT,
-                ExampleParams.RETRIEVAL_CONTEXT,
-            ],
-        )
-    @property
-    def __name__(self):
-        return "Contextual Recall"

judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py DELETED Viewed

@@ -1,32 +0,0 @@
-"""
-`judgeval` contextual relevancy scorer
-TODO add link to docs page for this scorer
-"""
-# Internal imports
-from judgeval.scorers.api_scorer import APIJudgmentScorer
-from judgeval.constants import APIScorer
-from judgeval.data import ExampleParams
-class ContextualRelevancyScorer(APIJudgmentScorer):
-    """
-    Scorer that checks if the output of a model is relevant to the retrieval context
-    """
-    def __init__(self, threshold: float):
-        super().__init__(
-            threshold=threshold,
-            score_type=APIScorer.CONTEXTUAL_RELEVANCY,
-            required_params=[
-                ExampleParams.INPUT,
-                ExampleParams.ACTUAL_OUTPUT,
-                ExampleParams.RETRIEVAL_CONTEXT,
-            ],
-        )
-    @property
-    def __name__(self):
-        return "Contextual Relevancy"

judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py DELETED Viewed

@@ -1,28 +0,0 @@
-"""
-`judgeval` Groundedness scorer
-TODO add link to docs page for this scorer
-"""
-# Internal imports
-from judgeval.scorers.api_scorer import APIJudgmentScorer
-from judgeval.constants import APIScorer
-from judgeval.data import ExampleParams
-class GroundednessScorer(APIJudgmentScorer):
-    def __init__(self, threshold: float):
-        super().__init__(
-            threshold=threshold,
-            score_type=APIScorer.GROUNDEDNESS,
-            required_params=[
-                ExampleParams.INPUT,
-                ExampleParams.ACTUAL_OUTPUT,
-                ExampleParams.RETRIEVAL_CONTEXT,
-            ],
-        )
-    @property
-    def __name__(self):
-        return "Groundedness"

judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py DELETED Viewed

@@ -1,38 +0,0 @@
-"""
-`judgeval` JSON correctness scorer
-TODO add link to docs page for this scorer
-"""
-# External imports
-from pydantic import BaseModel, Field
-# Internal imports
-from judgeval.scorers.api_scorer import APIJudgmentScorer
-from judgeval.constants import APIScorer
-from judgeval.data import ExampleParams
-class JSONCorrectnessScorer(APIJudgmentScorer):
-    json_schema: BaseModel = Field(None, exclude=True)
-    def __init__(self, threshold: float, json_schema: BaseModel):
-        super().__init__(
-            threshold=threshold,
-            score_type=APIScorer.JSON_CORRECTNESS,
-            required_params=[
-                ExampleParams.INPUT,
-                ExampleParams.ACTUAL_OUTPUT,
-            ],
-        )
-        object.__setattr__(self, "json_schema", json_schema)
-    def to_dict(self):
-        base_dict = super().to_dict()  # Get the parent class's dictionary
-        base_dict["kwargs"] = {"json_schema": self.json_schema.model_json_schema()}
-        return base_dict
-    @property
-    def __name__(self):
-        return "JSON Correctness"

judgeval/scorers/judgeval_scorers/api_scorers/summarization.py DELETED Viewed

@@ -1,27 +0,0 @@
-"""
-`judgeval` summarization scorer
-TODO add link to docs page for this scorer
-"""
-# Internal imports
-from judgeval.scorers.api_scorer import APIJudgmentScorer
-from judgeval.constants import APIScorer
-from judgeval.data import ExampleParams
-class SummarizationScorer(APIJudgmentScorer):
-    def __init__(self, threshold: float):
-        super().__init__(
-            threshold=threshold,
-            score_type=APIScorer.SUMMARIZATION,
-            required_params=[
-                ExampleParams.INPUT,
-                ExampleParams.ACTUAL_OUTPUT,
-            ],
-        )
-    @property
-    def __name__(self):
-        return "Summarization"

judgeval 0.0.52__py3-none-any.whl → 0.0.54__py3-none-any.whl

judgeval 0.0.52py3-none-any.whl → 0.0.54py3-none-any.whl