judgeval 0.0.31__py3-none-any.whl → 0.0.33__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. judgeval/__init__.py +3 -1
  2. judgeval/common/s3_storage.py +93 -0
  3. judgeval/common/tracer.py +869 -183
  4. judgeval/constants.py +1 -1
  5. judgeval/data/datasets/dataset.py +5 -1
  6. judgeval/data/datasets/eval_dataset_client.py +2 -2
  7. judgeval/data/sequence.py +16 -26
  8. judgeval/data/sequence_run.py +2 -0
  9. judgeval/judgment_client.py +44 -166
  10. judgeval/rules.py +4 -7
  11. judgeval/run_evaluation.py +2 -2
  12. judgeval/scorers/__init__.py +4 -4
  13. judgeval/scorers/judgeval_scorers/__init__.py +0 -176
  14. judgeval/version_check.py +22 -0
  15. {judgeval-0.0.31.dist-info → judgeval-0.0.33.dist-info}/METADATA +15 -2
  16. judgeval-0.0.33.dist-info/RECORD +63 -0
  17. judgeval/scorers/base_scorer.py +0 -58
  18. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -27
  19. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
  20. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -276
  21. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
  22. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
  23. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
  24. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
  25. judgeval/scorers/judgeval_scorers/local_implementations/comparison/__init__.py +0 -0
  26. judgeval/scorers/judgeval_scorers/local_implementations/comparison/comparison_scorer.py +0 -161
  27. judgeval/scorers/judgeval_scorers/local_implementations/comparison/prompts.py +0 -222
  28. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
  29. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
  30. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
  31. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
  32. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
  33. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
  34. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
  35. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
  36. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
  37. judgeval/scorers/judgeval_scorers/local_implementations/execution_order/__init__.py +0 -3
  38. judgeval/scorers/judgeval_scorers/local_implementations/execution_order/execution_order.py +0 -156
  39. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
  40. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -318
  41. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
  42. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
  43. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -264
  44. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
  45. judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/instruction_adherence.py +0 -232
  46. judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/prompt.py +0 -102
  47. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
  48. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
  49. judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
  50. judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
  51. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -551
  52. judgeval-0.0.31.dist-info/RECORD +0 -96
  53. {judgeval-0.0.31.dist-info → judgeval-0.0.33.dist-info}/WHEEL +0 -0
  54. {judgeval-0.0.31.dist-info → judgeval-0.0.33.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,176 +0,0 @@
1
- from typing import Type, Optional, Any
2
-
3
- # Import implementations
4
- from judgeval.scorers.judgeval_scorers.api_scorers import (
5
- ExecutionOrderScorer as APIExecutionOrderScorer,
6
- JSONCorrectnessScorer as APIJSONCorrectnessScorer,
7
- SummarizationScorer as APISummarizationScorer,
8
- HallucinationScorer as APIHallucinationScorer,
9
- FaithfulnessScorer as APIFaithfulnessScorer,
10
- ContextualRelevancyScorer as APIContextualRelevancyScorer,
11
- ContextualPrecisionScorer as APIContextualPrecisionScorer,
12
- ContextualRecallScorer as APIContextualRecallScorer,
13
- AnswerRelevancyScorer as APIAnswerRelevancyScorer,
14
- AnswerCorrectnessScorer as APIAnswerCorrectnessScorer,
15
- ComparisonScorer as APIComparisonScorer,
16
- InstructionAdherenceScorer as APIInstructionAdherenceScorer,
17
- GroundednessScorer as APIGroundednessScorer,
18
- DerailmentScorer as APIDerailmentScorer,
19
- )
20
-
21
- from judgeval.scorers.judgeval_scorers.local_implementations import (
22
- AnswerRelevancyScorer as LocalAnswerRelevancyScorer,
23
- ContextualPrecisionScorer as LocalContextualPrecisionScorer,
24
- ContextualRecallScorer as LocalContextualRecallScorer,
25
- ContextualRelevancyScorer as LocalContextualRelevancyScorer,
26
- FaithfulnessScorer as LocalFaithfulnessScorer,
27
- JsonCorrectnessScorer as LocalJsonCorrectnessScorer,
28
- ExecutionOrderScorer as LocalExecutionOrderScorer,
29
- HallucinationScorer as LocalHallucinationScorer,
30
- SummarizationScorer as LocalSummarizationScorer,
31
- AnswerCorrectnessScorer as LocalAnswerCorrectnessScorer,
32
- ComparisonScorer as LocalComparisonScorer,
33
- InstructionAdherenceScorer as LocalInstructionAdherenceScorer,
34
- )
35
-
36
- from judgeval.scorers.judgeval_scorers.classifiers import Text2SQLScorer
37
-
38
-
39
- class ScorerWrapper:
40
- """
41
- Wrapper class that can dynamically load either API or local implementation of a scorer.
42
- """
43
- def __init__(self, api_implementation: Type, local_implementation: Optional[Type] = None):
44
- self.api_implementation = api_implementation
45
- self.local_implementation = local_implementation
46
- self._instance = None
47
- self._init_args = None
48
- self._init_kwargs = None
49
-
50
- def __call__(self, *args, **kwargs):
51
- """Store initialization arguments for later use when implementation is loaded"""
52
- self._init_args = args
53
- self._init_kwargs = kwargs
54
- return self
55
-
56
- def load_implementation(self, use_judgment: bool = True) -> Any:
57
- """
58
- Load the appropriate implementation based on the use_judgment flag.
59
-
60
- Args:
61
- use_judgment (bool): If True, use API implementation. If False, use local implementation.
62
-
63
- Returns:
64
- Instance of the appropriate implementation
65
-
66
- Raises:
67
- ValueError: If local implementation is requested but not available
68
- """
69
- if self._instance is not None:
70
- return self._instance
71
-
72
- if use_judgment:
73
- implementation = self.api_implementation
74
- else:
75
- if self.local_implementation is None:
76
- raise ValueError("No local implementation available for this scorer")
77
- implementation = self.local_implementation
78
-
79
- args = self._init_args or ()
80
- kwargs = self._init_kwargs or {}
81
- self._instance = implementation(*args, **kwargs)
82
- return self._instance
83
-
84
- def __getattr__(self, name):
85
- """Defer all attribute access to the loaded implementation"""
86
- if self._instance is None:
87
- raise RuntimeError("Implementation not loaded. Call load_implementation() first")
88
- return getattr(self._instance, name)
89
-
90
- # Create wrapped versions of all scorers
91
-
92
- AnswerCorrectnessScorer = ScorerWrapper(
93
- api_implementation=APIAnswerCorrectnessScorer,
94
- local_implementation=LocalAnswerCorrectnessScorer
95
- )
96
-
97
- AnswerRelevancyScorer = ScorerWrapper(
98
- api_implementation=APIAnswerRelevancyScorer,
99
- local_implementation=LocalAnswerRelevancyScorer
100
- )
101
-
102
- ExecutionOrderScorer = ScorerWrapper(
103
- api_implementation=APIExecutionOrderScorer,
104
- local_implementation=LocalExecutionOrderScorer
105
- )
106
-
107
- JSONCorrectnessScorer = ScorerWrapper(
108
- api_implementation=APIJSONCorrectnessScorer,
109
- local_implementation=LocalJsonCorrectnessScorer
110
- )
111
-
112
- SummarizationScorer = ScorerWrapper(
113
- api_implementation=APISummarizationScorer,
114
- local_implementation=LocalSummarizationScorer
115
- )
116
-
117
- HallucinationScorer = ScorerWrapper(
118
- api_implementation=APIHallucinationScorer,
119
- local_implementation=LocalHallucinationScorer
120
- )
121
-
122
- FaithfulnessScorer = ScorerWrapper(
123
- api_implementation=APIFaithfulnessScorer,
124
- local_implementation=LocalFaithfulnessScorer
125
- )
126
-
127
- ContextualRelevancyScorer = ScorerWrapper(
128
- api_implementation=APIContextualRelevancyScorer,
129
- local_implementation=LocalContextualRelevancyScorer
130
- )
131
-
132
- ContextualPrecisionScorer = ScorerWrapper(
133
- api_implementation=APIContextualPrecisionScorer,
134
- local_implementation=LocalContextualPrecisionScorer
135
- )
136
-
137
- ContextualRecallScorer = ScorerWrapper(
138
- api_implementation=APIContextualRecallScorer,
139
- local_implementation=LocalContextualRecallScorer
140
- )
141
-
142
- InstructionAdherenceScorer = ScorerWrapper(
143
- api_implementation=APIInstructionAdherenceScorer,
144
- local_implementation=LocalInstructionAdherenceScorer
145
- )
146
-
147
- def ComparisonScorer(threshold: float, criteria: str, description: str):
148
- return ScorerWrapper(
149
- api_implementation=APIComparisonScorer,
150
- local_implementation=LocalComparisonScorer
151
- )(threshold=threshold, criteria=criteria, description=description)
152
-
153
- GroundednessScorer = ScorerWrapper(
154
- api_implementation=APIGroundednessScorer,
155
- )
156
-
157
- DerailmentScorer = ScorerWrapper(
158
- api_implementation=APIDerailmentScorer,
159
- local_implementation=LocalInstructionAdherenceScorer # TODO: add local implementation
160
- )
161
-
162
- __all__ = [
163
- "ExecutionOrderScorer",
164
- "JSONCorrectnessScorer",
165
- "SummarizationScorer",
166
- "HallucinationScorer",
167
- "FaithfulnessScorer",
168
- "ContextualRelevancyScorer",
169
- "ContextualPrecisionScorer",
170
- "ContextualRecallScorer",
171
- "AnswerRelevancyScorer",
172
- "Text2SQLScorer",
173
- "ComparisonScorer",
174
- "GroundednessScorer",
175
- "DerailmentScorer",
176
- ]
@@ -0,0 +1,22 @@
1
+ import importlib.metadata
2
+ import requests
3
+ import threading
4
+
5
+ def check_latest_version(package_name: str = "judgeval"):
6
+ def _check():
7
+ try:
8
+ current_version = importlib.metadata.version(package_name)
9
+ response = requests.get(f"https://pypi.org/pypi/{package_name}/json", timeout=2)
10
+ latest_version = response.json()["info"]["version"]
11
+
12
+ if current_version != latest_version:
13
+ print(
14
+ f"\033[93mUPDATE AVAILABLE:\033[0m You are using '{package_name}=={current_version}', "
15
+ f"but the latest version is '{latest_version}'. While this version is still supported, "
16
+ f"we recommend upgrading to avoid potential issues or missing features: "
17
+ f"`pip install --upgrade {package_name}`"
18
+ )
19
+ except Exception:
20
+ pass
21
+
22
+ threading.Thread(target=_check, daemon=True).start()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.31
3
+ Version: 0.0.33
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -12,12 +12,13 @@ Classifier: Programming Language :: Python :: 3
12
12
  Requires-Python: >=3.11
13
13
  Requires-Dist: anthropic
14
14
  Requires-Dist: fastapi
15
+ Requires-Dist: google-genai
15
16
  Requires-Dist: langchain
16
17
  Requires-Dist: langchain-anthropic
17
18
  Requires-Dist: langchain-core
18
19
  Requires-Dist: langchain-huggingface
19
20
  Requires-Dist: langchain-openai
20
- Requires-Dist: litellm
21
+ Requires-Dist: litellm==1.38.12
21
22
  Requires-Dist: nest-asyncio
22
23
  Requires-Dist: openai
23
24
  Requires-Dist: openpyxl
@@ -94,9 +95,21 @@ Create a file named `traces.py` with the following code:
94
95
  from judgeval.common.tracer import Tracer, wrap
95
96
  from openai import OpenAI
96
97
 
98
+ # Basic initialization
97
99
  client = wrap(OpenAI())
98
100
  judgment = Tracer(project_name="my_project")
99
101
 
102
+ # Or with S3 storage enabled
103
+ # NOTE: Make sure AWS creds correspond to an account with write access to the specified S3 bucket
104
+ judgment = Tracer(
105
+ project_name="my_project",
106
+ use_s3=True,
107
+ s3_bucket_name="my-traces-bucket", # Bucket created automatically if it doesn't exist
108
+ s3_aws_access_key_id="your-access-key", # Optional: defaults to AWS_ACCESS_KEY_ID env var
109
+ s3_aws_secret_access_key="your-secret-key", # Optional: defaults to AWS_SECRET_ACCESS_KEY env var
110
+ s3_region_name="us-west-1" # Optional: defaults to AWS_REGION env var or "us-west-1"
111
+ )
112
+
100
113
  @judgment.observe(span_type="tool")
101
114
  def my_tool():
102
115
  return "Hello world!"
@@ -0,0 +1,63 @@
1
+ judgeval/__init__.py,sha256=x9HWt4waJwJMAqTuJSg2MezF9Zg-macEjeU-ajbly-8,330
2
+ judgeval/clients.py,sha256=6VQmEqmfCngUdS2MuPBIpHvtDFqOENm8-_BmMvjLyRQ,944
3
+ judgeval/constants.py,sha256=_XmVAkebMyGrDvvanAVlMgVd4p6MLHdEVsTQFI0kz1k,5411
4
+ judgeval/evaluation_run.py,sha256=WGzx-Ug2qhSmunFo8NrmSstBRsOUc5KpKq0Lc51rqsM,6739
5
+ judgeval/judgment_client.py,sha256=brRYmphZR-2IUre9kdOhfse1mYDilcIqUzzH21ROAdk,22208
6
+ judgeval/rules.py,sha256=jkh1cXXcUf8oRY7xJUZfcQBYWn_rjUW4GvrhRt15PeU,20265
7
+ judgeval/run_evaluation.py,sha256=elMpFHahyeukKKa09fmJM3c_afwJ00mbZRqm18l5f00,28481
8
+ judgeval/version_check.py,sha256=bvJEidB7rAeXozoUbN9Yb97QOR_s2hgvpvj74jJ5HlY,943
9
+ judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
10
+ judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
11
+ judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
12
+ judgeval/common/s3_storage.py,sha256=W8wq9S7qJZdqdBR4sk3aEZ4K3-pz40DOoolOJrWs9Vo,3768
13
+ judgeval/common/tracer.py,sha256=YsObK8VQXp1DDbU9xncU8NjuY-JUI54BqmG4olezrZc,92507
14
+ judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
15
+ judgeval/data/__init__.py,sha256=xuKx_KCVHGp6CXvQuVmKl3v7pJp-qDaz0NccKxwjtO0,481
16
+ judgeval/data/custom_example.py,sha256=QRBqiRiZS8UgVeTRHY0r1Jzm6yAYsyg6zmHxQGxdiQs,739
17
+ judgeval/data/example.py,sha256=cJrmPGLel_P2sy1UaRvuVSAi35EnA9XMR11Lhp4aDLo,5930
18
+ judgeval/data/result.py,sha256=Gb9tiSDsk1amXgh0cFG6JmlW_BMKxS2kuTwNA0rrHjA,3184
19
+ judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
20
+ judgeval/data/sequence.py,sha256=FmKVdzQP5VTujRCHDWk097MKRR-rJgbsdrxyCKee6tA,1994
21
+ judgeval/data/sequence_run.py,sha256=RmYjfWKMWg-pcF5PLeiWfrhuDkjDZi5VEmAIEXN3Ib0,2104
22
+ judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
23
+ judgeval/data/datasets/dataset.py,sha256=dhLo30hvpmmOK2R6O5wDs_neawUJ4lS8bb4S42SufNQ,13034
24
+ judgeval/data/datasets/eval_dataset_client.py,sha256=xjj66BO9Es9IxXqzQe1RT_e0kpeKlt7OrhRoSuj4KHM,15085
25
+ judgeval/integrations/langgraph.py,sha256=J-cQfFP52TjJewdSTe-fcsUC4HDvjNbXoxmbmF0SgiE,11743
26
+ judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
27
+ judgeval/judges/base_judge.py,sha256=ch_S7uBB7lyv44Lf1d7mIGFpveOO58zOkkpImKgd9_4,994
28
+ judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6KTg,2424
29
+ judgeval/judges/mixture_of_judges.py,sha256=IJoi4Twk8ze1CJWVEp69k6TSqTCTGrmVYQ0qdffer60,15549
30
+ judgeval/judges/together_judge.py,sha256=l00hhPerAZXg3oYBd8cyMtWsOTNt_0FIqoxhKJKQe3k,2302
31
+ judgeval/judges/utils.py,sha256=9lvUxziGV86ISvVFxYBWc09TWFyAQgUTyPf_a9mD5Rs,2686
32
+ judgeval/scorers/__init__.py,sha256=Mk-mWUt_gNpJqY_WIEuQynD6fxc34fWSRSuobMSrj94,1238
33
+ judgeval/scorers/api_scorer.py,sha256=NQ_CrrUPhSUk1k2Q8rKpCG_TU2FT32sFEqvb-Yi54B0,2688
34
+ judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
35
+ judgeval/scorers/judgeval_scorer.py,sha256=79-JJurqHP-qTaWNWInx4SjvQYwXc9lvfPPNgwsh2yA,6773
36
+ judgeval/scorers/prompt_scorer.py,sha256=PaAs2qRolw1P3_I061Xvk9qzvF4O-JR8g_39RqXnHcM,17728
37
+ judgeval/scorers/score.py,sha256=r9QiT4-LIvivcJ6XxByrbswKSO8eQTtAD1UlXT_lcmo,18741
38
+ judgeval/scorers/utils.py,sha256=iHQVTlIANbmCTXz9kTeSdOytgUZ_T74Re61ajqsk_WQ,6827
39
+ judgeval/scorers/judgeval_scorers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
+ judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=_sDUBxSG536KGqXNi6dFpaYKghjEAadxBxaaxV9HuuE,1764
41
+ judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=Fnd9CVIOZ73sWEWymsU5eBrrZqPFjMZ0BKpeW-PDyTg,711
42
+ judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=oETeN9K0HSIRdL2SDqn82Vskpwh5SlKnZvs5VDm2OBU,658
43
+ judgeval/scorers/judgeval_scorers/api_scorers/comparison.py,sha256=kuzf9OWvpY38yYSwlBgneLkUZwJNM4FQqvbS66keA90,1249
44
+ judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py,sha256=tpSuzFAaW8X9xqA0aLLKwh7qmBK0Pc_bJZMIe_q412U,770
45
+ judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py,sha256=pFVhk4pLtQ-FnNlbI-dFF-SIh69Jza7erHqiPkFWoBo,758
46
+ judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py,sha256=RQ6DZwEhChfecd89Ey-T7ke--7qTaXZlRsNxwH8gaME,823
47
+ judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py,sha256=V9WPuwNMm097V7IknKs8UkmAk0yjnBXTcJha_BHXxTA,475
48
+ judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py,sha256=Pb3CiNF2Ca826B92wJCVAi_68lJjLhqqCKwQKaflSUg,1294
49
+ judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=-BwOapqjryYNKNydtdkUiKIij76dY0O1jBmdc6dKazQ,692
50
+ judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py,sha256=ntEEeTANEOsGlcbiTAF_3r6BeSJEaVDns8po8T0L6Vg,692
51
+ judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py,sha256=k5gDOki-8KXrZXydvdSqDt3NZqQ28hXoOCHQf6jNxr4,686
52
+ judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=XnSGEkQfwVqaqnHEGMCsxNiHVzrsrej48uDbLoWc8CQ,678
53
+ judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py,sha256=mMKEuR87_yanEuZJ5YSGFMHDD_oLVZ6-rQuciFaDOMA,1095
54
+ judgeval/scorers/judgeval_scorers/api_scorers/summarization.py,sha256=QmWB8bVbDYHY5FcF0rYZE_3c2XXgMLRmR6aXJWfdMC4,655
55
+ judgeval/scorers/judgeval_scorers/classifiers/__init__.py,sha256=Qt81W5ZCwMvBAne0LfQDb8xvg5iOG1vEYP7WizgwAZo,67
56
+ judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py,sha256=8iTzMvou1Dr8pybul6lZHKjc9Ye2-0_racRGYkhEdTY,74
57
+ judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py,sha256=ly72Z7s_c8NID6-nQnuW8qEGEW2MqdvpJ-5WfXzbAQg,2579
58
+ judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
59
+ judgeval/utils/alerts.py,sha256=O19Xj7DA0YVjl8PWiuH4zfdZeu3yiLVvHfY8ah2wG0g,2759
60
+ judgeval-0.0.33.dist-info/METADATA,sha256=KzTkGTHYE8Uplehvtk_7x30XrV0xe1bpd-tU5lt0mHg,6097
61
+ judgeval-0.0.33.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
62
+ judgeval-0.0.33.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
63
+ judgeval-0.0.33.dist-info/RECORD,,
@@ -1,58 +0,0 @@
1
- """
2
- Judgment Scorer class.
3
-
4
- Scores `Example`s using ready-made Judgment evaluators.
5
- """
6
-
7
- from pydantic import BaseModel, field_validator
8
- from judgeval.common.logger import debug, info, warning, error
9
-
10
- from judgeval.constants import APIScorer, UNBOUNDED_SCORERS
11
-
12
-
13
- class APIJudgmentScorer(BaseModel):
14
- """
15
- Class for ready-made, "out-of-the-box" scorer that uses Judgment evaluators to score `Example`s.
16
-
17
- Args:
18
- score_type (APIScorer): The Judgment metric to use for scoring `Example`s
19
- threshold (float): A value between 0 and 1 that determines the scoring threshold
20
- """
21
- score_type: APIScorer
22
- threshold: float
23
-
24
- @field_validator('threshold')
25
- def validate_threshold(cls, v, info):
26
- """
27
- Validates that the threshold is between 0 and 1 inclusive.
28
- """
29
- score_type = info.data.get('score_type')
30
- if score_type in UNBOUNDED_SCORERS:
31
- if v < 0:
32
- error(f"Threshold for {score_type} must be greater than 0, got: {v}")
33
- raise ValueError(f"Threshold for {score_type} must be greater than 0, got: {v}")
34
- else:
35
- if not 0 <= v <= 1:
36
- error(f"Threshold for {score_type} must be between 0 and 1, got: {v}")
37
- raise ValueError(f"Threshold for {score_type} must be between 0 and 1, got: {v}")
38
- return v
39
-
40
- @field_validator('score_type')
41
- def convert_to_enum_value(cls, v):
42
- """
43
- Validates that the `score_type` is a valid `JudgmentMetric` enum value.
44
- Converts string values to `JudgmentMetric` enum values.
45
- """
46
- debug(f"Attempting to convert score_type value: {v}")
47
- if isinstance(v, APIScorer):
48
- info(f"Using existing JudgmentMetric: {v.value}")
49
- return v.value
50
- elif isinstance(v, str):
51
- debug(f"Converting string value to JudgmentMetric enum: {v}")
52
- return APIScorer[v.upper()].value
53
- error(f"Invalid score_type value: {v}")
54
- raise ValueError(f"Invalid value for score_type: {v}")
55
-
56
- def __str__(self):
57
- return f"JudgmentScorer(score_type={self.score_type.value}, threshold={self.threshold})"
58
-
@@ -1,27 +0,0 @@
1
- from judgeval.scorers.judgeval_scorers.local_implementations.answer_relevancy.answer_relevancy_scorer import AnswerRelevancyScorer
2
- from judgeval.scorers.judgeval_scorers.local_implementations.contextual_precision.contextual_precision_scorer import ContextualPrecisionScorer
3
- from judgeval.scorers.judgeval_scorers.local_implementations.contextual_recall.contextual_recall_scorer import ContextualRecallScorer
4
- from judgeval.scorers.judgeval_scorers.local_implementations.contextual_relevancy.contextual_relevancy_scorer import ContextualRelevancyScorer
5
- from judgeval.scorers.judgeval_scorers.local_implementations.faithfulness.faithfulness_scorer import FaithfulnessScorer
6
- from judgeval.scorers.judgeval_scorers.local_implementations.json_correctness.json_correctness_scorer import JsonCorrectnessScorer
7
- from judgeval.scorers.judgeval_scorers.local_implementations.execution_order.execution_order import ExecutionOrderScorer
8
- from judgeval.scorers.judgeval_scorers.local_implementations.hallucination.hallucination_scorer import HallucinationScorer
9
- from judgeval.scorers.judgeval_scorers.local_implementations.summarization.summarization_scorer import SummarizationScorer
10
- from judgeval.scorers.judgeval_scorers.local_implementations.answer_correctness.answer_correctness_scorer import AnswerCorrectnessScorer
11
- from judgeval.scorers.judgeval_scorers.local_implementations.comparison.comparison_scorer import ComparisonScorer
12
- from judgeval.scorers.judgeval_scorers.local_implementations.instruction_adherence.instruction_adherence import InstructionAdherenceScorer
13
-
14
- __all__ = [
15
- "AnswerCorrectnessScorer",
16
- "AnswerRelevancyScorer",
17
- "ComparisonScorer",
18
- "ContextualPrecisionScorer",
19
- "ContextualRecallScorer",
20
- "ContextualRelevancyScorer",
21
- "FaithfulnessScorer",
22
- "JsonCorrectnessScorer",
23
- "ExecutionOrderScorer",
24
- "HallucinationScorer",
25
- "SummarizationScorer",
26
- "InstructionAdherenceScorer",
27
- ]
@@ -1,4 +0,0 @@
1
- from judgeval.scorers.judgeval_scorers.local_implementations.answer_correctness.answer_correctness_scorer import AnswerCorrectnessScorer
2
-
3
-
4
- __all__ = ["AnswerCorrectnessScorer"]
@@ -1,276 +0,0 @@
1
- from typing import Optional, List, Union, Tuple
2
-
3
- from judgeval.constants import APIScorer
4
- from judgeval.judges import JudgevalJudge
5
- from judgeval.judges.utils import create_judge
6
- from judgeval.data import Example, ExampleParams
7
- from judgeval.scorers import JudgevalScorer
8
- from judgeval.scorers.utils import (
9
- get_or_create_event_loop,
10
- parse_response_json,
11
- scorer_progress_meter,
12
- create_verbose_logs,
13
- check_example_params,
14
- )
15
- from .prompts import (
16
- ACVerdict,
17
- AnswerCorrectnessTemplate,
18
- Statements,
19
- Verdicts,
20
- Reason,
21
- )
22
-
23
-
24
- required_params = [
25
- ExampleParams.INPUT,
26
- ExampleParams.ACTUAL_OUTPUT,
27
- ExampleParams.EXPECTED_OUTPUT,
28
- ]
29
-
30
-
31
- class AnswerCorrectnessScorer(JudgevalScorer):
32
- def __init__(
33
- self,
34
- threshold: float = 0.5,
35
- model: Optional[Union[str, JudgevalJudge]] = None,
36
- include_reason: bool = True,
37
- async_mode: bool = True,
38
- strict_mode: bool = False,
39
- verbose_mode: bool = False
40
- ):
41
- super().__init__(
42
- score_type=APIScorer.ANSWER_CORRECTNESS,
43
- threshold=1 if strict_mode else threshold,
44
- evaluation_model=None,
45
- include_reason=include_reason,
46
- async_mode=async_mode,
47
- strict_mode=strict_mode,
48
- verbose_mode=verbose_mode
49
- )
50
- self.model, self.using_native_model = create_judge(model)
51
- self.evaluation_model = self.model.get_model_name()
52
-
53
- async def _a_get_statements(self, expected_output: str) -> List[str]:
54
- prompt = AnswerCorrectnessTemplate.deduce_statements(
55
- expected_output=expected_output,
56
- )
57
- if self.using_native_model:
58
- res = await self.model.a_generate(prompt)
59
- data = parse_response_json(res, self)
60
- return data["statements"]
61
- else:
62
- try:
63
- res: Statements = await self.model.a_generate(
64
- prompt, schema=Statements
65
- )
66
- return res.statements
67
- except TypeError:
68
- res = await self.model.a_generate(prompt)
69
- data = parse_response_json(res, self)
70
- return data["statements"]
71
-
72
- def _get_statements(self, expected_output: str) -> List[str]:
73
- prompt = AnswerCorrectnessTemplate.deduce_statements(
74
- expected_output=expected_output,
75
- )
76
- if self.using_native_model:
77
- res = self.model.generate(prompt)
78
- data = parse_response_json(res, self)
79
- return data["statements"]
80
- else:
81
- try:
82
- res: Statements = self.model.generate(
83
- prompt, schema=Statements
84
- )
85
- return res.statements
86
- except TypeError:
87
- res = self.model.generate(prompt)
88
- data = parse_response_json(res, self)
89
- return data["statements"]
90
-
91
- async def _a_get_verdicts(self, actual_output: str) -> List[ACVerdict]:
92
- if len(self.statements) == 0:
93
- return []
94
-
95
- prompt = AnswerCorrectnessTemplate.generate_verdicts(
96
- actual_output=actual_output,
97
- statements=self.statements,
98
- )
99
-
100
- if self.using_native_model:
101
- res = await self.model.a_generate(prompt)
102
- data = parse_response_json(res, self)
103
- return [ACVerdict(**item) for item in data["verdicts"]]
104
- else:
105
- try:
106
- res: Verdicts = await self.model.a_generate(prompt, schema=Verdicts)
107
- return [item for item in res.verdicts]
108
- except TypeError:
109
- res = await self.model.a_generate(prompt)
110
- data = parse_response_json(res, self)
111
- return [ACVerdict(**item) for item in data["verdicts"]]
112
-
113
- def _get_verdicts(self, actual_output: str) -> List[ACVerdict]:
114
- if len(self.statements) == 0:
115
- return []
116
-
117
- prompt = AnswerCorrectnessTemplate.generate_verdicts(
118
- actual_output=actual_output,
119
- statements=self.statements,
120
- )
121
-
122
- if self.using_native_model:
123
- res = self.model.generate(prompt)
124
- data = parse_response_json(res, self)
125
- return [ACVerdict(**item) for item in data["verdicts"]]
126
- else:
127
- try:
128
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
129
- return [item for item in res.verdicts]
130
- except TypeError:
131
- res = self.model.generate(prompt)
132
- data = parse_response_json(res, self)
133
- return [ACVerdict(**item) for item in data["verdicts"]]
134
-
135
- async def _a_get_reason(self) -> str:
136
- if self.include_reason is False:
137
- return None
138
-
139
- incorrect_statements: List[Tuple[str, str]] = []
140
- for idx, verdict in enumerate(self.verdicts):
141
- if verdict.verdict.strip().lower() == "no":
142
- incorrect_statements.append((self.statements[idx], verdict.reason))
143
-
144
- prompt = AnswerCorrectnessTemplate.generate_reason(
145
- incorrect_statements=incorrect_statements,
146
- score=format(self.score, ".2f"),
147
- )
148
- if self.using_native_model:
149
- res = await self.model.a_generate(prompt)
150
- data = parse_response_json(res, self)
151
- return data["reason"]
152
- else:
153
- try:
154
- res: Reason = await self.model.a_generate(
155
- prompt=prompt, schema=Reason
156
- )
157
- return res.reason
158
- except TypeError:
159
- res = await self.model.a_generate(prompt)
160
- data = parse_response_json(res, self)
161
- return data["reason"]
162
-
163
- def _get_reason(self) -> str:
164
- if self.include_reason is False:
165
- return None
166
-
167
- incorrect_statements: List[Tuple[str, str]] = []
168
- for idx, verdict in enumerate(self.verdicts):
169
- if verdict.verdict.strip().lower() == "no":
170
- incorrect_statements.append((self.statements[idx], verdict.reason))
171
-
172
- prompt = AnswerCorrectnessTemplate.generate_reason(
173
- incorrect_statements=incorrect_statements,
174
- score=format(self.score, ".2f"),
175
- )
176
- if self.using_native_model:
177
- res = self.model.generate(prompt)
178
- data = parse_response_json(res, self)
179
- return data["reason"]
180
- else:
181
- try:
182
- res: Reason = self.model.generate(
183
- prompt=prompt, schema=Reason
184
- )
185
- return res.reason
186
- except TypeError:
187
- res = self.model.generate(prompt)
188
- data = parse_response_json(res, self)
189
- return data["reason"]
190
-
191
- def _compute_score(self) -> float:
192
- number_of_verdicts = len(self.verdicts)
193
- if number_of_verdicts == 0:
194
- return 1
195
-
196
- correct_count = 0
197
- for verdict in self.verdicts:
198
- if verdict.verdict.strip().lower() == "yes":
199
- correct_count += 1
200
-
201
- score = correct_count / number_of_verdicts
202
- return 0 if self.strict_mode and score < self.threshold else score
203
-
204
- def score_example(
205
- self,
206
- example: Example,
207
- _show_indicator: bool = True,
208
- ) -> float:
209
- check_example_params(example, required_params, self)
210
-
211
- with scorer_progress_meter(self, display_meter=_show_indicator):
212
- try:
213
- if self.async_mode:
214
- loop = get_or_create_event_loop()
215
- loop.run_until_complete(
216
- self.a_score_example(example, _show_indicator=False)
217
- )
218
- else:
219
- self.statements = self._get_statements(example.expected_output)
220
- self.verdicts = self._get_verdicts(example.actual_output)
221
- self.score = self._compute_score()
222
- self.reason = self._get_reason()
223
- self.success = self.score >= self.threshold
224
- self.verbose_logs = create_verbose_logs(
225
- self,
226
- steps=[
227
- f"Statements:\n{self.statements}",
228
- f"Verdicts:\n{[v.model_dump() for v in self.verdicts]}",
229
- f"Score: {self.score}\nReason: {self.reason}",
230
- ],
231
- )
232
- return self.score
233
- except Exception as e:
234
- print(f"Error in score_example for AnswerCorrectnessScorer: {e}")
235
- raise
236
-
237
- async def a_score_example(
238
- self,
239
- example: Example,
240
- _show_indicator: bool = True,
241
- ) -> float:
242
- check_example_params(example, required_params, self)
243
-
244
- with scorer_progress_meter(self, async_mode=True, display_meter=_show_indicator):
245
- try:
246
- self.statements: List[str] = await self._a_get_statements(example.expected_output)
247
- self.verdicts: List[ACVerdict] = await self._a_get_verdicts(example.actual_output)
248
- self.score = self._compute_score()
249
- self.reason = await self._a_get_reason()
250
- self.success = self.score >= self.threshold
251
- self.verbose_logs = create_verbose_logs(
252
- self,
253
- steps=[
254
- f"Statements:\n{self.statements}",
255
- f"Verdicts:\n{[v.model_dump() for v in self.verdicts]}",
256
- f"Score: {self.score}\nReason: {self.reason}",
257
- ],
258
- )
259
- return self.score
260
- except Exception as e:
261
- print(f"Error in a_score_example for AnswerCorrectnessScorer: {e}")
262
- raise
263
-
264
- def _success_check(self) -> bool:
265
- if self.error is not None:
266
- self.success = False
267
- else:
268
- try:
269
- self.success = self.score >= self.threshold
270
- except:
271
- self.success = False
272
- return self.success
273
-
274
- @property
275
- def __name__(self):
276
- return "Answer Correctness"