judgeval 0.0.7__py3-none-any.whl → 0.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. judgeval/judgment_client.py +6 -5
  2. judgeval/scorers/__init__.py +2 -0
  3. judgeval/scorers/judgeval_scorer.py +2 -0
  4. judgeval/scorers/judgeval_scorers/__init__.py +4 -0
  5. judgeval/scorers/judgeval_scorers/classifiers/__init__.py +3 -0
  6. judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +3 -0
  7. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +54 -0
  8. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +10 -5
  9. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +11 -5
  10. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +10 -5
  11. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +10 -5
  12. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +10 -5
  13. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +11 -6
  14. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +10 -5
  15. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +5 -0
  16. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +13 -6
  17. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +10 -1
  18. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +10 -4
  19. {judgeval-0.0.7.dist-info → judgeval-0.0.9.dist-info}/METADATA +1 -1
  20. {judgeval-0.0.7.dist-info → judgeval-0.0.9.dist-info}/RECORD +22 -18
  21. {judgeval-0.0.7.dist-info → judgeval-0.0.9.dist-info}/WHEEL +0 -0
  22. {judgeval-0.0.7.dist-info → judgeval-0.0.9.dist-info}/licenses/LICENSE.md +0 -0
@@ -267,7 +267,6 @@ class JudgmentClient:
267
267
 
268
268
  return response.json()["slug"]
269
269
 
270
-
271
270
  def assert_test(
272
271
  self,
273
272
  examples: List[Example],
@@ -275,12 +274,14 @@ class JudgmentClient:
275
274
  model: Union[str, List[str], JudgevalJudge],
276
275
  aggregator: Optional[str] = None,
277
276
  metadata: Optional[Dict[str, Any]] = None,
278
- log_results: bool = False,
279
- project_name: str = "",
280
- eval_run_name: str = "",
277
+ log_results: bool = True,
278
+ project_name: str = "default_project",
279
+ eval_run_name: str = "default_eval_run",
281
280
  override: bool = False,
282
281
  ) -> None:
283
-
282
+ """
283
+ Asserts a test by running the evaluation and checking the results for success
284
+ """
284
285
  results = self.run_evaluation(
285
286
  examples=examples,
286
287
  scorers=scorers,
@@ -13,6 +13,7 @@ from judgeval.scorers.judgeval_scorers import (
13
13
  AnswerRelevancyScorer,
14
14
  ScorerWrapper,
15
15
  AnswerCorrectnessScorer,
16
+ Text2SQLScorer,
16
17
  )
17
18
 
18
19
  __all__ = [
@@ -31,4 +32,5 @@ __all__ = [
31
32
  "AnswerRelevancyScorer",
32
33
  "ScorerWrapper",
33
34
  "AnswerCorrectnessScorer",
35
+ "Text2SQLScorer",
34
36
  ]
@@ -58,6 +58,8 @@ class JudgevalScorer:
58
58
  additional_metadata: Optional[Dict] = None
59
59
  ):
60
60
  debug(f"Initializing CustomScorer with score_type={score_type}, threshold={threshold}")
61
+ if not 0 <= threshold <= 1:
62
+ raise ValueError("Threshold must be between 0 and 1")
61
63
  if strict_mode:
62
64
  warning("Strict mode enabled - scoring will be more rigorous")
63
65
  info(f"CustomScorer initialized with evaluation_model: {evaluation_model}")
@@ -28,6 +28,9 @@ from judgeval.scorers.judgeval_scorers.local_implementations import (
28
28
  AnswerCorrectnessScorer as LocalAnswerCorrectnessScorer
29
29
  )
30
30
 
31
+ from judgeval.scorers.judgeval_scorers.classifiers import Text2SQLScorer
32
+
33
+
31
34
  class ScorerWrapper:
32
35
  """
33
36
  Wrapper class that can dynamically load either API or local implementation of a scorer.
@@ -141,4 +144,5 @@ __all__ = [
141
144
  "ContextualPrecisionScorer",
142
145
  "ContextualRecallScorer",
143
146
  "AnswerRelevancyScorer",
147
+ "Text2SQLScorer",
144
148
  ]
@@ -0,0 +1,3 @@
1
+ from .text2sql import Text2SQLScorer
2
+
3
+ __all__ = ["Text2SQLScorer"]
@@ -0,0 +1,3 @@
1
+ from .text2sql_scorer import Text2SQLScorer
2
+
3
+ __all__ = ["Text2SQLScorer"]
@@ -0,0 +1,54 @@
1
+ """
2
+ ClassifierScorer implementation for basic Text-to-SQL evaluation.
3
+
4
+ Takes a natural language query, a corresponding LLM-generated SQL query, and a table schema + (optional) metadata.
5
+ Determines if the LLM-generated SQL query is valid and works for the natural language query.
6
+ """
7
+ from judgeval.scorers import ClassifierScorer
8
+
9
+ Text2SQLScorer = ClassifierScorer(
10
+ "Text to SQL",
11
+ slug="text2sql-1010101010",
12
+ threshold=1.0,
13
+ conversation=[{
14
+ "role": "system",
15
+ "content": """You will be given a natural language query, a corresponding LLM-generated SQL query, and a table schema + (optional) metadata.
16
+
17
+ ** TASK INSTRUCTIONS **
18
+ Your task is to decide whether the LLM generated SQL query properly filters for what the natural language query is asking, based on the table schema + (optional) metadata.
19
+ Additionally, you should check if the SQL query is valid based on the table schema (checking for syntax errors, false column names, etc.)
20
+
21
+ ** TIPS **
22
+ - Look for correct references to the table schema for column names, table names, etc.
23
+ - Check that the SQL query can be executed; make sure JOINs, GROUP BYs, ORDER BYs, etc. are valid with respect to the table schema.
24
+ - Check that aggregation functions (COUNT, SUM, AVG, etc.) are used appropriately with GROUP BY clauses
25
+ - Verify that WHERE conditions use the correct operators and data types for comparisons
26
+ - Ensure LIMIT and OFFSET clauses make sense for the query's purpose
27
+ - Check that JOINs use the correct keys and maintain referential integrity
28
+ - Verify that ORDER BY clauses use valid column names and sort directions
29
+ - Check for proper handling of NULL values where relevant
30
+ - Ensure subqueries are properly constructed and correlated when needed
31
+ - EVEN IF THE QUERY IS VALID, IF IT DOESN'T WORK FOR THE NATURAL LANGUAGE QUERY, YOU SHOULD CHOOSE "N" AS THE ANSWER.
32
+
33
+ ** FORMATTING YOUR ANSWER **
34
+ If the SQL query is valid and works for the natural language query, choose option "Y" and otherwise "N". Provide a justification for your decision; if you choose "N", explain what about the LLM-generated SQL query is incorrect, or explain why it doesn't address the natural language query.
35
+ IF YOUR JUSTIFICATION SHOWS THAT THE SQL QUERY IS VALID AND WORKS FOR THE NATURAL LANGUAGE QUERY, YOU SHOULD CHOOSE "Y" AS THE ANSWER.
36
+ IF THE SQL QUERY IS INVALID, YOU SHOULD CHOOSE "N" AS THE ANSWER.
37
+
38
+ ** YOUR TURN **
39
+ Natural language query:
40
+ {{input}}
41
+
42
+ LLM generated SQL query:
43
+ {{actual_output}}
44
+
45
+ Table schema:
46
+ {{context}}
47
+ """
48
+ }],
49
+ options={
50
+ "Y": 1.0,
51
+ "N": 0.0
52
+ }
53
+ )
54
+
@@ -1,6 +1,7 @@
1
1
  from typing import Optional, List, Union, Tuple
2
2
  from pydantic import BaseModel
3
3
 
4
+ from judgeval.constants import APIScorer
4
5
  from judgeval.judges import JudgevalJudge
5
6
  from judgeval.judges.utils import create_judge
6
7
  from judgeval.data import Example, ExampleParams
@@ -38,13 +39,17 @@ class AnswerCorrectnessScorer(JudgevalScorer):
38
39
  strict_mode: bool = False,
39
40
  verbose_mode: bool = False
40
41
  ):
41
- self.threshold = 1 if strict_mode else threshold
42
- self.include_reason = include_reason
42
+ super().__init__(
43
+ score_type=APIScorer.ANSWER_CORRECTNESS,
44
+ threshold=1 if strict_mode else threshold,
45
+ evaluation_model=None,
46
+ include_reason=include_reason,
47
+ async_mode=async_mode,
48
+ strict_mode=strict_mode,
49
+ verbose_mode=verbose_mode
50
+ )
43
51
  self.model, self.using_native_model = create_judge(model)
44
52
  self.evaluation_model = self.model.get_model_name()
45
- self.async_mode = async_mode
46
- self.strict_mode = strict_mode
47
- self.verbose_mode = verbose_mode
48
53
 
49
54
  async def _a_get_statements(self, expected_output: str) -> List[str]:
50
55
  prompt = AnswerCorrectnessTemplate.deduce_statements(
@@ -1,5 +1,6 @@
1
1
  from typing import Optional, List, Union, Tuple
2
2
 
3
+ from judgeval.constants import APIScorer
3
4
  from judgeval.scorers.utils import (get_or_create_event_loop,
4
5
  scorer_progress_meter,
5
6
  create_verbose_logs,
@@ -34,13 +35,18 @@ class AnswerRelevancyScorer(JudgevalScorer):
34
35
  strict_mode: bool = False,
35
36
  verbose_mode: bool = False,
36
37
  ):
37
- self.threshold = 1 if strict_mode else threshold
38
+ super().__init__(
39
+ score_type=APIScorer.ANSWER_RELEVANCY,
40
+ threshold=1 if strict_mode else threshold,
41
+ evaluation_model=None,
42
+ include_reason=include_reason,
43
+ async_mode=async_mode,
44
+ strict_mode=strict_mode,
45
+ verbose_mode=verbose_mode
46
+ )
38
47
  self.model, self.using_native_model = create_judge(model)
39
48
  self.evaluation_model = self.model.get_model_name()
40
- self.include_reason = include_reason
41
- self.async_mode = async_mode
42
- self.strict_mode = strict_mode
43
- self.verbose_mode = verbose_mode
49
+ print(self.model)
44
50
 
45
51
  def score_example(
46
52
  self,
@@ -4,6 +4,7 @@ from judgeval.judges import JudgevalJudge
4
4
  from judgeval.judges.utils import create_judge
5
5
  from judgeval.data import Example, ExampleParams
6
6
  from judgeval.scorers import JudgevalScorer
7
+ from judgeval.constants import APIScorer
7
8
  from judgeval.scorers.utils import (
8
9
  get_or_create_event_loop,
9
10
  parse_response_json,
@@ -30,13 +31,17 @@ class ContextualPrecisionScorer(JudgevalScorer):
30
31
  strict_mode: bool = False,
31
32
  verbose_mode: bool = False,
32
33
  ):
33
- self.threshold = 1 if strict_mode else threshold
34
- self.include_reason = include_reason
34
+ super().__init__(
35
+ score_type=APIScorer.CONTEXTUAL_PRECISION,
36
+ threshold=1 if strict_mode else threshold,
37
+ evaluation_model=None,
38
+ include_reason=include_reason,
39
+ async_mode=async_mode,
40
+ strict_mode=strict_mode,
41
+ verbose_mode=verbose_mode
42
+ )
35
43
  self.model, self.using_native_model = create_judge(model)
36
44
  self.evaluation_model = self.model.get_model_name()
37
- self.async_mode = async_mode
38
- self.strict_mode = strict_mode
39
- self.verbose_mode = verbose_mode
40
45
 
41
46
  def score_example(
42
47
  self,
@@ -1,5 +1,6 @@
1
1
  from typing import Optional, List, Union
2
2
 
3
+ from judgeval.constants import APIScorer
3
4
  from judgeval.scorers.utils import (
4
5
  get_or_create_event_loop,
5
6
  parse_response_json,
@@ -32,14 +33,18 @@ class ContextualRecallScorer(JudgevalScorer):
32
33
  verbose_mode: bool = False,
33
34
  user: Optional[str] = None
34
35
  ):
36
+ super().__init__(
37
+ score_type=APIScorer.CONTEXTUAL_RECALL,
38
+ threshold=1 if strict_mode else threshold,
39
+ evaluation_model=None,
40
+ include_reason=include_reason,
41
+ async_mode=async_mode,
42
+ strict_mode=strict_mode,
43
+ verbose_mode=verbose_mode
44
+ )
35
45
  self.user = user
36
- self.threshold = 1 if strict_mode else threshold
37
46
  self.model, self.using_native_model = create_judge(model)
38
47
  self.evaluation_model = self.model.get_model_name()
39
- self.include_reason = include_reason
40
- self.async_mode = async_mode
41
- self.strict_mode = strict_mode
42
- self.verbose_mode = verbose_mode
43
48
 
44
49
  def score_example(
45
50
  self,
@@ -1,6 +1,7 @@
1
1
  from typing import Optional, List, Union
2
2
  import asyncio
3
3
 
4
+ from judgeval.constants import APIScorer
4
5
  from judgeval.scorers.utils import (get_or_create_event_loop,
5
6
  scorer_progress_meter,
6
7
  create_verbose_logs,
@@ -32,14 +33,18 @@ class ContextualRelevancyScorer(JudgevalScorer):
32
33
  verbose_mode: bool = False,
33
34
  user: Optional[str] = None
34
35
  ):
36
+ super().__init__(
37
+ score_type=APIScorer.CONTEXTUAL_RELEVANCY,
38
+ threshold=1 if strict_mode else threshold,
39
+ evaluation_model=None,
40
+ include_reason=include_reason,
41
+ async_mode=async_mode,
42
+ strict_mode=strict_mode,
43
+ verbose_mode=verbose_mode
44
+ )
35
45
  self.user = user
36
- self.threshold = 1 if strict_mode else threshold
37
46
  self.model, self.using_native_model = create_judge(model)
38
47
  self.evaluation_model = self.model.get_model_name()
39
- self.include_reason = include_reason
40
- self.async_mode = async_mode
41
- self.strict_mode = strict_mode
42
- self.verbose_mode = verbose_mode
43
48
 
44
49
  def score_example(
45
50
  self,
@@ -3,7 +3,7 @@ Code for the local implementation of the Faithfulness metric.
3
3
  """
4
4
  from typing import List, Optional, Union
5
5
 
6
-
6
+ from judgeval.constants import APIScorer
7
7
  from judgeval.data import (
8
8
  Example,
9
9
  ExampleParams
@@ -47,14 +47,19 @@ class FaithfulnessScorer(JudgevalScorer):
47
47
  verbose_mode: bool = False,
48
48
  user: Optional[str] = None
49
49
  ):
50
- self.threshold = 1 if strict_mode else threshold
50
+ super().__init__(
51
+ score_type=APIScorer.FAITHFULNESS,
52
+ threshold=1 if strict_mode else threshold,
53
+ evaluation_model=None,
54
+ include_reason=include_reason,
55
+ async_mode=async_mode,
56
+ strict_mode=strict_mode,
57
+ verbose_mode=verbose_mode
58
+ )
59
+ self.user = user
51
60
  self.model, self.using_native_model = create_judge(model)
52
61
  self.using_native_model = True # NOTE: SETTING THIS FOR LITELLM and TOGETHER usage
53
62
  self.evaluation_model = self.model.get_model_name()
54
- self.include_reason = include_reason
55
- self.async_mode = async_mode
56
- self.strict_mode = strict_mode
57
- self.verbose_mode = verbose_mode
58
63
 
59
64
  def score_example(
60
65
  self,
@@ -20,6 +20,7 @@ Hallucination is measuring the fraction of contexts that agree with output (do n
20
20
 
21
21
  from typing import Optional, Union, List
22
22
 
23
+ from judgeval.constants import APIScorer
23
24
  from judgeval.scorers.utils import (get_or_create_event_loop,
24
25
  scorer_progress_meter,
25
26
  create_verbose_logs,
@@ -50,13 +51,17 @@ class HallucinationScorer(JudgevalScorer):
50
51
  strict_mode: bool = False,
51
52
  verbose_mode: bool = False,
52
53
  ):
53
- self.threshold = 1 if strict_mode else threshold
54
+ super().__init__(
55
+ score_type=APIScorer.HALLUCINATION,
56
+ threshold=1 if strict_mode else threshold,
57
+ evaluation_model=None,
58
+ include_reason=include_reason,
59
+ async_mode=async_mode,
60
+ strict_mode=strict_mode,
61
+ verbose_mode=verbose_mode
62
+ )
54
63
  self.model, self.using_native_model = create_judge(model)
55
64
  self.evaluation_model = self.model.get_model_name()
56
- self.include_reason = include_reason
57
- self.async_mode = async_mode
58
- self.strict_mode = strict_mode
59
- self.verbose_mode = verbose_mode
60
65
 
61
66
  def score_example(
62
67
  self,
@@ -0,0 +1,5 @@
1
+ from judgeval.scorers.judgeval_scorers.local_implementations.json_correctness.json_correctness_scorer import JsonCorrectnessScorer
2
+
3
+ __all__ = [
4
+ "JsonCorrectnessScorer",
5
+ ]
@@ -1,7 +1,9 @@
1
1
  from typing import List, Optional, Union, Any
2
2
  from pydantic import BaseModel, ValidationError, create_model
3
3
 
4
+ from judgeval.constants import APIScorer
4
5
  from judgeval.judges import JudgevalJudge
6
+ from judgeval.judges.utils import create_judge
5
7
  from judgeval.scorers.utils import (get_or_create_event_loop,
6
8
  scorer_progress_meter,
7
9
  create_verbose_logs,
@@ -30,13 +32,18 @@ class JsonCorrectnessScorer(JudgevalScorer):
30
32
  verbose_mode: bool = False,
31
33
  user: Optional[str] = None
32
34
  ):
33
- self.score_type = "json_correctness"
34
- self.model = model
35
- self.threshold = threshold
36
- self.async_mode = async_mode
37
- self.strict_mode = strict_mode
38
- self.verbose_mode = verbose_mode
35
+ super().__init__(
36
+ score_type=APIScorer.JSON_CORRECTNESS,
37
+ threshold=1 if strict_mode else threshold,
38
+ evaluation_model=None,
39
+ include_reason=False,
40
+ async_mode=async_mode,
41
+ strict_mode=strict_mode,
42
+ verbose_mode=verbose_mode
43
+ )
39
44
  self.user = user
45
+ self.model, self.using_native_model = create_judge(model)
46
+ self.evaluation_model = self.model.get_model_name()
40
47
 
41
48
  if isinstance(json_schema, dict):
42
49
  # Convert to BaseModel
@@ -1,6 +1,7 @@
1
1
  from typing import List, Optional, Union
2
2
  import asyncio
3
3
 
4
+ from judgeval.constants import APIScorer
4
5
  from judgeval.scorers.utils import (get_or_create_event_loop,
5
6
  scorer_progress_meter,
6
7
  create_verbose_logs,
@@ -36,7 +37,15 @@ class SummarizationScorer(JudgevalScorer):
36
37
  strict_mode: bool = False,
37
38
  verbose_mode: bool = False,
38
39
  ):
39
- self.threshold = 1 if strict_mode else threshold
40
+ super().__init__(
41
+ score_type=APIScorer.SUMMARIZATION,
42
+ threshold=1 if strict_mode else threshold,
43
+ evaluation_model=None,
44
+ include_reason=include_reason,
45
+ async_mode=async_mode,
46
+ strict_mode=strict_mode,
47
+ verbose_mode=verbose_mode
48
+ )
40
49
  self.model, self.using_native_model = create_judge(model)
41
50
  self.evaluation_model = self.model.get_model_name()
42
51
 
@@ -1,5 +1,6 @@
1
1
  from typing import List, Union
2
2
 
3
+ from judgeval.constants import APIScorer
3
4
  from judgeval.scorers.utils import (
4
5
  scorer_progress_meter,
5
6
  create_verbose_logs,
@@ -55,10 +56,15 @@ class ToolCorrectnessScorer(JudgevalScorer):
55
56
  should_exact_match: bool = False,
56
57
  should_consider_ordering: bool = False,
57
58
  ):
58
- self.threshold = 1 if strict_mode else threshold
59
- self.include_reason = include_reason
60
- self.strict_mode = strict_mode
61
- self.verbose_mode = verbose_mode
59
+ super().__init__(
60
+ score_type=APIScorer.TOOL_CORRECTNESS,
61
+ threshold=1 if strict_mode else threshold,
62
+ evaluation_model=None,
63
+ include_reason=include_reason,
64
+ async_mode=False,
65
+ strict_mode=strict_mode,
66
+ verbose_mode=verbose_mode
67
+ )
62
68
  self.should_exact_match = should_exact_match
63
69
  self.should_consider_ordering = should_consider_ordering
64
70
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.7
3
+ Version: 0.0.9
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -2,7 +2,7 @@ judgeval/__init__.py,sha256=xiiG4CkeaOtey4fusCd9CBz0BVqzTIbV-K2EFIU0rUM,283
2
2
  judgeval/clients.py,sha256=Ns5ljrgPPXUMo7fSPJxO12H64lcPyKeQPIVG_RMi2cM,1162
3
3
  judgeval/constants.py,sha256=5O1jWvxMCRyMSWhmkrvPqfBctx42c7kMtgTS7ORVcFw,1965
4
4
  judgeval/evaluation_run.py,sha256=KcIS7mDR_9XEdqYrJXFcrLz5IDMof34HcD5VtjZgV8w,5884
5
- judgeval/judgment_client.py,sha256=8Z4Woiv56qphYqlMI3bNy4rvQItZl_z9vNNd3UdrCes,11241
5
+ judgeval/judgment_client.py,sha256=lVVVDxRQ750nd0wT827dca94YzThNjuFWWJ-BTFW7lg,11367
6
6
  judgeval/run_evaluation.py,sha256=A9jjtWPH2_5W43a1f98R8u-8PuVczoJZNCZIyCoRqi8,18918
7
7
  judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
8
8
  judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
@@ -24,15 +24,15 @@ judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6
24
24
  judgeval/judges/mixture_of_judges.py,sha256=OuGWCuXyqe7s_Y74ij90TJFRfHU-VAFyJVVrwBM0RO0,15532
25
25
  judgeval/judges/together_judge.py,sha256=x3jf-tq77QPXHeeoF739f69hE_0VceXD9FHLrVFdGVA,2275
26
26
  judgeval/judges/utils.py,sha256=YUvivcGV1OKLPMJ9N6aTvhA0r_zzJ2NXriPguiiaVaY,2110
27
- judgeval/scorers/__init__.py,sha256=3rq2VtszrJk9gZ3oAMVd7EGlSugr8aRlHWprMDgQPaQ,956
27
+ judgeval/scorers/__init__.py,sha256=XcDdLn_s16rSQob0896oj4JXTA8-Xfl271TUEBj6Oew,998
28
28
  judgeval/scorers/api_scorer.py,sha256=88kCWr6IetLFn3ziTPG-lwDWvMhFUC6xfINU1MJBoho,2125
29
29
  judgeval/scorers/base_scorer.py,sha256=mbOReG88fWaqCnC8F0u5QepRlzgVkuOz89KEKYxrmMc,1794
30
30
  judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
31
- judgeval/scorers/judgeval_scorer.py,sha256=qpjyj7JZEgxoF8LR3pwkKeebmVC8qlITnKFN4YOUKXc,6165
31
+ judgeval/scorers/judgeval_scorer.py,sha256=14SZ3sBZtGNM3BCegKebkNad9LTs5Tyhs0kD6l3wLAA,6275
32
32
  judgeval/scorers/prompt_scorer.py,sha256=bUv8eZNy1XGVM1gNMt33dgIVX6zj63bGAV6O0o0c7yg,17821
33
33
  judgeval/scorers/score.py,sha256=zJKG21h9Njyj2vS36CAFK2wlbOcHSKgrLgHV5_25KKw,18630
34
34
  judgeval/scorers/utils.py,sha256=dtueaJm8e3Ph3wj1vC-srzadgK_CoIlOefdvMQ-cwK8,6826
35
- judgeval/scorers/judgeval_scorers/__init__.py,sha256=077QnuBfw9Sy9RP2TF2oKCtt5PbaqBZLyiP-gczKShk,5092
35
+ judgeval/scorers/judgeval_scorers/__init__.py,sha256=D12jJAKTcfmz8fDBkYeOmdzZMZsURuODIJ5p7Nk1lWE,5189
36
36
  judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=zFwH2TC5AFlpDRfVKc6GN4YTtnmeyALl-JRLoZD_Jco,1284
37
37
  judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=690G5askjE8dcbKPGvCF6JxAEM9QJUqb-3K-D6lI6oM,463
38
38
  judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=CqvvjV7AZqPlXh-PZaPKYPILHr15u4bIYiKBFjlk5i0,457
@@ -44,35 +44,39 @@ judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py,sha256=ffYwH3CexP
44
44
  judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py,sha256=CAZBQKwNSqpqAoOgStYfr-yP1Brug_6VRimRIQY-zdg,894
45
45
  judgeval/scorers/judgeval_scorers/api_scorers/summarization.py,sha256=-E3oxYbI0D_0q-_fGWh2jQHW9O4Pu7I7xvLWsHU6cn8,450
46
46
  judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py,sha256=17ppPXm962ew67GU5m0npzbPu3CuhgdKY_KmfPvKfu4,457
47
+ judgeval/scorers/judgeval_scorers/classifiers/__init__.py,sha256=Qt81W5ZCwMvBAne0LfQDb8xvg5iOG1vEYP7WizgwAZo,67
48
+ judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py,sha256=8iTzMvou1Dr8pybul6lZHKjc9Ye2-0_racRGYkhEdTY,74
49
+ judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py,sha256=ly72Z7s_c8NID6-nQnuW8qEGEW2MqdvpJ-5WfXzbAQg,2579
47
50
  judgeval/scorers/judgeval_scorers/local_implementations/__init__.py,sha256=ZDbmYHwIbPD75Gj9JKtEWnpBdSVGGRmbn1_IOR6GR-c,1627
48
51
  judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py,sha256=cxxUEspgoIdSzJbwIIioamC0-xDqhYVfYAWxaYF-D_Y,177
49
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py,sha256=ZjLw3Usx3SsSRXGXLItNL_IEWo_UV8dxhc2mS9j-nGM,10073
52
+ judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py,sha256=PDThn6SzqxgMXT7BpQs2TEBOsgfD5fi6fnKk31qaCTo,10227
50
53
  judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py,sha256=5B_G7PPEsfLq6cwWkKWcLuy2k_5RgoOzsW3wOZLIeMk,6703
51
54
  judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py,sha256=r6yae5iaWtlBL_cP8I-1SuhS9dulsy1e7W9Rcz82v6E,169
52
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py,sha256=3T3pDxJde6M8RxsPLXvQA16Dpo-sLECEnMCe7rpvNcY,10536
55
+ judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py,sha256=QG-oxa6-c74VzTuni17RQ9aeT0t1lCuxQXDMznqX8rc,10714
53
56
  judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py,sha256=GfbKv595s1a0dB1No_kDsap6gfcr6dYRGiXx0PDb89k,6557
54
57
  judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py,sha256=J6tc-T60AVOEaNVuoVU0XIG6dvQri99Q0tnX_Tm-0vc,108
55
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py,sha256=5EYjUusMyDfiqatg8-_OJg8IDax-8Ib6aI1sQgi-6JA,9493
58
+ judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py,sha256=tRgRyjGpc4Pe3nQ1c-5NeNYFvbulL7YEnoRa9zLp1gc,9649
56
59
  judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py,sha256=gddK6BQAFcW04vAad81kxIXCHJQp8CbCqMwudWKy7aM,4892
57
60
  judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py,sha256=4kjfqD_95muHZFo75S8_fbTcC1DI1onNIfMmr8gMZaI,99
58
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py,sha256=u8sknD9IsPfU7iF4725w5OmFKe1JEZbOYwsLcAy4m3E,9107
61
+ judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py,sha256=hwAv_x3XwGDnSW3a75CTCgIW6eVg8ymdjDdJQvw5p0Y,9260
59
62
  judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py,sha256=boVq6IM7Iambc1ky_JJQ4ejnYoQQtYreG0SjO4iMyFU,6558
60
63
  judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py,sha256=JPCvrekKLbl_xdD49evhtiFIVocuegCpCBkn1auzTSE,184
61
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py,sha256=NCGFLrdkpEK_LPVQC21qY-0pEOrsdVC0RRcNn9IdREE,8759
64
+ judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py,sha256=BtVgE7z-9PHfFRcvn96aEG5mXVcWBweVyty934hZdiU,8915
62
65
  judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py,sha256=6EHBfxWvhur9z14l8zCw5Z4Hb2uRo9Yv7qIhTRT7-aM,4591
63
66
  judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py,sha256=NbkSqPwxgF4T8KsvuIWhVyRwdOlo7mNHMFuRStTFnvk,154
64
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py,sha256=HdpxzQVYs79AdoNWmInS6yNYwPdwgqN23OHSzo1e9_4,11169
67
+ judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py,sha256=4XqdcdgHg3evrg-IQwXmUHEyee1lZUjXRNEiQSvdpmQ,11341
65
68
  judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py,sha256=oxmCsouh5ExUMmlSuCDolpYR2y9c-yKth6PHrdsCH_g,11387
66
69
  judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py,sha256=fZk3UQxI9Nljf5qjCRLRkF0D-AERFHElI9cC83_cgV8,158
67
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py,sha256=ud-P20erpiLR-i-ycma7Bg8M_mxJ2yQliXPXr-Iwq3M,9521
70
+ judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py,sha256=orCrEe1IH4NE7m-AkKMX0EHbysTuAwIqfohcQaU7XxQ,9670
68
71
  judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py,sha256=BkEu7Q_jIVdcdZSq37tMjitZFzACd8-iBTDDXfGbZig,4346
69
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py,sha256=eYqHTWiw0NOPHueswknmpdxrmrmSm6Jadq56Ncmv9B0,4218
72
+ judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py,sha256=xQDw7o9JQ6qajusPnBH0MWBRJ5ct_Ao3pJELXxxVMRo,175
73
+ judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py,sha256=WxIIK_sgHMQ3aLGvkzvYpcmUm6r62gvrAELimMLw3iM,4529
70
74
  judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py,sha256=mv6-XeLSV5yj1H98YYV2iTYVd88zKftZJP42Lgl6R80,89
71
75
  judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py,sha256=6GnRz2h-6Fwt4sl__0RgQOyo3n3iDO4MNuHWxdu-rrM,10242
72
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=3FMn7EhM5IxNwJLGTcpeHODaOPJefMHW6rRizmlA93U,20775
76
+ judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=CBuE6oCxMzTdJoXFt_YPWBte88kedEQ9t3g52ZRztGY,21086
73
77
  judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py,sha256=JUB3TMqS1OHr6PqpIGqkyiBNbyfUaw7lZuUATjU3_ek,168
74
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py,sha256=oxhVDR3Pb55Kxp9KsvmuvHWKtMiV1BQRG6yaXEr5Bp8,5309
75
- judgeval-0.0.7.dist-info/METADATA,sha256=mo6GssA1Es1FTG8saMwzsoL5AoIHbKDXnTiTA238ZzQ,1204
76
- judgeval-0.0.7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
77
- judgeval-0.0.7.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
78
- judgeval-0.0.7.dist-info/RECORD,,
78
+ judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py,sha256=CYGRJY5EuyICYzHrmFdLykwXakX8AC7G3Bhj7p6szfY,5493
79
+ judgeval-0.0.9.dist-info/METADATA,sha256=D9-pDQVSwfHCVcZ85-AS6MaMhd1AGz1CAJ5fRLwrRsA,1204
80
+ judgeval-0.0.9.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
81
+ judgeval-0.0.9.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
82
+ judgeval-0.0.9.dist-info/RECORD,,