judgeval 0.0.29__py3-none-any.whl → 0.0.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -34,6 +34,7 @@ class JudgevalScorer:
34
34
  async_mode: bool = True # Whether to run the scorer in async mode
35
35
  verbose_mode: bool = True # Whether to run the scorer in verbose mode
36
36
  include_reason: bool = False # Whether to include the reason in the output
37
+ custom_example: bool = False # Whether the scorer corresponds to CustomExamples
37
38
  error: Optional[str] = None # The error message if the scorer failed
38
39
  evaluation_cost: Optional[float] = None # The cost of running the scorer
39
40
  verbose_logs: Optional[str] = None # The verbose logs of the scorer
@@ -52,6 +53,7 @@ class JudgevalScorer:
52
53
  async_mode: bool = True,
53
54
  verbose_mode: bool = True,
54
55
  include_reason: bool = False,
56
+ custom_example: bool = False,
55
57
  error: Optional[str] = None,
56
58
  evaluation_cost: Optional[float] = None,
57
59
  verbose_logs: Optional[str] = None,
@@ -78,6 +80,7 @@ class JudgevalScorer:
78
80
  self.async_mode = async_mode
79
81
  self.verbose_mode = verbose_mode
80
82
  self.include_reason = include_reason
83
+ self.custom_example = custom_example
81
84
  self.error = error
82
85
  self.evaluation_cost = evaluation_cost
83
86
  self.verbose_logs = verbose_logs
@@ -15,6 +15,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
15
15
  ComparisonScorer as APIComparisonScorer,
16
16
  InstructionAdherenceScorer as APIInstructionAdherenceScorer,
17
17
  GroundednessScorer as APIGroundednessScorer,
18
+ DerailmentScorer as APIDerailmentScorer,
18
19
  )
19
20
 
20
21
  from judgeval.scorers.judgeval_scorers.local_implementations import (
@@ -153,6 +154,11 @@ GroundednessScorer = ScorerWrapper(
153
154
  api_implementation=APIGroundednessScorer,
154
155
  )
155
156
 
157
+ DerailmentScorer = ScorerWrapper(
158
+ api_implementation=APIDerailmentScorer,
159
+ local_implementation=LocalInstructionAdherenceScorer # TODO: add local implementation
160
+ )
161
+
156
162
  __all__ = [
157
163
  "ExecutionOrderScorer",
158
164
  "JSONCorrectnessScorer",
@@ -166,4 +172,5 @@ __all__ = [
166
172
  "Text2SQLScorer",
167
173
  "ComparisonScorer",
168
174
  "GroundednessScorer",
175
+ "DerailmentScorer",
169
176
  ]
@@ -11,7 +11,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers.answer_correctness import Ans
11
11
  from judgeval.scorers.judgeval_scorers.api_scorers.comparison import ComparisonScorer
12
12
  from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import InstructionAdherenceScorer
13
13
  from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import GroundednessScorer
14
-
14
+ from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import DerailmentScorer
15
15
  __all__ = [
16
16
  "ExecutionOrderScorer",
17
17
  "JSONCorrectnessScorer",
@@ -26,4 +26,5 @@ __all__ = [
26
26
  "ComparisonScorer",
27
27
  "InstructionAdherenceScorer",
28
28
  "GroundednessScorer",
29
+ "DerailmentScorer",
29
30
  ]
@@ -0,0 +1,21 @@
1
+ """
2
+ `judgeval` answer relevancy scorer
3
+
4
+ TODO add link to docs page for this scorer
5
+
6
+ """
7
+
8
+ # Internal imports
9
+ from judgeval.scorers.api_scorer import APIJudgmentScorer
10
+ from judgeval.constants import APIScorer
11
+
12
+ class DerailmentScorer(APIJudgmentScorer):
13
+ def __init__(self, threshold: float):
14
+ super().__init__(
15
+ threshold=threshold,
16
+ score_type=APIScorer.DERAILMENT,
17
+ )
18
+
19
+ @property
20
+ def __name__(self):
21
+ return "Derailment"
judgeval/scorers/score.py CHANGED
@@ -11,6 +11,7 @@ from rich.progress import Progress, SpinnerColumn, TextColumn
11
11
 
12
12
  from judgeval.data import (
13
13
  Example,
14
+ CustomExample,
14
15
  ScoringResult,
15
16
  generate_scoring_result,
16
17
  create_scorer_data,
@@ -240,7 +241,7 @@ async def score_with_indicator(
240
241
 
241
242
 
242
243
  async def a_execute_scoring(
243
- examples: List[Example],
244
+ examples: Union[List[Example], List[CustomExample]],
244
245
  scorers: List[JudgevalScorer],
245
246
  model: Optional[Union[str, List[str], JudgevalJudge]] = None,
246
247
  ignore_errors: bool = True,
@@ -256,7 +257,7 @@ async def a_execute_scoring(
256
257
  Each `Example` will be evaluated by all of the `JudgevalScorer`s in the `scorers` list.
257
258
 
258
259
  Args:
259
- examples (List[Example]): A list of `Example` objects to be evaluated.
260
+ examples (Union[List[Example], List[CustomExample]]): A list of `Example` objects to be evaluated.
260
261
  scorers (List[JudgevalScorer]): A list of `JudgevalScorer` objects to evaluate the examples.
261
262
  model (Union[str, List[str], JudgevalJudge]): The model to use for evaluation.
262
263
  ignore_errors (bool): Whether to ignore errors during evaluation.
@@ -313,7 +314,7 @@ async def a_execute_scoring(
313
314
  debug(f"Scorer threshold: {scorer.threshold}")
314
315
  if hasattr(scorer, 'model'):
315
316
  debug(f"Scorer model: {type(scorer.model).__name__}")
316
- if isinstance(ex, Example):
317
+ if isinstance(ex, Example) or isinstance(ex, CustomExample):
317
318
  if len(scorers) == 0:
318
319
  pbar.update(1)
319
320
  continue
@@ -339,7 +340,7 @@ async def a_execute_scoring(
339
340
  await asyncio.gather(*tasks)
340
341
  else:
341
342
  for i, ex in enumerate(examples):
342
- if isinstance(ex, Example):
343
+ if isinstance(ex, Example) or isinstance(ex, CustomExample):
343
344
  if len(scorers) == 0:
344
345
  continue
345
346
 
@@ -366,7 +367,7 @@ async def a_execute_scoring(
366
367
 
367
368
  async def a_eval_examples_helper(
368
369
  scorers: List[JudgevalScorer],
369
- example: Example,
370
+ example: Union[Example, CustomExample],
370
371
  scoring_results: List[ScoringResult],
371
372
  score_index: int,
372
373
  ignore_errors: bool,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.29
3
+ Version: 0.0.31
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -1,46 +1,49 @@
1
1
  judgeval/__init__.py,sha256=dtXxsCmI4eEsZdGSUMy8P_pA0bc2-OSGAgb2C__yJoA,252
2
2
  judgeval/clients.py,sha256=6VQmEqmfCngUdS2MuPBIpHvtDFqOENm8-_BmMvjLyRQ,944
3
- judgeval/constants.py,sha256=ksAXhAXovzJKH0uHOdQtREs168uCJRG79PooHNmEbYQ,5313
4
- judgeval/evaluation_run.py,sha256=6Kft3wZDWkdBDZoMwOhWf7zSAOF4naI7Pcg_YlZaZY4,6394
5
- judgeval/judgment_client.py,sha256=uf0V1-eu3qnFTwrQ_Ckcv8IiWRVv7dbvou4P4KjU6hM,26794
3
+ judgeval/constants.py,sha256=XTqijsuuLEhUBXTjzNJVsee5U_Gl14ULLO5uQVW_nEE,5398
4
+ judgeval/evaluation_run.py,sha256=WGzx-Ug2qhSmunFo8NrmSstBRsOUc5KpKq0Lc51rqsM,6739
5
+ judgeval/judgment_client.py,sha256=FncHkjyFx2vfXv4cu4DzbOO0ideHNOWtHVbc8pSXNxk,29754
6
6
  judgeval/rules.py,sha256=B0ZL0pn72D4Jnlr0zMQ6CPHi7D8AQQRariXCVsiCMiI,20542
7
- judgeval/run_evaluation.py,sha256=N2ppmEE5WoSReChKjr_n0NcdAUlUR6Nua7M1C_3zHQ8,24949
7
+ judgeval/run_evaluation.py,sha256=2Mv1iLthJeFQZSVhjLOcJKRZ52Sy6OxLb2KyQ_yVwnA,28484
8
8
  judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
9
9
  judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
10
10
  judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
11
- judgeval/common/tracer.py,sha256=b-eQyC_MPwMAVQJS6wtVW0_7hzk8tC9EV6NZZoNjWos,58188
11
+ judgeval/common/tracer.py,sha256=9Qga-7rLFlQK-oM5eK1O_8Mn1SewIrPtFwWbSZFtSII,59651
12
12
  judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
13
- judgeval/data/__init__.py,sha256=dG5ytBOeOWCTd5o0KP7IblqtW4G1EBaGreLWepM3jas,345
14
- judgeval/data/custom_api_example.py,sha256=uW_ZBzkDLWumtudmfRHAJQkVYpm2qWgcDf7vBNLpS-o,3444
15
- judgeval/data/example.py,sha256=BhGBhamFWgH6wtvrRYM8dGtDfXh-cDxDhtNL5Gbdz_M,5892
16
- judgeval/data/result.py,sha256=BT4f2FF5EFuiRjOmS4vuIXsrEwSlG16Vw3QaWi6PZzc,3122
13
+ judgeval/data/__init__.py,sha256=xuKx_KCVHGp6CXvQuVmKl3v7pJp-qDaz0NccKxwjtO0,481
14
+ judgeval/data/custom_example.py,sha256=QRBqiRiZS8UgVeTRHY0r1Jzm6yAYsyg6zmHxQGxdiQs,739
15
+ judgeval/data/example.py,sha256=cJrmPGLel_P2sy1UaRvuVSAi35EnA9XMR11Lhp4aDLo,5930
16
+ judgeval/data/result.py,sha256=Gb9tiSDsk1amXgh0cFG6JmlW_BMKxS2kuTwNA0rrHjA,3184
17
17
  judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
18
+ judgeval/data/sequence.py,sha256=DlQUjyWQJB6iNmiftDZ9N6C-nPtrOC1e0JZ57U00zZk,2387
19
+ judgeval/data/sequence_run.py,sha256=GrnYSZBcZmt4tKQYA_1v09MFB8n3ccrkOJd4qyweHMg,1987
18
20
  judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
19
21
  judgeval/data/datasets/dataset.py,sha256=AFYjksV_wXx5CqFYJsl3aN8yZ6hC50O1myRuOJ8s8_E,12867
20
- judgeval/data/datasets/eval_dataset_client.py,sha256=P9fEmcNrjPPaiYbbLiEiBziZrIexA39HN9qzClt6uPE,12691
21
- judgeval/integrations/langgraph.py,sha256=fGDZOTlVbxTO4ErC-m9OSg3h-RkOIIWXCfhjgkKRh4E,11187
22
+ judgeval/data/datasets/eval_dataset_client.py,sha256=xzXlBJRBEEmwsB79_eepm0Da-Bz8yRodX7ttk-u-BxU,14986
23
+ judgeval/integrations/langgraph.py,sha256=J-cQfFP52TjJewdSTe-fcsUC4HDvjNbXoxmbmF0SgiE,11743
22
24
  judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
23
25
  judgeval/judges/base_judge.py,sha256=ch_S7uBB7lyv44Lf1d7mIGFpveOO58zOkkpImKgd9_4,994
24
26
  judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6KTg,2424
25
27
  judgeval/judges/mixture_of_judges.py,sha256=IJoi4Twk8ze1CJWVEp69k6TSqTCTGrmVYQ0qdffer60,15549
26
28
  judgeval/judges/together_judge.py,sha256=l00hhPerAZXg3oYBd8cyMtWsOTNt_0FIqoxhKJKQe3k,2302
27
29
  judgeval/judges/utils.py,sha256=9lvUxziGV86ISvVFxYBWc09TWFyAQgUTyPf_a9mD5Rs,2686
28
- judgeval/scorers/__init__.py,sha256=gkeKJvjXhswCnkEyjijrVvGVM3Om86egrZ-PUOGvNvI,1158
30
+ judgeval/scorers/__init__.py,sha256=Z_88Sr45gLFAIbMHzG1BF24TUQGCDiuP9QpmVFvSYJM,1204
29
31
  judgeval/scorers/api_scorer.py,sha256=NQ_CrrUPhSUk1k2Q8rKpCG_TU2FT32sFEqvb-Yi54B0,2688
30
32
  judgeval/scorers/base_scorer.py,sha256=xdUlY3CnLdCQ1Z5iUeY22Bim5v-OQruZmaVF_4Y1mC0,2183
31
33
  judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
32
- judgeval/scorers/judgeval_scorer.py,sha256=jq_rzfTG0XBTuLCaa6TlaK4YcT-LlgsO1LEm6hpOYdg,6601
34
+ judgeval/scorers/judgeval_scorer.py,sha256=79-JJurqHP-qTaWNWInx4SjvQYwXc9lvfPPNgwsh2yA,6773
33
35
  judgeval/scorers/prompt_scorer.py,sha256=PaAs2qRolw1P3_I061Xvk9qzvF4O-JR8g_39RqXnHcM,17728
34
- judgeval/scorers/score.py,sha256=ObFAlMbNRcGrfBpH4WW_6OA3CjrneC539xSWhGH60GQ,18578
36
+ judgeval/scorers/score.py,sha256=r9QiT4-LIvivcJ6XxByrbswKSO8eQTtAD1UlXT_lcmo,18741
35
37
  judgeval/scorers/utils.py,sha256=iHQVTlIANbmCTXz9kTeSdOytgUZ_T74Re61ajqsk_WQ,6827
36
- judgeval/scorers/judgeval_scorers/__init__.py,sha256=xFRb62sp4JmBUSeuAB_pC_7kEGp-lGdqCRIu9--Bbdg,5992
37
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=mZ6b_5Dl04k3PaG24ICBajB_j43ody1II1OJhO1DkXo,1648
38
+ judgeval/scorers/judgeval_scorers/__init__.py,sha256=kSmQWKeBvLeZMfLYNQSc2qbJYo1MFIQnf3P-D4ltuSM,6232
39
+ judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=_sDUBxSG536KGqXNi6dFpaYKghjEAadxBxaaxV9HuuE,1764
38
40
  judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=Fnd9CVIOZ73sWEWymsU5eBrrZqPFjMZ0BKpeW-PDyTg,711
39
41
  judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=oETeN9K0HSIRdL2SDqn82Vskpwh5SlKnZvs5VDm2OBU,658
40
42
  judgeval/scorers/judgeval_scorers/api_scorers/comparison.py,sha256=kuzf9OWvpY38yYSwlBgneLkUZwJNM4FQqvbS66keA90,1249
41
43
  judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py,sha256=tpSuzFAaW8X9xqA0aLLKwh7qmBK0Pc_bJZMIe_q412U,770
42
44
  judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py,sha256=pFVhk4pLtQ-FnNlbI-dFF-SIh69Jza7erHqiPkFWoBo,758
43
45
  judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py,sha256=RQ6DZwEhChfecd89Ey-T7ke--7qTaXZlRsNxwH8gaME,823
46
+ judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py,sha256=V9WPuwNMm097V7IknKs8UkmAk0yjnBXTcJha_BHXxTA,475
44
47
  judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py,sha256=Pb3CiNF2Ca826B92wJCVAi_68lJjLhqqCKwQKaflSUg,1294
45
48
  judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=-BwOapqjryYNKNydtdkUiKIij76dY0O1jBmdc6dKazQ,692
46
49
  judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py,sha256=ntEEeTANEOsGlcbiTAF_3r6BeSJEaVDns8po8T0L6Vg,692
@@ -87,7 +90,7 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py
87
90
  judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=Qk7lwHgRPYeGoxTOyclAh1VfGItfvHJ6l1t7Nk3SWFM,20927
88
91
  judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
89
92
  judgeval/utils/alerts.py,sha256=O19Xj7DA0YVjl8PWiuH4zfdZeu3yiLVvHfY8ah2wG0g,2759
90
- judgeval-0.0.29.dist-info/METADATA,sha256=UAJZhCDCKCMOSnvYVZ9M2v3O0QdMErKdYWeN2NBp2_M,5418
91
- judgeval-0.0.29.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
92
- judgeval-0.0.29.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
93
- judgeval-0.0.29.dist-info/RECORD,,
93
+ judgeval-0.0.31.dist-info/METADATA,sha256=g9288fIE7NDwXuqUylqCV0mby5hAY7yEztR8TOn5sNk,5418
94
+ judgeval-0.0.31.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
95
+ judgeval-0.0.31.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
96
+ judgeval-0.0.31.dist-info/RECORD,,
@@ -1,91 +0,0 @@
1
- from typing import List, Optional, Dict, Any, Union
2
- from pydantic import BaseModel, ConfigDict, model_validator
3
-
4
- from judgeval.data.example import Example
5
- from judgeval.data.custom_example import CustomExample
6
- from judgeval.data.scorer_data import ScorerData
7
- from judgeval.common.logger import debug, error
8
-
9
- class ProcessExample(BaseModel):
10
- """
11
- ProcessExample is an `Example` object that contains intermediate information
12
- about an undergoing evaluation on the original `Example`. It is used purely for
13
- internal operations and keeping track of the evaluation process.
14
- """
15
- name: str
16
- # input: Optional[str] = None
17
- # actual_output: Optional[Union[str, List[str]]] = None
18
- # expected_output: Optional[Union[str, List[str]]] = None
19
- # context: Optional[list] = None
20
- # retrieval_context: Optional[list] = None
21
- # tools_called: Optional[list] = None
22
- # expected_tools: Optional[list] = None
23
-
24
- # make these optional, not all test cases in a conversation will be evaluated
25
- success: Optional[bool] = None
26
- scorers_data: Optional[List[ScorerData]] = None
27
- run_duration: Optional[float] = None
28
- evaluation_cost: Optional[float] = None
29
-
30
- order: Optional[int] = None
31
- # These should map 1 to 1 from golden
32
- additional_metadata: Optional[Dict] = None
33
- comments: Optional[str] = None
34
- trace_id: Optional[str] = None
35
- model_config = ConfigDict(arbitrary_types_allowed=True)
36
-
37
- def update_scorer_data(self, scorer_data: ScorerData):
38
- """
39
- Updates scorer data field of test case after the scorers have been
40
- evaluated on this test case.
41
- """
42
- debug(f"Updating scorer data for example '{self.name}' with scorer: {scorer_data}")
43
- # self.scorers_data is a list of ScorerData objects that contain the
44
- # evaluation results of each scorer on this test case
45
- if self.scorers_data is None:
46
- self.scorers_data = [scorer_data]
47
- else:
48
- self.scorers_data.append(scorer_data)
49
-
50
- if self.success is None:
51
- # self.success will be None when it is a message
52
- # in that case we will be setting success for the first time
53
- self.success = scorer_data.success
54
- else:
55
- if scorer_data.success is False:
56
- debug(f"Example '{self.name}' marked as failed due to scorer: {scorer_data}")
57
- self.success = False
58
-
59
- def update_run_duration(self, run_duration: float):
60
- self.run_duration = run_duration
61
-
62
-
63
- def create_process_custom_example(
64
- example: CustomExample,
65
- ) -> ProcessExample:
66
- """
67
- When an LLM Test Case is executed, we track its progress using an ProcessExample.
68
-
69
- This will track things like the success of the test case, as well as the metadata (such as verdicts and claims in Faithfulness).
70
- """
71
- success = True
72
- if example.name is not None:
73
- name = example.name
74
- else:
75
- name = "Test Case Placeholder"
76
- debug(f"No name provided for example, using default name: {name}")
77
- order = None
78
- scorers_data = []
79
-
80
- debug(f"Creating ProcessExample for: {name}")
81
- process_ex = ProcessExample(
82
- name=name,
83
- success=success,
84
- scorers_data=scorers_data,
85
- run_duration=None,
86
- evaluation_cost=None,
87
- order=order,
88
- additional_metadata=example.additional_metadata,
89
- trace_id=example.trace_id
90
- )
91
- return process_ex