judgeval 0.0.26__py3-none-any.whl → 0.0.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +476 -161
- judgeval/constants.py +4 -2
- judgeval/data/__init__.py +0 -3
- judgeval/data/datasets/eval_dataset_client.py +59 -20
- judgeval/data/result.py +34 -56
- judgeval/judgment_client.py +47 -15
- judgeval/run_evaluation.py +20 -36
- judgeval/scorers/score.py +9 -11
- {judgeval-0.0.26.dist-info → judgeval-0.0.27.dist-info}/METADATA +1 -1
- {judgeval-0.0.26.dist-info → judgeval-0.0.27.dist-info}/RECORD +12 -13
- judgeval/data/api_example.py +0 -98
- {judgeval-0.0.26.dist-info → judgeval-0.0.27.dist-info}/WHEEL +0 -0
- {judgeval-0.0.26.dist-info → judgeval-0.0.27.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,23 +1,22 @@
|
|
1
1
|
judgeval/__init__.py,sha256=dtXxsCmI4eEsZdGSUMy8P_pA0bc2-OSGAgb2C__yJoA,252
|
2
2
|
judgeval/clients.py,sha256=6VQmEqmfCngUdS2MuPBIpHvtDFqOENm8-_BmMvjLyRQ,944
|
3
|
-
judgeval/constants.py,sha256=
|
3
|
+
judgeval/constants.py,sha256=ksAXhAXovzJKH0uHOdQtREs168uCJRG79PooHNmEbYQ,5313
|
4
4
|
judgeval/evaluation_run.py,sha256=RgJD60lJsunNQzObjo7iXnAzXWgubCLOAAuuamAAuoI,6354
|
5
|
-
judgeval/judgment_client.py,sha256=
|
5
|
+
judgeval/judgment_client.py,sha256=uf0V1-eu3qnFTwrQ_Ckcv8IiWRVv7dbvou4P4KjU6hM,26794
|
6
6
|
judgeval/rules.py,sha256=B0ZL0pn72D4Jnlr0zMQ6CPHi7D8AQQRariXCVsiCMiI,20542
|
7
|
-
judgeval/run_evaluation.py,sha256=
|
7
|
+
judgeval/run_evaluation.py,sha256=N2ppmEE5WoSReChKjr_n0NcdAUlUR6Nua7M1C_3zHQ8,24949
|
8
8
|
judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
|
9
9
|
judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
|
10
10
|
judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
|
11
|
-
judgeval/common/tracer.py,sha256=
|
11
|
+
judgeval/common/tracer.py,sha256=L6JkCHj6kxhtDzf9OPg5ZC-NUUH4VDvDcV4utPi_I38,57544
|
12
12
|
judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
|
13
|
-
judgeval/data/__init__.py,sha256=
|
14
|
-
judgeval/data/api_example.py,sha256=dzkrQ0xno08y6qNfqL2djXbapUyc2B2aQ5iANn0o4CY,3667
|
13
|
+
judgeval/data/__init__.py,sha256=dG5ytBOeOWCTd5o0KP7IblqtW4G1EBaGreLWepM3jas,345
|
15
14
|
judgeval/data/example.py,sha256=BhGBhamFWgH6wtvrRYM8dGtDfXh-cDxDhtNL5Gbdz_M,5892
|
16
|
-
judgeval/data/result.py,sha256=
|
15
|
+
judgeval/data/result.py,sha256=YHD-dVYJN4JFpM-YCGgBtSdFcGAOyWYL41sf0TE9Hzg,3122
|
17
16
|
judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
|
18
17
|
judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
|
19
18
|
judgeval/data/datasets/dataset.py,sha256=AFYjksV_wXx5CqFYJsl3aN8yZ6hC50O1myRuOJ8s8_E,12867
|
20
|
-
judgeval/data/datasets/eval_dataset_client.py,sha256=
|
19
|
+
judgeval/data/datasets/eval_dataset_client.py,sha256=P9fEmcNrjPPaiYbbLiEiBziZrIexA39HN9qzClt6uPE,12691
|
21
20
|
judgeval/integrations/langgraph.py,sha256=fGDZOTlVbxTO4ErC-m9OSg3h-RkOIIWXCfhjgkKRh4E,11187
|
22
21
|
judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
|
23
22
|
judgeval/judges/base_judge.py,sha256=ch_S7uBB7lyv44Lf1d7mIGFpveOO58zOkkpImKgd9_4,994
|
@@ -31,7 +30,7 @@ judgeval/scorers/base_scorer.py,sha256=xdUlY3CnLdCQ1Z5iUeY22Bim5v-OQruZmaVF_4Y1m
|
|
31
30
|
judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
|
32
31
|
judgeval/scorers/judgeval_scorer.py,sha256=jq_rzfTG0XBTuLCaa6TlaK4YcT-LlgsO1LEm6hpOYdg,6601
|
33
32
|
judgeval/scorers/prompt_scorer.py,sha256=PaAs2qRolw1P3_I061Xvk9qzvF4O-JR8g_39RqXnHcM,17728
|
34
|
-
judgeval/scorers/score.py,sha256=
|
33
|
+
judgeval/scorers/score.py,sha256=ObFAlMbNRcGrfBpH4WW_6OA3CjrneC539xSWhGH60GQ,18578
|
35
34
|
judgeval/scorers/utils.py,sha256=iHQVTlIANbmCTXz9kTeSdOytgUZ_T74Re61ajqsk_WQ,6827
|
36
35
|
judgeval/scorers/judgeval_scorers/__init__.py,sha256=xFRb62sp4JmBUSeuAB_pC_7kEGp-lGdqCRIu9--Bbdg,5992
|
37
36
|
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=mZ6b_5Dl04k3PaG24ICBajB_j43ody1II1OJhO1DkXo,1648
|
@@ -87,7 +86,7 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py
|
|
87
86
|
judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=Qk7lwHgRPYeGoxTOyclAh1VfGItfvHJ6l1t7Nk3SWFM,20927
|
88
87
|
judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
|
89
88
|
judgeval/utils/alerts.py,sha256=O19Xj7DA0YVjl8PWiuH4zfdZeu3yiLVvHfY8ah2wG0g,2759
|
90
|
-
judgeval-0.0.
|
91
|
-
judgeval-0.0.
|
92
|
-
judgeval-0.0.
|
93
|
-
judgeval-0.0.
|
89
|
+
judgeval-0.0.27.dist-info/METADATA,sha256=yoUWIaLIDPksMYQSxDIbVFjtFVCxim6-5LSQ2P13a-U,5418
|
90
|
+
judgeval-0.0.27.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
91
|
+
judgeval-0.0.27.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
|
92
|
+
judgeval-0.0.27.dist-info/RECORD,,
|
judgeval/data/api_example.py
DELETED
@@ -1,98 +0,0 @@
|
|
1
|
-
from typing import List, Optional, Dict, Any, Union
|
2
|
-
from pydantic import BaseModel, ConfigDict, model_validator
|
3
|
-
|
4
|
-
from judgeval.data.example import Example
|
5
|
-
from judgeval.data.scorer_data import ScorerData
|
6
|
-
from judgeval.common.logger import debug, error
|
7
|
-
|
8
|
-
class ProcessExample(BaseModel):
|
9
|
-
"""
|
10
|
-
ProcessExample is an `Example` object that contains intermediate information
|
11
|
-
about an undergoing evaluation on the original `Example`. It is used purely for
|
12
|
-
internal operations and keeping track of the evaluation process.
|
13
|
-
"""
|
14
|
-
name: str
|
15
|
-
input: Optional[str] = None
|
16
|
-
actual_output: Optional[Union[str, List[str]]] = None
|
17
|
-
expected_output: Optional[Union[str, List[str]]] = None
|
18
|
-
context: Optional[list] = None
|
19
|
-
retrieval_context: Optional[list] = None
|
20
|
-
tools_called: Optional[list] = None
|
21
|
-
expected_tools: Optional[list] = None
|
22
|
-
|
23
|
-
# make these optional, not all test cases in a conversation will be evaluated
|
24
|
-
success: Optional[bool] = None
|
25
|
-
scorers_data: Optional[List[ScorerData]] = None
|
26
|
-
run_duration: Optional[float] = None
|
27
|
-
evaluation_cost: Optional[float] = None
|
28
|
-
|
29
|
-
order: Optional[int] = None
|
30
|
-
# These should map 1 to 1 from golden
|
31
|
-
additional_metadata: Optional[Dict] = None
|
32
|
-
comments: Optional[str] = None
|
33
|
-
trace_id: Optional[str] = None
|
34
|
-
model_config = ConfigDict(arbitrary_types_allowed=True)
|
35
|
-
|
36
|
-
def update_scorer_data(self, scorer_data: ScorerData):
|
37
|
-
"""
|
38
|
-
Updates scorer data field of test case after the scorers have been
|
39
|
-
evaluated on this test case.
|
40
|
-
"""
|
41
|
-
debug(f"Updating scorer data for example '{self.name}' with scorer: {scorer_data}")
|
42
|
-
# self.scorers_data is a list of ScorerData objects that contain the
|
43
|
-
# evaluation results of each scorer on this test case
|
44
|
-
if self.scorers_data is None:
|
45
|
-
self.scorers_data = [scorer_data]
|
46
|
-
else:
|
47
|
-
self.scorers_data.append(scorer_data)
|
48
|
-
|
49
|
-
if self.success is None:
|
50
|
-
# self.success will be None when it is a message
|
51
|
-
# in that case we will be setting success for the first time
|
52
|
-
self.success = scorer_data.success
|
53
|
-
else:
|
54
|
-
if scorer_data.success is False:
|
55
|
-
debug(f"Example '{self.name}' marked as failed due to scorer: {scorer_data}")
|
56
|
-
self.success = False
|
57
|
-
|
58
|
-
def update_run_duration(self, run_duration: float):
|
59
|
-
self.run_duration = run_duration
|
60
|
-
|
61
|
-
|
62
|
-
def create_process_example(
|
63
|
-
example: Example,
|
64
|
-
) -> ProcessExample:
|
65
|
-
"""
|
66
|
-
When an LLM Test Case is executed, we track its progress using an ProcessExample.
|
67
|
-
|
68
|
-
This will track things like the success of the test case, as well as the metadata (such as verdicts and claims in Faithfulness).
|
69
|
-
"""
|
70
|
-
success = True
|
71
|
-
if example.name is not None:
|
72
|
-
name = example.name
|
73
|
-
else:
|
74
|
-
name = "Test Case Placeholder"
|
75
|
-
debug(f"No name provided for example, using default name: {name}")
|
76
|
-
order = None
|
77
|
-
scorers_data = []
|
78
|
-
|
79
|
-
debug(f"Creating ProcessExample for: {name}")
|
80
|
-
process_ex = ProcessExample(
|
81
|
-
name=name,
|
82
|
-
input=example.input,
|
83
|
-
actual_output=example.actual_output,
|
84
|
-
expected_output=example.expected_output,
|
85
|
-
context=example.context,
|
86
|
-
retrieval_context=example.retrieval_context,
|
87
|
-
tools_called=example.tools_called,
|
88
|
-
expected_tools=example.expected_tools,
|
89
|
-
success=success,
|
90
|
-
scorers_data=scorers_data,
|
91
|
-
run_duration=None,
|
92
|
-
evaluation_cost=None,
|
93
|
-
order=order,
|
94
|
-
additional_metadata=example.additional_metadata,
|
95
|
-
trace_id=example.trace_id
|
96
|
-
)
|
97
|
-
return process_ex
|
98
|
-
|
File without changes
|
File without changes
|