judgeval 0.0.29__py3-none-any.whl → 0.0.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +93 -55
- judgeval/constants.py +4 -2
- judgeval/data/__init__.py +4 -0
- judgeval/data/custom_example.py +18 -0
- judgeval/data/datasets/eval_dataset_client.py +62 -3
- judgeval/data/example.py +1 -0
- judgeval/data/result.py +7 -6
- judgeval/data/sequence.py +59 -0
- judgeval/data/sequence_run.py +42 -0
- judgeval/evaluation_run.py +12 -7
- judgeval/integrations/langgraph.py +89 -72
- judgeval/judgment_client.py +77 -14
- judgeval/run_evaluation.py +87 -13
- judgeval/scorers/__init__.py +2 -0
- judgeval/scorers/judgeval_scorer.py +3 -0
- judgeval/scorers/judgeval_scorers/__init__.py +7 -0
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -1
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +21 -0
- judgeval/scorers/score.py +6 -5
- {judgeval-0.0.29.dist-info → judgeval-0.0.31.dist-info}/METADATA +1 -1
- {judgeval-0.0.29.dist-info → judgeval-0.0.31.dist-info}/RECORD +23 -20
- judgeval/data/custom_api_example.py +0 -91
- {judgeval-0.0.29.dist-info → judgeval-0.0.31.dist-info}/WHEEL +0 -0
- {judgeval-0.0.29.dist-info → judgeval-0.0.31.dist-info}/licenses/LICENSE.md +0 -0
@@ -34,6 +34,7 @@ class JudgevalScorer:
|
|
34
34
|
async_mode: bool = True # Whether to run the scorer in async mode
|
35
35
|
verbose_mode: bool = True # Whether to run the scorer in verbose mode
|
36
36
|
include_reason: bool = False # Whether to include the reason in the output
|
37
|
+
custom_example: bool = False # Whether the scorer corresponds to CustomExamples
|
37
38
|
error: Optional[str] = None # The error message if the scorer failed
|
38
39
|
evaluation_cost: Optional[float] = None # The cost of running the scorer
|
39
40
|
verbose_logs: Optional[str] = None # The verbose logs of the scorer
|
@@ -52,6 +53,7 @@ class JudgevalScorer:
|
|
52
53
|
async_mode: bool = True,
|
53
54
|
verbose_mode: bool = True,
|
54
55
|
include_reason: bool = False,
|
56
|
+
custom_example: bool = False,
|
55
57
|
error: Optional[str] = None,
|
56
58
|
evaluation_cost: Optional[float] = None,
|
57
59
|
verbose_logs: Optional[str] = None,
|
@@ -78,6 +80,7 @@ class JudgevalScorer:
|
|
78
80
|
self.async_mode = async_mode
|
79
81
|
self.verbose_mode = verbose_mode
|
80
82
|
self.include_reason = include_reason
|
83
|
+
self.custom_example = custom_example
|
81
84
|
self.error = error
|
82
85
|
self.evaluation_cost = evaluation_cost
|
83
86
|
self.verbose_logs = verbose_logs
|
@@ -15,6 +15,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
|
|
15
15
|
ComparisonScorer as APIComparisonScorer,
|
16
16
|
InstructionAdherenceScorer as APIInstructionAdherenceScorer,
|
17
17
|
GroundednessScorer as APIGroundednessScorer,
|
18
|
+
DerailmentScorer as APIDerailmentScorer,
|
18
19
|
)
|
19
20
|
|
20
21
|
from judgeval.scorers.judgeval_scorers.local_implementations import (
|
@@ -153,6 +154,11 @@ GroundednessScorer = ScorerWrapper(
|
|
153
154
|
api_implementation=APIGroundednessScorer,
|
154
155
|
)
|
155
156
|
|
157
|
+
DerailmentScorer = ScorerWrapper(
|
158
|
+
api_implementation=APIDerailmentScorer,
|
159
|
+
local_implementation=LocalInstructionAdherenceScorer # TODO: add local implementation
|
160
|
+
)
|
161
|
+
|
156
162
|
__all__ = [
|
157
163
|
"ExecutionOrderScorer",
|
158
164
|
"JSONCorrectnessScorer",
|
@@ -166,4 +172,5 @@ __all__ = [
|
|
166
172
|
"Text2SQLScorer",
|
167
173
|
"ComparisonScorer",
|
168
174
|
"GroundednessScorer",
|
175
|
+
"DerailmentScorer",
|
169
176
|
]
|
@@ -11,7 +11,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers.answer_correctness import Ans
|
|
11
11
|
from judgeval.scorers.judgeval_scorers.api_scorers.comparison import ComparisonScorer
|
12
12
|
from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import InstructionAdherenceScorer
|
13
13
|
from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import GroundednessScorer
|
14
|
-
|
14
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import DerailmentScorer
|
15
15
|
__all__ = [
|
16
16
|
"ExecutionOrderScorer",
|
17
17
|
"JSONCorrectnessScorer",
|
@@ -26,4 +26,5 @@ __all__ = [
|
|
26
26
|
"ComparisonScorer",
|
27
27
|
"InstructionAdherenceScorer",
|
28
28
|
"GroundednessScorer",
|
29
|
+
"DerailmentScorer",
|
29
30
|
]
|
@@ -0,0 +1,21 @@
|
|
1
|
+
"""
|
2
|
+
`judgeval` answer relevancy scorer
|
3
|
+
|
4
|
+
TODO add link to docs page for this scorer
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Internal imports
|
9
|
+
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
|
+
from judgeval.constants import APIScorer
|
11
|
+
|
12
|
+
class DerailmentScorer(APIJudgmentScorer):
|
13
|
+
def __init__(self, threshold: float):
|
14
|
+
super().__init__(
|
15
|
+
threshold=threshold,
|
16
|
+
score_type=APIScorer.DERAILMENT,
|
17
|
+
)
|
18
|
+
|
19
|
+
@property
|
20
|
+
def __name__(self):
|
21
|
+
return "Derailment"
|
judgeval/scorers/score.py
CHANGED
@@ -11,6 +11,7 @@ from rich.progress import Progress, SpinnerColumn, TextColumn
|
|
11
11
|
|
12
12
|
from judgeval.data import (
|
13
13
|
Example,
|
14
|
+
CustomExample,
|
14
15
|
ScoringResult,
|
15
16
|
generate_scoring_result,
|
16
17
|
create_scorer_data,
|
@@ -240,7 +241,7 @@ async def score_with_indicator(
|
|
240
241
|
|
241
242
|
|
242
243
|
async def a_execute_scoring(
|
243
|
-
examples: List[Example],
|
244
|
+
examples: Union[List[Example], List[CustomExample]],
|
244
245
|
scorers: List[JudgevalScorer],
|
245
246
|
model: Optional[Union[str, List[str], JudgevalJudge]] = None,
|
246
247
|
ignore_errors: bool = True,
|
@@ -256,7 +257,7 @@ async def a_execute_scoring(
|
|
256
257
|
Each `Example` will be evaluated by all of the `JudgevalScorer`s in the `scorers` list.
|
257
258
|
|
258
259
|
Args:
|
259
|
-
examples (List[Example]): A list of `Example` objects to be evaluated.
|
260
|
+
examples (Union[List[Example], List[CustomExample]]): A list of `Example` objects to be evaluated.
|
260
261
|
scorers (List[JudgevalScorer]): A list of `JudgevalScorer` objects to evaluate the examples.
|
261
262
|
model (Union[str, List[str], JudgevalJudge]): The model to use for evaluation.
|
262
263
|
ignore_errors (bool): Whether to ignore errors during evaluation.
|
@@ -313,7 +314,7 @@ async def a_execute_scoring(
|
|
313
314
|
debug(f"Scorer threshold: {scorer.threshold}")
|
314
315
|
if hasattr(scorer, 'model'):
|
315
316
|
debug(f"Scorer model: {type(scorer.model).__name__}")
|
316
|
-
if isinstance(ex, Example):
|
317
|
+
if isinstance(ex, Example) or isinstance(ex, CustomExample):
|
317
318
|
if len(scorers) == 0:
|
318
319
|
pbar.update(1)
|
319
320
|
continue
|
@@ -339,7 +340,7 @@ async def a_execute_scoring(
|
|
339
340
|
await asyncio.gather(*tasks)
|
340
341
|
else:
|
341
342
|
for i, ex in enumerate(examples):
|
342
|
-
if isinstance(ex, Example):
|
343
|
+
if isinstance(ex, Example) or isinstance(ex, CustomExample):
|
343
344
|
if len(scorers) == 0:
|
344
345
|
continue
|
345
346
|
|
@@ -366,7 +367,7 @@ async def a_execute_scoring(
|
|
366
367
|
|
367
368
|
async def a_eval_examples_helper(
|
368
369
|
scorers: List[JudgevalScorer],
|
369
|
-
example: Example,
|
370
|
+
example: Union[Example, CustomExample],
|
370
371
|
scoring_results: List[ScoringResult],
|
371
372
|
score_index: int,
|
372
373
|
ignore_errors: bool,
|
@@ -1,46 +1,49 @@
|
|
1
1
|
judgeval/__init__.py,sha256=dtXxsCmI4eEsZdGSUMy8P_pA0bc2-OSGAgb2C__yJoA,252
|
2
2
|
judgeval/clients.py,sha256=6VQmEqmfCngUdS2MuPBIpHvtDFqOENm8-_BmMvjLyRQ,944
|
3
|
-
judgeval/constants.py,sha256=
|
4
|
-
judgeval/evaluation_run.py,sha256=
|
5
|
-
judgeval/judgment_client.py,sha256=
|
3
|
+
judgeval/constants.py,sha256=XTqijsuuLEhUBXTjzNJVsee5U_Gl14ULLO5uQVW_nEE,5398
|
4
|
+
judgeval/evaluation_run.py,sha256=WGzx-Ug2qhSmunFo8NrmSstBRsOUc5KpKq0Lc51rqsM,6739
|
5
|
+
judgeval/judgment_client.py,sha256=FncHkjyFx2vfXv4cu4DzbOO0ideHNOWtHVbc8pSXNxk,29754
|
6
6
|
judgeval/rules.py,sha256=B0ZL0pn72D4Jnlr0zMQ6CPHi7D8AQQRariXCVsiCMiI,20542
|
7
|
-
judgeval/run_evaluation.py,sha256=
|
7
|
+
judgeval/run_evaluation.py,sha256=2Mv1iLthJeFQZSVhjLOcJKRZ52Sy6OxLb2KyQ_yVwnA,28484
|
8
8
|
judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
|
9
9
|
judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
|
10
10
|
judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
|
11
|
-
judgeval/common/tracer.py,sha256=
|
11
|
+
judgeval/common/tracer.py,sha256=9Qga-7rLFlQK-oM5eK1O_8Mn1SewIrPtFwWbSZFtSII,59651
|
12
12
|
judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
|
13
|
-
judgeval/data/__init__.py,sha256=
|
14
|
-
judgeval/data/
|
15
|
-
judgeval/data/example.py,sha256=
|
16
|
-
judgeval/data/result.py,sha256=
|
13
|
+
judgeval/data/__init__.py,sha256=xuKx_KCVHGp6CXvQuVmKl3v7pJp-qDaz0NccKxwjtO0,481
|
14
|
+
judgeval/data/custom_example.py,sha256=QRBqiRiZS8UgVeTRHY0r1Jzm6yAYsyg6zmHxQGxdiQs,739
|
15
|
+
judgeval/data/example.py,sha256=cJrmPGLel_P2sy1UaRvuVSAi35EnA9XMR11Lhp4aDLo,5930
|
16
|
+
judgeval/data/result.py,sha256=Gb9tiSDsk1amXgh0cFG6JmlW_BMKxS2kuTwNA0rrHjA,3184
|
17
17
|
judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
|
18
|
+
judgeval/data/sequence.py,sha256=DlQUjyWQJB6iNmiftDZ9N6C-nPtrOC1e0JZ57U00zZk,2387
|
19
|
+
judgeval/data/sequence_run.py,sha256=GrnYSZBcZmt4tKQYA_1v09MFB8n3ccrkOJd4qyweHMg,1987
|
18
20
|
judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
|
19
21
|
judgeval/data/datasets/dataset.py,sha256=AFYjksV_wXx5CqFYJsl3aN8yZ6hC50O1myRuOJ8s8_E,12867
|
20
|
-
judgeval/data/datasets/eval_dataset_client.py,sha256=
|
21
|
-
judgeval/integrations/langgraph.py,sha256=
|
22
|
+
judgeval/data/datasets/eval_dataset_client.py,sha256=xzXlBJRBEEmwsB79_eepm0Da-Bz8yRodX7ttk-u-BxU,14986
|
23
|
+
judgeval/integrations/langgraph.py,sha256=J-cQfFP52TjJewdSTe-fcsUC4HDvjNbXoxmbmF0SgiE,11743
|
22
24
|
judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
|
23
25
|
judgeval/judges/base_judge.py,sha256=ch_S7uBB7lyv44Lf1d7mIGFpveOO58zOkkpImKgd9_4,994
|
24
26
|
judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6KTg,2424
|
25
27
|
judgeval/judges/mixture_of_judges.py,sha256=IJoi4Twk8ze1CJWVEp69k6TSqTCTGrmVYQ0qdffer60,15549
|
26
28
|
judgeval/judges/together_judge.py,sha256=l00hhPerAZXg3oYBd8cyMtWsOTNt_0FIqoxhKJKQe3k,2302
|
27
29
|
judgeval/judges/utils.py,sha256=9lvUxziGV86ISvVFxYBWc09TWFyAQgUTyPf_a9mD5Rs,2686
|
28
|
-
judgeval/scorers/__init__.py,sha256=
|
30
|
+
judgeval/scorers/__init__.py,sha256=Z_88Sr45gLFAIbMHzG1BF24TUQGCDiuP9QpmVFvSYJM,1204
|
29
31
|
judgeval/scorers/api_scorer.py,sha256=NQ_CrrUPhSUk1k2Q8rKpCG_TU2FT32sFEqvb-Yi54B0,2688
|
30
32
|
judgeval/scorers/base_scorer.py,sha256=xdUlY3CnLdCQ1Z5iUeY22Bim5v-OQruZmaVF_4Y1mC0,2183
|
31
33
|
judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
|
32
|
-
judgeval/scorers/judgeval_scorer.py,sha256=
|
34
|
+
judgeval/scorers/judgeval_scorer.py,sha256=79-JJurqHP-qTaWNWInx4SjvQYwXc9lvfPPNgwsh2yA,6773
|
33
35
|
judgeval/scorers/prompt_scorer.py,sha256=PaAs2qRolw1P3_I061Xvk9qzvF4O-JR8g_39RqXnHcM,17728
|
34
|
-
judgeval/scorers/score.py,sha256=
|
36
|
+
judgeval/scorers/score.py,sha256=r9QiT4-LIvivcJ6XxByrbswKSO8eQTtAD1UlXT_lcmo,18741
|
35
37
|
judgeval/scorers/utils.py,sha256=iHQVTlIANbmCTXz9kTeSdOytgUZ_T74Re61ajqsk_WQ,6827
|
36
|
-
judgeval/scorers/judgeval_scorers/__init__.py,sha256=
|
37
|
-
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=
|
38
|
+
judgeval/scorers/judgeval_scorers/__init__.py,sha256=kSmQWKeBvLeZMfLYNQSc2qbJYo1MFIQnf3P-D4ltuSM,6232
|
39
|
+
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=_sDUBxSG536KGqXNi6dFpaYKghjEAadxBxaaxV9HuuE,1764
|
38
40
|
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=Fnd9CVIOZ73sWEWymsU5eBrrZqPFjMZ0BKpeW-PDyTg,711
|
39
41
|
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=oETeN9K0HSIRdL2SDqn82Vskpwh5SlKnZvs5VDm2OBU,658
|
40
42
|
judgeval/scorers/judgeval_scorers/api_scorers/comparison.py,sha256=kuzf9OWvpY38yYSwlBgneLkUZwJNM4FQqvbS66keA90,1249
|
41
43
|
judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py,sha256=tpSuzFAaW8X9xqA0aLLKwh7qmBK0Pc_bJZMIe_q412U,770
|
42
44
|
judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py,sha256=pFVhk4pLtQ-FnNlbI-dFF-SIh69Jza7erHqiPkFWoBo,758
|
43
45
|
judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py,sha256=RQ6DZwEhChfecd89Ey-T7ke--7qTaXZlRsNxwH8gaME,823
|
46
|
+
judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py,sha256=V9WPuwNMm097V7IknKs8UkmAk0yjnBXTcJha_BHXxTA,475
|
44
47
|
judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py,sha256=Pb3CiNF2Ca826B92wJCVAi_68lJjLhqqCKwQKaflSUg,1294
|
45
48
|
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=-BwOapqjryYNKNydtdkUiKIij76dY0O1jBmdc6dKazQ,692
|
46
49
|
judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py,sha256=ntEEeTANEOsGlcbiTAF_3r6BeSJEaVDns8po8T0L6Vg,692
|
@@ -87,7 +90,7 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py
|
|
87
90
|
judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=Qk7lwHgRPYeGoxTOyclAh1VfGItfvHJ6l1t7Nk3SWFM,20927
|
88
91
|
judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
|
89
92
|
judgeval/utils/alerts.py,sha256=O19Xj7DA0YVjl8PWiuH4zfdZeu3yiLVvHfY8ah2wG0g,2759
|
90
|
-
judgeval-0.0.
|
91
|
-
judgeval-0.0.
|
92
|
-
judgeval-0.0.
|
93
|
-
judgeval-0.0.
|
93
|
+
judgeval-0.0.31.dist-info/METADATA,sha256=g9288fIE7NDwXuqUylqCV0mby5hAY7yEztR8TOn5sNk,5418
|
94
|
+
judgeval-0.0.31.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
95
|
+
judgeval-0.0.31.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
|
96
|
+
judgeval-0.0.31.dist-info/RECORD,,
|
@@ -1,91 +0,0 @@
|
|
1
|
-
from typing import List, Optional, Dict, Any, Union
|
2
|
-
from pydantic import BaseModel, ConfigDict, model_validator
|
3
|
-
|
4
|
-
from judgeval.data.example import Example
|
5
|
-
from judgeval.data.custom_example import CustomExample
|
6
|
-
from judgeval.data.scorer_data import ScorerData
|
7
|
-
from judgeval.common.logger import debug, error
|
8
|
-
|
9
|
-
class ProcessExample(BaseModel):
|
10
|
-
"""
|
11
|
-
ProcessExample is an `Example` object that contains intermediate information
|
12
|
-
about an undergoing evaluation on the original `Example`. It is used purely for
|
13
|
-
internal operations and keeping track of the evaluation process.
|
14
|
-
"""
|
15
|
-
name: str
|
16
|
-
# input: Optional[str] = None
|
17
|
-
# actual_output: Optional[Union[str, List[str]]] = None
|
18
|
-
# expected_output: Optional[Union[str, List[str]]] = None
|
19
|
-
# context: Optional[list] = None
|
20
|
-
# retrieval_context: Optional[list] = None
|
21
|
-
# tools_called: Optional[list] = None
|
22
|
-
# expected_tools: Optional[list] = None
|
23
|
-
|
24
|
-
# make these optional, not all test cases in a conversation will be evaluated
|
25
|
-
success: Optional[bool] = None
|
26
|
-
scorers_data: Optional[List[ScorerData]] = None
|
27
|
-
run_duration: Optional[float] = None
|
28
|
-
evaluation_cost: Optional[float] = None
|
29
|
-
|
30
|
-
order: Optional[int] = None
|
31
|
-
# These should map 1 to 1 from golden
|
32
|
-
additional_metadata: Optional[Dict] = None
|
33
|
-
comments: Optional[str] = None
|
34
|
-
trace_id: Optional[str] = None
|
35
|
-
model_config = ConfigDict(arbitrary_types_allowed=True)
|
36
|
-
|
37
|
-
def update_scorer_data(self, scorer_data: ScorerData):
|
38
|
-
"""
|
39
|
-
Updates scorer data field of test case after the scorers have been
|
40
|
-
evaluated on this test case.
|
41
|
-
"""
|
42
|
-
debug(f"Updating scorer data for example '{self.name}' with scorer: {scorer_data}")
|
43
|
-
# self.scorers_data is a list of ScorerData objects that contain the
|
44
|
-
# evaluation results of each scorer on this test case
|
45
|
-
if self.scorers_data is None:
|
46
|
-
self.scorers_data = [scorer_data]
|
47
|
-
else:
|
48
|
-
self.scorers_data.append(scorer_data)
|
49
|
-
|
50
|
-
if self.success is None:
|
51
|
-
# self.success will be None when it is a message
|
52
|
-
# in that case we will be setting success for the first time
|
53
|
-
self.success = scorer_data.success
|
54
|
-
else:
|
55
|
-
if scorer_data.success is False:
|
56
|
-
debug(f"Example '{self.name}' marked as failed due to scorer: {scorer_data}")
|
57
|
-
self.success = False
|
58
|
-
|
59
|
-
def update_run_duration(self, run_duration: float):
|
60
|
-
self.run_duration = run_duration
|
61
|
-
|
62
|
-
|
63
|
-
def create_process_custom_example(
|
64
|
-
example: CustomExample,
|
65
|
-
) -> ProcessExample:
|
66
|
-
"""
|
67
|
-
When an LLM Test Case is executed, we track its progress using an ProcessExample.
|
68
|
-
|
69
|
-
This will track things like the success of the test case, as well as the metadata (such as verdicts and claims in Faithfulness).
|
70
|
-
"""
|
71
|
-
success = True
|
72
|
-
if example.name is not None:
|
73
|
-
name = example.name
|
74
|
-
else:
|
75
|
-
name = "Test Case Placeholder"
|
76
|
-
debug(f"No name provided for example, using default name: {name}")
|
77
|
-
order = None
|
78
|
-
scorers_data = []
|
79
|
-
|
80
|
-
debug(f"Creating ProcessExample for: {name}")
|
81
|
-
process_ex = ProcessExample(
|
82
|
-
name=name,
|
83
|
-
success=success,
|
84
|
-
scorers_data=scorers_data,
|
85
|
-
run_duration=None,
|
86
|
-
evaluation_cost=None,
|
87
|
-
order=order,
|
88
|
-
additional_metadata=example.additional_metadata,
|
89
|
-
trace_id=example.trace_id
|
90
|
-
)
|
91
|
-
return process_ex
|
File without changes
|
File without changes
|