judgeval 0.0.30__py3-none-any.whl → 0.0.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +3 -1
- judgeval/common/tracer.py +352 -117
- judgeval/constants.py +5 -3
- judgeval/data/__init__.py +4 -0
- judgeval/data/custom_example.py +18 -0
- judgeval/data/datasets/dataset.py +5 -1
- judgeval/data/datasets/eval_dataset_client.py +64 -5
- judgeval/data/example.py +1 -0
- judgeval/data/result.py +7 -6
- judgeval/data/sequence.py +55 -0
- judgeval/data/sequence_run.py +44 -0
- judgeval/evaluation_run.py +12 -7
- judgeval/integrations/langgraph.py +89 -72
- judgeval/judgment_client.py +70 -68
- judgeval/run_evaluation.py +87 -13
- judgeval/scorers/__init__.py +2 -0
- judgeval/scorers/judgeval_scorer.py +3 -0
- judgeval/scorers/judgeval_scorers/__init__.py +7 -0
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -1
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +21 -0
- judgeval/scorers/score.py +6 -5
- judgeval/version_check.py +22 -0
- {judgeval-0.0.30.dist-info → judgeval-0.0.32.dist-info}/METADATA +1 -1
- {judgeval-0.0.30.dist-info → judgeval-0.0.32.dist-info}/RECORD +26 -22
- judgeval/data/custom_api_example.py +0 -91
- {judgeval-0.0.30.dist-info → judgeval-0.0.32.dist-info}/WHEEL +0 -0
- {judgeval-0.0.30.dist-info → judgeval-0.0.32.dist-info}/licenses/LICENSE.md +0 -0
judgeval/run_evaluation.py
CHANGED
@@ -4,14 +4,15 @@ import time
|
|
4
4
|
import sys
|
5
5
|
import itertools
|
6
6
|
import threading
|
7
|
-
from typing import List, Dict, Any
|
7
|
+
from typing import List, Dict, Any, Union
|
8
8
|
from datetime import datetime
|
9
9
|
from rich import print as rprint
|
10
10
|
|
11
11
|
from judgeval.data import (
|
12
12
|
ScorerData,
|
13
13
|
ScoringResult,
|
14
|
-
Example
|
14
|
+
Example,
|
15
|
+
CustomExample
|
15
16
|
)
|
16
17
|
from judgeval.scorers import (
|
17
18
|
JudgevalScorer,
|
@@ -22,6 +23,7 @@ from judgeval.scorers.score import a_execute_scoring
|
|
22
23
|
from judgeval.constants import (
|
23
24
|
ROOT_API,
|
24
25
|
JUDGMENT_EVAL_API_URL,
|
26
|
+
JUDGMENT_SEQUENCE_EVAL_API_URL,
|
25
27
|
JUDGMENT_EVAL_LOG_API_URL,
|
26
28
|
MAX_CONCURRENT_EVALUATIONS,
|
27
29
|
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL
|
@@ -34,7 +36,7 @@ from judgeval.common.logger import (
|
|
34
36
|
example_logging_context
|
35
37
|
)
|
36
38
|
from judgeval.evaluation_run import EvaluationRun
|
37
|
-
|
39
|
+
from judgeval.data.sequence_run import SequenceRun
|
38
40
|
|
39
41
|
def send_to_rabbitmq(evaluation_run: EvaluationRun) -> None:
|
40
42
|
"""
|
@@ -91,6 +93,36 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
|
|
91
93
|
raise JudgmentAPIError(error_message)
|
92
94
|
return response_data
|
93
95
|
|
96
|
+
def execute_api_sequence_eval(sequence_run: SequenceRun) -> List[Dict]:
|
97
|
+
"""
|
98
|
+
Executes an evaluation of a list of `Example`s using one or more `JudgmentScorer`s via the Judgment API.
|
99
|
+
"""
|
100
|
+
|
101
|
+
try:
|
102
|
+
# submit API request to execute evals
|
103
|
+
payload = sequence_run.model_dump(warnings=False)
|
104
|
+
response = requests.post(
|
105
|
+
JUDGMENT_SEQUENCE_EVAL_API_URL,
|
106
|
+
headers={
|
107
|
+
"Content-Type": "application/json",
|
108
|
+
"Authorization": f"Bearer {sequence_run.judgment_api_key}",
|
109
|
+
"X-Organization-Id": sequence_run.organization_id
|
110
|
+
},
|
111
|
+
json=payload,
|
112
|
+
verify=True
|
113
|
+
)
|
114
|
+
response_data = response.json()
|
115
|
+
except Exception as e:
|
116
|
+
error(f"Error: {e}")
|
117
|
+
details = response.json().get("detail", "No details provided")
|
118
|
+
raise JudgmentAPIError("An error occurred while executing the Judgment API request: " + details)
|
119
|
+
# Check if the response status code is not 2XX
|
120
|
+
# Add check for the duplicate eval run name
|
121
|
+
if not response.ok:
|
122
|
+
error_message = response_data.get('detail', 'An unknown error occurred.')
|
123
|
+
error(f"Error: {error_message=}")
|
124
|
+
raise JudgmentAPIError(error_message)
|
125
|
+
return response_data
|
94
126
|
|
95
127
|
def merge_results(api_results: List[ScoringResult], local_results: List[ScoringResult]) -> List[ScoringResult]:
|
96
128
|
"""
|
@@ -197,8 +229,8 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
|
|
197
229
|
)
|
198
230
|
|
199
231
|
if response.status_code == 409:
|
200
|
-
error(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name or set the `
|
201
|
-
raise ValueError(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name or set the `
|
232
|
+
error(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true.")
|
233
|
+
raise ValueError(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true.")
|
202
234
|
|
203
235
|
if not response.ok:
|
204
236
|
response_data = response.json()
|
@@ -211,7 +243,7 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
|
|
211
243
|
raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
|
212
244
|
|
213
245
|
|
214
|
-
def log_evaluation_results(merged_results: List[ScoringResult],
|
246
|
+
def log_evaluation_results(merged_results: List[ScoringResult], run: Union[EvaluationRun, SequenceRun]) -> str:
|
215
247
|
"""
|
216
248
|
Logs evaluation results to the Judgment API database.
|
217
249
|
|
@@ -228,13 +260,12 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
|
|
228
260
|
JUDGMENT_EVAL_LOG_API_URL,
|
229
261
|
headers={
|
230
262
|
"Content-Type": "application/json",
|
231
|
-
"Authorization": f"Bearer {
|
232
|
-
"X-Organization-Id":
|
263
|
+
"Authorization": f"Bearer {run.judgment_api_key}",
|
264
|
+
"X-Organization-Id": run.organization_id
|
233
265
|
},
|
234
266
|
json={
|
235
|
-
"results": [result.
|
236
|
-
"
|
237
|
-
"eval_name": evaluation_run.eval_name,
|
267
|
+
"results": [result.model_dump(warnings=False) for result in merged_results],
|
268
|
+
"run": run.model_dump(warnings=False)
|
238
269
|
},
|
239
270
|
verify=True
|
240
271
|
)
|
@@ -303,6 +334,42 @@ def check_examples(examples: List[Example], scorers: List[APIJudgmentScorer]) ->
|
|
303
334
|
# Example ID (usually random UUID) does not provide any helpful information for the user but printing the entire example is overdoing it
|
304
335
|
print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
|
305
336
|
|
337
|
+
def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True, async_execution: bool = False) -> List[ScoringResult]:
|
338
|
+
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
339
|
+
if not override and sequence_run.log_results and not sequence_run.append:
|
340
|
+
check_eval_run_name_exists(
|
341
|
+
sequence_run.eval_name,
|
342
|
+
sequence_run.project_name,
|
343
|
+
sequence_run.judgment_api_key,
|
344
|
+
sequence_run.organization_id
|
345
|
+
)
|
346
|
+
|
347
|
+
# Execute evaluation using Judgment API
|
348
|
+
info("Starting API evaluation")
|
349
|
+
try: # execute an EvaluationRun with just JudgmentScorers
|
350
|
+
debug("Sending request to Judgment API")
|
351
|
+
response_data: List[Dict] = run_with_spinner("Running Sequence Evaluation: ", execute_api_sequence_eval, sequence_run)
|
352
|
+
|
353
|
+
info(f"Received {len(response_data['results'])} results from API")
|
354
|
+
except JudgmentAPIError as e:
|
355
|
+
error(f"An error occurred while executing the Judgment API request: {str(e)}")
|
356
|
+
raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
|
357
|
+
except ValueError as e:
|
358
|
+
raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: {str(e)}")
|
359
|
+
|
360
|
+
# Convert the response data to `ScoringResult` objects
|
361
|
+
debug("Processing API results")
|
362
|
+
api_results = []
|
363
|
+
for result in response_data["results"]:
|
364
|
+
api_results.append(ScoringResult(**result))
|
365
|
+
|
366
|
+
# TODO: allow for custom scorer on sequences
|
367
|
+
if sequence_run.log_results:
|
368
|
+
pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, api_results, sequence_run)
|
369
|
+
rprint(pretty_str)
|
370
|
+
|
371
|
+
|
372
|
+
|
306
373
|
def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_errors: bool = True, async_execution: bool = False) -> List[ScoringResult]:
|
307
374
|
"""
|
308
375
|
Executes an evaluation of `Example`s using one or more `Scorer`s
|
@@ -329,7 +396,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
329
396
|
"""
|
330
397
|
|
331
398
|
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
332
|
-
if not override and evaluation_run.log_results:
|
399
|
+
if not override and evaluation_run.log_results and not evaluation_run.append:
|
333
400
|
check_eval_run_name_exists(
|
334
401
|
evaluation_run.eval_name,
|
335
402
|
evaluation_run.project_name,
|
@@ -373,12 +440,20 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
373
440
|
local_scorers.append(scorer)
|
374
441
|
debug(f"Added local scorer: {type(scorer).__name__}")
|
375
442
|
|
443
|
+
custom_example_check = [scorer.custom_example for scorer in local_scorers]
|
444
|
+
if any(custom_example_check) and not all(custom_example_check):
|
445
|
+
error("All scorers must be custom scorers if using custom examples")
|
446
|
+
raise ValueError("All scorers must be custom scorers if using custom examples")
|
447
|
+
|
376
448
|
debug(f"Found {len(judgment_scorers)} judgment scorers and {len(local_scorers)} local scorers")
|
377
449
|
|
378
450
|
api_results: List[ScoringResult] = []
|
379
451
|
local_results: List[ScoringResult] = []
|
380
452
|
|
381
453
|
if async_execution:
|
454
|
+
if len(local_scorers) > 0:
|
455
|
+
error("Local scorers are not supported in async execution")
|
456
|
+
|
382
457
|
check_examples(evaluation_run.examples, evaluation_run.scorers)
|
383
458
|
info("Starting async evaluation")
|
384
459
|
payload = evaluation_run.model_dump(warnings=False)
|
@@ -396,7 +471,6 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
396
471
|
else:
|
397
472
|
if judgment_scorers:
|
398
473
|
# Execute evaluation using Judgment API
|
399
|
-
check_examples(evaluation_run.examples, evaluation_run.scorers)
|
400
474
|
info("Starting API evaluation")
|
401
475
|
debug(f"Creating API evaluation run with {len(judgment_scorers)} scorers")
|
402
476
|
try: # execute an EvaluationRun with just JudgmentScorers
|
judgeval/scorers/__init__.py
CHANGED
@@ -17,6 +17,7 @@ from judgeval.scorers.judgeval_scorers import (
|
|
17
17
|
ComparisonScorer,
|
18
18
|
InstructionAdherenceScorer,
|
19
19
|
GroundednessScorer,
|
20
|
+
DerailmentScorer,
|
20
21
|
)
|
21
22
|
|
22
23
|
__all__ = [
|
@@ -39,4 +40,5 @@ __all__ = [
|
|
39
40
|
"ComparisonScorer",
|
40
41
|
"InstructionAdherenceScorer",
|
41
42
|
"GroundednessScorer",
|
43
|
+
"DerailmentScorer",
|
42
44
|
]
|
@@ -34,6 +34,7 @@ class JudgevalScorer:
|
|
34
34
|
async_mode: bool = True # Whether to run the scorer in async mode
|
35
35
|
verbose_mode: bool = True # Whether to run the scorer in verbose mode
|
36
36
|
include_reason: bool = False # Whether to include the reason in the output
|
37
|
+
custom_example: bool = False # Whether the scorer corresponds to CustomExamples
|
37
38
|
error: Optional[str] = None # The error message if the scorer failed
|
38
39
|
evaluation_cost: Optional[float] = None # The cost of running the scorer
|
39
40
|
verbose_logs: Optional[str] = None # The verbose logs of the scorer
|
@@ -52,6 +53,7 @@ class JudgevalScorer:
|
|
52
53
|
async_mode: bool = True,
|
53
54
|
verbose_mode: bool = True,
|
54
55
|
include_reason: bool = False,
|
56
|
+
custom_example: bool = False,
|
55
57
|
error: Optional[str] = None,
|
56
58
|
evaluation_cost: Optional[float] = None,
|
57
59
|
verbose_logs: Optional[str] = None,
|
@@ -78,6 +80,7 @@ class JudgevalScorer:
|
|
78
80
|
self.async_mode = async_mode
|
79
81
|
self.verbose_mode = verbose_mode
|
80
82
|
self.include_reason = include_reason
|
83
|
+
self.custom_example = custom_example
|
81
84
|
self.error = error
|
82
85
|
self.evaluation_cost = evaluation_cost
|
83
86
|
self.verbose_logs = verbose_logs
|
@@ -15,6 +15,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
|
|
15
15
|
ComparisonScorer as APIComparisonScorer,
|
16
16
|
InstructionAdherenceScorer as APIInstructionAdherenceScorer,
|
17
17
|
GroundednessScorer as APIGroundednessScorer,
|
18
|
+
DerailmentScorer as APIDerailmentScorer,
|
18
19
|
)
|
19
20
|
|
20
21
|
from judgeval.scorers.judgeval_scorers.local_implementations import (
|
@@ -153,6 +154,11 @@ GroundednessScorer = ScorerWrapper(
|
|
153
154
|
api_implementation=APIGroundednessScorer,
|
154
155
|
)
|
155
156
|
|
157
|
+
DerailmentScorer = ScorerWrapper(
|
158
|
+
api_implementation=APIDerailmentScorer,
|
159
|
+
local_implementation=LocalInstructionAdherenceScorer # TODO: add local implementation
|
160
|
+
)
|
161
|
+
|
156
162
|
__all__ = [
|
157
163
|
"ExecutionOrderScorer",
|
158
164
|
"JSONCorrectnessScorer",
|
@@ -166,4 +172,5 @@ __all__ = [
|
|
166
172
|
"Text2SQLScorer",
|
167
173
|
"ComparisonScorer",
|
168
174
|
"GroundednessScorer",
|
175
|
+
"DerailmentScorer",
|
169
176
|
]
|
@@ -11,7 +11,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers.answer_correctness import Ans
|
|
11
11
|
from judgeval.scorers.judgeval_scorers.api_scorers.comparison import ComparisonScorer
|
12
12
|
from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import InstructionAdherenceScorer
|
13
13
|
from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import GroundednessScorer
|
14
|
-
|
14
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import DerailmentScorer
|
15
15
|
__all__ = [
|
16
16
|
"ExecutionOrderScorer",
|
17
17
|
"JSONCorrectnessScorer",
|
@@ -26,4 +26,5 @@ __all__ = [
|
|
26
26
|
"ComparisonScorer",
|
27
27
|
"InstructionAdherenceScorer",
|
28
28
|
"GroundednessScorer",
|
29
|
+
"DerailmentScorer",
|
29
30
|
]
|
@@ -0,0 +1,21 @@
|
|
1
|
+
"""
|
2
|
+
`judgeval` answer relevancy scorer
|
3
|
+
|
4
|
+
TODO add link to docs page for this scorer
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Internal imports
|
9
|
+
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
|
+
from judgeval.constants import APIScorer
|
11
|
+
|
12
|
+
class DerailmentScorer(APIJudgmentScorer):
|
13
|
+
def __init__(self, threshold: float):
|
14
|
+
super().__init__(
|
15
|
+
threshold=threshold,
|
16
|
+
score_type=APIScorer.DERAILMENT,
|
17
|
+
)
|
18
|
+
|
19
|
+
@property
|
20
|
+
def __name__(self):
|
21
|
+
return "Derailment"
|
judgeval/scorers/score.py
CHANGED
@@ -11,6 +11,7 @@ from rich.progress import Progress, SpinnerColumn, TextColumn
|
|
11
11
|
|
12
12
|
from judgeval.data import (
|
13
13
|
Example,
|
14
|
+
CustomExample,
|
14
15
|
ScoringResult,
|
15
16
|
generate_scoring_result,
|
16
17
|
create_scorer_data,
|
@@ -240,7 +241,7 @@ async def score_with_indicator(
|
|
240
241
|
|
241
242
|
|
242
243
|
async def a_execute_scoring(
|
243
|
-
examples: List[Example],
|
244
|
+
examples: Union[List[Example], List[CustomExample]],
|
244
245
|
scorers: List[JudgevalScorer],
|
245
246
|
model: Optional[Union[str, List[str], JudgevalJudge]] = None,
|
246
247
|
ignore_errors: bool = True,
|
@@ -256,7 +257,7 @@ async def a_execute_scoring(
|
|
256
257
|
Each `Example` will be evaluated by all of the `JudgevalScorer`s in the `scorers` list.
|
257
258
|
|
258
259
|
Args:
|
259
|
-
examples (List[Example]): A list of `Example` objects to be evaluated.
|
260
|
+
examples (Union[List[Example], List[CustomExample]]): A list of `Example` objects to be evaluated.
|
260
261
|
scorers (List[JudgevalScorer]): A list of `JudgevalScorer` objects to evaluate the examples.
|
261
262
|
model (Union[str, List[str], JudgevalJudge]): The model to use for evaluation.
|
262
263
|
ignore_errors (bool): Whether to ignore errors during evaluation.
|
@@ -313,7 +314,7 @@ async def a_execute_scoring(
|
|
313
314
|
debug(f"Scorer threshold: {scorer.threshold}")
|
314
315
|
if hasattr(scorer, 'model'):
|
315
316
|
debug(f"Scorer model: {type(scorer.model).__name__}")
|
316
|
-
if isinstance(ex, Example):
|
317
|
+
if isinstance(ex, Example) or isinstance(ex, CustomExample):
|
317
318
|
if len(scorers) == 0:
|
318
319
|
pbar.update(1)
|
319
320
|
continue
|
@@ -339,7 +340,7 @@ async def a_execute_scoring(
|
|
339
340
|
await asyncio.gather(*tasks)
|
340
341
|
else:
|
341
342
|
for i, ex in enumerate(examples):
|
342
|
-
if isinstance(ex, Example):
|
343
|
+
if isinstance(ex, Example) or isinstance(ex, CustomExample):
|
343
344
|
if len(scorers) == 0:
|
344
345
|
continue
|
345
346
|
|
@@ -366,7 +367,7 @@ async def a_execute_scoring(
|
|
366
367
|
|
367
368
|
async def a_eval_examples_helper(
|
368
369
|
scorers: List[JudgevalScorer],
|
369
|
-
example: Example,
|
370
|
+
example: Union[Example, CustomExample],
|
370
371
|
scoring_results: List[ScoringResult],
|
371
372
|
score_index: int,
|
372
373
|
ignore_errors: bool,
|
@@ -0,0 +1,22 @@
|
|
1
|
+
import importlib.metadata
|
2
|
+
import requests
|
3
|
+
import threading
|
4
|
+
|
5
|
+
def check_latest_version(package_name: str = "judgeval"):
|
6
|
+
def _check():
|
7
|
+
try:
|
8
|
+
current_version = importlib.metadata.version(package_name)
|
9
|
+
response = requests.get(f"https://pypi.org/pypi/{package_name}/json", timeout=2)
|
10
|
+
latest_version = response.json()["info"]["version"]
|
11
|
+
|
12
|
+
if current_version != latest_version:
|
13
|
+
print(
|
14
|
+
f"\033[93mUPDATE AVAILABLE:\033[0m You are using '{package_name}=={current_version}', "
|
15
|
+
f"but the latest version is '{latest_version}'. While this version is still supported, "
|
16
|
+
f"we recommend upgrading to avoid potential issues or missing features: "
|
17
|
+
f"`pip install --upgrade {package_name}`"
|
18
|
+
)
|
19
|
+
except Exception:
|
20
|
+
pass
|
21
|
+
|
22
|
+
threading.Thread(target=_check, daemon=True).start()
|
@@ -1,46 +1,50 @@
|
|
1
|
-
judgeval/__init__.py,sha256=
|
1
|
+
judgeval/__init__.py,sha256=x9HWt4waJwJMAqTuJSg2MezF9Zg-macEjeU-ajbly-8,330
|
2
2
|
judgeval/clients.py,sha256=6VQmEqmfCngUdS2MuPBIpHvtDFqOENm8-_BmMvjLyRQ,944
|
3
|
-
judgeval/constants.py,sha256=
|
4
|
-
judgeval/evaluation_run.py,sha256=
|
5
|
-
judgeval/judgment_client.py,sha256=
|
3
|
+
judgeval/constants.py,sha256=_XmVAkebMyGrDvvanAVlMgVd4p6MLHdEVsTQFI0kz1k,5411
|
4
|
+
judgeval/evaluation_run.py,sha256=WGzx-Ug2qhSmunFo8NrmSstBRsOUc5KpKq0Lc51rqsM,6739
|
5
|
+
judgeval/judgment_client.py,sha256=k0q2s5A0RkhF9ElD9o-KWN10H36t3Of2PrvNF-silf8,26141
|
6
6
|
judgeval/rules.py,sha256=B0ZL0pn72D4Jnlr0zMQ6CPHi7D8AQQRariXCVsiCMiI,20542
|
7
|
-
judgeval/run_evaluation.py,sha256=
|
7
|
+
judgeval/run_evaluation.py,sha256=hnEY8QckEviXYNJutf-6tLFq2DWCzqWV1EVyPvrVXyA,28512
|
8
|
+
judgeval/version_check.py,sha256=bvJEidB7rAeXozoUbN9Yb97QOR_s2hgvpvj74jJ5HlY,943
|
8
9
|
judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
|
9
10
|
judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
|
10
11
|
judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
|
11
|
-
judgeval/common/tracer.py,sha256=
|
12
|
+
judgeval/common/tracer.py,sha256=owRRfIZXPUOVCCn0macygnf18mcp8am1eULGnZXD0Kk,68876
|
12
13
|
judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
|
13
|
-
judgeval/data/__init__.py,sha256=
|
14
|
-
judgeval/data/
|
15
|
-
judgeval/data/example.py,sha256=
|
16
|
-
judgeval/data/result.py,sha256=
|
14
|
+
judgeval/data/__init__.py,sha256=xuKx_KCVHGp6CXvQuVmKl3v7pJp-qDaz0NccKxwjtO0,481
|
15
|
+
judgeval/data/custom_example.py,sha256=QRBqiRiZS8UgVeTRHY0r1Jzm6yAYsyg6zmHxQGxdiQs,739
|
16
|
+
judgeval/data/example.py,sha256=cJrmPGLel_P2sy1UaRvuVSAi35EnA9XMR11Lhp4aDLo,5930
|
17
|
+
judgeval/data/result.py,sha256=Gb9tiSDsk1amXgh0cFG6JmlW_BMKxS2kuTwNA0rrHjA,3184
|
17
18
|
judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
|
19
|
+
judgeval/data/sequence.py,sha256=Fkk2HJGnPboH-Fvwgxub_ryG0eUXa3cbsj7ZD0qkeBo,2204
|
20
|
+
judgeval/data/sequence_run.py,sha256=RmYjfWKMWg-pcF5PLeiWfrhuDkjDZi5VEmAIEXN3Ib0,2104
|
18
21
|
judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
|
19
|
-
judgeval/data/datasets/dataset.py,sha256=
|
20
|
-
judgeval/data/datasets/eval_dataset_client.py,sha256=
|
21
|
-
judgeval/integrations/langgraph.py,sha256=
|
22
|
+
judgeval/data/datasets/dataset.py,sha256=dhLo30hvpmmOK2R6O5wDs_neawUJ4lS8bb4S42SufNQ,13034
|
23
|
+
judgeval/data/datasets/eval_dataset_client.py,sha256=xjj66BO9Es9IxXqzQe1RT_e0kpeKlt7OrhRoSuj4KHM,15085
|
24
|
+
judgeval/integrations/langgraph.py,sha256=J-cQfFP52TjJewdSTe-fcsUC4HDvjNbXoxmbmF0SgiE,11743
|
22
25
|
judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
|
23
26
|
judgeval/judges/base_judge.py,sha256=ch_S7uBB7lyv44Lf1d7mIGFpveOO58zOkkpImKgd9_4,994
|
24
27
|
judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6KTg,2424
|
25
28
|
judgeval/judges/mixture_of_judges.py,sha256=IJoi4Twk8ze1CJWVEp69k6TSqTCTGrmVYQ0qdffer60,15549
|
26
29
|
judgeval/judges/together_judge.py,sha256=l00hhPerAZXg3oYBd8cyMtWsOTNt_0FIqoxhKJKQe3k,2302
|
27
30
|
judgeval/judges/utils.py,sha256=9lvUxziGV86ISvVFxYBWc09TWFyAQgUTyPf_a9mD5Rs,2686
|
28
|
-
judgeval/scorers/__init__.py,sha256=
|
31
|
+
judgeval/scorers/__init__.py,sha256=Z_88Sr45gLFAIbMHzG1BF24TUQGCDiuP9QpmVFvSYJM,1204
|
29
32
|
judgeval/scorers/api_scorer.py,sha256=NQ_CrrUPhSUk1k2Q8rKpCG_TU2FT32sFEqvb-Yi54B0,2688
|
30
33
|
judgeval/scorers/base_scorer.py,sha256=xdUlY3CnLdCQ1Z5iUeY22Bim5v-OQruZmaVF_4Y1mC0,2183
|
31
34
|
judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
|
32
|
-
judgeval/scorers/judgeval_scorer.py,sha256=
|
35
|
+
judgeval/scorers/judgeval_scorer.py,sha256=79-JJurqHP-qTaWNWInx4SjvQYwXc9lvfPPNgwsh2yA,6773
|
33
36
|
judgeval/scorers/prompt_scorer.py,sha256=PaAs2qRolw1P3_I061Xvk9qzvF4O-JR8g_39RqXnHcM,17728
|
34
|
-
judgeval/scorers/score.py,sha256=
|
37
|
+
judgeval/scorers/score.py,sha256=r9QiT4-LIvivcJ6XxByrbswKSO8eQTtAD1UlXT_lcmo,18741
|
35
38
|
judgeval/scorers/utils.py,sha256=iHQVTlIANbmCTXz9kTeSdOytgUZ_T74Re61ajqsk_WQ,6827
|
36
|
-
judgeval/scorers/judgeval_scorers/__init__.py,sha256=
|
37
|
-
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=
|
39
|
+
judgeval/scorers/judgeval_scorers/__init__.py,sha256=kSmQWKeBvLeZMfLYNQSc2qbJYo1MFIQnf3P-D4ltuSM,6232
|
40
|
+
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=_sDUBxSG536KGqXNi6dFpaYKghjEAadxBxaaxV9HuuE,1764
|
38
41
|
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=Fnd9CVIOZ73sWEWymsU5eBrrZqPFjMZ0BKpeW-PDyTg,711
|
39
42
|
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=oETeN9K0HSIRdL2SDqn82Vskpwh5SlKnZvs5VDm2OBU,658
|
40
43
|
judgeval/scorers/judgeval_scorers/api_scorers/comparison.py,sha256=kuzf9OWvpY38yYSwlBgneLkUZwJNM4FQqvbS66keA90,1249
|
41
44
|
judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py,sha256=tpSuzFAaW8X9xqA0aLLKwh7qmBK0Pc_bJZMIe_q412U,770
|
42
45
|
judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py,sha256=pFVhk4pLtQ-FnNlbI-dFF-SIh69Jza7erHqiPkFWoBo,758
|
43
46
|
judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py,sha256=RQ6DZwEhChfecd89Ey-T7ke--7qTaXZlRsNxwH8gaME,823
|
47
|
+
judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py,sha256=V9WPuwNMm097V7IknKs8UkmAk0yjnBXTcJha_BHXxTA,475
|
44
48
|
judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py,sha256=Pb3CiNF2Ca826B92wJCVAi_68lJjLhqqCKwQKaflSUg,1294
|
45
49
|
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=-BwOapqjryYNKNydtdkUiKIij76dY0O1jBmdc6dKazQ,692
|
46
50
|
judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py,sha256=ntEEeTANEOsGlcbiTAF_3r6BeSJEaVDns8po8T0L6Vg,692
|
@@ -87,7 +91,7 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py
|
|
87
91
|
judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=Qk7lwHgRPYeGoxTOyclAh1VfGItfvHJ6l1t7Nk3SWFM,20927
|
88
92
|
judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
|
89
93
|
judgeval/utils/alerts.py,sha256=O19Xj7DA0YVjl8PWiuH4zfdZeu3yiLVvHfY8ah2wG0g,2759
|
90
|
-
judgeval-0.0.
|
91
|
-
judgeval-0.0.
|
92
|
-
judgeval-0.0.
|
93
|
-
judgeval-0.0.
|
94
|
+
judgeval-0.0.32.dist-info/METADATA,sha256=RJzqlHJwfYiOXEcyEEO5WQBM0DC1zQDuoN-Plix6U38,5418
|
95
|
+
judgeval-0.0.32.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
96
|
+
judgeval-0.0.32.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
|
97
|
+
judgeval-0.0.32.dist-info/RECORD,,
|
@@ -1,91 +0,0 @@
|
|
1
|
-
from typing import List, Optional, Dict, Any, Union
|
2
|
-
from pydantic import BaseModel, ConfigDict, model_validator
|
3
|
-
|
4
|
-
from judgeval.data.example import Example
|
5
|
-
from judgeval.data.custom_example import CustomExample
|
6
|
-
from judgeval.data.scorer_data import ScorerData
|
7
|
-
from judgeval.common.logger import debug, error
|
8
|
-
|
9
|
-
class ProcessExample(BaseModel):
|
10
|
-
"""
|
11
|
-
ProcessExample is an `Example` object that contains intermediate information
|
12
|
-
about an undergoing evaluation on the original `Example`. It is used purely for
|
13
|
-
internal operations and keeping track of the evaluation process.
|
14
|
-
"""
|
15
|
-
name: str
|
16
|
-
# input: Optional[str] = None
|
17
|
-
# actual_output: Optional[Union[str, List[str]]] = None
|
18
|
-
# expected_output: Optional[Union[str, List[str]]] = None
|
19
|
-
# context: Optional[list] = None
|
20
|
-
# retrieval_context: Optional[list] = None
|
21
|
-
# tools_called: Optional[list] = None
|
22
|
-
# expected_tools: Optional[list] = None
|
23
|
-
|
24
|
-
# make these optional, not all test cases in a conversation will be evaluated
|
25
|
-
success: Optional[bool] = None
|
26
|
-
scorers_data: Optional[List[ScorerData]] = None
|
27
|
-
run_duration: Optional[float] = None
|
28
|
-
evaluation_cost: Optional[float] = None
|
29
|
-
|
30
|
-
order: Optional[int] = None
|
31
|
-
# These should map 1 to 1 from golden
|
32
|
-
additional_metadata: Optional[Dict] = None
|
33
|
-
comments: Optional[str] = None
|
34
|
-
trace_id: Optional[str] = None
|
35
|
-
model_config = ConfigDict(arbitrary_types_allowed=True)
|
36
|
-
|
37
|
-
def update_scorer_data(self, scorer_data: ScorerData):
|
38
|
-
"""
|
39
|
-
Updates scorer data field of test case after the scorers have been
|
40
|
-
evaluated on this test case.
|
41
|
-
"""
|
42
|
-
debug(f"Updating scorer data for example '{self.name}' with scorer: {scorer_data}")
|
43
|
-
# self.scorers_data is a list of ScorerData objects that contain the
|
44
|
-
# evaluation results of each scorer on this test case
|
45
|
-
if self.scorers_data is None:
|
46
|
-
self.scorers_data = [scorer_data]
|
47
|
-
else:
|
48
|
-
self.scorers_data.append(scorer_data)
|
49
|
-
|
50
|
-
if self.success is None:
|
51
|
-
# self.success will be None when it is a message
|
52
|
-
# in that case we will be setting success for the first time
|
53
|
-
self.success = scorer_data.success
|
54
|
-
else:
|
55
|
-
if scorer_data.success is False:
|
56
|
-
debug(f"Example '{self.name}' marked as failed due to scorer: {scorer_data}")
|
57
|
-
self.success = False
|
58
|
-
|
59
|
-
def update_run_duration(self, run_duration: float):
|
60
|
-
self.run_duration = run_duration
|
61
|
-
|
62
|
-
|
63
|
-
def create_process_custom_example(
|
64
|
-
example: CustomExample,
|
65
|
-
) -> ProcessExample:
|
66
|
-
"""
|
67
|
-
When an LLM Test Case is executed, we track its progress using an ProcessExample.
|
68
|
-
|
69
|
-
This will track things like the success of the test case, as well as the metadata (such as verdicts and claims in Faithfulness).
|
70
|
-
"""
|
71
|
-
success = True
|
72
|
-
if example.name is not None:
|
73
|
-
name = example.name
|
74
|
-
else:
|
75
|
-
name = "Test Case Placeholder"
|
76
|
-
debug(f"No name provided for example, using default name: {name}")
|
77
|
-
order = None
|
78
|
-
scorers_data = []
|
79
|
-
|
80
|
-
debug(f"Creating ProcessExample for: {name}")
|
81
|
-
process_ex = ProcessExample(
|
82
|
-
name=name,
|
83
|
-
success=success,
|
84
|
-
scorers_data=scorers_data,
|
85
|
-
run_duration=None,
|
86
|
-
evaluation_cost=None,
|
87
|
-
order=order,
|
88
|
-
additional_metadata=example.additional_metadata,
|
89
|
-
trace_id=example.trace_id
|
90
|
-
)
|
91
|
-
return process_ex
|
File without changes
|
File without changes
|