judgeval 0.0.39__py3-none-any.whl → 0.0.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/clients.py +6 -4
- judgeval/common/tracer.py +504 -257
- judgeval/common/utils.py +5 -1
- judgeval/constants.py +2 -0
- judgeval/data/__init__.py +2 -1
- judgeval/data/datasets/dataset.py +12 -6
- judgeval/data/datasets/eval_dataset_client.py +3 -1
- judgeval/data/example.py +7 -7
- judgeval/data/tool.py +29 -1
- judgeval/data/trace.py +31 -39
- judgeval/data/trace_run.py +2 -1
- judgeval/evaluation_run.py +4 -7
- judgeval/judgment_client.py +34 -7
- judgeval/run_evaluation.py +67 -19
- judgeval/scorers/__init__.py +4 -1
- judgeval/scorers/judgeval_scorer.py +12 -1
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +4 -0
- judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +124 -0
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +20 -0
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +1 -1
- judgeval/scorers/prompt_scorer.py +8 -164
- judgeval/scorers/score.py +15 -15
- judgeval-0.0.41.dist-info/METADATA +1450 -0
- {judgeval-0.0.39.dist-info → judgeval-0.0.41.dist-info}/RECORD +26 -24
- judgeval-0.0.39.dist-info/METADATA +0 -247
- {judgeval-0.0.39.dist-info → judgeval-0.0.41.dist-info}/WHEEL +0 -0
- {judgeval-0.0.39.dist-info → judgeval-0.0.41.dist-info}/licenses/LICENSE.md +0 -0
judgeval/run_evaluation.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
import asyncio
|
2
2
|
import requests
|
3
3
|
import time
|
4
|
+
import json
|
4
5
|
import sys
|
5
6
|
import itertools
|
6
7
|
import threading
|
@@ -204,9 +205,9 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
|
|
204
205
|
)
|
205
206
|
return results
|
206
207
|
|
207
|
-
def check_experiment_type(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str,
|
208
|
+
def check_experiment_type(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str, is_trace: bool) -> None:
|
208
209
|
"""
|
209
|
-
Checks if the current experiment, if one exists, has the same type (examples of
|
210
|
+
Checks if the current experiment, if one exists, has the same type (examples of traces)
|
210
211
|
"""
|
211
212
|
try:
|
212
213
|
response = requests.post(
|
@@ -220,7 +221,7 @@ def check_experiment_type(eval_name: str, project_name: str, judgment_api_key: s
|
|
220
221
|
"eval_name": eval_name,
|
221
222
|
"project_name": project_name,
|
222
223
|
"judgment_api_key": judgment_api_key,
|
223
|
-
"
|
224
|
+
"is_trace": is_trace
|
224
225
|
},
|
225
226
|
verify=True
|
226
227
|
)
|
@@ -362,14 +363,26 @@ def check_examples(examples: List[Example], scorers: List[Union[APIJudgmentScore
|
|
362
363
|
"""
|
363
364
|
Checks if the example contains the necessary parameters for the scorer.
|
364
365
|
"""
|
366
|
+
prompt_user = False
|
365
367
|
for scorer in scorers:
|
366
368
|
for example in examples:
|
367
369
|
missing_params = []
|
368
370
|
for param in scorer.required_params:
|
369
371
|
if getattr(example, param.value) is None:
|
370
|
-
missing_params.append(f"
|
372
|
+
missing_params.append(f"{param.value}")
|
371
373
|
if missing_params:
|
372
|
-
|
374
|
+
rprint(f"[yellow]⚠️ WARNING:[/yellow] Example is missing required parameters for scorer [bold]{scorer.score_type.value}[/bold]")
|
375
|
+
rprint(f"Missing parameters: {', '.join(missing_params)}")
|
376
|
+
rprint(f"Example: {json.dumps(example.model_dump(), indent=2)}")
|
377
|
+
rprint("-"*40)
|
378
|
+
prompt_user = True
|
379
|
+
|
380
|
+
if prompt_user:
|
381
|
+
user_input = input("Do you want to continue? (y/n)")
|
382
|
+
if user_input.lower() != "y":
|
383
|
+
sys.exit(0)
|
384
|
+
else:
|
385
|
+
rprint("[green]Continuing...[/green]")
|
373
386
|
|
374
387
|
def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: bool = True, function: Optional[Callable] = None, tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None, examples: Optional[List[Example]] = None) -> List[ScoringResult]:
|
375
388
|
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
@@ -382,7 +395,7 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
|
|
382
395
|
)
|
383
396
|
|
384
397
|
if trace_run.append:
|
385
|
-
# Check that the current experiment, if one exists, has the same type (examples
|
398
|
+
# Check that the current experiment, if one exists, has the same type (examples or traces)
|
386
399
|
check_experiment_type(
|
387
400
|
trace_run.eval_name,
|
388
401
|
trace_run.project_name,
|
@@ -390,21 +403,27 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
|
|
390
403
|
trace_run.organization_id,
|
391
404
|
True
|
392
405
|
)
|
393
|
-
|
394
406
|
if function and tracer:
|
395
407
|
new_traces: List[Trace] = []
|
396
408
|
tracer.offline_mode = True
|
409
|
+
tracer.traces = []
|
397
410
|
for example in examples:
|
398
411
|
if example.input:
|
399
|
-
|
412
|
+
if isinstance(example.input, str):
|
413
|
+
result = run_with_spinner("Running agent function: ", function, example.input)
|
414
|
+
elif isinstance(example.input, dict):
|
415
|
+
result = run_with_spinner("Running agent function: ", function, **example.input)
|
416
|
+
else:
|
417
|
+
raise ValueError(f"Input must be string or dict, got {type(example.input)}")
|
400
418
|
else:
|
401
419
|
result = run_with_spinner("Running agent function: ", function)
|
402
420
|
for i, trace in enumerate(tracer.traces):
|
403
421
|
# We set the root-level trace span with the expected tools of the Trace
|
404
422
|
trace = Trace(**trace)
|
405
|
-
trace.
|
423
|
+
trace.trace_spans[0].expected_tools = examples[i].expected_tools
|
406
424
|
new_traces.append(trace)
|
407
425
|
trace_run.traces = new_traces
|
426
|
+
tracer.traces = []
|
408
427
|
|
409
428
|
# Execute evaluation using Judgment API
|
410
429
|
info("Starting API evaluation")
|
@@ -423,7 +442,7 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
|
|
423
442
|
debug("Processing API results")
|
424
443
|
# TODO: allow for custom scorer on traces
|
425
444
|
if trace_run.log_results:
|
426
|
-
pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["
|
445
|
+
pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["agent_results"], trace_run)
|
427
446
|
rprint(pretty_str)
|
428
447
|
|
429
448
|
return scoring_results
|
@@ -504,7 +523,8 @@ async def _poll_evaluation_until_complete(eval_name: str, project_name: str, jud
|
|
504
523
|
info(f"Polling for evaluation '{eval_name}' in project '{project_name}' (attempt {poll_count})")
|
505
524
|
|
506
525
|
# Check status
|
507
|
-
response =
|
526
|
+
response = await asyncio.to_thread(
|
527
|
+
requests.get,
|
508
528
|
JUDGMENT_GET_EVAL_STATUS_API_URL,
|
509
529
|
headers={
|
510
530
|
"Content-Type": "application/json",
|
@@ -531,7 +551,8 @@ async def _poll_evaluation_until_complete(eval_name: str, project_name: str, jud
|
|
531
551
|
# If complete, get results and return
|
532
552
|
if status == "completed" or status == "complete":
|
533
553
|
info(f"Evaluation '{eval_name}' reported as completed, fetching and verifying results...")
|
534
|
-
results_response =
|
554
|
+
results_response = await asyncio.to_thread(
|
555
|
+
requests.post,
|
535
556
|
JUDGMENT_EVAL_FETCH_API_URL,
|
536
557
|
headers={
|
537
558
|
"Content-Type": "application/json",
|
@@ -723,7 +744,18 @@ class SpinnerWrappedTask:
|
|
723
744
|
|
724
745
|
def __await__(self):
|
725
746
|
async def _spin_and_await():
|
726
|
-
|
747
|
+
# self.task resolves to (scoring_results, pretty_str_to_print)
|
748
|
+
task_result_tuple = await await_with_spinner(self.task, self.message)
|
749
|
+
|
750
|
+
# Unpack the tuple
|
751
|
+
scoring_results, pretty_str_to_print = task_result_tuple
|
752
|
+
|
753
|
+
# Print the pretty string if it exists, after spinner is cleared
|
754
|
+
if pretty_str_to_print:
|
755
|
+
rprint(pretty_str_to_print)
|
756
|
+
|
757
|
+
# Return only the scoring_results to the original awaiter
|
758
|
+
return scoring_results
|
727
759
|
return _spin_and_await().__await__()
|
728
760
|
|
729
761
|
# Proxy all Task attributes and methods to the underlying task
|
@@ -756,7 +788,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
756
788
|
)
|
757
789
|
|
758
790
|
if evaluation_run.append:
|
759
|
-
# Check that the current experiment, if one exists, has the same type (examples of
|
791
|
+
# Check that the current experiment, if one exists, has the same type (examples of traces)
|
760
792
|
check_experiment_type(
|
761
793
|
evaluation_run.eval_name,
|
762
794
|
evaluation_run.project_name,
|
@@ -769,8 +801,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
769
801
|
debug("Initializing examples with IDs and timestamps")
|
770
802
|
for idx, example in enumerate(evaluation_run.examples):
|
771
803
|
example.example_index = idx # Set numeric index
|
772
|
-
example.
|
773
|
-
with example_logging_context(example.timestamp, example.example_id):
|
804
|
+
with example_logging_context(example.created_at, example.example_id):
|
774
805
|
debug(f"Initialized example {example.example_id} (index: {example.example_index})")
|
775
806
|
debug(f"Input: {example.input}")
|
776
807
|
debug(f"Actual output: {example.actual_output}")
|
@@ -824,7 +855,8 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
824
855
|
payload = evaluation_run.model_dump(warnings=False)
|
825
856
|
|
826
857
|
# Send the evaluation to the queue
|
827
|
-
response =
|
858
|
+
response = await asyncio.to_thread(
|
859
|
+
requests.post,
|
828
860
|
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
|
829
861
|
headers={
|
830
862
|
"Content-Type": "application/json",
|
@@ -843,13 +875,28 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
843
875
|
info(f"Successfully added evaluation '{evaluation_run.eval_name}' to queue")
|
844
876
|
|
845
877
|
# Poll until the evaluation is complete
|
846
|
-
|
878
|
+
results = await _poll_evaluation_until_complete(
|
847
879
|
eval_name=evaluation_run.eval_name,
|
848
880
|
project_name=evaluation_run.project_name,
|
849
881
|
judgment_api_key=evaluation_run.judgment_api_key,
|
850
882
|
organization_id=evaluation_run.organization_id,
|
851
883
|
original_examples=evaluation_run.examples # Pass the original examples
|
852
884
|
)
|
885
|
+
|
886
|
+
pretty_str_to_print = None
|
887
|
+
if evaluation_run.log_results and results: # Ensure results exist before logging
|
888
|
+
send_results = [scoring_result.model_dump(warnings=False) for scoring_result in results]
|
889
|
+
try:
|
890
|
+
# Run the blocking log_evaluation_results in a separate thread
|
891
|
+
pretty_str_to_print = await asyncio.to_thread(
|
892
|
+
log_evaluation_results,
|
893
|
+
send_results,
|
894
|
+
evaluation_run
|
895
|
+
)
|
896
|
+
except Exception as e:
|
897
|
+
error(f"Error logging results after async evaluation: {str(e)}")
|
898
|
+
|
899
|
+
return results, pretty_str_to_print
|
853
900
|
|
854
901
|
# Create a regular task
|
855
902
|
task = asyncio.create_task(_async_evaluation_workflow())
|
@@ -860,6 +907,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
860
907
|
f"Processing evaluation '{evaluation_run.eval_name}': "
|
861
908
|
)
|
862
909
|
else:
|
910
|
+
check_examples(evaluation_run.examples, evaluation_run.scorers)
|
863
911
|
if judgment_scorers:
|
864
912
|
# Execute evaluation using Judgment API
|
865
913
|
info("Starting API evaluation")
|
@@ -895,7 +943,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
895
943
|
# We should be removing local scorers soon
|
896
944
|
info("Starting local evaluation")
|
897
945
|
for example in evaluation_run.examples:
|
898
|
-
with example_logging_context(example.
|
946
|
+
with example_logging_context(example.created_at, example.example_id):
|
899
947
|
debug(f"Processing example {example.example_id}: {example.input}")
|
900
948
|
|
901
949
|
results: List[ScoringResult] = asyncio.run(
|
judgeval/scorers/__init__.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
2
2
|
from judgeval.scorers.judgeval_scorer import JudgevalScorer
|
3
|
-
from judgeval.scorers.prompt_scorer import PromptScorer
|
3
|
+
from judgeval.scorers.prompt_scorer import PromptScorer
|
4
4
|
from judgeval.scorers.judgeval_scorers.api_scorers import (
|
5
5
|
ExecutionOrderScorer,
|
6
6
|
JSONCorrectnessScorer,
|
@@ -17,6 +17,8 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
|
|
17
17
|
GroundednessScorer,
|
18
18
|
DerailmentScorer,
|
19
19
|
ToolOrderScorer,
|
20
|
+
ClassifierScorer,
|
21
|
+
ToolDependencyScorer,
|
20
22
|
)
|
21
23
|
from judgeval.scorers.judgeval_scorers.classifiers import (
|
22
24
|
Text2SQLScorer,
|
@@ -43,4 +45,5 @@ __all__ = [
|
|
43
45
|
"GroundednessScorer",
|
44
46
|
"DerailmentScorer",
|
45
47
|
"ToolOrderScorer",
|
48
|
+
"ToolDependencyScorer",
|
46
49
|
]
|
@@ -12,7 +12,7 @@ from judgeval.common.logger import debug, info, warning, error
|
|
12
12
|
from judgeval.judges import JudgevalJudge
|
13
13
|
from judgeval.judges.utils import create_judge
|
14
14
|
from judgeval.constants import UNBOUNDED_SCORERS
|
15
|
-
|
15
|
+
from judgeval.data.example import ExampleParams
|
16
16
|
class JudgevalScorer:
|
17
17
|
"""
|
18
18
|
Base class for scorers in `judgeval`.
|
@@ -39,6 +39,9 @@ class JudgevalScorer:
|
|
39
39
|
evaluation_cost: Optional[float] = None # The cost of running the scorer
|
40
40
|
verbose_logs: Optional[str] = None # The verbose logs of the scorer
|
41
41
|
additional_metadata: Optional[Dict] = None # Additional metadata for the scorer
|
42
|
+
required_params: Optional[List[ExampleParams]] = None # The required parameters for the scorer
|
43
|
+
error: Optional[str] = None
|
44
|
+
success: Optional[bool] = None
|
42
45
|
|
43
46
|
def __init__(
|
44
47
|
self,
|
@@ -49,6 +52,7 @@ class JudgevalScorer:
|
|
49
52
|
reason: Optional[str] = None,
|
50
53
|
success: Optional[bool] = None,
|
51
54
|
evaluation_model: Optional[str] = None,
|
55
|
+
required_params: Optional[List[ExampleParams]] = None,
|
52
56
|
strict_mode: bool = False,
|
53
57
|
async_mode: bool = True,
|
54
58
|
verbose_mode: bool = True,
|
@@ -85,6 +89,7 @@ class JudgevalScorer:
|
|
85
89
|
self.evaluation_cost = evaluation_cost
|
86
90
|
self.verbose_logs = verbose_logs
|
87
91
|
self.additional_metadata = additional_metadata
|
92
|
+
self.required_params = required_params
|
88
93
|
|
89
94
|
def _add_model(self, model: Optional[Union[str, List[str], JudgevalJudge]] = None):
|
90
95
|
"""
|
@@ -145,3 +150,9 @@ class JudgevalScorer:
|
|
145
150
|
"additional_metadata": self.additional_metadata,
|
146
151
|
}
|
147
152
|
return f"JudgevalScorer({attributes})"
|
153
|
+
|
154
|
+
def to_dict(self):
|
155
|
+
return {
|
156
|
+
"score_type": str(self.score_type), # Convert enum to string for serialization
|
157
|
+
"threshold": self.threshold
|
158
|
+
}
|
@@ -13,6 +13,8 @@ from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import
|
|
13
13
|
from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import GroundednessScorer
|
14
14
|
from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import DerailmentScorer
|
15
15
|
from judgeval.scorers.judgeval_scorers.api_scorers.tool_order import ToolOrderScorer
|
16
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.classifier_scorer import ClassifierScorer
|
17
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.tool_dependency import ToolDependencyScorer
|
16
18
|
__all__ = [
|
17
19
|
"ExecutionOrderScorer",
|
18
20
|
"JSONCorrectnessScorer",
|
@@ -29,4 +31,6 @@ __all__ = [
|
|
29
31
|
"GroundednessScorer",
|
30
32
|
"DerailmentScorer",
|
31
33
|
"ToolOrderScorer",
|
34
|
+
"ClassifierScorer",
|
35
|
+
"ToolDependencyScorer",
|
32
36
|
]
|
@@ -0,0 +1,124 @@
|
|
1
|
+
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
2
|
+
from judgeval.constants import APIScorer
|
3
|
+
from typing import List, Mapping, Optional, Dict
|
4
|
+
from pydantic import model_serializer
|
5
|
+
|
6
|
+
class ClassifierScorer(APIJudgmentScorer):
|
7
|
+
"""
|
8
|
+
In the Judgment backend, this scorer is implemented as a PromptScorer that takes
|
9
|
+
1. a system role that may involve the Example object
|
10
|
+
2. options for scores on the example
|
11
|
+
|
12
|
+
and uses a judge to execute the evaluation from the system role and classify into one of the options
|
13
|
+
|
14
|
+
ex:
|
15
|
+
system_role = "You are a judge that evaluates whether the response is positive or negative. The response is: {example.actual_output}"
|
16
|
+
options = {"positive": 1, "negative": 0}
|
17
|
+
|
18
|
+
Args:
|
19
|
+
name (str): The name of the scorer
|
20
|
+
slug (str): A unique identifier for the scorer
|
21
|
+
conversation (List[dict]): The conversation template with placeholders (e.g., {{actual_output}})
|
22
|
+
options (Mapping[str, float]): A mapping of classification options to their corresponding scores
|
23
|
+
threshold (float): The threshold for determining success (default: 0.5)
|
24
|
+
include_reason (bool): Whether to include reasoning in the response (default: True)
|
25
|
+
strict_mode (bool): Whether to use strict mode (default: False)
|
26
|
+
verbose_mode (bool): Whether to include verbose logging (default: False)
|
27
|
+
"""
|
28
|
+
name: Optional[str] = None
|
29
|
+
slug: Optional[str] = None
|
30
|
+
conversation: Optional[List[dict]] = None
|
31
|
+
options: Optional[Mapping[str, float]] = None
|
32
|
+
verbose_mode: bool = False
|
33
|
+
strict_mode: bool = False
|
34
|
+
include_reason: bool = True,
|
35
|
+
async_mode: bool = True,
|
36
|
+
threshold: float = 0.5
|
37
|
+
|
38
|
+
def __init__(
|
39
|
+
self,
|
40
|
+
name: str,
|
41
|
+
slug: str,
|
42
|
+
conversation: List[dict],
|
43
|
+
options: Mapping[str, float],
|
44
|
+
threshold: float = 0.5,
|
45
|
+
include_reason: bool = True,
|
46
|
+
strict_mode: bool = False,
|
47
|
+
verbose_mode: bool = False,
|
48
|
+
async_mode: bool = True,
|
49
|
+
):
|
50
|
+
super().__init__(
|
51
|
+
threshold=threshold,
|
52
|
+
score_type=APIScorer.CLASSIFIER,
|
53
|
+
)
|
54
|
+
self.name = name
|
55
|
+
self.verbose_mode = verbose_mode
|
56
|
+
self.strict_mode = strict_mode
|
57
|
+
self.include_reason = include_reason
|
58
|
+
self.slug = slug
|
59
|
+
self.conversation = conversation
|
60
|
+
self.options = options
|
61
|
+
self.async_mode = async_mode
|
62
|
+
|
63
|
+
def update_name(self, name: str):
|
64
|
+
"""
|
65
|
+
Updates the name of the scorer.
|
66
|
+
"""
|
67
|
+
self.name = name
|
68
|
+
|
69
|
+
def update_threshold(self, threshold: float):
|
70
|
+
"""
|
71
|
+
Updates the threshold of the scorer.
|
72
|
+
"""
|
73
|
+
self.threshold = threshold
|
74
|
+
|
75
|
+
def update_conversation(self, conversation: List[dict]):
|
76
|
+
"""
|
77
|
+
Updates the conversation with the new conversation.
|
78
|
+
|
79
|
+
Sample conversation:
|
80
|
+
[{'role': 'system', 'content': "Did the chatbot answer the user's question in a kind way?: {{actual_output}}."}]
|
81
|
+
"""
|
82
|
+
self.conversation = conversation
|
83
|
+
|
84
|
+
def update_options(self, options: Mapping[str, float]):
|
85
|
+
"""
|
86
|
+
Updates the options with the new options.
|
87
|
+
|
88
|
+
Sample options:
|
89
|
+
{"yes": 1, "no": 0}
|
90
|
+
"""
|
91
|
+
self.options = options
|
92
|
+
|
93
|
+
def __str__(self):
|
94
|
+
return f"ClassifierScorer(name={self.name}, slug={self.slug}, conversation={self.conversation}, threshold={self.threshold}, options={self.options})"
|
95
|
+
|
96
|
+
# @model_serializer
|
97
|
+
# def serialize_model(self) -> dict:
|
98
|
+
# """
|
99
|
+
# Defines how the ClassifierScorer should be serialized when model_dump() is called.
|
100
|
+
# """
|
101
|
+
# return {
|
102
|
+
# "name": self.name,
|
103
|
+
# "score_type": self.name,
|
104
|
+
# "conversation": self.conversation,
|
105
|
+
# "options": self.options,
|
106
|
+
# "threshold": self.threshold,
|
107
|
+
# "include_reason": self.include_reason,
|
108
|
+
# "async_mode": self.async_mode,
|
109
|
+
# "strict_mode": self.strict_mode,
|
110
|
+
# "verbose_mode": self.verbose_mode,
|
111
|
+
# }
|
112
|
+
|
113
|
+
def to_dict(self) -> dict:
|
114
|
+
return {
|
115
|
+
"name": self.name,
|
116
|
+
"score_type": self.name,
|
117
|
+
"conversation": self.conversation,
|
118
|
+
"options": self.options,
|
119
|
+
"threshold": self.threshold,
|
120
|
+
"include_reason": self.include_reason,
|
121
|
+
"async_mode": self.async_mode,
|
122
|
+
"strict_mode": self.strict_mode,
|
123
|
+
"verbose_mode": self.verbose_mode,
|
124
|
+
}
|
@@ -0,0 +1,20 @@
|
|
1
|
+
"""
|
2
|
+
`judgeval` tool dependency scorer
|
3
|
+
"""
|
4
|
+
|
5
|
+
# Internal imports
|
6
|
+
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
7
|
+
from judgeval.constants import APIScorer
|
8
|
+
from typing import Optional, Dict
|
9
|
+
class ToolDependencyScorer(APIJudgmentScorer):
|
10
|
+
kwargs: Optional[Dict] = None
|
11
|
+
def __init__(self, threshold: float=1.0, enable_param_checking: bool = True):
|
12
|
+
super().__init__(
|
13
|
+
threshold=threshold,
|
14
|
+
score_type=APIScorer.TOOL_DEPENDENCY
|
15
|
+
)
|
16
|
+
self.kwargs = {"enable_param_checking": enable_param_checking}
|
17
|
+
|
18
|
+
@property
|
19
|
+
def __name__(self):
|
20
|
+
return "Tool Dependency"
|
@@ -7,7 +7,7 @@ Determines if the LLM-generated SQL query is valid and works for the natural lan
|
|
7
7
|
from judgeval.scorers import ClassifierScorer
|
8
8
|
|
9
9
|
Text2SQLScorer = ClassifierScorer(
|
10
|
-
"Text to SQL",
|
10
|
+
name="Text to SQL",
|
11
11
|
slug="text2sql-1010101010",
|
12
12
|
threshold=1.0,
|
13
13
|
conversation=[{
|
@@ -30,6 +30,7 @@ from typing import List, Optional, Tuple, Any, Mapping
|
|
30
30
|
from pydantic import BaseModel, model_serializer, Field
|
31
31
|
|
32
32
|
from judgeval.data import Example
|
33
|
+
from judgeval.data.example import ExampleParams
|
33
34
|
from judgeval.scorers import JudgevalScorer
|
34
35
|
from judgeval.scorers.utils import (
|
35
36
|
scorer_progress_meter,
|
@@ -37,6 +38,7 @@ from judgeval.scorers.utils import (
|
|
37
38
|
get_or_create_event_loop,
|
38
39
|
create_verbose_logs
|
39
40
|
)
|
41
|
+
from judgeval.judges import JudgevalJudge
|
40
42
|
|
41
43
|
|
42
44
|
class ReasonScore(BaseModel):
|
@@ -49,7 +51,8 @@ class PromptScorer(JudgevalScorer, BaseModel):
|
|
49
51
|
score_type: str
|
50
52
|
threshold: float = Field(default=0.5)
|
51
53
|
using_native_model: bool = Field(default=True)
|
52
|
-
|
54
|
+
model: Optional[JudgevalJudge] = Field(default=None)
|
55
|
+
skipped: bool = Field(default=False)
|
53
56
|
# DO NOT SET THESE FIELDS MANUALLY, THEY ARE SET BY THE SCORE_EXAMPLE METHOD
|
54
57
|
_response: Optional[dict] = None
|
55
58
|
_result: Optional[float] = None
|
@@ -62,6 +65,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
|
|
62
65
|
async_mode: bool = True,
|
63
66
|
strict_mode: bool = False,
|
64
67
|
verbose_mode: bool = False,
|
68
|
+
required_params: Optional[List[ExampleParams]] = None,
|
65
69
|
):
|
66
70
|
# Initialize BaseModel first
|
67
71
|
BaseModel.__init__(
|
@@ -83,6 +87,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
|
|
83
87
|
async_mode=async_mode,
|
84
88
|
strict_mode=strict_mode,
|
85
89
|
verbose_mode=verbose_mode,
|
90
|
+
required_params=required_params,
|
86
91
|
)
|
87
92
|
|
88
93
|
def score_example(
|
@@ -276,166 +281,5 @@ class PromptScorer(JudgevalScorer, BaseModel):
|
|
276
281
|
def __name__(self):
|
277
282
|
return self.name
|
278
283
|
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
"""
|
283
|
-
This is a PromptScorer that takes
|
284
|
-
1. a system role that may involve the Example object
|
285
|
-
2. options for scores on the example
|
286
|
-
|
287
|
-
and uses a judge to execute the evaluation from the system role and classify into one of the options
|
288
|
-
|
289
|
-
ex:
|
290
|
-
system_role = "You are a judge that evaluates whether the response is positive or negative. The response is: {example.actual_output}"
|
291
|
-
options = {"positive": 1, "negative": 0}
|
292
|
-
"""
|
293
|
-
|
294
|
-
conversation: List[dict]
|
295
|
-
options: Mapping[str, float]
|
296
|
-
|
297
|
-
def __init__(self, name: str, slug: str, conversation: List[dict], options: Mapping[str, float],
|
298
|
-
threshold: float = 0.5, include_reason: bool = True,
|
299
|
-
async_mode: bool = True, strict_mode: bool = False, verbose_mode: bool = False):
|
300
|
-
# Initialize BaseModel first with all fields
|
301
|
-
BaseModel.__init__(
|
302
|
-
self,
|
303
|
-
name=name,
|
304
|
-
slug=slug,
|
305
|
-
score_type=name,
|
306
|
-
conversation=conversation,
|
307
|
-
options=options,
|
308
|
-
threshold=threshold,
|
309
|
-
include_reason=include_reason,
|
310
|
-
async_mode=async_mode,
|
311
|
-
strict_mode=strict_mode,
|
312
|
-
verbose_mode=verbose_mode,
|
313
|
-
)
|
314
|
-
# Then initialize JudgevalScorer
|
315
|
-
JudgevalScorer.__init__(
|
316
|
-
self,
|
317
|
-
score_type=name,
|
318
|
-
threshold=threshold,
|
319
|
-
include_reason=include_reason,
|
320
|
-
async_mode=async_mode,
|
321
|
-
strict_mode=strict_mode,
|
322
|
-
verbose_mode=verbose_mode,
|
323
|
-
)
|
324
|
-
|
325
|
-
def _build_measure_prompt(self, example: Example) -> List[dict]:
|
326
|
-
"""
|
327
|
-
Builds the measure prompt for the classifier scorer.
|
328
|
-
|
329
|
-
Args:
|
330
|
-
example (Example): The example to build the prompt for
|
331
|
-
|
332
|
-
Returns:
|
333
|
-
List[dict]: The measure prompt for the classifier scorer
|
334
|
-
"""
|
335
|
-
replacement_words = {
|
336
|
-
"{{actual_output}}": example.actual_output,
|
337
|
-
"{{expected_output}}": example.expected_output,
|
338
|
-
"{{context}}": example.context,
|
339
|
-
"{{retrieval_context}}": example.retrieval_context,
|
340
|
-
"{{tools_called}}": example.tools_called,
|
341
|
-
"{{expected_tools}}": example.expected_tools,
|
342
|
-
}
|
343
|
-
# Make a copy of the conversation to avoid modifying the original
|
344
|
-
conversation_copy = [dict(message) for message in self.conversation]
|
345
|
-
|
346
|
-
# Only replace if double brackets are found in the content
|
347
|
-
for message in conversation_copy:
|
348
|
-
content = message["content"]
|
349
|
-
if "{{" in content:
|
350
|
-
for key, value in replacement_words.items():
|
351
|
-
if key in content:
|
352
|
-
message["content"] = content.replace(key, str(value))
|
353
|
-
return conversation_copy
|
354
|
-
|
355
|
-
def _build_schema(self) -> dict:
|
356
|
-
return self.options
|
357
|
-
|
358
|
-
def _enforce_prompt_format(self, judge_prompt: List[dict], schema: dict) -> List[dict]:
|
359
|
-
"""
|
360
|
-
Enforces the judge model to choose an option from the schema.
|
361
|
-
|
362
|
-
We want the model to choose an option from the schema and a reason for the choice.
|
363
|
-
"""
|
364
|
-
options = list(schema.keys())
|
365
|
-
options_str = ", ".join(options)
|
366
|
-
|
367
|
-
system_role = judge_prompt[0]["content"]
|
368
|
-
system_role += (
|
369
|
-
f"\n\nYou must choose one of the following options: {options_str}. "
|
370
|
-
"Format your response as a JSON object with two fields:\n"
|
371
|
-
"1. 'choice': Your selected option (must be one of the provided choices)\n"
|
372
|
-
"2. 'reason': A brief explanation for why you made this choice\n\n"
|
373
|
-
"Example response format:\n"
|
374
|
-
"{\n"
|
375
|
-
' "choice": "<one of the valid options>",\n'
|
376
|
-
' "reason": "<your explanation>"\n'
|
377
|
-
"}"
|
378
|
-
)
|
379
|
-
|
380
|
-
judge_prompt[0]["content"] = system_role
|
381
|
-
return judge_prompt
|
382
|
-
|
383
|
-
def _process_response(self, response: dict) -> Tuple[float, str]:
|
384
|
-
choice = response.get("choice")
|
385
|
-
if choice not in self.options:
|
386
|
-
raise ValueError(f"Invalid choice: {choice}. Expected one of: {self.options.keys()}")
|
387
|
-
reason = response.get("reason", "No reason could be found in model response.")
|
388
|
-
return self.options[choice], reason
|
389
|
-
|
390
|
-
def _success_check(self, **kwargs) -> bool:
|
391
|
-
return self.score >= self.threshold
|
392
|
-
|
393
|
-
def update_name(self, name: str):
|
394
|
-
"""
|
395
|
-
Updates the name of the scorer.
|
396
|
-
"""
|
397
|
-
self.name = name
|
398
|
-
|
399
|
-
def update_threshold(self, threshold: float):
|
400
|
-
"""
|
401
|
-
Updates the threshold of the scorer.
|
402
|
-
"""
|
403
|
-
self.threshold = threshold
|
404
|
-
|
405
|
-
def update_conversation(self, conversation: List[dict]):
|
406
|
-
"""
|
407
|
-
Updates the conversation with the new conversation.
|
408
|
-
|
409
|
-
Sample conversation:
|
410
|
-
[{'role': 'system', 'content': "Did the chatbot answer the user's question in a kind way?: {{actual_output}}."}]
|
411
|
-
"""
|
412
|
-
self.conversation = conversation
|
413
|
-
|
414
|
-
def update_options(self, options: Mapping[str, float]):
|
415
|
-
"""
|
416
|
-
Updates the options with the new options.
|
417
|
-
|
418
|
-
Sample options:
|
419
|
-
{"yes": 1, "no": 0}
|
420
|
-
"""
|
421
|
-
self.options = options
|
422
|
-
|
423
|
-
def __str__(self):
|
424
|
-
return f"ClassifierScorer(name={self.name}, slug={self.slug}, conversation={self.conversation}, threshold={self.threshold}, options={self.options})"
|
425
|
-
|
426
|
-
@model_serializer
|
427
|
-
def serialize_model(self) -> dict:
|
428
|
-
"""
|
429
|
-
Defines how the ClassifierScorer should be serialized when model_dump() is called.
|
430
|
-
"""
|
431
|
-
return {
|
432
|
-
"name": self.name,
|
433
|
-
"score_type": self.score_type,
|
434
|
-
"conversation": self.conversation,
|
435
|
-
"options": self.options,
|
436
|
-
"threshold": self.threshold,
|
437
|
-
"include_reason": self.include_reason,
|
438
|
-
"async_mode": self.async_mode,
|
439
|
-
"strict_mode": self.strict_mode,
|
440
|
-
"verbose_mode": self.verbose_mode,
|
441
|
-
}
|
284
|
+
class Config:
|
285
|
+
arbitrary_types_allowed = True
|