judgeval 0.0.39__py3-none-any.whl → 0.0.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,7 @@
1
1
  import asyncio
2
2
  import requests
3
3
  import time
4
+ import json
4
5
  import sys
5
6
  import itertools
6
7
  import threading
@@ -204,9 +205,9 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
204
205
  )
205
206
  return results
206
207
 
207
- def check_experiment_type(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str, is_sequence: bool) -> None:
208
+ def check_experiment_type(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str, is_trace: bool) -> None:
208
209
  """
209
- Checks if the current experiment, if one exists, has the same type (examples of sequences)
210
+ Checks if the current experiment, if one exists, has the same type (examples of traces)
210
211
  """
211
212
  try:
212
213
  response = requests.post(
@@ -220,7 +221,7 @@ def check_experiment_type(eval_name: str, project_name: str, judgment_api_key: s
220
221
  "eval_name": eval_name,
221
222
  "project_name": project_name,
222
223
  "judgment_api_key": judgment_api_key,
223
- "is_sequence": is_sequence
224
+ "is_trace": is_trace
224
225
  },
225
226
  verify=True
226
227
  )
@@ -362,14 +363,26 @@ def check_examples(examples: List[Example], scorers: List[Union[APIJudgmentScore
362
363
  """
363
364
  Checks if the example contains the necessary parameters for the scorer.
364
365
  """
366
+ prompt_user = False
365
367
  for scorer in scorers:
366
368
  for example in examples:
367
369
  missing_params = []
368
370
  for param in scorer.required_params:
369
371
  if getattr(example, param.value) is None:
370
- missing_params.append(f"'{param.value}'")
372
+ missing_params.append(f"{param.value}")
371
373
  if missing_params:
372
- print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
374
+ rprint(f"[yellow]⚠️ WARNING:[/yellow] Example is missing required parameters for scorer [bold]{scorer.score_type.value}[/bold]")
375
+ rprint(f"Missing parameters: {', '.join(missing_params)}")
376
+ rprint(f"Example: {json.dumps(example.model_dump(), indent=2)}")
377
+ rprint("-"*40)
378
+ prompt_user = True
379
+
380
+ if prompt_user:
381
+ user_input = input("Do you want to continue? (y/n)")
382
+ if user_input.lower() != "y":
383
+ sys.exit(0)
384
+ else:
385
+ rprint("[green]Continuing...[/green]")
373
386
 
374
387
  def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: bool = True, function: Optional[Callable] = None, tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None, examples: Optional[List[Example]] = None) -> List[ScoringResult]:
375
388
  # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
@@ -382,7 +395,7 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
382
395
  )
383
396
 
384
397
  if trace_run.append:
385
- # Check that the current experiment, if one exists, has the same type (examples of sequences)
398
+ # Check that the current experiment, if one exists, has the same type (examples or traces)
386
399
  check_experiment_type(
387
400
  trace_run.eval_name,
388
401
  trace_run.project_name,
@@ -390,21 +403,27 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
390
403
  trace_run.organization_id,
391
404
  True
392
405
  )
393
-
394
406
  if function and tracer:
395
407
  new_traces: List[Trace] = []
396
408
  tracer.offline_mode = True
409
+ tracer.traces = []
397
410
  for example in examples:
398
411
  if example.input:
399
- result = run_with_spinner("Running agent function: ", function, **example.input)
412
+ if isinstance(example.input, str):
413
+ result = run_with_spinner("Running agent function: ", function, example.input)
414
+ elif isinstance(example.input, dict):
415
+ result = run_with_spinner("Running agent function: ", function, **example.input)
416
+ else:
417
+ raise ValueError(f"Input must be string or dict, got {type(example.input)}")
400
418
  else:
401
419
  result = run_with_spinner("Running agent function: ", function)
402
420
  for i, trace in enumerate(tracer.traces):
403
421
  # We set the root-level trace span with the expected tools of the Trace
404
422
  trace = Trace(**trace)
405
- trace.entries[0].expected_tools = examples[i].expected_tools
423
+ trace.trace_spans[0].expected_tools = examples[i].expected_tools
406
424
  new_traces.append(trace)
407
425
  trace_run.traces = new_traces
426
+ tracer.traces = []
408
427
 
409
428
  # Execute evaluation using Judgment API
410
429
  info("Starting API evaluation")
@@ -423,7 +442,7 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
423
442
  debug("Processing API results")
424
443
  # TODO: allow for custom scorer on traces
425
444
  if trace_run.log_results:
426
- pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["results"], trace_run)
445
+ pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["agent_results"], trace_run)
427
446
  rprint(pretty_str)
428
447
 
429
448
  return scoring_results
@@ -504,7 +523,8 @@ async def _poll_evaluation_until_complete(eval_name: str, project_name: str, jud
504
523
  info(f"Polling for evaluation '{eval_name}' in project '{project_name}' (attempt {poll_count})")
505
524
 
506
525
  # Check status
507
- response = requests.get(
526
+ response = await asyncio.to_thread(
527
+ requests.get,
508
528
  JUDGMENT_GET_EVAL_STATUS_API_URL,
509
529
  headers={
510
530
  "Content-Type": "application/json",
@@ -531,7 +551,8 @@ async def _poll_evaluation_until_complete(eval_name: str, project_name: str, jud
531
551
  # If complete, get results and return
532
552
  if status == "completed" or status == "complete":
533
553
  info(f"Evaluation '{eval_name}' reported as completed, fetching and verifying results...")
534
- results_response = requests.post(
554
+ results_response = await asyncio.to_thread(
555
+ requests.post,
535
556
  JUDGMENT_EVAL_FETCH_API_URL,
536
557
  headers={
537
558
  "Content-Type": "application/json",
@@ -723,7 +744,18 @@ class SpinnerWrappedTask:
723
744
 
724
745
  def __await__(self):
725
746
  async def _spin_and_await():
726
- return await await_with_spinner(self.task, self.message)
747
+ # self.task resolves to (scoring_results, pretty_str_to_print)
748
+ task_result_tuple = await await_with_spinner(self.task, self.message)
749
+
750
+ # Unpack the tuple
751
+ scoring_results, pretty_str_to_print = task_result_tuple
752
+
753
+ # Print the pretty string if it exists, after spinner is cleared
754
+ if pretty_str_to_print:
755
+ rprint(pretty_str_to_print)
756
+
757
+ # Return only the scoring_results to the original awaiter
758
+ return scoring_results
727
759
  return _spin_and_await().__await__()
728
760
 
729
761
  # Proxy all Task attributes and methods to the underlying task
@@ -756,7 +788,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
756
788
  )
757
789
 
758
790
  if evaluation_run.append:
759
- # Check that the current experiment, if one exists, has the same type (examples of sequences)
791
+ # Check that the current experiment, if one exists, has the same type (examples of traces)
760
792
  check_experiment_type(
761
793
  evaluation_run.eval_name,
762
794
  evaluation_run.project_name,
@@ -769,8 +801,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
769
801
  debug("Initializing examples with IDs and timestamps")
770
802
  for idx, example in enumerate(evaluation_run.examples):
771
803
  example.example_index = idx # Set numeric index
772
- example.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
773
- with example_logging_context(example.timestamp, example.example_id):
804
+ with example_logging_context(example.created_at, example.example_id):
774
805
  debug(f"Initialized example {example.example_id} (index: {example.example_index})")
775
806
  debug(f"Input: {example.input}")
776
807
  debug(f"Actual output: {example.actual_output}")
@@ -824,7 +855,8 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
824
855
  payload = evaluation_run.model_dump(warnings=False)
825
856
 
826
857
  # Send the evaluation to the queue
827
- response = requests.post(
858
+ response = await asyncio.to_thread(
859
+ requests.post,
828
860
  JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
829
861
  headers={
830
862
  "Content-Type": "application/json",
@@ -843,13 +875,28 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
843
875
  info(f"Successfully added evaluation '{evaluation_run.eval_name}' to queue")
844
876
 
845
877
  # Poll until the evaluation is complete
846
- return await _poll_evaluation_until_complete(
878
+ results = await _poll_evaluation_until_complete(
847
879
  eval_name=evaluation_run.eval_name,
848
880
  project_name=evaluation_run.project_name,
849
881
  judgment_api_key=evaluation_run.judgment_api_key,
850
882
  organization_id=evaluation_run.organization_id,
851
883
  original_examples=evaluation_run.examples # Pass the original examples
852
884
  )
885
+
886
+ pretty_str_to_print = None
887
+ if evaluation_run.log_results and results: # Ensure results exist before logging
888
+ send_results = [scoring_result.model_dump(warnings=False) for scoring_result in results]
889
+ try:
890
+ # Run the blocking log_evaluation_results in a separate thread
891
+ pretty_str_to_print = await asyncio.to_thread(
892
+ log_evaluation_results,
893
+ send_results,
894
+ evaluation_run
895
+ )
896
+ except Exception as e:
897
+ error(f"Error logging results after async evaluation: {str(e)}")
898
+
899
+ return results, pretty_str_to_print
853
900
 
854
901
  # Create a regular task
855
902
  task = asyncio.create_task(_async_evaluation_workflow())
@@ -860,6 +907,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
860
907
  f"Processing evaluation '{evaluation_run.eval_name}': "
861
908
  )
862
909
  else:
910
+ check_examples(evaluation_run.examples, evaluation_run.scorers)
863
911
  if judgment_scorers:
864
912
  # Execute evaluation using Judgment API
865
913
  info("Starting API evaluation")
@@ -895,7 +943,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
895
943
  # We should be removing local scorers soon
896
944
  info("Starting local evaluation")
897
945
  for example in evaluation_run.examples:
898
- with example_logging_context(example.timestamp, example.example_id):
946
+ with example_logging_context(example.created_at, example.example_id):
899
947
  debug(f"Processing example {example.example_id}: {example.input}")
900
948
 
901
949
  results: List[ScoringResult] = asyncio.run(
@@ -1,6 +1,6 @@
1
1
  from judgeval.scorers.api_scorer import APIJudgmentScorer
2
2
  from judgeval.scorers.judgeval_scorer import JudgevalScorer
3
- from judgeval.scorers.prompt_scorer import PromptScorer, ClassifierScorer
3
+ from judgeval.scorers.prompt_scorer import PromptScorer
4
4
  from judgeval.scorers.judgeval_scorers.api_scorers import (
5
5
  ExecutionOrderScorer,
6
6
  JSONCorrectnessScorer,
@@ -17,6 +17,8 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
17
17
  GroundednessScorer,
18
18
  DerailmentScorer,
19
19
  ToolOrderScorer,
20
+ ClassifierScorer,
21
+ ToolDependencyScorer,
20
22
  )
21
23
  from judgeval.scorers.judgeval_scorers.classifiers import (
22
24
  Text2SQLScorer,
@@ -43,4 +45,5 @@ __all__ = [
43
45
  "GroundednessScorer",
44
46
  "DerailmentScorer",
45
47
  "ToolOrderScorer",
48
+ "ToolDependencyScorer",
46
49
  ]
@@ -12,7 +12,7 @@ from judgeval.common.logger import debug, info, warning, error
12
12
  from judgeval.judges import JudgevalJudge
13
13
  from judgeval.judges.utils import create_judge
14
14
  from judgeval.constants import UNBOUNDED_SCORERS
15
-
15
+ from judgeval.data.example import ExampleParams
16
16
  class JudgevalScorer:
17
17
  """
18
18
  Base class for scorers in `judgeval`.
@@ -39,6 +39,9 @@ class JudgevalScorer:
39
39
  evaluation_cost: Optional[float] = None # The cost of running the scorer
40
40
  verbose_logs: Optional[str] = None # The verbose logs of the scorer
41
41
  additional_metadata: Optional[Dict] = None # Additional metadata for the scorer
42
+ required_params: Optional[List[ExampleParams]] = None # The required parameters for the scorer
43
+ error: Optional[str] = None
44
+ success: Optional[bool] = None
42
45
 
43
46
  def __init__(
44
47
  self,
@@ -49,6 +52,7 @@ class JudgevalScorer:
49
52
  reason: Optional[str] = None,
50
53
  success: Optional[bool] = None,
51
54
  evaluation_model: Optional[str] = None,
55
+ required_params: Optional[List[ExampleParams]] = None,
52
56
  strict_mode: bool = False,
53
57
  async_mode: bool = True,
54
58
  verbose_mode: bool = True,
@@ -85,6 +89,7 @@ class JudgevalScorer:
85
89
  self.evaluation_cost = evaluation_cost
86
90
  self.verbose_logs = verbose_logs
87
91
  self.additional_metadata = additional_metadata
92
+ self.required_params = required_params
88
93
 
89
94
  def _add_model(self, model: Optional[Union[str, List[str], JudgevalJudge]] = None):
90
95
  """
@@ -145,3 +150,9 @@ class JudgevalScorer:
145
150
  "additional_metadata": self.additional_metadata,
146
151
  }
147
152
  return f"JudgevalScorer({attributes})"
153
+
154
+ def to_dict(self):
155
+ return {
156
+ "score_type": str(self.score_type), # Convert enum to string for serialization
157
+ "threshold": self.threshold
158
+ }
@@ -13,6 +13,8 @@ from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import
13
13
  from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import GroundednessScorer
14
14
  from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import DerailmentScorer
15
15
  from judgeval.scorers.judgeval_scorers.api_scorers.tool_order import ToolOrderScorer
16
+ from judgeval.scorers.judgeval_scorers.api_scorers.classifier_scorer import ClassifierScorer
17
+ from judgeval.scorers.judgeval_scorers.api_scorers.tool_dependency import ToolDependencyScorer
16
18
  __all__ = [
17
19
  "ExecutionOrderScorer",
18
20
  "JSONCorrectnessScorer",
@@ -29,4 +31,6 @@ __all__ = [
29
31
  "GroundednessScorer",
30
32
  "DerailmentScorer",
31
33
  "ToolOrderScorer",
34
+ "ClassifierScorer",
35
+ "ToolDependencyScorer",
32
36
  ]
@@ -0,0 +1,124 @@
1
+ from judgeval.scorers.api_scorer import APIJudgmentScorer
2
+ from judgeval.constants import APIScorer
3
+ from typing import List, Mapping, Optional, Dict
4
+ from pydantic import model_serializer
5
+
6
+ class ClassifierScorer(APIJudgmentScorer):
7
+ """
8
+ In the Judgment backend, this scorer is implemented as a PromptScorer that takes
9
+ 1. a system role that may involve the Example object
10
+ 2. options for scores on the example
11
+
12
+ and uses a judge to execute the evaluation from the system role and classify into one of the options
13
+
14
+ ex:
15
+ system_role = "You are a judge that evaluates whether the response is positive or negative. The response is: {example.actual_output}"
16
+ options = {"positive": 1, "negative": 0}
17
+
18
+ Args:
19
+ name (str): The name of the scorer
20
+ slug (str): A unique identifier for the scorer
21
+ conversation (List[dict]): The conversation template with placeholders (e.g., {{actual_output}})
22
+ options (Mapping[str, float]): A mapping of classification options to their corresponding scores
23
+ threshold (float): The threshold for determining success (default: 0.5)
24
+ include_reason (bool): Whether to include reasoning in the response (default: True)
25
+ strict_mode (bool): Whether to use strict mode (default: False)
26
+ verbose_mode (bool): Whether to include verbose logging (default: False)
27
+ """
28
+ name: Optional[str] = None
29
+ slug: Optional[str] = None
30
+ conversation: Optional[List[dict]] = None
31
+ options: Optional[Mapping[str, float]] = None
32
+ verbose_mode: bool = False
33
+ strict_mode: bool = False
34
+ include_reason: bool = True,
35
+ async_mode: bool = True,
36
+ threshold: float = 0.5
37
+
38
+ def __init__(
39
+ self,
40
+ name: str,
41
+ slug: str,
42
+ conversation: List[dict],
43
+ options: Mapping[str, float],
44
+ threshold: float = 0.5,
45
+ include_reason: bool = True,
46
+ strict_mode: bool = False,
47
+ verbose_mode: bool = False,
48
+ async_mode: bool = True,
49
+ ):
50
+ super().__init__(
51
+ threshold=threshold,
52
+ score_type=APIScorer.CLASSIFIER,
53
+ )
54
+ self.name = name
55
+ self.verbose_mode = verbose_mode
56
+ self.strict_mode = strict_mode
57
+ self.include_reason = include_reason
58
+ self.slug = slug
59
+ self.conversation = conversation
60
+ self.options = options
61
+ self.async_mode = async_mode
62
+
63
+ def update_name(self, name: str):
64
+ """
65
+ Updates the name of the scorer.
66
+ """
67
+ self.name = name
68
+
69
+ def update_threshold(self, threshold: float):
70
+ """
71
+ Updates the threshold of the scorer.
72
+ """
73
+ self.threshold = threshold
74
+
75
+ def update_conversation(self, conversation: List[dict]):
76
+ """
77
+ Updates the conversation with the new conversation.
78
+
79
+ Sample conversation:
80
+ [{'role': 'system', 'content': "Did the chatbot answer the user's question in a kind way?: {{actual_output}}."}]
81
+ """
82
+ self.conversation = conversation
83
+
84
+ def update_options(self, options: Mapping[str, float]):
85
+ """
86
+ Updates the options with the new options.
87
+
88
+ Sample options:
89
+ {"yes": 1, "no": 0}
90
+ """
91
+ self.options = options
92
+
93
+ def __str__(self):
94
+ return f"ClassifierScorer(name={self.name}, slug={self.slug}, conversation={self.conversation}, threshold={self.threshold}, options={self.options})"
95
+
96
+ # @model_serializer
97
+ # def serialize_model(self) -> dict:
98
+ # """
99
+ # Defines how the ClassifierScorer should be serialized when model_dump() is called.
100
+ # """
101
+ # return {
102
+ # "name": self.name,
103
+ # "score_type": self.name,
104
+ # "conversation": self.conversation,
105
+ # "options": self.options,
106
+ # "threshold": self.threshold,
107
+ # "include_reason": self.include_reason,
108
+ # "async_mode": self.async_mode,
109
+ # "strict_mode": self.strict_mode,
110
+ # "verbose_mode": self.verbose_mode,
111
+ # }
112
+
113
+ def to_dict(self) -> dict:
114
+ return {
115
+ "name": self.name,
116
+ "score_type": self.name,
117
+ "conversation": self.conversation,
118
+ "options": self.options,
119
+ "threshold": self.threshold,
120
+ "include_reason": self.include_reason,
121
+ "async_mode": self.async_mode,
122
+ "strict_mode": self.strict_mode,
123
+ "verbose_mode": self.verbose_mode,
124
+ }
@@ -0,0 +1,20 @@
1
+ """
2
+ `judgeval` tool dependency scorer
3
+ """
4
+
5
+ # Internal imports
6
+ from judgeval.scorers.api_scorer import APIJudgmentScorer
7
+ from judgeval.constants import APIScorer
8
+ from typing import Optional, Dict
9
+ class ToolDependencyScorer(APIJudgmentScorer):
10
+ kwargs: Optional[Dict] = None
11
+ def __init__(self, threshold: float=1.0, enable_param_checking: bool = True):
12
+ super().__init__(
13
+ threshold=threshold,
14
+ score_type=APIScorer.TOOL_DEPENDENCY
15
+ )
16
+ self.kwargs = {"enable_param_checking": enable_param_checking}
17
+
18
+ @property
19
+ def __name__(self):
20
+ return "Tool Dependency"
@@ -7,7 +7,7 @@ Determines if the LLM-generated SQL query is valid and works for the natural lan
7
7
  from judgeval.scorers import ClassifierScorer
8
8
 
9
9
  Text2SQLScorer = ClassifierScorer(
10
- "Text to SQL",
10
+ name="Text to SQL",
11
11
  slug="text2sql-1010101010",
12
12
  threshold=1.0,
13
13
  conversation=[{
@@ -30,6 +30,7 @@ from typing import List, Optional, Tuple, Any, Mapping
30
30
  from pydantic import BaseModel, model_serializer, Field
31
31
 
32
32
  from judgeval.data import Example
33
+ from judgeval.data.example import ExampleParams
33
34
  from judgeval.scorers import JudgevalScorer
34
35
  from judgeval.scorers.utils import (
35
36
  scorer_progress_meter,
@@ -37,6 +38,7 @@ from judgeval.scorers.utils import (
37
38
  get_or_create_event_loop,
38
39
  create_verbose_logs
39
40
  )
41
+ from judgeval.judges import JudgevalJudge
40
42
 
41
43
 
42
44
  class ReasonScore(BaseModel):
@@ -49,7 +51,8 @@ class PromptScorer(JudgevalScorer, BaseModel):
49
51
  score_type: str
50
52
  threshold: float = Field(default=0.5)
51
53
  using_native_model: bool = Field(default=True)
52
-
54
+ model: Optional[JudgevalJudge] = Field(default=None)
55
+ skipped: bool = Field(default=False)
53
56
  # DO NOT SET THESE FIELDS MANUALLY, THEY ARE SET BY THE SCORE_EXAMPLE METHOD
54
57
  _response: Optional[dict] = None
55
58
  _result: Optional[float] = None
@@ -62,6 +65,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
62
65
  async_mode: bool = True,
63
66
  strict_mode: bool = False,
64
67
  verbose_mode: bool = False,
68
+ required_params: Optional[List[ExampleParams]] = None,
65
69
  ):
66
70
  # Initialize BaseModel first
67
71
  BaseModel.__init__(
@@ -83,6 +87,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
83
87
  async_mode=async_mode,
84
88
  strict_mode=strict_mode,
85
89
  verbose_mode=verbose_mode,
90
+ required_params=required_params,
86
91
  )
87
92
 
88
93
  def score_example(
@@ -276,166 +281,5 @@ class PromptScorer(JudgevalScorer, BaseModel):
276
281
  def __name__(self):
277
282
  return self.name
278
283
 
279
-
280
- class ClassifierScorer(PromptScorer):
281
-
282
- """
283
- This is a PromptScorer that takes
284
- 1. a system role that may involve the Example object
285
- 2. options for scores on the example
286
-
287
- and uses a judge to execute the evaluation from the system role and classify into one of the options
288
-
289
- ex:
290
- system_role = "You are a judge that evaluates whether the response is positive or negative. The response is: {example.actual_output}"
291
- options = {"positive": 1, "negative": 0}
292
- """
293
-
294
- conversation: List[dict]
295
- options: Mapping[str, float]
296
-
297
- def __init__(self, name: str, slug: str, conversation: List[dict], options: Mapping[str, float],
298
- threshold: float = 0.5, include_reason: bool = True,
299
- async_mode: bool = True, strict_mode: bool = False, verbose_mode: bool = False):
300
- # Initialize BaseModel first with all fields
301
- BaseModel.__init__(
302
- self,
303
- name=name,
304
- slug=slug,
305
- score_type=name,
306
- conversation=conversation,
307
- options=options,
308
- threshold=threshold,
309
- include_reason=include_reason,
310
- async_mode=async_mode,
311
- strict_mode=strict_mode,
312
- verbose_mode=verbose_mode,
313
- )
314
- # Then initialize JudgevalScorer
315
- JudgevalScorer.__init__(
316
- self,
317
- score_type=name,
318
- threshold=threshold,
319
- include_reason=include_reason,
320
- async_mode=async_mode,
321
- strict_mode=strict_mode,
322
- verbose_mode=verbose_mode,
323
- )
324
-
325
- def _build_measure_prompt(self, example: Example) -> List[dict]:
326
- """
327
- Builds the measure prompt for the classifier scorer.
328
-
329
- Args:
330
- example (Example): The example to build the prompt for
331
-
332
- Returns:
333
- List[dict]: The measure prompt for the classifier scorer
334
- """
335
- replacement_words = {
336
- "{{actual_output}}": example.actual_output,
337
- "{{expected_output}}": example.expected_output,
338
- "{{context}}": example.context,
339
- "{{retrieval_context}}": example.retrieval_context,
340
- "{{tools_called}}": example.tools_called,
341
- "{{expected_tools}}": example.expected_tools,
342
- }
343
- # Make a copy of the conversation to avoid modifying the original
344
- conversation_copy = [dict(message) for message in self.conversation]
345
-
346
- # Only replace if double brackets are found in the content
347
- for message in conversation_copy:
348
- content = message["content"]
349
- if "{{" in content:
350
- for key, value in replacement_words.items():
351
- if key in content:
352
- message["content"] = content.replace(key, str(value))
353
- return conversation_copy
354
-
355
- def _build_schema(self) -> dict:
356
- return self.options
357
-
358
- def _enforce_prompt_format(self, judge_prompt: List[dict], schema: dict) -> List[dict]:
359
- """
360
- Enforces the judge model to choose an option from the schema.
361
-
362
- We want the model to choose an option from the schema and a reason for the choice.
363
- """
364
- options = list(schema.keys())
365
- options_str = ", ".join(options)
366
-
367
- system_role = judge_prompt[0]["content"]
368
- system_role += (
369
- f"\n\nYou must choose one of the following options: {options_str}. "
370
- "Format your response as a JSON object with two fields:\n"
371
- "1. 'choice': Your selected option (must be one of the provided choices)\n"
372
- "2. 'reason': A brief explanation for why you made this choice\n\n"
373
- "Example response format:\n"
374
- "{\n"
375
- ' "choice": "<one of the valid options>",\n'
376
- ' "reason": "<your explanation>"\n'
377
- "}"
378
- )
379
-
380
- judge_prompt[0]["content"] = system_role
381
- return judge_prompt
382
-
383
- def _process_response(self, response: dict) -> Tuple[float, str]:
384
- choice = response.get("choice")
385
- if choice not in self.options:
386
- raise ValueError(f"Invalid choice: {choice}. Expected one of: {self.options.keys()}")
387
- reason = response.get("reason", "No reason could be found in model response.")
388
- return self.options[choice], reason
389
-
390
- def _success_check(self, **kwargs) -> bool:
391
- return self.score >= self.threshold
392
-
393
- def update_name(self, name: str):
394
- """
395
- Updates the name of the scorer.
396
- """
397
- self.name = name
398
-
399
- def update_threshold(self, threshold: float):
400
- """
401
- Updates the threshold of the scorer.
402
- """
403
- self.threshold = threshold
404
-
405
- def update_conversation(self, conversation: List[dict]):
406
- """
407
- Updates the conversation with the new conversation.
408
-
409
- Sample conversation:
410
- [{'role': 'system', 'content': "Did the chatbot answer the user's question in a kind way?: {{actual_output}}."}]
411
- """
412
- self.conversation = conversation
413
-
414
- def update_options(self, options: Mapping[str, float]):
415
- """
416
- Updates the options with the new options.
417
-
418
- Sample options:
419
- {"yes": 1, "no": 0}
420
- """
421
- self.options = options
422
-
423
- def __str__(self):
424
- return f"ClassifierScorer(name={self.name}, slug={self.slug}, conversation={self.conversation}, threshold={self.threshold}, options={self.options})"
425
-
426
- @model_serializer
427
- def serialize_model(self) -> dict:
428
- """
429
- Defines how the ClassifierScorer should be serialized when model_dump() is called.
430
- """
431
- return {
432
- "name": self.name,
433
- "score_type": self.score_type,
434
- "conversation": self.conversation,
435
- "options": self.options,
436
- "threshold": self.threshold,
437
- "include_reason": self.include_reason,
438
- "async_mode": self.async_mode,
439
- "strict_mode": self.strict_mode,
440
- "verbose_mode": self.verbose_mode,
441
- }
284
+ class Config:
285
+ arbitrary_types_allowed = True