judgeval 0.0.39__py3-none-any.whl → 0.0.40__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
judgeval/constants.py CHANGED
@@ -28,6 +28,8 @@ class APIScorer(str, Enum):
28
28
  GROUNDEDNESS = "groundedness"
29
29
  DERAILMENT = "derailment"
30
30
  TOOL_ORDER = "tool_order"
31
+ CLASSIFIER = "classifier"
32
+ TOOL_DEPENDENCY = "tool_dependency"
31
33
  @classmethod
32
34
  def _missing_(cls, value):
33
35
  # Handle case-insensitive lookup
judgeval/data/__init__.py CHANGED
@@ -2,7 +2,7 @@ from judgeval.data.example import Example, ExampleParams
2
2
  from judgeval.data.custom_example import CustomExample
3
3
  from judgeval.data.scorer_data import ScorerData, create_scorer_data
4
4
  from judgeval.data.result import ScoringResult, generate_scoring_result
5
- from judgeval.data.trace import Trace, TraceSpan
5
+ from judgeval.data.trace import Trace, TraceSpan, TraceUsage
6
6
 
7
7
 
8
8
  __all__ = [
@@ -15,4 +15,5 @@ __all__ = [
15
15
  "generate_scoring_result",
16
16
  "Trace",
17
17
  "TraceSpan",
18
+ "TraceUsage"
18
19
  ]
judgeval/data/example.py CHANGED
@@ -36,15 +36,15 @@ class Example(BaseModel):
36
36
  name: Optional[str] = None
37
37
  example_id: str = Field(default_factory=lambda: str(uuid4()))
38
38
  example_index: Optional[int] = None
39
- timestamp: Optional[str] = None
39
+ created_at: Optional[str] = None
40
40
  trace_id: Optional[str] = None
41
41
 
42
42
  def __init__(self, **data):
43
43
  if 'example_id' not in data:
44
44
  data['example_id'] = str(uuid4())
45
45
  # Set timestamp if not provided
46
- if 'timestamp' not in data:
47
- data['timestamp'] = datetime.now().strftime("%Y%m%d_%H%M%S")
46
+ if 'created_at' not in data:
47
+ data['created_at'] = datetime.now().isoformat()
48
48
  super().__init__(**data)
49
49
 
50
50
  @field_validator('input', mode='before')
@@ -123,9 +123,9 @@ class Example(BaseModel):
123
123
  raise ValueError(f"Example index must be an integer or None but got {v} of type {type(v)}")
124
124
  return v
125
125
 
126
- @field_validator('timestamp', mode='before')
126
+ @field_validator('created_at', mode='before')
127
127
  @classmethod
128
- def validate_timestamp(cls, v):
128
+ def validate_created_at(cls, v):
129
129
  if v is not None and not isinstance(v, str):
130
130
  raise ValueError(f"Timestamp must be a string or None but got {v} of type {type(v)}")
131
131
  return v
@@ -150,7 +150,7 @@ class Example(BaseModel):
150
150
  "name": self.name,
151
151
  "example_id": self.example_id,
152
152
  "example_index": self.example_index,
153
- "timestamp": self.timestamp,
153
+ "created_at": self.created_at,
154
154
  }
155
155
 
156
156
  def __str__(self):
@@ -166,5 +166,5 @@ class Example(BaseModel):
166
166
  f"name={self.name}, "
167
167
  f"example_id={self.example_id}, "
168
168
  f"example_index={self.example_index}, "
169
- f"timestamp={self.timestamp}, "
169
+ f"created_at={self.created_at}, "
170
170
  )
judgeval/data/tool.py CHANGED
@@ -1,10 +1,14 @@
1
1
  from pydantic import BaseModel, field_validator
2
- from typing import Dict, Any, Optional
2
+ from typing import Dict, Any, Optional, List
3
3
  import warnings
4
4
 
5
5
  class Tool(BaseModel):
6
6
  tool_name: str
7
7
  parameters: Optional[Dict[str, Any]] = None
8
+ agent_name: Optional[str] = None
9
+ result_dependencies: Optional[List[Dict[str, Any]]] = None
10
+ action_dependencies: Optional[List[Dict[str, Any]]] = None
11
+ require_all: Optional[bool] = None
8
12
 
9
13
  @field_validator('tool_name')
10
14
  def validate_tool_name(cls, v):
@@ -16,4 +20,28 @@ class Tool(BaseModel):
16
20
  def validate_parameters(cls, v):
17
21
  if v is not None and not isinstance(v, dict):
18
22
  warnings.warn(f"Parameters should be a dictionary, got {type(v)}", UserWarning)
23
+ return v
24
+
25
+ @field_validator('agent_name')
26
+ def validate_agent_name(cls, v):
27
+ if v is not None and not isinstance(v, str):
28
+ warnings.warn(f"Agent name should be a string, got {type(v)}", UserWarning)
29
+ return v
30
+
31
+ @field_validator('result_dependencies')
32
+ def validate_result_dependencies(cls, v):
33
+ if v is not None and not isinstance(v, list):
34
+ warnings.warn(f"Result dependencies should be a list, got {type(v)}", UserWarning)
35
+ return v
36
+
37
+ @field_validator('action_dependencies')
38
+ def validate_action_dependencies(cls, v):
39
+ if v is not None and not isinstance(v, list):
40
+ warnings.warn(f"Action dependencies should be a list, got {type(v)}", UserWarning)
41
+ return v
42
+
43
+ @field_validator('require_all')
44
+ def validate_require_all(cls, v):
45
+ if v is not None and not isinstance(v, bool):
46
+ warnings.warn(f"Require all should be a boolean, got {type(v)}", UserWarning)
19
47
  return v
judgeval/data/trace.py CHANGED
@@ -5,36 +5,52 @@ from judgeval.data.tool import Tool
5
5
  import json
6
6
  from datetime import datetime, timezone
7
7
 
8
+ class TraceUsage(BaseModel):
9
+ prompt_tokens: Optional[int] = None
10
+ completion_tokens: Optional[int] = None
11
+ total_tokens: Optional[int] = None
12
+ prompt_tokens_cost_usd: Optional[float] = None
13
+ completion_tokens_cost_usd: Optional[float] = None
14
+ total_cost_usd: Optional[float] = None
15
+ model_name: Optional[str] = None
16
+
8
17
  class TraceSpan(BaseModel):
9
18
  span_id: str
10
19
  trace_id: str
11
- function: Optional[str] = None
20
+ function: str
12
21
  depth: int
13
22
  created_at: Optional[Any] = None
14
23
  parent_span_id: Optional[str] = None
15
24
  span_type: Optional[str] = "span"
16
25
  inputs: Optional[Dict[str, Any]] = None
26
+ error: Optional[Dict[str, Any]] = None
17
27
  output: Optional[Any] = None
28
+ usage: Optional[TraceUsage] = None
18
29
  duration: Optional[float] = None
19
30
  annotation: Optional[List[Dict[str, Any]]] = None
20
31
  evaluation_runs: Optional[List[EvaluationRun]] = []
21
32
  expected_tools: Optional[List[Tool]] = None
22
33
  additional_metadata: Optional[Dict[str, Any]] = None
34
+ has_evaluation: Optional[bool] = False
35
+ agent_name: Optional[str] = None
23
36
 
24
37
  def model_dump(self, **kwargs):
25
38
  return {
26
39
  "span_id": self.span_id,
27
40
  "trace_id": self.trace_id,
28
41
  "depth": self.depth,
29
- # "created_at": datetime.fromtimestamp(self.created_at).isoformat(),
30
42
  "created_at": datetime.fromtimestamp(self.created_at, tz=timezone.utc).isoformat(),
31
- "inputs": self._serialize_inputs(),
32
- "output": self._serialize_output(),
43
+ "inputs": self._serialize_value(self.inputs),
44
+ "output": self._serialize_value(self.output),
45
+ "error": self._serialize_value(self.error),
33
46
  "evaluation_runs": [run.model_dump() for run in self.evaluation_runs] if self.evaluation_runs else [],
34
47
  "parent_span_id": self.parent_span_id,
35
48
  "function": self.function,
36
49
  "duration": self.duration,
37
- "span_type": self.span_type
50
+ "span_type": self.span_type,
51
+ "usage": self.usage.model_dump() if self.usage else None,
52
+ "has_evaluation": self.has_evaluation,
53
+ "agent_name": self.agent_name
38
54
  }
39
55
 
40
56
  def print_span(self):
@@ -42,30 +58,6 @@ class TraceSpan(BaseModel):
42
58
  indent = " " * self.depth
43
59
  parent_info = f" (parent_id: {self.parent_span_id})" if self.parent_span_id else ""
44
60
  print(f"{indent}→ {self.function} (id: {self.span_id}){parent_info}")
45
-
46
- def _serialize_inputs(self) -> dict:
47
- """Helper method to serialize input data safely."""
48
- if self.inputs is None:
49
- return {}
50
-
51
- serialized_inputs = {}
52
- for key, value in self.inputs.items():
53
- if isinstance(value, BaseModel):
54
- serialized_inputs[key] = value.model_dump()
55
- elif isinstance(value, (list, tuple)):
56
- # Handle lists/tuples of arguments
57
- serialized_inputs[key] = [
58
- item.model_dump() if isinstance(item, BaseModel)
59
- else None if not self._is_json_serializable(item)
60
- else item
61
- for item in value
62
- ]
63
- else:
64
- if self._is_json_serializable(value):
65
- serialized_inputs[key] = value
66
- else:
67
- serialized_inputs[key] = self.safe_stringify(value, self.function)
68
- return serialized_inputs
69
61
 
70
62
  def _is_json_serializable(self, obj: Any) -> bool:
71
63
  """Helper method to check if an object is JSON serializable."""
@@ -88,15 +80,11 @@ class TraceSpan(BaseModel):
88
80
  return repr(output)
89
81
  except (TypeError, OverflowError, ValueError):
90
82
  pass
91
-
92
- warnings.warn(
93
- f"Output for function {function_name} is not JSON serializable and could not be converted to string. Setting to None."
94
- )
95
83
  return None
96
84
 
97
- def _serialize_output(self) -> Any:
98
- """Helper method to serialize output data safely."""
99
- if self.output is None:
85
+ def _serialize_value(self, value: Any) -> Any:
86
+ """Helper method to deep serialize a value safely supporting Pydantic Models / regular PyObjects."""
87
+ if value is None:
100
88
  return None
101
89
 
102
90
  def serialize_value(value):
@@ -117,8 +105,8 @@ class TraceSpan(BaseModel):
117
105
  # Fallback to safe stringification
118
106
  return self.safe_stringify(value, self.function)
119
107
 
120
- # Start serialization with the top-level output
121
- return serialize_value(self.output)
108
+ # Start serialization with the top-level value
109
+ return serialize_value(value)
122
110
 
123
111
  class Trace(BaseModel):
124
112
  trace_id: str
@@ -1,4 +1,3 @@
1
-
2
1
  from pydantic import BaseModel
3
2
  from typing import List, Optional, Dict, Any, Union, Callable
4
3
  from judgeval.data import Trace
@@ -22,6 +21,7 @@ class TraceRun(BaseModel):
22
21
  judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
23
22
  rules (Optional[List[Rule]]): Rules to evaluate against scoring results
24
23
  append (Optional[bool]): Whether to append to existing evaluation results
24
+ tools (Optional[List[Dict[str, Any]]]): List of tools to use for evaluation
25
25
  """
26
26
 
27
27
  # The user will specify whether they want log_results when they call run_eval
@@ -40,6 +40,7 @@ class TraceRun(BaseModel):
40
40
  judgment_api_key: Optional[str] = ""
41
41
  override: Optional[bool] = False
42
42
  rules: Optional[List[Rule]] = None
43
+ tools: Optional[List[Dict[str, Any]]] = None
43
44
 
44
45
  class Config:
45
46
  arbitrary_types_allowed = True
@@ -1,5 +1,5 @@
1
1
  from typing import List, Optional, Dict, Any, Union
2
- from pydantic import BaseModel, field_validator
2
+ from pydantic import BaseModel, field_validator, Field
3
3
 
4
4
  from judgeval.data import Example, CustomExample
5
5
  from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
@@ -27,12 +27,12 @@ class EvaluationRun(BaseModel):
27
27
  # The user will specify whether they want log_results when they call run_eval
28
28
  log_results: bool = False # NOTE: log_results has to be set first because it is used to validate project_name and eval_name
29
29
  organization_id: Optional[str] = None
30
- project_name: Optional[str] = None
31
- eval_name: Optional[str] = None
30
+ project_name: Optional[str] = Field(default=None, validate_default=True)
31
+ eval_name: Optional[str] = Field(default=None, validate_default=True)
32
32
  examples: Union[List[Example], List[CustomExample]]
33
33
  scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
34
34
  model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1"
35
- aggregator: Optional[str] = None
35
+ aggregator: Optional[str] = Field(default=None, validate_default=True)
36
36
  metadata: Optional[Dict[str, Any]] = None
37
37
  trace_span_id: Optional[str] = None
38
38
  # API Key will be "" until user calls client.run_eval(), then API Key will be set
@@ -96,9 +96,6 @@ class EvaluationRun(BaseModel):
96
96
  def validate_scorers(cls, v):
97
97
  if not v:
98
98
  raise ValueError("Scorers cannot be empty.")
99
- for s in v:
100
- if not isinstance(s, APIJudgmentScorer) and not isinstance(s, JudgevalScorer):
101
- raise ValueError(f"Invalid type for Scorer: {type(s)}")
102
99
  return v
103
100
 
104
101
  @field_validator('model')
@@ -5,6 +5,7 @@ import os
5
5
  from uuid import uuid4
6
6
  from typing import Optional, List, Dict, Any, Union, Callable
7
7
  import requests
8
+ import asyncio
8
9
 
9
10
  from judgeval.constants import ROOT_API
10
11
  from judgeval.data.datasets import EvalDataset, EvalDatasetClient
@@ -121,7 +122,8 @@ class JudgmentClient(metaclass=SingletonMeta):
121
122
  ignore_errors: bool = True,
122
123
  rules: Optional[List[Rule]] = None,
123
124
  function: Optional[Callable] = None,
124
- tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None
125
+ tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
126
+ tools: Optional[List[Dict[str, Any]]] = None
125
127
  ) -> List[ScoringResult]:
126
128
  try:
127
129
 
@@ -151,6 +153,7 @@ class JudgmentClient(metaclass=SingletonMeta):
151
153
  append=append,
152
154
  judgment_api_key=self.judgment_api_key,
153
155
  organization_id=self.organization_id,
156
+ tools=tools
154
157
  )
155
158
  return run_trace_eval(trace_run, override, ignore_errors, function, tracer, examples)
156
159
  except ValueError as e:
@@ -173,7 +176,7 @@ class JudgmentClient(metaclass=SingletonMeta):
173
176
  ignore_errors: bool = True,
174
177
  async_execution: bool = False,
175
178
  rules: Optional[List[Rule]] = None
176
- ) -> List[ScoringResult]:
179
+ ) -> Union[List[ScoringResult], asyncio.Task]:
177
180
  """
178
181
  Executes an evaluation of `Example`s using one or more `Scorer`s
179
182
 
@@ -480,7 +483,7 @@ class JudgmentClient(metaclass=SingletonMeta):
480
483
 
481
484
  return response.json()["slug"]
482
485
 
483
- async def assert_test(
486
+ def assert_test(
484
487
  self,
485
488
  scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
486
489
  examples: Optional[List[Example]] = None,
@@ -495,6 +498,7 @@ class JudgmentClient(metaclass=SingletonMeta):
495
498
  rules: Optional[List[Rule]] = None,
496
499
  function: Optional[Callable] = None,
497
500
  tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
501
+ tools: Optional[List[Dict[str, Any]]] = None,
498
502
  async_execution: bool = False
499
503
  ) -> None:
500
504
  """
@@ -513,6 +517,14 @@ class JudgmentClient(metaclass=SingletonMeta):
513
517
  override (bool): Whether to override an existing evaluation run with the same name
514
518
  rules (Optional[List[Rule]]): Rules to evaluate against scoring results
515
519
  """
520
+
521
+ # Check for enable_param_checking and tools
522
+ for scorer in scorers:
523
+ if hasattr(scorer, "kwargs") and scorer.kwargs is not None:
524
+ if scorer.kwargs.get("enable_param_checking") is True:
525
+ if not tools:
526
+ raise ValueError(f"You must provide the 'tools' argument to assert_test when using a scorer with enable_param_checking=True. If you do not want to do param checking, explicitly set enable_param_checking=False for the {scorer.__name__} scorer.")
527
+
516
528
  # Validate that exactly one of examples or test_file is provided
517
529
  if (examples is None and test_file is None) or (examples is not None and test_file is not None):
518
530
  raise ValueError("Exactly one of 'examples' or 'test_file' must be provided, but not both")
@@ -530,10 +542,11 @@ class JudgmentClient(metaclass=SingletonMeta):
530
542
  rules=rules,
531
543
  function=function,
532
544
  tracer=tracer,
533
- test_file=test_file
545
+ test_file=test_file,
546
+ tools=tools
534
547
  )
535
548
  else:
536
- results = await self.run_evaluation(
549
+ results = self.run_evaluation(
537
550
  examples=examples,
538
551
  scorers=scorers,
539
552
  model=model,
@@ -547,4 +560,10 @@ class JudgmentClient(metaclass=SingletonMeta):
547
560
  async_execution=async_execution
548
561
  )
549
562
 
550
- assert_test(results)
563
+ if async_execution:
564
+ # 'results' is an asyncio.Task here, awaiting it gives List[ScoringResult]
565
+ actual_results = asyncio.run(results)
566
+ assert_test(actual_results) # Call the synchronous imported function
567
+ else:
568
+ # 'results' is already List[ScoringResult] here (synchronous path)
569
+ assert_test(results) # Call the synchronous imported function
@@ -204,9 +204,9 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
204
204
  )
205
205
  return results
206
206
 
207
- def check_experiment_type(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str, is_sequence: bool) -> None:
207
+ def check_experiment_type(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str, is_trace: bool) -> None:
208
208
  """
209
- Checks if the current experiment, if one exists, has the same type (examples of sequences)
209
+ Checks if the current experiment, if one exists, has the same type (examples of traces)
210
210
  """
211
211
  try:
212
212
  response = requests.post(
@@ -220,7 +220,7 @@ def check_experiment_type(eval_name: str, project_name: str, judgment_api_key: s
220
220
  "eval_name": eval_name,
221
221
  "project_name": project_name,
222
222
  "judgment_api_key": judgment_api_key,
223
- "is_sequence": is_sequence
223
+ "is_trace": is_trace
224
224
  },
225
225
  verify=True
226
226
  )
@@ -382,7 +382,7 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
382
382
  )
383
383
 
384
384
  if trace_run.append:
385
- # Check that the current experiment, if one exists, has the same type (examples of sequences)
385
+ # Check that the current experiment, if one exists, has the same type (examples or traces)
386
386
  check_experiment_type(
387
387
  trace_run.eval_name,
388
388
  trace_run.project_name,
@@ -390,13 +390,18 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
390
390
  trace_run.organization_id,
391
391
  True
392
392
  )
393
-
394
393
  if function and tracer:
395
394
  new_traces: List[Trace] = []
396
395
  tracer.offline_mode = True
396
+ tracer.traces = []
397
397
  for example in examples:
398
398
  if example.input:
399
- result = run_with_spinner("Running agent function: ", function, **example.input)
399
+ if isinstance(example.input, str):
400
+ result = run_with_spinner("Running agent function: ", function, example.input)
401
+ elif isinstance(example.input, dict):
402
+ result = run_with_spinner("Running agent function: ", function, **example.input)
403
+ else:
404
+ raise ValueError(f"Input must be string or dict, got {type(example.input)}")
400
405
  else:
401
406
  result = run_with_spinner("Running agent function: ", function)
402
407
  for i, trace in enumerate(tracer.traces):
@@ -405,6 +410,7 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
405
410
  trace.entries[0].expected_tools = examples[i].expected_tools
406
411
  new_traces.append(trace)
407
412
  trace_run.traces = new_traces
413
+ tracer.traces = []
408
414
 
409
415
  # Execute evaluation using Judgment API
410
416
  info("Starting API evaluation")
@@ -423,7 +429,7 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
423
429
  debug("Processing API results")
424
430
  # TODO: allow for custom scorer on traces
425
431
  if trace_run.log_results:
426
- pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["results"], trace_run)
432
+ pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["agent_results"], trace_run)
427
433
  rprint(pretty_str)
428
434
 
429
435
  return scoring_results
@@ -504,7 +510,8 @@ async def _poll_evaluation_until_complete(eval_name: str, project_name: str, jud
504
510
  info(f"Polling for evaluation '{eval_name}' in project '{project_name}' (attempt {poll_count})")
505
511
 
506
512
  # Check status
507
- response = requests.get(
513
+ response = await asyncio.to_thread(
514
+ requests.get,
508
515
  JUDGMENT_GET_EVAL_STATUS_API_URL,
509
516
  headers={
510
517
  "Content-Type": "application/json",
@@ -531,7 +538,8 @@ async def _poll_evaluation_until_complete(eval_name: str, project_name: str, jud
531
538
  # If complete, get results and return
532
539
  if status == "completed" or status == "complete":
533
540
  info(f"Evaluation '{eval_name}' reported as completed, fetching and verifying results...")
534
- results_response = requests.post(
541
+ results_response = await asyncio.to_thread(
542
+ requests.post,
535
543
  JUDGMENT_EVAL_FETCH_API_URL,
536
544
  headers={
537
545
  "Content-Type": "application/json",
@@ -723,7 +731,18 @@ class SpinnerWrappedTask:
723
731
 
724
732
  def __await__(self):
725
733
  async def _spin_and_await():
726
- return await await_with_spinner(self.task, self.message)
734
+ # self.task resolves to (scoring_results, pretty_str_to_print)
735
+ task_result_tuple = await await_with_spinner(self.task, self.message)
736
+
737
+ # Unpack the tuple
738
+ scoring_results, pretty_str_to_print = task_result_tuple
739
+
740
+ # Print the pretty string if it exists, after spinner is cleared
741
+ if pretty_str_to_print:
742
+ rprint(pretty_str_to_print)
743
+
744
+ # Return only the scoring_results to the original awaiter
745
+ return scoring_results
727
746
  return _spin_and_await().__await__()
728
747
 
729
748
  # Proxy all Task attributes and methods to the underlying task
@@ -756,7 +775,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
756
775
  )
757
776
 
758
777
  if evaluation_run.append:
759
- # Check that the current experiment, if one exists, has the same type (examples of sequences)
778
+ # Check that the current experiment, if one exists, has the same type (examples of traces)
760
779
  check_experiment_type(
761
780
  evaluation_run.eval_name,
762
781
  evaluation_run.project_name,
@@ -769,8 +788,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
769
788
  debug("Initializing examples with IDs and timestamps")
770
789
  for idx, example in enumerate(evaluation_run.examples):
771
790
  example.example_index = idx # Set numeric index
772
- example.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
773
- with example_logging_context(example.timestamp, example.example_id):
791
+ with example_logging_context(example.created_at, example.example_id):
774
792
  debug(f"Initialized example {example.example_id} (index: {example.example_index})")
775
793
  debug(f"Input: {example.input}")
776
794
  debug(f"Actual output: {example.actual_output}")
@@ -824,7 +842,8 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
824
842
  payload = evaluation_run.model_dump(warnings=False)
825
843
 
826
844
  # Send the evaluation to the queue
827
- response = requests.post(
845
+ response = await asyncio.to_thread(
846
+ requests.post,
828
847
  JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
829
848
  headers={
830
849
  "Content-Type": "application/json",
@@ -843,13 +862,28 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
843
862
  info(f"Successfully added evaluation '{evaluation_run.eval_name}' to queue")
844
863
 
845
864
  # Poll until the evaluation is complete
846
- return await _poll_evaluation_until_complete(
865
+ results = await _poll_evaluation_until_complete(
847
866
  eval_name=evaluation_run.eval_name,
848
867
  project_name=evaluation_run.project_name,
849
868
  judgment_api_key=evaluation_run.judgment_api_key,
850
869
  organization_id=evaluation_run.organization_id,
851
870
  original_examples=evaluation_run.examples # Pass the original examples
852
871
  )
872
+
873
+ pretty_str_to_print = None
874
+ if evaluation_run.log_results and results: # Ensure results exist before logging
875
+ send_results = [scoring_result.model_dump(warnings=False) for scoring_result in results]
876
+ try:
877
+ # Run the blocking log_evaluation_results in a separate thread
878
+ pretty_str_to_print = await asyncio.to_thread(
879
+ log_evaluation_results,
880
+ send_results,
881
+ evaluation_run
882
+ )
883
+ except Exception as e:
884
+ error(f"Error logging results after async evaluation: {str(e)}")
885
+
886
+ return results, pretty_str_to_print
853
887
 
854
888
  # Create a regular task
855
889
  task = asyncio.create_task(_async_evaluation_workflow())
@@ -895,7 +929,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
895
929
  # We should be removing local scorers soon
896
930
  info("Starting local evaluation")
897
931
  for example in evaluation_run.examples:
898
- with example_logging_context(example.timestamp, example.example_id):
932
+ with example_logging_context(example.created_at, example.example_id):
899
933
  debug(f"Processing example {example.example_id}: {example.input}")
900
934
 
901
935
  results: List[ScoringResult] = asyncio.run(
@@ -1,6 +1,6 @@
1
1
  from judgeval.scorers.api_scorer import APIJudgmentScorer
2
2
  from judgeval.scorers.judgeval_scorer import JudgevalScorer
3
- from judgeval.scorers.prompt_scorer import PromptScorer, ClassifierScorer
3
+ from judgeval.scorers.prompt_scorer import PromptScorer
4
4
  from judgeval.scorers.judgeval_scorers.api_scorers import (
5
5
  ExecutionOrderScorer,
6
6
  JSONCorrectnessScorer,
@@ -17,6 +17,8 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
17
17
  GroundednessScorer,
18
18
  DerailmentScorer,
19
19
  ToolOrderScorer,
20
+ ClassifierScorer,
21
+ ToolDependencyScorer,
20
22
  )
21
23
  from judgeval.scorers.judgeval_scorers.classifiers import (
22
24
  Text2SQLScorer,
@@ -43,4 +45,5 @@ __all__ = [
43
45
  "GroundednessScorer",
44
46
  "DerailmentScorer",
45
47
  "ToolOrderScorer",
48
+ "ToolDependencyScorer",
46
49
  ]
@@ -39,6 +39,8 @@ class JudgevalScorer:
39
39
  evaluation_cost: Optional[float] = None # The cost of running the scorer
40
40
  verbose_logs: Optional[str] = None # The verbose logs of the scorer
41
41
  additional_metadata: Optional[Dict] = None # Additional metadata for the scorer
42
+ error: Optional[str] = None
43
+ success: Optional[bool] = None
42
44
 
43
45
  def __init__(
44
46
  self,
@@ -145,3 +147,9 @@ class JudgevalScorer:
145
147
  "additional_metadata": self.additional_metadata,
146
148
  }
147
149
  return f"JudgevalScorer({attributes})"
150
+
151
+ def to_dict(self):
152
+ return {
153
+ "score_type": str(self.score_type), # Convert enum to string for serialization
154
+ "threshold": self.threshold
155
+ }
@@ -13,6 +13,8 @@ from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import
13
13
  from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import GroundednessScorer
14
14
  from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import DerailmentScorer
15
15
  from judgeval.scorers.judgeval_scorers.api_scorers.tool_order import ToolOrderScorer
16
+ from judgeval.scorers.judgeval_scorers.api_scorers.classifier_scorer import ClassifierScorer
17
+ from judgeval.scorers.judgeval_scorers.api_scorers.tool_dependency import ToolDependencyScorer
16
18
  __all__ = [
17
19
  "ExecutionOrderScorer",
18
20
  "JSONCorrectnessScorer",
@@ -29,4 +31,6 @@ __all__ = [
29
31
  "GroundednessScorer",
30
32
  "DerailmentScorer",
31
33
  "ToolOrderScorer",
34
+ "ClassifierScorer",
35
+ "ToolDependencyScorer",
32
36
  ]