judgeval 0.10.1__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. judgeval/__init__.py +4 -4
  2. judgeval/api/__init__.py +17 -9
  3. judgeval/api/api_types.py +20 -18
  4. judgeval/data/evaluation_run.py +10 -11
  5. judgeval/data/judgment_types.py +25 -14
  6. judgeval/data/result.py +1 -0
  7. judgeval/data/scorer_data.py +1 -26
  8. judgeval/dataset/__init__.py +17 -16
  9. judgeval/env.py +11 -2
  10. judgeval/evaluation/__init__.py +20 -63
  11. judgeval/integrations/langgraph/__init__.py +2 -1
  12. judgeval/scorers/__init__.py +0 -4
  13. judgeval/scorers/agent_scorer.py +15 -15
  14. judgeval/scorers/api_scorer.py +0 -8
  15. judgeval/scorers/base_scorer.py +2 -2
  16. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -2
  17. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
  18. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -2
  19. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -2
  20. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +3 -5
  21. judgeval/scorers/score.py +1 -1
  22. judgeval/tracer/__init__.py +7 -10
  23. judgeval/tracer/local_eval_queue.py +11 -7
  24. judgeval/tracer/utils.py +2 -2
  25. judgeval/trainer/config.py +1 -1
  26. judgeval/trainer/trainable_model.py +1 -1
  27. judgeval/trainer/trainer.py +8 -6
  28. judgeval/utils/async_utils.py +7 -3
  29. judgeval/utils/testing.py +0 -4
  30. {judgeval-0.10.1.dist-info → judgeval-0.11.0.dist-info}/METADATA +1 -1
  31. {judgeval-0.10.1.dist-info → judgeval-0.11.0.dist-info}/RECORD +34 -35
  32. judgeval/data/tool.py +0 -5
  33. {judgeval-0.10.1.dist-info → judgeval-0.11.0.dist-info}/WHEEL +0 -0
  34. {judgeval-0.10.1.dist-info → judgeval-0.11.0.dist-info}/entry_points.txt +0 -0
  35. {judgeval-0.10.1.dist-info → judgeval-0.11.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -3,14 +3,11 @@ from __future__ import annotations
3
3
  import asyncio
4
4
  import concurrent.futures
5
5
  import time
6
- import orjson
7
- import sys
8
6
  import threading
9
- from typing import List, Dict, Union, Tuple, TYPE_CHECKING
7
+ from typing import List, Tuple, TYPE_CHECKING
10
8
  from rich import print as rprint
11
9
 
12
- from judgeval.data import ScorerData, ScoringResult, Example
13
- from judgeval.scorers import BaseScorer, ExampleAPIScorerConfig
10
+ from judgeval.data import ScorerData, ScoringResult
14
11
  from judgeval.scorers.score import a_execute_scoring
15
12
  from judgeval.api import JudgmentSyncClient
16
13
  from judgeval.env import (
@@ -19,9 +16,10 @@ from judgeval.env import (
19
16
  from judgeval.exceptions import JudgmentAPIError, JudgmentRuntimeError
20
17
  from judgeval.logger import judgeval_logger
21
18
 
19
+ from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
22
20
 
23
21
  if TYPE_CHECKING:
24
- from judgeval.data.evaluation_run import EvaluationRun
22
+ from judgeval.data.evaluation_run import ExampleEvaluationRun
25
23
 
26
24
 
27
25
  def safe_run_async(coro):
@@ -49,8 +47,7 @@ def safe_run_async(coro):
49
47
 
50
48
  def log_evaluation_results(
51
49
  scoring_results: List[ScoringResult],
52
- run: EvaluationRun,
53
- judgment_api_key: str,
50
+ run: ExampleEvaluationRun,
54
51
  ) -> str:
55
52
  """
56
53
  Logs evaluation results to the Judgment API database.
@@ -65,10 +62,10 @@ def log_evaluation_results(
65
62
  ValueError: If there's a validation error with the results
66
63
  """
67
64
  try:
68
- if not judgment_api_key or not run.organization_id:
65
+ if not JUDGMENT_API_KEY or not JUDGMENT_ORG_ID:
69
66
  raise ValueError("API key and organization ID are required")
70
67
 
71
- api_client = JudgmentSyncClient(judgment_api_key, run.organization_id)
68
+ api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
72
69
  response = api_client.log_eval_results(
73
70
  {
74
71
  "results": scoring_results, # type: ignore
@@ -85,41 +82,8 @@ def log_evaluation_results(
85
82
  )
86
83
 
87
84
 
88
- def check_examples(
89
- examples: List[Example], scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]]
90
- ) -> None:
91
- """
92
- Checks if the example contains the necessary parameters for the scorer.
93
- """
94
- prompt_user = False
95
- for scorer in scorers:
96
- for example in examples:
97
- missing_params = []
98
- for param in scorer.required_params:
99
- if getattr(example, param.value) is None:
100
- missing_params.append(f"{param.value}")
101
- if missing_params:
102
- rprint(
103
- f"[yellow]⚠️ WARNING:[/yellow] Example is missing required parameters for scorer [bold]{scorer.score_type.value}[/bold]"
104
- )
105
- rprint(f"Missing parameters: {', '.join(missing_params)}")
106
- rprint(
107
- f"Example: {orjson.dumps(example.model_dump(), option=orjson.OPT_INDENT_2).decode('utf-8')}"
108
- )
109
- rprint("-" * 40)
110
- prompt_user = True
111
-
112
- if prompt_user:
113
- user_input = input("Do you want to continue? (y/n)")
114
- if user_input.lower() != "y":
115
- sys.exit(0)
116
- else:
117
- rprint("[green]Continuing...[/green]")
118
-
119
-
120
85
  def _poll_evaluation_until_complete(
121
- evaluation_run: EvaluationRun,
122
- judgment_api_key: str,
86
+ evaluation_run: ExampleEvaluationRun,
123
87
  expected_scorer_data_count: int,
124
88
  poll_interval_seconds: float = 5,
125
89
  max_failures: int = 5,
@@ -140,13 +104,15 @@ def _poll_evaluation_until_complete(
140
104
  Returns:
141
105
  List[ScoringResult]: The evaluation results
142
106
  """
143
- organization_id = evaluation_run.organization_id
144
107
  project_name = evaluation_run.project_name
145
108
  experiment_run_id = evaluation_run.id
146
109
 
110
+ if not project_name or not experiment_run_id:
111
+ raise ValueError("Project name and experiment run ID are required")
112
+
147
113
  poll_count = 0
148
114
  exception_count = 0
149
- api_client = JudgmentSyncClient(judgment_api_key, organization_id)
115
+ api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
150
116
  while poll_count < max_poll_count:
151
117
  poll_count += 1
152
118
  try:
@@ -213,14 +179,13 @@ def progress_logger(stop_event, msg="Working...", interval=5):
213
179
 
214
180
 
215
181
  def run_eval(
216
- evaluation_run: EvaluationRun,
217
- judgment_api_key: str,
182
+ evaluation_run: ExampleEvaluationRun,
218
183
  ) -> List[ScoringResult]:
219
184
  """
220
185
  Executes an evaluation of `Example`s using one or more `Scorer`s
221
186
 
222
187
  Args:
223
- evaluation_run (EvaluationRun): Stores example and evaluation together for running
188
+ evaluation_run (ExampleEvaluationRun): Stores example and evaluation together for running
224
189
 
225
190
  Returns:
226
191
  List[ScoringResult]: A list of ScoringResult objects
@@ -258,16 +223,13 @@ def run_eval(
258
223
  judgeval_logger.error(error_msg)
259
224
  raise ValueError(error_msg)
260
225
 
261
- check_examples(evaluation_run.examples, evaluation_run.judgment_scorers)
262
226
  stop_event = threading.Event()
263
227
  t = threading.Thread(
264
228
  target=progress_logger, args=(stop_event, "Running evaluation...")
265
229
  )
266
230
  t.start()
267
231
  try:
268
- api_client = JudgmentSyncClient(
269
- judgment_api_key, evaluation_run.organization_id
270
- )
232
+ api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
271
233
  response = api_client.add_to_run_eval_queue_examples(
272
234
  evaluation_run.model_dump(warnings=False) # type: ignore
273
235
  )
@@ -286,7 +248,6 @@ def run_eval(
286
248
  )
287
249
  results, url = _poll_evaluation_until_complete(
288
250
  evaluation_run=evaluation_run,
289
- judgment_api_key=judgment_api_key,
290
251
  expected_scorer_data_count=(num_scorers * len(evaluation_run.examples)),
291
252
  )
292
253
  finally:
@@ -306,7 +267,7 @@ def run_eval(
306
267
  send_results = [
307
268
  scoring_result.model_dump(warnings=False) for scoring_result in results
308
269
  ]
309
- url = log_evaluation_results(send_results, evaluation_run, judgment_api_key)
270
+ url = log_evaluation_results(send_results, evaluation_run)
310
271
  rprint(
311
272
  f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
312
273
  )
@@ -323,27 +284,23 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
323
284
  Returns:
324
285
  None. Raises exceptions for any failed test cases.
325
286
  """
326
- failed_cases: List[ScorerData] = []
287
+ failed_cases: List[List[ScorerData]] = []
327
288
 
328
289
  for result in scoring_results:
329
290
  if not result.success:
330
291
  # Create a test case context with all relevant fields
331
- test_case: Dict = {"failed_scorers": []}
292
+ test_case: List[ScorerData] = []
332
293
  if result.scorers_data:
333
294
  # If the result was not successful, check each scorer_data
334
295
  for scorer_data in result.scorers_data:
335
296
  if not scorer_data.success:
336
- if scorer_data.name == "Tool Order":
337
- # Remove threshold, evaluation model for Tool Order scorer
338
- scorer_data.threshold = None
339
- scorer_data.evaluation_model = None
340
- test_case["failed_scorers"].append(scorer_data)
297
+ test_case.append(scorer_data)
341
298
  failed_cases.append(test_case)
342
299
 
343
300
  if failed_cases:
344
301
  error_msg = "The following test cases failed: \n"
345
302
  for fail_case in failed_cases:
346
- for fail_scorer in fail_case["failed_scorers"]:
303
+ for fail_scorer in fail_case:
347
304
  error_msg += (
348
305
  f"\nScorer Name: {fail_scorer.name}\n"
349
306
  f"Threshold: {fail_scorer.threshold}\n"
@@ -507,6 +507,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
507
507
  )
508
508
 
509
509
  # Extract response content
510
+ output: Any
510
511
  if response.generations:
511
512
  last_generation = response.generations[-1][-1]
512
513
  if (
@@ -547,7 +548,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
547
548
  for key, value in usage_attrs.items():
548
549
  span.set_attribute(key, value)
549
550
 
550
- self._end_span(run_id=run_id, outputs=output, **usage_attrs)
551
+ self._end_span(run_id=run_id, outputs=output, **usage_attrs) # type: ignore
551
552
 
552
553
  except Exception as e:
553
554
  judgeval_logger.exception(f"Error in on_llm_end: {e}")
@@ -1,7 +1,5 @@
1
1
  from judgeval.scorers.api_scorer import (
2
2
  APIScorerConfig,
3
- ExampleAPIScorerConfig,
4
- TraceAPIScorerConfig,
5
3
  )
6
4
  from judgeval.scorers.base_scorer import BaseScorer
7
5
  from judgeval.scorers.judgeval_scorers.api_scorers import (
@@ -15,8 +13,6 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
15
13
 
16
14
  __all__ = [
17
15
  "APIScorerConfig",
18
- "ExampleAPIScorerConfig",
19
- "TraceAPIScorerConfig",
20
16
  "BaseScorer",
21
17
  "TracePromptScorer",
22
18
  "PromptScorer",
@@ -1,17 +1,17 @@
1
- from judgeval.scorers.base_scorer import BaseScorer
2
- from judgeval.data.judgment_types import Trace as JudgmentTrace
3
- from typing import List, Optional
4
- from abc import abstractmethod
1
+ # from judgeval.scorers.base_scorer import BaseScorer
2
+ # from judgeval.data.judgment_types import Trace as JudgmentTrace
3
+ # from typing import List, Optional
4
+ # from abc import abstractmethod
5
5
 
6
6
 
7
- class TraceScorer(BaseScorer):
8
- @abstractmethod
9
- async def a_score_trace(
10
- self, trace: JudgmentTrace, tools: Optional[List] = None, *args, **kwargs
11
- ) -> float:
12
- """
13
- Asynchronously measures the score on a trace
14
- """
15
- raise NotImplementedError(
16
- "You must implement the `a_score_trace` method in your custom scorer"
17
- )
7
+ # class TraceScorer(BaseScorer):
8
+ # @abstractmethod
9
+ # async def a_score_trace(
10
+ # self, trace: JudgmentTrace, tools: Optional[List] = None, *args, **kwargs
11
+ # ) -> float:
12
+ # """
13
+ # Asynchronously measures the score on a trace
14
+ # """
15
+ # raise NotImplementedError(
16
+ # "You must implement the `a_score_trace` method in your custom scorer"
17
+ # )
@@ -63,11 +63,3 @@ class APIScorerConfig(BaseModel):
63
63
 
64
64
  def __str__(self):
65
65
  return f"JudgmentScorer(score_type={self.score_type.value}, threshold={self.threshold})"
66
-
67
-
68
- class ExampleAPIScorerConfig(APIScorerConfig):
69
- pass
70
-
71
-
72
- class TraceAPIScorerConfig(APIScorerConfig):
73
- pass
@@ -27,7 +27,7 @@ class BaseScorer(BaseModel):
27
27
  threshold: float = 0.5
28
28
 
29
29
  # name of your scorer (Faithfulness, PromptScorer-randomslug)
30
- name: Optional[str] = None
30
+ name: str = ""
31
31
 
32
32
  # The name of the class of the scorer
33
33
  class_name: Optional[str] = None
@@ -42,7 +42,7 @@ class BaseScorer(BaseModel):
42
42
  using_native_model: Optional[bool] = None
43
43
 
44
44
  # Whether the test case passed or failed
45
- success: Optional[bool] = None
45
+ success: bool = False
46
46
 
47
47
  # The name of the model used to evaluate the test case
48
48
  model: Optional[str] = None
@@ -6,13 +6,13 @@ TODO add link to docs page for this scorer
6
6
  """
7
7
 
8
8
  # Internal imports
9
- from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
9
+ from judgeval.scorers.api_scorer import APIScorerConfig
10
10
  from judgeval.constants import APIScorerType
11
11
  from judgeval.data import ExampleParams
12
12
  from typing import List
13
13
 
14
14
 
15
- class AnswerCorrectnessScorer(ExampleAPIScorerConfig):
15
+ class AnswerCorrectnessScorer(APIScorerConfig):
16
16
  score_type: APIScorerType = APIScorerType.ANSWER_CORRECTNESS
17
17
  required_params: List[ExampleParams] = [
18
18
  ExampleParams.INPUT,
@@ -1,10 +1,10 @@
1
- from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
1
+ from judgeval.scorers.api_scorer import APIScorerConfig
2
2
  from judgeval.constants import APIScorerType
3
3
  from judgeval.data import ExampleParams
4
4
  from typing import List
5
5
 
6
6
 
7
- class AnswerRelevancyScorer(ExampleAPIScorerConfig):
7
+ class AnswerRelevancyScorer(APIScorerConfig):
8
8
  score_type: APIScorerType = APIScorerType.ANSWER_RELEVANCY
9
9
  required_params: List[ExampleParams] = [
10
10
  ExampleParams.INPUT,
@@ -6,13 +6,13 @@ TODO add link to docs page for this scorer
6
6
  """
7
7
 
8
8
  # Internal imports
9
- from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
9
+ from judgeval.scorers.api_scorer import APIScorerConfig
10
10
  from judgeval.constants import APIScorerType
11
11
  from judgeval.data import ExampleParams
12
12
  from typing import List
13
13
 
14
14
 
15
- class FaithfulnessScorer(ExampleAPIScorerConfig):
15
+ class FaithfulnessScorer(APIScorerConfig):
16
16
  score_type: APIScorerType = APIScorerType.FAITHFULNESS
17
17
  required_params: List[ExampleParams] = [
18
18
  ExampleParams.INPUT,
@@ -6,12 +6,12 @@ TODO add link to docs page for this scorer
6
6
  """
7
7
 
8
8
  # Internal imports
9
- from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
9
+ from judgeval.scorers.api_scorer import APIScorerConfig
10
10
  from judgeval.constants import APIScorerType
11
11
  from judgeval.data import ExampleParams
12
12
 
13
13
 
14
- class InstructionAdherenceScorer(ExampleAPIScorerConfig):
14
+ class InstructionAdherenceScorer(APIScorerConfig):
15
15
  def __init__(self, threshold: float):
16
16
  super().__init__(
17
17
  threshold=threshold,
@@ -1,7 +1,5 @@
1
1
  from judgeval.scorers.api_scorer import (
2
2
  APIScorerConfig,
3
- ExampleAPIScorerConfig,
4
- TraceAPIScorerConfig,
5
3
  )
6
4
  from judgeval.constants import APIScorerType
7
5
  from typing import Dict, Any, Optional
@@ -55,7 +53,7 @@ def fetch_prompt_scorer(
55
53
  ):
56
54
  client = JudgmentSyncClient(judgment_api_key, organization_id)
57
55
  try:
58
- scorer_config = client.fetch_scorer({"name": name})["scorer"]
56
+ scorer_config = client.fetch_scorers({"names": [name]})["scorers"][0]
59
57
  scorer_config.pop("created_at")
60
58
  scorer_config.pop("updated_at")
61
59
  return scorer_config
@@ -284,9 +282,9 @@ class BasePromptScorer(ABC, APIScorerConfig):
284
282
  return base
285
283
 
286
284
 
287
- class PromptScorer(BasePromptScorer, ExampleAPIScorerConfig):
285
+ class PromptScorer(BasePromptScorer, APIScorerConfig):
288
286
  pass
289
287
 
290
288
 
291
- class TracePromptScorer(BasePromptScorer, TraceAPIScorerConfig):
289
+ class TracePromptScorer(BasePromptScorer, APIScorerConfig):
292
290
  pass
judgeval/scorers/score.py CHANGED
@@ -21,7 +21,7 @@ from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
21
21
 
22
22
 
23
23
  async def safe_a_score_example(
24
- scorer: ExampleScorer,
24
+ scorer: Union[ExampleScorer],
25
25
  example: Example,
26
26
  ):
27
27
  """
@@ -43,8 +43,8 @@ from judgeval.env import (
43
43
  JUDGMENT_ORG_ID,
44
44
  )
45
45
  from judgeval.logger import judgeval_logger
46
- from judgeval.scorers.api_scorer import ExampleAPIScorerConfig, TraceAPIScorerConfig
47
- from judgeval.scorers.base_scorer import BaseScorer
46
+ from judgeval.scorers.api_scorer import APIScorerConfig
47
+ from judgeval.scorers.example_scorer import ExampleScorer
48
48
  from judgeval.tracer.constants import JUDGEVAL_TRACER_INSTRUMENTING_MODULE_NAME
49
49
  from judgeval.tracer.managers import (
50
50
  sync_span_context,
@@ -328,7 +328,7 @@ class Tracer:
328
328
  run_condition = scorer_config.run_condition
329
329
  sampling_rate = scorer_config.sampling_rate
330
330
 
331
- if not isinstance(scorer, (TraceAPIScorerConfig)):
331
+ if not isinstance(scorer, (APIScorerConfig)):
332
332
  judgeval_logger.error(
333
333
  "Scorer must be an instance of TraceAPIScorerConfig, got %s, skipping evaluation."
334
334
  % type(scorer)
@@ -358,7 +358,6 @@ class Tracer:
358
358
  eval_run_name = f"async_trace_evaluate_{span_id}"
359
359
 
360
360
  eval_run = TraceEvaluationRun(
361
- organization_id=self.organization_id,
362
361
  project_name=self.project_name,
363
362
  eval_name=eval_run_name,
364
363
  scorers=[scorer],
@@ -862,7 +861,7 @@ class Tracer:
862
861
  self,
863
862
  /,
864
863
  *,
865
- scorer: Union[ExampleAPIScorerConfig, BaseScorer],
864
+ scorer: Union[APIScorerConfig, ExampleScorer],
866
865
  example: Example,
867
866
  model: str = JUDGMENT_DEFAULT_GPT_MODEL,
868
867
  sampling_rate: float = 1.0,
@@ -871,7 +870,7 @@ class Tracer:
871
870
  judgeval_logger.info("Evaluation is not enabled, skipping evaluation")
872
871
  return
873
872
 
874
- if not isinstance(scorer, (ExampleAPIScorerConfig, BaseScorer)):
873
+ if not isinstance(scorer, (APIScorerConfig, ExampleScorer)):
875
874
  judgeval_logger.error(
876
875
  "Scorer must be an instance of ExampleAPIScorerConfig or BaseScorer, got %s, skipping evaluation."
877
876
  % type(scorer)
@@ -902,13 +901,12 @@ class Tracer:
902
901
  span_context = self.get_current_span().get_span_context()
903
902
  trace_id = format(span_context.trace_id, "032x")
904
903
  span_id = format(span_context.span_id, "016x")
905
- hosted_scoring = isinstance(scorer, ExampleAPIScorerConfig) or (
906
- isinstance(scorer, BaseScorer) and scorer.server_hosted
904
+ hosted_scoring = isinstance(scorer, APIScorerConfig) or (
905
+ isinstance(scorer, ExampleScorer) and scorer.server_hosted
907
906
  )
908
907
  eval_run_name = f"async_evaluate_{span_id}" # note this name doesnt matter because we don't save the experiment only the example and scorer_data
909
908
  if hosted_scoring:
910
909
  eval_run = ExampleEvaluationRun(
911
- organization_id=self.organization_id,
912
910
  project_name=self.project_name,
913
911
  eval_name=eval_run_name,
914
912
  examples=[example],
@@ -923,7 +921,6 @@ class Tracer:
923
921
  else:
924
922
  # Handle custom scorers using local evaluation queue
925
923
  eval_run = ExampleEvaluationRun(
926
- organization_id=self.organization_id,
927
924
  project_name=self.project_name,
928
925
  eval_name=eval_run_name,
929
926
  examples=[example],
@@ -13,7 +13,7 @@ import time
13
13
  from judgeval.logger import judgeval_logger
14
14
  from judgeval.env import JUDGMENT_MAX_CONCURRENT_EVALUATIONS
15
15
  from judgeval.data import ScoringResult
16
- from judgeval.data.evaluation_run import EvaluationRun
16
+ from judgeval.data.evaluation_run import ExampleEvaluationRun
17
17
  from judgeval.utils.async_utils import safe_run_async
18
18
  from judgeval.scorers.score import a_execute_scoring
19
19
  from judgeval.api import JudgmentSyncClient
@@ -34,7 +34,7 @@ class LocalEvaluationQueue:
34
34
  ):
35
35
  if num_workers <= 0:
36
36
  raise ValueError("num_workers must be a positive integer.")
37
- self._queue: queue.Queue[Optional[EvaluationRun]] = queue.Queue()
37
+ self._queue: queue.Queue[Optional[ExampleEvaluationRun]] = queue.Queue()
38
38
  self._max_concurrent = max_concurrent
39
39
  self._num_workers = num_workers # Number of worker threads
40
40
  self._worker_threads: List[threading.Thread] = []
@@ -44,11 +44,11 @@ class LocalEvaluationQueue:
44
44
  organization_id=JUDGMENT_ORG_ID,
45
45
  )
46
46
 
47
- def enqueue(self, evaluation_run: EvaluationRun) -> None:
47
+ def enqueue(self, evaluation_run: ExampleEvaluationRun) -> None:
48
48
  """Add evaluation run to the queue."""
49
49
  self._queue.put(evaluation_run)
50
50
 
51
- def _process_run(self, evaluation_run: EvaluationRun) -> List[ScoringResult]:
51
+ def _process_run(self, evaluation_run: ExampleEvaluationRun) -> List[ScoringResult]:
52
52
  """Execute evaluation run locally and return results."""
53
53
 
54
54
  if not evaluation_run.custom_scorers:
@@ -70,7 +70,9 @@ class LocalEvaluationQueue:
70
70
 
71
71
  def run_all(
72
72
  self,
73
- callback: Optional[Callable[[EvaluationRun, List[ScoringResult]], None]] = None,
73
+ callback: Optional[
74
+ Callable[[ExampleEvaluationRun, List[ScoringResult]], None]
75
+ ] = None,
74
76
  ) -> None:
75
77
  """Process all queued runs synchronously.
76
78
 
@@ -134,7 +136,9 @@ class LocalEvaluationQueue:
134
136
 
135
137
  def start_worker(
136
138
  self,
137
- callback: Optional[Callable[[EvaluationRun, List[ScoringResult]], None]] = None,
139
+ callback: Optional[
140
+ Callable[[ExampleEvaluationRun, List[ScoringResult]], None]
141
+ ] = None,
138
142
  ) -> Optional[threading.Thread]:
139
143
  """Start a single background thread to process runs (backward compatibility).
140
144
 
@@ -144,7 +148,7 @@ class LocalEvaluationQueue:
144
148
  Returns:
145
149
  The started thread, or None if no threads were started.
146
150
  """
147
- threads = self.start_workers(callback)
151
+ threads = self.start_workers()
148
152
  return threads[0] if threads else None
149
153
 
150
154
  def wait_for_completion(self, timeout: Optional[float] = None) -> bool:
judgeval/tracer/utils.py CHANGED
@@ -2,7 +2,7 @@ from typing import Any
2
2
  from opentelemetry.trace import Span
3
3
  from pydantic import BaseModel
4
4
  from typing import Callable, Optional
5
- from judgeval.scorers.api_scorer import TraceAPIScorerConfig
5
+ from judgeval.scorers.api_scorer import APIScorerConfig
6
6
  from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
7
7
 
8
8
 
@@ -14,7 +14,7 @@ def set_span_attribute(span: Span, name: str, value: Any):
14
14
 
15
15
 
16
16
  class TraceScorerConfig(BaseModel):
17
- scorer: TraceAPIScorerConfig
17
+ scorer: APIScorerConfig
18
18
  model: str = JUDGMENT_DEFAULT_GPT_MODEL
19
19
  sampling_rate: float = 1.0
20
20
  run_condition: Optional[Callable[..., bool]] = None
@@ -5,7 +5,7 @@ from typing import Optional, Dict, Any, TYPE_CHECKING
5
5
  import json
6
6
 
7
7
  if TYPE_CHECKING:
8
- from fireworks.llm.llm_reinforcement_step import ReinforcementAcceleratorTypeLiteral
8
+ from fireworks.llm.llm_reinforcement_step import ReinforcementAcceleratorTypeLiteral # type: ignore[import-not-found]
9
9
 
10
10
 
11
11
  @dataclass
@@ -1,4 +1,4 @@
1
- from fireworks import LLM
1
+ from fireworks import LLM # type: ignore[import-not-found]
2
2
  from .config import TrainerConfig, ModelConfig
3
3
  from typing import Optional, Dict, Any, Callable
4
4
  from .console import _model_spinner_progress, _print_model_progress
@@ -2,7 +2,7 @@ import asyncio
2
2
  import json
3
3
  import time
4
4
  from typing import Optional, Callable, Any, List, Union, Dict
5
- from fireworks import Dataset
5
+ from fireworks import Dataset # type: ignore[import-not-found]
6
6
  from .config import TrainerConfig, ModelConfig
7
7
  from .trainable_model import TrainableModel
8
8
  from judgeval.tracer import Tracer
@@ -10,7 +10,7 @@ from judgeval.tracer.exporters.store import SpanStore
10
10
  from judgeval.tracer.exporters import InMemorySpanExporter
11
11
  from judgeval.tracer.keys import AttributeKeys
12
12
  from judgeval import JudgmentClient
13
- from judgeval.scorers import BaseScorer, ExampleAPIScorerConfig
13
+ from judgeval.scorers import BaseScorer, APIScorerConfig
14
14
  from judgeval.data import Example
15
15
  from .console import _spinner_progress, _print_progress, _print_progress_update
16
16
  from judgeval.exceptions import JudgmentRuntimeError
@@ -85,7 +85,9 @@ class JudgmentTrainer:
85
85
  if not first_found and span_attributes.get(
86
86
  AttributeKeys.JUDGMENT_INPUT
87
87
  ):
88
- input_data = span_attributes.get(AttributeKeys.JUDGMENT_INPUT, {})
88
+ input_data: Any = span_attributes.get(
89
+ AttributeKeys.JUDGMENT_INPUT, {}
90
+ )
89
91
  if isinstance(input_data, dict) and "messages" in input_data:
90
92
  input_messages = input_data["messages"]
91
93
  if input_messages:
@@ -154,7 +156,7 @@ class JudgmentTrainer:
154
156
  async def generate_rollouts_and_rewards(
155
157
  self,
156
158
  agent_function: Callable[[Any], Any],
157
- scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]],
159
+ scorers: List[Union[APIScorerConfig, BaseScorer]],
158
160
  prompts: List[Any],
159
161
  num_prompts_per_step: Optional[int] = None,
160
162
  num_generations_per_prompt: Optional[int] = None,
@@ -264,7 +266,7 @@ class JudgmentTrainer:
264
266
  async def run_reinforcement_learning(
265
267
  self,
266
268
  agent_function: Callable[[Any], Any],
267
- scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]],
269
+ scorers: List[Union[APIScorerConfig, BaseScorer]],
268
270
  prompts: List[Any],
269
271
  ) -> ModelConfig:
270
272
  """
@@ -370,7 +372,7 @@ class JudgmentTrainer:
370
372
  async def train(
371
373
  self,
372
374
  agent_function: Callable[[Any], Any],
373
- scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]],
375
+ scorers: List[Union[APIScorerConfig, BaseScorer]],
374
376
  prompts: List[Any],
375
377
  rft_provider: Optional[str] = None,
376
378
  ) -> ModelConfig:
@@ -2,13 +2,13 @@
2
2
 
3
3
  import asyncio
4
4
  import concurrent.futures
5
- from typing import Awaitable, TypeVar
5
+ from typing import Awaitable, TypeVar, Coroutine
6
6
 
7
7
 
8
8
  T = TypeVar("T")
9
9
 
10
10
 
11
- def safe_run_async(coro: Awaitable[T]) -> T: # type: ignore[type-var]
11
+ def safe_run_async(coro: Awaitable[T]) -> T:
12
12
  """Safely execute an async *coro* from synchronous code.
13
13
 
14
14
  This helper handles two common situations:
@@ -24,6 +24,8 @@ def safe_run_async(coro: Awaitable[T]) -> T: # type: ignore[type-var]
24
24
  Returns:
25
25
  The result returned by *coro*.
26
26
  """
27
+ if not isinstance(coro, Coroutine):
28
+ raise TypeError("The provided awaitable must be a coroutine.")
27
29
 
28
30
  try:
29
31
  asyncio.get_running_loop()
@@ -31,5 +33,7 @@ def safe_run_async(coro: Awaitable[T]) -> T: # type: ignore[type-var]
31
33
  return asyncio.run(coro)
32
34
 
33
35
  with concurrent.futures.ThreadPoolExecutor() as executor:
34
- future = executor.submit(lambda: asyncio.run(coro))
36
+ future: concurrent.futures.Future[T] = executor.submit(
37
+ lambda: asyncio.run(coro)
38
+ )
35
39
  return future.result()
judgeval/utils/testing.py CHANGED
@@ -26,10 +26,6 @@ def assert_test_results(scoring_results: List[ScoringResult]) -> None:
26
26
  # If the result was not successful, check each scorer_data
27
27
  for scorer_data in result.scorers_data:
28
28
  if not scorer_data.success:
29
- if scorer_data.name == "Tool Order":
30
- # Remove threshold, evaluation model for Tool Order scorer
31
- scorer_data.threshold = None
32
- scorer_data.evaluation_model = None
33
29
  test_case.append(scorer_data)
34
30
  failed_cases.append(test_case)
35
31