PyPI - judgeval - Versions diffs - 0.10.1__py3-none-any.whl → 0.11.0__py3-none-any.whl - Mend

judgeval 0.10.1py3-none-any.whl → 0.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

judgeval/__init__.py +4 -4
judgeval/api/__init__.py +17 -9
judgeval/api/api_types.py +20 -18
judgeval/data/evaluation_run.py +10 -11
judgeval/data/judgment_types.py +25 -14
judgeval/data/result.py +1 -0
judgeval/data/scorer_data.py +1 -26
judgeval/dataset/__init__.py +17 -16
judgeval/env.py +11 -2
judgeval/evaluation/__init__.py +20 -63
judgeval/integrations/langgraph/__init__.py +2 -1
judgeval/scorers/__init__.py +0 -4
judgeval/scorers/agent_scorer.py +15 -15
judgeval/scorers/api_scorer.py +0 -8
judgeval/scorers/base_scorer.py +2 -2
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -2
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -2
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -2
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +3 -5
judgeval/scorers/score.py +1 -1
judgeval/tracer/__init__.py +7 -10
judgeval/tracer/local_eval_queue.py +11 -7
judgeval/tracer/utils.py +2 -2
judgeval/trainer/config.py +1 -1
judgeval/trainer/trainable_model.py +1 -1
judgeval/trainer/trainer.py +8 -6
judgeval/utils/async_utils.py +7 -3
judgeval/utils/testing.py +0 -4
{judgeval-0.10.1.dist-info → judgeval-0.11.0.dist-info}/METADATA +1 -1
{judgeval-0.10.1.dist-info → judgeval-0.11.0.dist-info}/RECORD +34 -35
judgeval/data/tool.py +0 -5
{judgeval-0.10.1.dist-info → judgeval-0.11.0.dist-info}/WHEEL +0 -0
{judgeval-0.10.1.dist-info → judgeval-0.11.0.dist-info}/entry_points.txt +0 -0
{judgeval-0.10.1.dist-info → judgeval-0.11.0.dist-info}/licenses/LICENSE.md +0 -0

judgeval/evaluation/__init__.py CHANGED Viewed

@@ -3,14 +3,11 @@ from __future__ import annotations
 import asyncio
 import concurrent.futures
 import time
-import orjson
-import sys
 import threading
-from typing import List, Dict, Union, Tuple, TYPE_CHECKING
+from typing import List, Tuple, TYPE_CHECKING
 from rich import print as rprint
-from judgeval.data import ScorerData, ScoringResult, Example
-from judgeval.scorers import BaseScorer, ExampleAPIScorerConfig
+from judgeval.data import ScorerData, ScoringResult
 from judgeval.scorers.score import a_execute_scoring
 from judgeval.api import JudgmentSyncClient
 from judgeval.env import (
@@ -19,9 +16,10 @@ from judgeval.env import (
 from judgeval.exceptions import JudgmentAPIError, JudgmentRuntimeError
 from judgeval.logger import judgeval_logger
+from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
 if TYPE_CHECKING:
-    from judgeval.data.evaluation_run import EvaluationRun
+    from judgeval.data.evaluation_run import ExampleEvaluationRun
 def safe_run_async(coro):
@@ -49,8 +47,7 @@ def safe_run_async(coro):
 def log_evaluation_results(
     scoring_results: List[ScoringResult],
-    run: EvaluationRun,
-    judgment_api_key: str,
+    run: ExampleEvaluationRun,
 ) -> str:
     """
     Logs evaluation results to the Judgment API database.
@@ -65,10 +62,10 @@ def log_evaluation_results(
         ValueError: If there's a validation error with the results
     """
     try:
-        if not judgment_api_key or not run.organization_id:
+        if not JUDGMENT_API_KEY or not JUDGMENT_ORG_ID:
             raise ValueError("API key and organization ID are required")
-        api_client = JudgmentSyncClient(judgment_api_key, run.organization_id)
+        api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
         response = api_client.log_eval_results(
             {
                 "results": scoring_results,  # type: ignore
@@ -85,41 +82,8 @@ def log_evaluation_results(
         )
-def check_examples(
-    examples: List[Example], scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]]
-) -> None:
-    """
-    Checks if the example contains the necessary parameters for the scorer.
-    """
-    prompt_user = False
-    for scorer in scorers:
-        for example in examples:
-            missing_params = []
-            for param in scorer.required_params:
-                if getattr(example, param.value) is None:
-                    missing_params.append(f"{param.value}")
-            if missing_params:
-                rprint(
-                    f"[yellow]⚠️  WARNING:[/yellow] Example is missing required parameters for scorer [bold]{scorer.score_type.value}[/bold]"
-                )
-                rprint(f"Missing parameters: {', '.join(missing_params)}")
-                rprint(
-                    f"Example: {orjson.dumps(example.model_dump(), option=orjson.OPT_INDENT_2).decode('utf-8')}"
-                )
-                rprint("-" * 40)
-                prompt_user = True
-    if prompt_user:
-        user_input = input("Do you want to continue? (y/n)")
-        if user_input.lower() != "y":
-            sys.exit(0)
-        else:
-            rprint("[green]Continuing...[/green]")
 def _poll_evaluation_until_complete(
-    evaluation_run: EvaluationRun,
-    judgment_api_key: str,
+    evaluation_run: ExampleEvaluationRun,
     expected_scorer_data_count: int,
     poll_interval_seconds: float = 5,
     max_failures: int = 5,
@@ -140,13 +104,15 @@ def _poll_evaluation_until_complete(
     Returns:
         List[ScoringResult]: The evaluation results
     """
-    organization_id = evaluation_run.organization_id
     project_name = evaluation_run.project_name
     experiment_run_id = evaluation_run.id
+    if not project_name or not experiment_run_id:
+        raise ValueError("Project name and experiment run ID are required")
     poll_count = 0
     exception_count = 0
-    api_client = JudgmentSyncClient(judgment_api_key, organization_id)
+    api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
     while poll_count < max_poll_count:
         poll_count += 1
         try:
@@ -213,14 +179,13 @@ def progress_logger(stop_event, msg="Working...", interval=5):
 def run_eval(
-    evaluation_run: EvaluationRun,
-    judgment_api_key: str,
+    evaluation_run: ExampleEvaluationRun,
 ) -> List[ScoringResult]:
     """
     Executes an evaluation of `Example`s using one or more `Scorer`s
     Args:
-        evaluation_run (EvaluationRun): Stores example and evaluation together for running
+        evaluation_run (ExampleEvaluationRun): Stores example and evaluation together for running
     Returns:
         List[ScoringResult]: A list of ScoringResult objects
@@ -258,16 +223,13 @@ def run_eval(
             judgeval_logger.error(error_msg)
             raise ValueError(error_msg)
-        check_examples(evaluation_run.examples, evaluation_run.judgment_scorers)
         stop_event = threading.Event()
         t = threading.Thread(
             target=progress_logger, args=(stop_event, "Running evaluation...")
         )
         t.start()
         try:
-            api_client = JudgmentSyncClient(
-                judgment_api_key, evaluation_run.organization_id
-            )
+            api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
             response = api_client.add_to_run_eval_queue_examples(
                 evaluation_run.model_dump(warnings=False)  # type: ignore
             )
@@ -286,7 +248,6 @@ def run_eval(
             )
             results, url = _poll_evaluation_until_complete(
                 evaluation_run=evaluation_run,
-                judgment_api_key=judgment_api_key,
                 expected_scorer_data_count=(num_scorers * len(evaluation_run.examples)),
             )
         finally:
@@ -306,7 +267,7 @@ def run_eval(
         send_results = [
             scoring_result.model_dump(warnings=False) for scoring_result in results
         ]
-        url = log_evaluation_results(send_results, evaluation_run, judgment_api_key)
+        url = log_evaluation_results(send_results, evaluation_run)
     rprint(
         f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
     )
@@ -323,27 +284,23 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
     Returns:
         None. Raises exceptions for any failed test cases.
     """
-    failed_cases: List[ScorerData] = []
+    failed_cases: List[List[ScorerData]] = []
     for result in scoring_results:
         if not result.success:
             # Create a test case context with all relevant fields
-            test_case: Dict = {"failed_scorers": []}
+            test_case: List[ScorerData] = []
             if result.scorers_data:
                 # If the result was not successful, check each scorer_data
                 for scorer_data in result.scorers_data:
                     if not scorer_data.success:
-                        if scorer_data.name == "Tool Order":
-                            # Remove threshold, evaluation model for Tool Order scorer
-                            scorer_data.threshold = None
-                            scorer_data.evaluation_model = None
-                        test_case["failed_scorers"].append(scorer_data)
+                        test_case.append(scorer_data)
             failed_cases.append(test_case)
     if failed_cases:
         error_msg = "The following test cases failed: \n"
         for fail_case in failed_cases:
-            for fail_scorer in fail_case["failed_scorers"]:
+            for fail_scorer in fail_case:
                 error_msg += (
                     f"\nScorer Name: {fail_scorer.name}\n"
                     f"Threshold: {fail_scorer.threshold}\n"

judgeval/integrations/langgraph/__init__.py CHANGED Viewed

@@ -507,6 +507,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
             )
             # Extract response content
+            output: Any
             if response.generations:
                 last_generation = response.generations[-1][-1]
                 if (
@@ -547,7 +548,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
                 for key, value in usage_attrs.items():
                     span.set_attribute(key, value)
-            self._end_span(run_id=run_id, outputs=output, **usage_attrs)
+            self._end_span(run_id=run_id, outputs=output, **usage_attrs)  # type: ignore
         except Exception as e:
             judgeval_logger.exception(f"Error in on_llm_end: {e}")

judgeval/scorers/__init__.py CHANGED Viewed

@@ -1,7 +1,5 @@
 from judgeval.scorers.api_scorer import (
     APIScorerConfig,
-    ExampleAPIScorerConfig,
-    TraceAPIScorerConfig,
 )
 from judgeval.scorers.base_scorer import BaseScorer
 from judgeval.scorers.judgeval_scorers.api_scorers import (
@@ -15,8 +13,6 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
 __all__ = [
     "APIScorerConfig",
-    "ExampleAPIScorerConfig",
-    "TraceAPIScorerConfig",
     "BaseScorer",
     "TracePromptScorer",
     "PromptScorer",

judgeval/scorers/agent_scorer.py CHANGED Viewed

@@ -1,17 +1,17 @@
-from judgeval.scorers.base_scorer import BaseScorer
-from judgeval.data.judgment_types import Trace as JudgmentTrace
-from typing import List, Optional
-from abc import abstractmethod
+# from judgeval.scorers.base_scorer import BaseScorer
+# from judgeval.data.judgment_types import Trace as JudgmentTrace
+# from typing import List, Optional
+# from abc import abstractmethod
-class TraceScorer(BaseScorer):
-    @abstractmethod
-    async def a_score_trace(
-        self, trace: JudgmentTrace, tools: Optional[List] = None, *args, **kwargs
-    ) -> float:
-        """
-        Asynchronously measures the score on a trace
-        """
-        raise NotImplementedError(
-            "You must implement the `a_score_trace` method in your custom scorer"
-        )
+# class TraceScorer(BaseScorer):
+#     @abstractmethod
+#     async def a_score_trace(
+#         self, trace: JudgmentTrace, tools: Optional[List] = None, *args, **kwargs
+#     ) -> float:
+#         """
+#         Asynchronously measures the score on a trace
+#         """
+#         raise NotImplementedError(
+#             "You must implement the `a_score_trace` method in your custom scorer"
+#         )

judgeval/scorers/api_scorer.py CHANGED Viewed

@@ -63,11 +63,3 @@ class APIScorerConfig(BaseModel):
     def __str__(self):
         return f"JudgmentScorer(score_type={self.score_type.value}, threshold={self.threshold})"
-class ExampleAPIScorerConfig(APIScorerConfig):
-    pass
-class TraceAPIScorerConfig(APIScorerConfig):
-    pass

judgeval/scorers/base_scorer.py CHANGED Viewed

@@ -27,7 +27,7 @@ class BaseScorer(BaseModel):
     threshold: float = 0.5
     # name of your scorer (Faithfulness, PromptScorer-randomslug)
-    name: Optional[str] = None
+    name: str = ""
     # The name of the class of the scorer
     class_name: Optional[str] = None
@@ -42,7 +42,7 @@ class BaseScorer(BaseModel):
     using_native_model: Optional[bool] = None
     # Whether the test case passed or failed
-    success: Optional[bool] = None
+    success: bool = False
     # The name of the model used to evaluate the test case
     model: Optional[str] = None

judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py CHANGED Viewed

@@ -6,13 +6,13 @@ TODO add link to docs page for this scorer
 """
 # Internal imports
-from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
+from judgeval.scorers.api_scorer import APIScorerConfig
 from judgeval.constants import APIScorerType
 from judgeval.data import ExampleParams
 from typing import List
-class AnswerCorrectnessScorer(ExampleAPIScorerConfig):
+class AnswerCorrectnessScorer(APIScorerConfig):
     score_type: APIScorerType = APIScorerType.ANSWER_CORRECTNESS
     required_params: List[ExampleParams] = [
         ExampleParams.INPUT,

judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py CHANGED Viewed

@@ -1,10 +1,10 @@
-from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
+from judgeval.scorers.api_scorer import APIScorerConfig
 from judgeval.constants import APIScorerType
 from judgeval.data import ExampleParams
 from typing import List
-class AnswerRelevancyScorer(ExampleAPIScorerConfig):
+class AnswerRelevancyScorer(APIScorerConfig):
     score_type: APIScorerType = APIScorerType.ANSWER_RELEVANCY
     required_params: List[ExampleParams] = [
         ExampleParams.INPUT,

judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py CHANGED Viewed

@@ -6,13 +6,13 @@ TODO add link to docs page for this scorer
 """
 # Internal imports
-from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
+from judgeval.scorers.api_scorer import APIScorerConfig
 from judgeval.constants import APIScorerType
 from judgeval.data import ExampleParams
 from typing import List
-class FaithfulnessScorer(ExampleAPIScorerConfig):
+class FaithfulnessScorer(APIScorerConfig):
     score_type: APIScorerType = APIScorerType.FAITHFULNESS
     required_params: List[ExampleParams] = [
         ExampleParams.INPUT,

judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py CHANGED Viewed

@@ -6,12 +6,12 @@ TODO add link to docs page for this scorer
 """
 # Internal imports
-from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
+from judgeval.scorers.api_scorer import APIScorerConfig
 from judgeval.constants import APIScorerType
 from judgeval.data import ExampleParams
-class InstructionAdherenceScorer(ExampleAPIScorerConfig):
+class InstructionAdherenceScorer(APIScorerConfig):
     def __init__(self, threshold: float):
         super().__init__(
             threshold=threshold,

judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py CHANGED Viewed

@@ -1,7 +1,5 @@
 from judgeval.scorers.api_scorer import (
     APIScorerConfig,
-    ExampleAPIScorerConfig,
-    TraceAPIScorerConfig,
 )
 from judgeval.constants import APIScorerType
 from typing import Dict, Any, Optional
@@ -55,7 +53,7 @@ def fetch_prompt_scorer(
 ):
     client = JudgmentSyncClient(judgment_api_key, organization_id)
     try:
-        scorer_config = client.fetch_scorer({"name": name})["scorer"]
+        scorer_config = client.fetch_scorers({"names": [name]})["scorers"][0]
         scorer_config.pop("created_at")
         scorer_config.pop("updated_at")
         return scorer_config
@@ -284,9 +282,9 @@ class BasePromptScorer(ABC, APIScorerConfig):
         return base
-class PromptScorer(BasePromptScorer, ExampleAPIScorerConfig):
+class PromptScorer(BasePromptScorer, APIScorerConfig):
     pass
-class TracePromptScorer(BasePromptScorer, TraceAPIScorerConfig):
+class TracePromptScorer(BasePromptScorer, APIScorerConfig):
     pass

judgeval/scorers/score.py CHANGED Viewed

@@ -21,7 +21,7 @@ from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
 async def safe_a_score_example(
-    scorer: ExampleScorer,
+    scorer: Union[ExampleScorer],
     example: Example,
 ):
     """

judgeval/tracer/__init__.py CHANGED Viewed

@@ -43,8 +43,8 @@ from judgeval.env import (
     JUDGMENT_ORG_ID,
 )
 from judgeval.logger import judgeval_logger
-from judgeval.scorers.api_scorer import ExampleAPIScorerConfig, TraceAPIScorerConfig
-from judgeval.scorers.base_scorer import BaseScorer
+from judgeval.scorers.api_scorer import APIScorerConfig
+from judgeval.scorers.example_scorer import ExampleScorer
 from judgeval.tracer.constants import JUDGEVAL_TRACER_INSTRUMENTING_MODULE_NAME
 from judgeval.tracer.managers import (
     sync_span_context,
@@ -328,7 +328,7 @@ class Tracer:
         run_condition = scorer_config.run_condition
         sampling_rate = scorer_config.sampling_rate
-        if not isinstance(scorer, (TraceAPIScorerConfig)):
+        if not isinstance(scorer, (APIScorerConfig)):
             judgeval_logger.error(
                 "Scorer must be an instance of TraceAPIScorerConfig, got %s, skipping evaluation."
                 % type(scorer)
@@ -358,7 +358,6 @@ class Tracer:
         eval_run_name = f"async_trace_evaluate_{span_id}"
         eval_run = TraceEvaluationRun(
-            organization_id=self.organization_id,
             project_name=self.project_name,
             eval_name=eval_run_name,
             scorers=[scorer],
@@ -862,7 +861,7 @@ class Tracer:
         self,
         /,
         *,
-        scorer: Union[ExampleAPIScorerConfig, BaseScorer],
+        scorer: Union[APIScorerConfig, ExampleScorer],
         example: Example,
         model: str = JUDGMENT_DEFAULT_GPT_MODEL,
         sampling_rate: float = 1.0,
@@ -871,7 +870,7 @@ class Tracer:
             judgeval_logger.info("Evaluation is not enabled, skipping evaluation")
             return
-        if not isinstance(scorer, (ExampleAPIScorerConfig, BaseScorer)):
+        if not isinstance(scorer, (APIScorerConfig, ExampleScorer)):
             judgeval_logger.error(
                 "Scorer must be an instance of ExampleAPIScorerConfig or BaseScorer, got %s, skipping evaluation."
                 % type(scorer)
@@ -902,13 +901,12 @@ class Tracer:
         span_context = self.get_current_span().get_span_context()
         trace_id = format(span_context.trace_id, "032x")
         span_id = format(span_context.span_id, "016x")
-        hosted_scoring = isinstance(scorer, ExampleAPIScorerConfig) or (
-            isinstance(scorer, BaseScorer) and scorer.server_hosted
+        hosted_scoring = isinstance(scorer, APIScorerConfig) or (
+            isinstance(scorer, ExampleScorer) and scorer.server_hosted
         )
         eval_run_name = f"async_evaluate_{span_id}"  # note this name doesnt matter because we don't save the experiment only the example and scorer_data
         if hosted_scoring:
             eval_run = ExampleEvaluationRun(
-                organization_id=self.organization_id,
                 project_name=self.project_name,
                 eval_name=eval_run_name,
                 examples=[example],
@@ -923,7 +921,6 @@ class Tracer:
         else:
             # Handle custom scorers using local evaluation queue
             eval_run = ExampleEvaluationRun(
-                organization_id=self.organization_id,
                 project_name=self.project_name,
                 eval_name=eval_run_name,
                 examples=[example],

judgeval/tracer/local_eval_queue.py CHANGED Viewed

@@ -13,7 +13,7 @@ import time
 from judgeval.logger import judgeval_logger
 from judgeval.env import JUDGMENT_MAX_CONCURRENT_EVALUATIONS
 from judgeval.data import ScoringResult
-from judgeval.data.evaluation_run import EvaluationRun
+from judgeval.data.evaluation_run import ExampleEvaluationRun
 from judgeval.utils.async_utils import safe_run_async
 from judgeval.scorers.score import a_execute_scoring
 from judgeval.api import JudgmentSyncClient
@@ -34,7 +34,7 @@ class LocalEvaluationQueue:
     ):
         if num_workers <= 0:
             raise ValueError("num_workers must be a positive integer.")
-        self._queue: queue.Queue[Optional[EvaluationRun]] = queue.Queue()
+        self._queue: queue.Queue[Optional[ExampleEvaluationRun]] = queue.Queue()
         self._max_concurrent = max_concurrent
         self._num_workers = num_workers  # Number of worker threads
         self._worker_threads: List[threading.Thread] = []
@@ -44,11 +44,11 @@ class LocalEvaluationQueue:
             organization_id=JUDGMENT_ORG_ID,
         )
-    def enqueue(self, evaluation_run: EvaluationRun) -> None:
+    def enqueue(self, evaluation_run: ExampleEvaluationRun) -> None:
         """Add evaluation run to the queue."""
         self._queue.put(evaluation_run)
-    def _process_run(self, evaluation_run: EvaluationRun) -> List[ScoringResult]:
+    def _process_run(self, evaluation_run: ExampleEvaluationRun) -> List[ScoringResult]:
         """Execute evaluation run locally and return results."""
         if not evaluation_run.custom_scorers:
@@ -70,7 +70,9 @@ class LocalEvaluationQueue:
     def run_all(
         self,
-        callback: Optional[Callable[[EvaluationRun, List[ScoringResult]], None]] = None,
+        callback: Optional[
+            Callable[[ExampleEvaluationRun, List[ScoringResult]], None]
+        ] = None,
     ) -> None:
         """Process all queued runs synchronously.
@@ -134,7 +136,9 @@ class LocalEvaluationQueue:
     def start_worker(
         self,
-        callback: Optional[Callable[[EvaluationRun, List[ScoringResult]], None]] = None,
+        callback: Optional[
+            Callable[[ExampleEvaluationRun, List[ScoringResult]], None]
+        ] = None,
     ) -> Optional[threading.Thread]:
         """Start a single background thread to process runs (backward compatibility).
@@ -144,7 +148,7 @@ class LocalEvaluationQueue:
         Returns:
             The started thread, or None if no threads were started.
         """
-        threads = self.start_workers(callback)
+        threads = self.start_workers()
         return threads[0] if threads else None
     def wait_for_completion(self, timeout: Optional[float] = None) -> bool:

judgeval/tracer/utils.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import Any
 from opentelemetry.trace import Span
 from pydantic import BaseModel
 from typing import Callable, Optional
-from judgeval.scorers.api_scorer import TraceAPIScorerConfig
+from judgeval.scorers.api_scorer import APIScorerConfig
 from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
@@ -14,7 +14,7 @@ def set_span_attribute(span: Span, name: str, value: Any):
 class TraceScorerConfig(BaseModel):
-    scorer: TraceAPIScorerConfig
+    scorer: APIScorerConfig
     model: str = JUDGMENT_DEFAULT_GPT_MODEL
     sampling_rate: float = 1.0
     run_condition: Optional[Callable[..., bool]] = None

judgeval/trainer/config.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Optional, Dict, Any, TYPE_CHECKING
 import json
 if TYPE_CHECKING:
-    from fireworks.llm.llm_reinforcement_step import ReinforcementAcceleratorTypeLiteral
+    from fireworks.llm.llm_reinforcement_step import ReinforcementAcceleratorTypeLiteral  # type: ignore[import-not-found]
 @dataclass

judgeval/trainer/trainable_model.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from fireworks import LLM
+from fireworks import LLM  # type: ignore[import-not-found]
 from .config import TrainerConfig, ModelConfig
 from typing import Optional, Dict, Any, Callable
 from .console import _model_spinner_progress, _print_model_progress

judgeval/trainer/trainer.py CHANGED Viewed

@@ -2,7 +2,7 @@ import asyncio
 import json
 import time
 from typing import Optional, Callable, Any, List, Union, Dict
-from fireworks import Dataset
+from fireworks import Dataset  # type: ignore[import-not-found]
 from .config import TrainerConfig, ModelConfig
 from .trainable_model import TrainableModel
 from judgeval.tracer import Tracer
@@ -10,7 +10,7 @@ from judgeval.tracer.exporters.store import SpanStore
 from judgeval.tracer.exporters import InMemorySpanExporter
 from judgeval.tracer.keys import AttributeKeys
 from judgeval import JudgmentClient
-from judgeval.scorers import BaseScorer, ExampleAPIScorerConfig
+from judgeval.scorers import BaseScorer, APIScorerConfig
 from judgeval.data import Example
 from .console import _spinner_progress, _print_progress, _print_progress_update
 from judgeval.exceptions import JudgmentRuntimeError
@@ -85,7 +85,9 @@ class JudgmentTrainer:
                 if not first_found and span_attributes.get(
                     AttributeKeys.JUDGMENT_INPUT
                 ):
-                    input_data = span_attributes.get(AttributeKeys.JUDGMENT_INPUT, {})
+                    input_data: Any = span_attributes.get(
+                        AttributeKeys.JUDGMENT_INPUT, {}
+                    )
                     if isinstance(input_data, dict) and "messages" in input_data:
                         input_messages = input_data["messages"]
                         if input_messages:
@@ -154,7 +156,7 @@ class JudgmentTrainer:
     async def generate_rollouts_and_rewards(
         self,
         agent_function: Callable[[Any], Any],
-        scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]],
+        scorers: List[Union[APIScorerConfig, BaseScorer]],
         prompts: List[Any],
         num_prompts_per_step: Optional[int] = None,
         num_generations_per_prompt: Optional[int] = None,
@@ -264,7 +266,7 @@ class JudgmentTrainer:
     async def run_reinforcement_learning(
         self,
         agent_function: Callable[[Any], Any],
-        scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]],
+        scorers: List[Union[APIScorerConfig, BaseScorer]],
         prompts: List[Any],
     ) -> ModelConfig:
         """
@@ -370,7 +372,7 @@ class JudgmentTrainer:
     async def train(
         self,
         agent_function: Callable[[Any], Any],
-        scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]],
+        scorers: List[Union[APIScorerConfig, BaseScorer]],
         prompts: List[Any],
         rft_provider: Optional[str] = None,
     ) -> ModelConfig:

judgeval/utils/async_utils.py CHANGED Viewed

@@ -2,13 +2,13 @@
 import asyncio
 import concurrent.futures
-from typing import Awaitable, TypeVar
+from typing import Awaitable, TypeVar, Coroutine
 T = TypeVar("T")
-def safe_run_async(coro: Awaitable[T]) -> T:  # type: ignore[type-var]
+def safe_run_async(coro: Awaitable[T]) -> T:
     """Safely execute an async *coro* from synchronous code.
     This helper handles two common situations:
@@ -24,6 +24,8 @@ def safe_run_async(coro: Awaitable[T]) -> T:  # type: ignore[type-var]
     Returns:
         The result returned by *coro*.
     """
+    if not isinstance(coro, Coroutine):
+        raise TypeError("The provided awaitable must be a coroutine.")
     try:
         asyncio.get_running_loop()
@@ -31,5 +33,7 @@ def safe_run_async(coro: Awaitable[T]) -> T:  # type: ignore[type-var]
         return asyncio.run(coro)
     with concurrent.futures.ThreadPoolExecutor() as executor:
-        future = executor.submit(lambda: asyncio.run(coro))
+        future: concurrent.futures.Future[T] = executor.submit(
+            lambda: asyncio.run(coro)
+        )
         return future.result()

judgeval/utils/testing.py CHANGED Viewed

@@ -26,10 +26,6 @@ def assert_test_results(scoring_results: List[ScoringResult]) -> None:
                 # If the result was not successful, check each scorer_data
                 for scorer_data in result.scorers_data:
                     if not scorer_data.success:
-                        if scorer_data.name == "Tool Order":
-                            # Remove threshold, evaluation model for Tool Order scorer
-                            scorer_data.threshold = None
-                            scorer_data.evaluation_model = None
                         test_case.append(scorer_data)
             failed_cases.append(test_case)

judgeval 0.10.1__py3-none-any.whl → 0.11.0__py3-none-any.whl

judgeval 0.10.1py3-none-any.whl → 0.11.0py3-none-any.whl