PyPI - judgeval - Versions diffs - 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl - Mend

judgeval 0.0.11py3-none-any.whl → 0.22.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of judgeval might be problematic. Click here for more details.

Files changed (171) hide show

judgeval/__init__.py +177 -12
judgeval/api/__init__.py +519 -0
judgeval/api/api_types.py +407 -0
judgeval/cli.py +79 -0
judgeval/constants.py +76 -47
judgeval/data/__init__.py +3 -3
judgeval/data/evaluation_run.py +125 -0
judgeval/data/example.py +15 -56
judgeval/data/judgment_types.py +450 -0
judgeval/data/result.py +29 -73
judgeval/data/scorer_data.py +29 -62
judgeval/data/scripts/fix_default_factory.py +23 -0
judgeval/data/scripts/openapi_transform.py +123 -0
judgeval/data/trace.py +121 -0
judgeval/dataset/__init__.py +264 -0
judgeval/env.py +52 -0
judgeval/evaluation/__init__.py +344 -0
judgeval/exceptions.py +27 -0
judgeval/integrations/langgraph/__init__.py +13 -0
judgeval/integrations/openlit/__init__.py +50 -0
judgeval/judges/__init__.py +2 -3
judgeval/judges/base_judge.py +2 -3
judgeval/judges/litellm_judge.py +100 -20
judgeval/judges/together_judge.py +101 -20
judgeval/judges/utils.py +20 -24
judgeval/logger.py +62 -0
judgeval/prompt/__init__.py +330 -0
judgeval/scorers/__init__.py +18 -25
judgeval/scorers/agent_scorer.py +17 -0
judgeval/scorers/api_scorer.py +45 -41
judgeval/scorers/base_scorer.py +83 -38
judgeval/scorers/example_scorer.py +17 -0
judgeval/scorers/exceptions.py +1 -0
judgeval/scorers/judgeval_scorers/__init__.py +0 -148
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
judgeval/scorers/score.py +77 -306
judgeval/scorers/utils.py +4 -199
judgeval/tracer/__init__.py +1122 -2
judgeval/tracer/constants.py +1 -0
judgeval/tracer/exporters/__init__.py +40 -0
judgeval/tracer/exporters/s3.py +119 -0
judgeval/tracer/exporters/store.py +59 -0
judgeval/tracer/exporters/utils.py +32 -0
judgeval/tracer/keys.py +63 -0
judgeval/tracer/llm/__init__.py +7 -0
judgeval/tracer/llm/config.py +78 -0
judgeval/tracer/llm/constants.py +9 -0
judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
judgeval/tracer/llm/llm_anthropic/config.py +6 -0
judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
judgeval/tracer/llm/llm_google/__init__.py +3 -0
judgeval/tracer/llm/llm_google/config.py +6 -0
judgeval/tracer/llm/llm_google/generate_content.py +127 -0
judgeval/tracer/llm/llm_google/wrapper.py +30 -0
judgeval/tracer/llm/llm_openai/__init__.py +3 -0
judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
judgeval/tracer/llm/llm_openai/config.py +6 -0
judgeval/tracer/llm/llm_openai/responses.py +506 -0
judgeval/tracer/llm/llm_openai/utils.py +42 -0
judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
judgeval/tracer/llm/llm_together/__init__.py +3 -0
judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
judgeval/tracer/llm/llm_together/config.py +6 -0
judgeval/tracer/llm/llm_together/wrapper.py +52 -0
judgeval/tracer/llm/providers.py +19 -0
judgeval/tracer/managers.py +167 -0
judgeval/tracer/processors/__init__.py +220 -0
judgeval/tracer/utils.py +19 -0
judgeval/trainer/__init__.py +14 -0
judgeval/trainer/base_trainer.py +122 -0
judgeval/trainer/config.py +128 -0
judgeval/trainer/console.py +144 -0
judgeval/trainer/fireworks_trainer.py +396 -0
judgeval/trainer/trainable_model.py +243 -0
judgeval/trainer/trainer.py +70 -0
judgeval/utils/async_utils.py +39 -0
judgeval/utils/decorators/__init__.py +0 -0
judgeval/utils/decorators/dont_throw.py +37 -0
judgeval/utils/decorators/use_once.py +13 -0
judgeval/utils/file_utils.py +97 -0
judgeval/utils/guards.py +36 -0
judgeval/utils/meta.py +27 -0
judgeval/utils/project.py +15 -0
judgeval/utils/serialize.py +253 -0
judgeval/utils/testing.py +70 -0
judgeval/utils/url.py +10 -0
judgeval/utils/version_check.py +28 -0
judgeval/utils/wrappers/README.md +3 -0
judgeval/utils/wrappers/__init__.py +15 -0
judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
judgeval/utils/wrappers/py.typed +0 -0
judgeval/utils/wrappers/utils.py +35 -0
judgeval/version.py +5 -0
judgeval/warnings.py +4 -0
judgeval-0.22.2.dist-info/METADATA +265 -0
judgeval-0.22.2.dist-info/RECORD +112 -0
judgeval-0.22.2.dist-info/entry_points.txt +2 -0
judgeval/clients.py +0 -39
judgeval/common/__init__.py +0 -8
judgeval/common/exceptions.py +0 -28
judgeval/common/logger.py +0 -189
judgeval/common/tracer.py +0 -798
judgeval/common/utils.py +0 -763
judgeval/data/api_example.py +0 -111
judgeval/data/datasets/__init__.py +0 -5
judgeval/data/datasets/dataset.py +0 -286
judgeval/data/datasets/eval_dataset_client.py +0 -193
judgeval/data/datasets/ground_truth.py +0 -54
judgeval/data/datasets/utils.py +0 -74
judgeval/evaluation_run.py +0 -132
judgeval/judges/mixture_of_judges.py +0 -248
judgeval/judgment_client.py +0 -354
judgeval/run_evaluation.py +0 -439
judgeval/scorers/judgeval_scorer.py +0 -140
judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
judgeval/scorers/prompt_scorer.py +0 -439
judgeval-0.0.11.dist-info/METADATA +0 -36
judgeval-0.0.11.dist-info/RECORD +0 -84
{judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
{judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0

judgeval/scorers/score.py CHANGED Viewed

@@ -1,301 +1,104 @@
 """
-Infrastructure for executing evaluations of `Example`s using one or more `JudgevalScorer`s.
+Infrastructure for executing evaluations of `Example`s using one or more `ExampleScorer`s.
 """
 import asyncio
-import time
+import time
 from tqdm.asyncio import tqdm_asyncio
 from typing import List, Union, Optional, Callable
-from rich.progress import Progress, SpinnerColumn, TextColumn
 from judgeval.data import (
-    Example,
+    Example,
     ScoringResult,
     generate_scoring_result,
-    create_process_example,
     create_scorer_data,
 )
-from judgeval.scorers import JudgevalScorer
-from judgeval.scorers.utils import clone_scorers, scorer_console_msg
-from judgeval.common.exceptions import MissingTestCaseParamsError
-from judgeval.common.logger import example_logging_context, debug, error, warning, info
+from judgeval.scorers.example_scorer import ExampleScorer
+from judgeval.scorers.utils import clone_scorers
+from judgeval.logger import judgeval_logger
 from judgeval.judges import JudgevalJudge
+from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
 async def safe_a_score_example(
-    scorer: JudgevalScorer,
+    scorer: ExampleScorer,
     example: Example,
-    ignore_errors: bool,
-    skip_on_missing_params: bool,
 ):
     """
     Scoring task function when not using a progress indicator!
-    "Safely" scores an `Example` using a `JudgevalScorer` by gracefully handling any exceptions that may occur.
+    "Safely" scores an `Example` using a `ExampleScorer` by gracefully handling any exceptions that may occur.
     Args:
-        scorer (JudgevalScorer): The `JudgevalScorer` to use for scoring the example.
+        scorer (ExampleScorer): The `ExampleScorer` to use for scoring the example.
         example (Example): The `Example` to be scored.
-        ignore_errors (bool): Whether to ignore errors during the evaluation.
-        If set to false, any error will be raised and stop the evaluation.
-        If set to true, the error will be stored in the `error` attribute of the `JudgevalScorer` and the `success` attribute will be set to False.
-        skip_on_missing_params (bool): Whether to skip the test case if required parameters are missing.
     """
-    debug(f"Starting safe_a_score_example for example {example.example_id}")
     try:
-        await scorer.a_score_example(example, _show_indicator=False)
-        info(f"Successfully scored example {example.example_id}")
-    except MissingTestCaseParamsError as e:
-        if skip_on_missing_params:  # Skip the example if the scorer requires parameters that are missing
-            with example_logging_context(example.timestamp, example.example_id):
-                warning(f"Skipping example {example.example_id} due to missing parameters")
-            scorer.skipped = True
-            return
+        score = await scorer.a_score_example(example)
+        if score is None:
+            raise Exception("a_score_example need to return a score")
+        elif score < 0:
+            judgeval_logger.warning("score cannot be less than 0 , setting to 0")
+            score = 0
+        elif score > 1:
+            judgeval_logger.warning("score cannot be greater than 1 , setting to 1")
+            score = 1
         else:
-            if ignore_errors:  # Gracefully handle the error, does not stop the evaluation
-                scorer.error = str(e)
-                scorer.success = False
-                with example_logging_context(example.timestamp, example.example_id):
-                    warning(f"Ignoring errors for example {example.example_id}: {str(e)} due to missing parameters")
-            else:  # Raise the error and stop the evaluation
-                with example_logging_context(example.timestamp, example.example_id):
-                    error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
-                raise
-    except TypeError:  # in case a_score_example does not accept _show_indicator
-        try:
-            await scorer.a_score_example(example)
-        except MissingTestCaseParamsError as e:
-            if skip_on_missing_params:
-                scorer.skipped = True
-                with example_logging_context(example.timestamp, example.example_id):
-                    warning(f"Skipping example {example.example_id} due to missing parameters")
-                return
-            else:
-                if ignore_errors:
-                    scorer.error = str(e)
-                    scorer.success = False
-                    with example_logging_context(example.timestamp, example.example_id):
-                        warning(f"Ignoring errors for example {example.example_id}: {str(e)} due to missing parameters")
-                else:
-                    with example_logging_context(example.timestamp, example.example_id):
-                        error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
-                    raise
+            scorer.score = score
+        scorer.success = scorer.success_check()
     except Exception as e:
-        if ignore_errors:
-            scorer.error = str(e)
-            scorer.success = False  # Assuming you want to set success to False
-            with example_logging_context(example.timestamp, example.example_id):
-                warning(f"Ignoring errors for example {example.example_id}: {str(e)}")
-        else:
-            with example_logging_context(example.timestamp, example.example_id):
-                error(f"Stopping example {example.example_id}: {str(e)}")
-            raise
-async def score_task(
-    task_id: int,
-    progress: Progress,
-    scorer: JudgevalScorer,
-    example: Example,
-    ignore_errors: bool = True,
-    skip_on_missing_params: bool = True,
-):
-    """
-    Task function for asynchronously measuring a given example using a JudgevalScorer.
-    Args:
-        task_id (int): The ID of the task being measured.
-        progress (Progress): An instance of the Progress class to track task progress.
-        scorer (JudgevalScorer): An instance of the JudgevalScorer class used to score the example.
-        example (Example): The example to be scored.
-        ignore_errors (bool, optional): Whether to ignore errors during scoring. Defaults to True.
-        skip_on_missing_params (bool, optional): Whether to skip scoring if there are missing parameters. Defaults to True.
-    Raises:
-        MissingTestCaseParamsError: If required test case parameters are missing and skip_on_missing_params is False.
-        Exception: If an unexpected error occurs and ignore_errors is False.
-    Returns:
-        None
-    """
-    while not progress.finished:
-        start_time = time.perf_counter()
-        try:
-            await scorer.a_score_example(example, _show_indicator=False)
-            finish_text = "Completed"
-        except MissingTestCaseParamsError as e:
-            if skip_on_missing_params:
-                scorer.skipped = True
-                with example_logging_context(example.timestamp, example.example_id):
-                    debug(f"Skipping example {example.example_id} due to missing parameters")
-                return
-            else:
-                if ignore_errors:
-                    scorer.error = str(e)
-                    scorer.success = False  # Override success
-                    finish_text = "Failed"
-                else:
-                    with example_logging_context(example.timestamp, example.example_id):
-                        error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
-                    raise
-        except TypeError:
-            try:
-                await scorer.a_score_example(example)
-                finish_text = "Completed"
-            except MissingTestCaseParamsError as e:
-                if skip_on_missing_params:
-                    scorer.skipped = True
-                    with example_logging_context(example.timestamp, example.example_id):
-                        debug(f"Skipping example {example.example_id} due to missing parameters")
-                    return
-                else:
-                    if ignore_errors:
-                        scorer.error = str(e)
-                        scorer.success = False  # Override success
-                        finish_text = "Failed"
-                    else:
-                        with example_logging_context(example.timestamp, example.example_id):
-                            error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
-                        raise
-        except Exception as e:
-            if ignore_errors:
-                scorer.error = str(e)
-                scorer.success = False  # Override success
-                finish_text = "Failed"
-                with example_logging_context(example.timestamp, example.example_id):
-                    warning(f"Ignoring errors for example {example.example_id}: {str(e)}")
-            else:
-                with example_logging_context(example.timestamp, example.example_id):
-                    error(f"Stopping example {example.example_id}: {str(e)}")
-                raise
-        end_time = time.perf_counter()
-        time_taken = format(end_time - start_time, ".2f")
-        progress.update(task_id, advance=100)  # Mark task as complete
-        progress.update(
-            task_id,
-            description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]{finish_text}! ({time_taken}s)",
-        )
-        break
-async def score_with_indicator(
-    scorers: List[JudgevalScorer],
-    example: Example,
-    ignore_errors: bool,
-    skip_on_missing_params: bool,
-    show_indicator: bool,
-):
-    """
-    Scores an example using a list of JudgevalScorers, optionally displaying a progress indicator.
-    Args:
-        scorers (List[JudgevalScorer]): A list of JudgevalScorer objects to evaluate the example.
-        example (Example): The example to be scored.
-        ignore_errors (bool): If True, errors during scoring will be ignored.
-        skip_on_missing_params (bool): If True, scoring will be skipped if required parameters are missing.
-        show_indicator (bool): If True, a progress indicator will be displayed during scoring.
-    Returns:
-        None
-    Raises:
-        Any exceptions raised by the scoring functions, unless `ignore_errors` is True.
-    """
-    if show_indicator:
-        with Progress(
-            SpinnerColumn(style="rgb(106,0,255)"),
-            TextColumn("[progress.description]{task.description}"),
-            transient=True,
-        ) as progress:
-            tasks = []
-            for scorer in scorers:
-                task_id = progress.add_task(
-                    description=scorer_console_msg(
-                        scorer, async_mode=True
-                    ),
-                    total=100,
-                )  # Add task to progress bar
-                tasks.append(
-                    score_task(
-                        task_id,
-                        progress,
-                        scorer,
-                        example,
-                        ignore_errors,
-                        skip_on_missing_params,
-                    )  # Create and execute task to score the example with a single scorer
-                )
-            await asyncio.gather(*tasks)
-    else:
-        tasks = [
-            safe_a_score_example(
-                scorer, example, ignore_errors, skip_on_missing_params
-            )
-            for scorer in scorers
-        ]
-        await asyncio.gather(*tasks)
+        judgeval_logger.error(f"Error during scoring: {str(e)}")
+        scorer.error = str(e)
+        scorer.success = False
+        scorer.score = 0
+        return
 async def a_execute_scoring(
     examples: List[Example],
-    scorers: List[JudgevalScorer],
-    model: Optional[Union[str, List[str], JudgevalJudge]] = None,
-    ignore_errors: bool = True,
-    skip_on_missing_params: bool = True,
-    show_indicator: bool = True,
+    scorers: List[ExampleScorer],
+    model: Optional[Union[str, List[str], JudgevalJudge]] = JUDGMENT_DEFAULT_GPT_MODEL,
+    ignore_errors: bool = False,
     throttle_value: int = 0,
     max_concurrent: int = 100,
-    verbose_mode: Optional[bool] = None,
-    _use_bar_indicator: bool = True,
+    show_progress: bool = True,
 ) -> List[ScoringResult]:
     """
-    Executes evaluations of `Example`s asynchronously using one or more `JudgevalScorer`s.
-    Each `Example` will be evaluated by all of the `JudgevalScorer`s in the `scorers` list.
+    Executes evaluations of `Example`s asynchronously using one or more `ExampleScorer`s.
+    Each `Example` will be evaluated by all of the `ExampleScorer`s in the `scorers` list.
     Args:
         examples (List[Example]): A list of `Example` objects to be evaluated.
-        scorers (List[JudgevalScorer]): A list of `JudgevalScorer` objects to evaluate the examples.
+        scorers (List[ExampleScorer]): A list of `ExampleScorer` objects to evaluate the examples.
         model (Union[str, List[str], JudgevalJudge]): The model to use for evaluation.
         ignore_errors (bool): Whether to ignore errors during evaluation.
-        skip_on_missing_params (bool): Whether to skip evaluation if parameters are missing.
-        show_indicator (bool): Whether to show a progress indicator.
         throttle_value (int): The amount of time to wait between starting each task.
         max_concurrent (int): The maximum number of concurrent tasks.
-        verbose_mode (Optional[bool]): If set, enables verbose mode for scorers.
-        _use_bar_indicator (bool): Whether to use a progress bar indicator.
+        show_progress (bool): Whether to show the progress bar indicator.
     Returns:
         List[ScoringResult]: A list of `ScoringResult` objects containing the evaluation results.
     """
     semaphore = asyncio.Semaphore(max_concurrent)
     async def execute_with_semaphore(func: Callable, *args, **kwargs):
-        try:
-            async with semaphore:
+        async with semaphore:
+            try:
                 return await func(*args, **kwargs)
-        except Exception as e:
-            error(f"Error executing function: {e}")
-            if kwargs.get('ignore_errors', False):
-                # Return None when ignoring errors
-                return None
-            raise
-    if verbose_mode is not None:
-        for scorer in scorers:
-            scorer.verbose_mode = verbose_mode
+            except Exception as e:
+                judgeval_logger.error(f"Error executing function: {e}")
+                if kwargs.get("ignore_errors", False):
+                    return None
+                raise
-    # Add model to scorers
     for scorer in scorers:
-        scorer._add_model(model)
+        if not scorer.model and isinstance(model, str):
+            scorer._add_model(model)
-    scoring_results: List[ScoringResult] = [None for _ in examples]
+    scoring_results: List[Optional[ScoringResult]] = [None for _ in examples]
     tasks = []
-    if show_indicator and _use_bar_indicator:
+    if show_progress:
         with tqdm_asyncio(
             desc=f"Evaluating {len(examples)} example(s) in parallel",
             unit="Example",
@@ -303,24 +106,12 @@ async def a_execute_scoring(
             bar_format="{desc}: |{bar}|{percentage:3.0f}% ({n_fmt}/{total_fmt}) [Time Taken: {elapsed}, {rate_fmt}{postfix}]",
         ) as pbar:
             for i, ex in enumerate(examples):
-                with example_logging_context(ex.timestamp, ex.example_id):
-                    debug(f"Starting scoring for example {ex.example_id}")
-                    debug(f"Input: {ex.input}")
-                    debug(f"Using {len(scorers)} scorers")
-                    for scorer in scorers:
-                        debug(f"Using scorer: {type(scorer).__name__}")
-                        if hasattr(scorer, 'threshold'):
-                            debug(f"Scorer threshold: {scorer.threshold}")
-                        if hasattr(scorer, 'model'):
-                            debug(f"Scorer model: {type(scorer.model).__name__}")
                 if isinstance(ex, Example):
                     if len(scorers) == 0:
                         pbar.update(1)
                         continue
-                    cloned_scorers: List[JudgevalScorer] = clone_scorers(
-                        scorers
-                    )
+                    cloned_scorers = clone_scorers(scorers)  # type: ignore
                     task = execute_with_semaphore(
                         func=a_eval_examples_helper,
                         scorers=cloned_scorers,
@@ -328,9 +119,6 @@ async def a_execute_scoring(
                         scoring_results=scoring_results,
                         score_index=i,
                         ignore_errors=ignore_errors,
-                        skip_on_missing_params=skip_on_missing_params,
-                        show_indicator=show_indicator,
-                        _use_bar_indicator=_use_bar_indicator,
                         pbar=pbar,
                     )
                     tasks.append(asyncio.create_task(task))
@@ -343,9 +131,7 @@ async def a_execute_scoring(
                 if len(scorers) == 0:
                     continue
-                cloned_scorers: List[JudgevalScorer] = clone_scorers(
-                    scorers
-                )
+                cloned_scorers = clone_scorers(scorers)  # type: ignore
                 task = execute_with_semaphore(
                     func=a_eval_examples_helper,
                     scorers=cloned_scorers,
@@ -353,75 +139,60 @@ async def a_execute_scoring(
                     scoring_results=scoring_results,
                     score_index=i,
                     ignore_errors=ignore_errors,
-                    skip_on_missing_params=skip_on_missing_params,
-                    _use_bar_indicator=_use_bar_indicator,
-                    show_indicator=show_indicator,
+                    pbar=None,
                 )
-                tasks.append(asyncio.create_task((task)))
+                tasks.append(asyncio.create_task(task))
             await asyncio.sleep(throttle_value)
         await asyncio.gather(*tasks)
-    return scoring_results
+    return [result for result in scoring_results if result is not None]
 async def a_eval_examples_helper(
-    scorers: List[JudgevalScorer],
+    scorers: List[ExampleScorer],
     example: Example,
-    scoring_results: List[ScoringResult],
+    scoring_results: List[Optional[ScoringResult]],
     score_index: int,
     ignore_errors: bool,
-    skip_on_missing_params: bool,
-    show_indicator: bool,
-    _use_bar_indicator: bool,
     pbar: Optional[tqdm_asyncio] = None,
-    ) -> None:
+) -> None:
     """
     Evaluate a single example asynchronously using a list of scorers.
     Args:
-        scorers (List[JudgevalScorer]): List of JudgevalScorer objects to evaluate the example.
+        scorers (List[ExampleScorer]): List of ExampleScorer objects to evaluate the example.
         example (Example): The example to be evaluated.
         scoring_results (List[ScoringResult]): List to store the scoring results.
         score_index (int): Index at which the result should be stored in scoring_results.
         ignore_errors (bool): Flag to indicate whether to ignore errors during scoring.
-        skip_on_missing_params (bool): Flag to indicate whether to skip scoring if parameters are missing.
-        show_indicator (bool): Flag to indicate whether to show a progress indicator.
-        _use_bar_indicator (bool): Flag to indicate whether to use a bar indicator for progress.
         pbar (Optional[tqdm_asyncio]): Optional progress bar for tracking progress.
     Returns:
         None
     """
-    show_metrics_indicator = show_indicator and not _use_bar_indicator
-    for scorer in scorers:
-        scorer.skipped = False
-        scorer.error = None  # Reset scorer error
-    # scoring the Example
-    process_example = create_process_example(example)  # Creates process example to track progress
     scoring_start_time = time.perf_counter()
-    await score_with_indicator(
-        scorers=scorers,
-        example=example,
-        skip_on_missing_params=skip_on_missing_params,
-        ignore_errors=ignore_errors,
-        show_indicator=show_metrics_indicator,
-    )  # execute the scoring functions of each scorer on the example
-    # Now that all the scoring functions of each scorer have executed, we collect
-    # the results and update the process example with the scorer data
+    tasks = [safe_a_score_example(scorer, example) for scorer in scorers]
+    await asyncio.gather(*tasks)
+    success = True
+    scorer_data_list = []
     for scorer in scorers:
-        # At this point, the scorer has been executed and already contains data.
-        if getattr(scorer, 'skipped', False):
+        if getattr(scorer, "skipped", False):
             continue
-        scorer_data = create_scorer_data(scorer)  # Fetch scorer data from completed scorer evaluation
-        process_example.update_scorer_data(scorer_data)  # Update process example with the same scorer data
-    test_end_time = time.perf_counter()
-    run_duration = test_end_time - scoring_start_time
-    process_example.update_run_duration(run_duration)   # Update process example with execution time duration
-    scoring_results[score_index] = generate_scoring_result(process_example)  # Converts the outcomes of the executed test to a ScoringResult and saves it
+        scorer_data = create_scorer_data(scorer)
+        for s in scorer_data:
+            success = success and s.success
+        scorer_data_list.extend(scorer_data)
+    scoring_end_time = time.perf_counter()
+    run_duration = scoring_end_time - scoring_start_time
+    scoring_result = generate_scoring_result(
+        example, scorer_data_list, run_duration, success
+    )
+    scoring_results[score_index] = scoring_result
     if pbar is not None:
         pbar.update(1)

judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl

Potentially problematic release.

judgeval 0.0.11py3-none-any.whl → 0.22.2py3-none-any.whl