PyPI - ragbits-evaluate - Versions diffs - 0.5.0__py3-none-any.whl → 1.4.0.dev202602030301__py3-none-any.whl - Mend

ragbits-evaluate 0.5.0py3-none-any.whl → 1.4.0.dev202602030301py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

ragbits/evaluate/agent_simulation/__init__.py +87 -0
ragbits/evaluate/agent_simulation/context.py +118 -0
ragbits/evaluate/agent_simulation/conversation.py +333 -0
ragbits/evaluate/agent_simulation/deepeval_evaluator.py +92 -0
ragbits/evaluate/agent_simulation/logger.py +165 -0
ragbits/evaluate/agent_simulation/metrics/__init__.py +19 -0
ragbits/evaluate/agent_simulation/metrics/builtin.py +221 -0
ragbits/evaluate/agent_simulation/metrics/collectors.py +142 -0
ragbits/evaluate/agent_simulation/models.py +37 -0
ragbits/evaluate/agent_simulation/results.py +200 -0
ragbits/evaluate/agent_simulation/scenarios.py +129 -0
ragbits/evaluate/agent_simulation/simulation.py +243 -0
ragbits/evaluate/cli.py +150 -0
ragbits/evaluate/config.py +11 -0
ragbits/evaluate/dataloaders/__init__.py +3 -0
ragbits/evaluate/dataloaders/base.py +95 -0
ragbits/evaluate/dataloaders/document_search.py +61 -0
ragbits/evaluate/dataloaders/exceptions.py +25 -0
ragbits/evaluate/dataloaders/gaia.py +78 -0
ragbits/evaluate/dataloaders/hotpot_qa.py +95 -0
ragbits/evaluate/dataloaders/human_eval.py +70 -0
ragbits/evaluate/dataloaders/question_answer.py +56 -0
ragbits/evaluate/dataset_generator/pipeline.py +4 -4
ragbits/evaluate/dataset_generator/prompts/qa.py +2 -4
ragbits/evaluate/dataset_generator/tasks/corpus_generation.py +2 -4
ragbits/evaluate/dataset_generator/tasks/text_generation/base.py +3 -5
ragbits/evaluate/dataset_generator/tasks/text_generation/qa.py +3 -3
ragbits/evaluate/evaluator.py +178 -50
ragbits/evaluate/factories/__init__.py +42 -0
ragbits/evaluate/metrics/__init__.py +2 -23
ragbits/evaluate/metrics/base.py +40 -17
ragbits/evaluate/metrics/document_search.py +40 -23
ragbits/evaluate/metrics/gaia.py +84 -0
ragbits/evaluate/metrics/hotpot_qa.py +51 -0
ragbits/evaluate/metrics/human_eval.py +105 -0
ragbits/evaluate/metrics/question_answer.py +222 -0
ragbits/evaluate/optimizer.py +138 -86
ragbits/evaluate/pipelines/__init__.py +37 -0
ragbits/evaluate/pipelines/base.py +34 -10
ragbits/evaluate/pipelines/document_search.py +72 -67
ragbits/evaluate/pipelines/gaia.py +249 -0
ragbits/evaluate/pipelines/hotpot_qa.py +342 -0
ragbits/evaluate/pipelines/human_eval.py +323 -0
ragbits/evaluate/pipelines/question_answer.py +96 -0
ragbits/evaluate/utils.py +86 -59
{ragbits_evaluate-0.5.0.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/METADATA +33 -9
ragbits_evaluate-1.4.0.dev202602030301.dist-info/RECORD +59 -0
{ragbits_evaluate-0.5.0.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/WHEEL +1 -1
ragbits/evaluate/callbacks/base.py +0 -22
ragbits/evaluate/callbacks/neptune.py +0 -26
ragbits/evaluate/loaders/__init__.py +0 -21
ragbits/evaluate/loaders/base.py +0 -24
ragbits/evaluate/loaders/hf.py +0 -25
ragbits_evaluate-0.5.0.dist-info/RECORD +0 -33
/ragbits/evaluate/{callbacks/__init__.py → py.typed} +0 -0

ragbits/evaluate/pipelines/human_eval.py ADDED Viewed

@@ -0,0 +1,323 @@
+import asyncio
+import contextlib
+import io
+import json
+import logging
+import multiprocessing
+import textwrap
+import time
+from collections.abc import Callable, Coroutine, Iterable
+from dataclasses import dataclass
+from multiprocessing.connection import Connection
+from pathlib import Path
+from typing import Any
+from typing_extensions import Self
+from ragbits.agents import Agent
+from ragbits.core.llms.base import LLM, LLMClientOptionsT
+from ragbits.evaluate.pipelines.base import EvaluationData, EvaluationPipeline, EvaluationResult
+class HumanEvalData(EvaluationData):
+    """
+    Represents a single HumanEval task.
+    """
+    task_id: str
+    prompt: str
+    entry_point: str
+    test: str
+    canonical_solution: str | None = None
+@dataclass
+class HumanEvalResult(EvaluationResult):
+    """
+    Represents the result of evaluating a single HumanEval task.
+    """
+    task_id: str
+    entry_point: str
+    samples: list[str]
+    passed_mask: list[bool]
+    exec_durations_sec: list[float]
+    compile_ok_mask: list[bool]
+    errors: list[str | None]
+def _execute_in_subprocess(
+    source: str, entry_point: str, test_code: str, timeout_sec: int = 10, memory_limit_mb: int | None = 512
+) -> tuple[bool, float, str | None]:
+    """Run candidate against HumanEval test in a subprocess with timeout."""
+    def _runner(pipe: Connection) -> None:
+        captured_out = io.StringIO()
+        start = time.perf_counter()
+        try:
+            with contextlib.redirect_stdout(captured_out), contextlib.redirect_stderr(captured_out):
+                # Apply soft resource limits -> NOT A SANDBOX
+                with contextlib.suppress(Exception):
+                    import os  # type: ignore
+                    import resource  # type: ignore
+                    import tempfile  # type: ignore
+                    cpu_secs = max(1, timeout_sec)
+                    resource.setrlimit(resource.RLIMIT_CPU, (cpu_secs, cpu_secs))
+                    if memory_limit_mb is not None:
+                        mem_bytes = int(memory_limit_mb) * 1024 * 1024
+                        resource.setrlimit(resource.RLIMIT_AS, (mem_bytes, mem_bytes))
+                    # Minimal extra security
+                    for rlim, val in (
+                        (getattr(resource, "RLIMIT_NOFILE", None), 256),
+                        (getattr(resource, "RLIMIT_NPROC", None), 64),
+                        (getattr(resource, "RLIMIT_FSIZE", None), 10 * 1024 * 1024),
+                    ):
+                        if rlim is not None:
+                            with contextlib.suppress(Exception):
+                                resource.setrlimit(rlim, (val, val))
+                    # Temporary working directory for solution
+                    tmp = tempfile.TemporaryDirectory()
+                    with contextlib.suppress(Exception):
+                        os.chdir(tmp.name)
+                globals_dict: dict[str, Any] = {"__name__": "__main__"}
+                exec(compile(source, filename="candidate.py", mode="exec"), globals_dict)
+                if entry_point not in globals_dict:
+                    raise NameError(f"Entry point '{entry_point}' not defined")
+                harness = textwrap.dedent(f"candidate = {entry_point}\n").lstrip()
+                test_code_clean = textwrap.dedent(test_code).lstrip()
+                compiled_test = compile(
+                    harness + "\n" + test_code_clean + "\ncheck(candidate)", filename="test.py", mode="exec"
+                )
+                exec(compiled_test, globals_dict)
+            duration = time.perf_counter() - start
+            pipe.send((True, duration, None))
+        except Exception as e:
+            duration = time.perf_counter() - start
+            pipe.send((False, duration, f"{e.__class__.__name__}: {e}"))
+    parent_conn, child_conn = multiprocessing.Pipe()
+    proc = multiprocessing.Process(target=_runner, args=(child_conn,))
+    proc.start()
+    proc.join(timeout=timeout_sec)
+    if proc.is_alive():
+        proc.terminate()
+        proc.join()
+        return False, float(timeout_sec), "TimeoutError: execution exceeded time limit"
+    passed, duration, err = parent_conn.recv()
+    return bool(passed), float(duration), (str(err) if err is not None else None)
+class HumanEvalPipeline(
+    EvaluationPipeline[Agent[LLMClientOptionsT, None, str] | LLM[LLMClientOptionsT], HumanEvalData, HumanEvalResult]
+):
+    """HumanEval evaluation pipeline for code generation models/agents."""
+    def __init__(
+        self,
+        evaluation_target: Agent[LLMClientOptionsT, None, str] | LLM[LLMClientOptionsT],
+        *,
+        n_samples: int = 1,
+        timeout_sec: int = 10,
+        memory_limit_mb: int | None = 512,
+        per_example_log_file: Path | None = None,
+        extended_logs: bool = False,
+        code_sanitize_fn: Callable[[str], str] | None = None,
+    ) -> None:
+        super().__init__(evaluation_target=evaluation_target)
+        self.n_samples = n_samples
+        self.timeout_sec = timeout_sec
+        self.memory_limit_mb = memory_limit_mb
+        self.per_example_log_file = per_example_log_file
+        self.extended_logs = extended_logs
+        self.code_sanitize_fn = code_sanitize_fn
+        self._init_log_file()
+    @classmethod
+    def from_config(cls, config: dict) -> Self:
+        """Create pipeline from config.
+        Attempts Agent first, falls back to raw LLM construction.
+        """
+        if "evaluation_target" not in config:
+            try:
+                config["evaluation_target"] = Agent.from_config(config)
+            except Exception:
+                config["evaluation_target"] = LLM.from_config(config)
+        return super().from_config(config)
+    def _process_generation(
+        self, raw: BaseException | tuple[str, dict | None] | str, debug_traces: list[dict | None] | None
+    ) -> tuple[str, dict | None]:
+        """Process a single generation result."""
+        if isinstance(raw, BaseException):
+            err_msg = f"GenerationError: {raw.__class__.__name__}: {raw}"
+            if self.extended_logs and debug_traces is not None:
+                debug_traces.append({"error": err_msg})
+            raise raw
+        if self.extended_logs and isinstance(raw, tuple):
+            content, dbg = raw
+            code = self._sanitize(content)
+            if debug_traces is not None:
+                debug_traces.append(dbg)
+            return code, dbg
+        if isinstance(raw, str):
+            code = self._sanitize(raw)
+            return code, None
+        raise TypeError(f"Unexpected type for raw: {type(raw)}")
+    def _evaluate_code_sample(self, code: str, row: HumanEvalData) -> tuple[bool, bool, float, str | None]:
+        """Evaluate a single code sample."""
+        # Compile check
+        try:
+            compile(code, filename="candidate.py", mode="exec")
+            compile_ok = True
+        except Exception as e:
+            return False, False, 0.0, f"SyntaxError: {e}"
+        ok, dur, err = _execute_in_subprocess(
+            code,
+            row.entry_point,
+            row.test,
+            timeout_sec=self.timeout_sec,
+            memory_limit_mb=self.memory_limit_mb,
+        )
+        return compile_ok, ok, dur, err
+    async def __call__(self, data: Iterable[HumanEvalData]) -> Iterable[HumanEvalResult]:
+        """Generate code completions per task and evaluate them.
+        Returns list of `HumanEvalResult`, one per input task.
+        """
+        results: list[HumanEvalResult] = []
+        for row in data:
+            prompt_input = row.prompt
+            samples: list[str] = []
+            compile_ok: list[bool] = []
+            pass_mask: list[bool] = []
+            durations: list[float] = []
+            errors: list[str | None] = []
+            # Produce n samples
+            gen_tasks: list[Coroutine[Any, Any, tuple[str, dict | None] | str]] = []
+            for _ in range(self.n_samples):
+                if self.extended_logs:
+                    gen_tasks.append(self._generate_with_debug(prompt_input))
+                else:
+                    gen_tasks.append(self._generate_code(prompt_input))
+            generations = await asyncio.gather(*gen_tasks, return_exceptions=True)
+            debug_traces: list[dict | None] | None = [] if self.extended_logs else None
+            for raw in generations:
+                try:
+                    code, _ = self._process_generation(raw, debug_traces)
+                    samples.append(code)
+                except BaseException as e:
+                    samples.append("")
+                    compile_ok.append(False)
+                    pass_mask.append(False)
+                    durations.append(0.0)
+                    err_msg = f"GenerationError: {e.__class__.__name__}: {e}"
+                    errors.append(err_msg)
+                    continue
+                compile_result, passed, duration, error = self._evaluate_code_sample(code, row)
+                compile_ok.append(compile_result)
+                pass_mask.append(passed)
+                durations.append(duration)
+                errors.append(error)
+            result = HumanEvalResult(
+                task_id=row.task_id,
+                entry_point=row.entry_point,
+                samples=samples,
+                passed_mask=pass_mask,
+                exec_durations_sec=durations,
+                compile_ok_mask=compile_ok,
+                errors=errors,
+            )
+            results.append(result)
+            ext_log_str = (
+                json.dumps(debug_traces, ensure_ascii=False, default=str)
+                if (self.extended_logs and debug_traces is not None)
+                else None
+            )
+            self._log_example(row, result, ext_log_str)
+        return results
+    def _sanitize(self, text: str) -> str:
+        """Optionally sanitize cpde from text using provided function.
+        If no parser provided, returns the original text.
+        """
+        if self.code_sanitize_fn is None:
+            return text
+        try:
+            return self.code_sanitize_fn(text)
+        except Exception as exc:
+            logging.getLogger(__name__).debug("Code sanitize error: %s", exc)
+            return text
+    def _init_log_file(self) -> None:
+        """Ensure the per-example log file exists if logging is enabled."""
+        if self.per_example_log_file is None:
+            return
+        self.per_example_log_file.parent.mkdir(parents=True, exist_ok=True)
+        with open(self.per_example_log_file, "w", encoding="utf-8") as _:
+            pass
+    def _log_example(self, row: HumanEvalData, result: HumanEvalResult, extended_log: str | None = None) -> None:
+        """Append a single NDJSON record for debugging if enabled."""
+        if self.per_example_log_file is None:
+            return
+        record: dict[str, object] = {
+            "task_id": row.task_id,
+            "entry_point": row.entry_point,
+            "n_samples": len(result.samples),
+            "samples": result.samples,
+            "compile_ok_mask": result.compile_ok_mask,
+            "passed_mask": result.passed_mask,
+            "exec_durations_sec": result.exec_durations_sec,
+            "errors": result.errors,
+        }
+        record["extended_debug_logging"] = extended_log or "[]"
+        with open(self.per_example_log_file, "a", encoding="utf-8") as f:
+            f.write(json.dumps(record, ensure_ascii=False) + "\n")
+    async def _generate_code(self, prompt: str) -> str:
+        """Generate final answer code from Agent or raw LLM."""
+        target = self.evaluation_target
+        if isinstance(target, Agent):
+            res = await target.run(prompt)
+            return str(res.content)
+        resp = await target.generate(prompt)
+        return str(resp)
+    async def _generate_with_debug(self, prompt: str) -> tuple[str, dict | None]:
+        """Generate code and capture tool/history/usage for logging (as raw content)."""
+        target = self.evaluation_target
+        if isinstance(target, Agent):
+            res = await target.run(prompt)
+            dbg = {
+                "history": res.history,
+                "tool_calls": res.tool_calls,
+                "usage": res.usage,
+                "metadata": res.metadata,
+            }
+            return str(res.content), dbg
+        resp = await target.generate(prompt)
+        return str(resp), None

ragbits/evaluate/pipelines/question_answer.py ADDED Viewed

@@ -0,0 +1,96 @@
+import asyncio
+from collections.abc import Iterable
+from dataclasses import dataclass
+from typing import Any, Generic
+from typing_extensions import Self
+from ragbits.agents._main import AgentResult
+from ragbits.agents.types import (
+    QuestionAnswerAgent,
+    QuestionAnswerPromptInput,
+    QuestionAnswerPromptOutputT,
+)
+from ragbits.core.llms.base import LLMClientOptionsT
+from ragbits.evaluate.pipelines.base import EvaluationData, EvaluationPipeline, EvaluationResult
+class QuestionAnswerData(EvaluationData):
+    """
+    Represents the evaluation data for question answer.
+    """
+    question: str
+    reference_answer: str
+    reference_context: Any | None = None
+@dataclass
+class QuestionAnswerResult(EvaluationResult, Generic[QuestionAnswerPromptOutputT]):
+    """
+    Represents the result of a single evaluation.
+    """
+    question: str
+    predicted_result: AgentResult[QuestionAnswerPromptOutputT]
+    reference_answer: str
+    reference_context: Any | None = None
+class QuestionAnswerPipeline(
+    EvaluationPipeline[
+        QuestionAnswerAgent[LLMClientOptionsT, QuestionAnswerPromptInput, QuestionAnswerPromptOutputT],
+        QuestionAnswerData,
+        QuestionAnswerResult,
+    ]
+):
+    """
+    Question answer evaluation pipeline.
+    """
+    @classmethod
+    def from_config(cls, config: dict) -> Self:
+        """
+        Create an instance of `QuestionAnswerPipeline` from a configuration dictionary.
+        Args:
+            config: A dictionary containing configuration settings for the pipeline.
+        Returns:
+            An instance of the pipeline class initialized with the provided configuration.
+        """
+        config["evaluation_target"] = QuestionAnswerAgent.from_config(config)
+        return super().from_config(config)
+    async def __call__(
+        self, data: Iterable[QuestionAnswerData]
+    ) -> Iterable[QuestionAnswerResult[QuestionAnswerPromptOutputT]]:
+        """
+        Run the question answer evaluation pipeline.
+        Args:
+            data: The evaluation data batch.
+        Returns:
+            The evaluation result batch.
+        """
+        results = await asyncio.gather(
+            *[
+                self.evaluation_target.run(
+                    QuestionAnswerPromptInput(
+                        question=row.question,
+                        context=row.reference_context,
+                    )
+                )
+                for row in data
+            ]
+        )
+        return [
+            QuestionAnswerResult(
+                question=row.question,
+                predicted_result=result,
+                reference_answer=row.reference_answer,
+                reference_context=row.reference_context,
+            )
+            for row, result in zip(data, results, strict=False)
+        ]

ragbits/evaluate/utils.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import json
 import sys
+import traceback
+from dataclasses import asdict
 from datetime import datetime
 from pathlib import Path
 from typing import Any
@@ -7,34 +9,20 @@ from typing import Any
 from datasets import Dataset
 from hydra.core.hydra_config import HydraConfig
 from neptune import Run
+from neptune.types import File
 from neptune.utils import stringify_unsupported
-from omegaconf import DictConfig, OmegaConf
+from neptune_optuna import NeptuneCallback
+from omegaconf import DictConfig
+from ragbits.evaluate.evaluator import EvaluatorResult
-def _save(file_path: Path, **data: Any) -> None:  # noqa: ANN401
-    """
-    Save the data to a file. Add the current timestamp and Python version to the data.
-    Args:
-        file_path: The path to the file.
-        data: The data to be saved.
-    """
-    current_time = datetime.now()
-    data["_timestamp"] = current_time.isoformat()
-    data["_python_version"] = sys.version
-    data["_interpreter_path"] = sys.executable
-    with open(file_path, "w", encoding="utf-8") as file:
-        json.dump(data, file, indent=4)
-def log_to_file(results: dict[str, Any], output_dir: Path | None = None) -> Path:
+def log_evaluation_to_file(result: EvaluatorResult, output_dir: Path | None = None) -> Path:
     """
-    Log the evaluation results locally.
+    Log the evaluation result locally.
     Args:
-        results: The evaluation results.
+        result: The evaluation result.
         output_dir: The output directory.
     Returns:
@@ -43,13 +31,59 @@ def log_to_file(results: dict[str, Any], output_dir: Path | None = None) -> Path
     output_dir = output_dir or Path(HydraConfig.get().runtime.output_dir)
     metrics_file = output_dir / "metrics.json"
     results_file = output_dir / "results.json"
-    _save(metrics_file, metrics=results["metrics"], time_perf=results["time_perf"])
-    _save(results_file, results=results["results"])
+    errors_file = output_dir / "errors.json"
+    _save_json(metrics_file, metrics=result.metrics, time_perf=asdict(result.time_perf))
+    _save_json(results_file, results=[asdict(entry) for entry in result.results])
+    _save_json(
+        errors_file,
+        errors=[
+            {
+                "type": exc.__class__.__name__,
+                "message": str(exc),
+                "stacktrace": "".join(traceback.format_exception(type(exc), exc, exc.__traceback__)),
+            }
+            for exc in result.errors
+        ],
+    )
     return output_dir
+def log_evaluation_to_neptune(result: EvaluatorResult, config: DictConfig, tags: str | list[str] | None = None) -> None:
+    """
+    Log the evaluation result to Neptune.
+    Args:
+        result: The evaluation result.
+        config: The evaluation configuration.
+        tags: The experiment tags.
+    """
+    run = Run(tags=tags)
+    run["config"] = stringify_unsupported(config)
+    run["evaluation/metrics"] = stringify_unsupported(result.metrics)
+    run["evaluation/time_perf"] = stringify_unsupported(asdict(result.time_perf))
+    run["evaluation/results"].upload(
+        File.from_content(json.dumps([asdict(entry) for entry in result.results], indent=4), extension="json")
+    )
+    run["evaluation/errors"].upload(
+        File.from_content(
+            json.dumps(
+                [
+                    {
+                        "type": exc.__class__.__name__,
+                        "message": str(exc),
+                        "stacktrace": "".join(traceback.format_exception(type(exc), exc, exc.__traceback__)),
+                    }
+                    for exc in result.errors
+                ],
+                indent=4,
+            ),
+            extension="json",
+        )
+    )
 def log_dataset_to_file(dataset: Dataset, output_dir: Path | None = None) -> Path:
     """
     Log the evaluation results locally.
@@ -68,7 +102,7 @@ def log_dataset_to_file(dataset: Dataset, output_dir: Path | None = None) -> Pat
 def log_optimization_to_file(
-    results: list[tuple[DictConfig, float, dict[str, float]]], output_dir: Path | None = None
+    results: list[tuple[dict, float, dict[str, float]]], output_dir: Path | None = None
 ) -> Path:
     """
     Log the evaluation results locally.
@@ -81,53 +115,46 @@ def log_optimization_to_file(
         The output directory.
     """
     output_dir = output_dir or Path(HydraConfig.get().runtime.output_dir)
     scores = {}
-    for idx, (cfg, score, all_metrics) in enumerate(results):
-        trial_name = f"trial_{idx}"
-        OmegaConf.save(cfg, output_dir / f"{trial_name}.yaml")
+    for i, (config, score, all_metrics) in enumerate(results):
+        trial_name = f"trial-{i}"
         scores[trial_name] = {"score": score, "all_metrics": all_metrics}
+        trial_config_file = output_dir / f"{trial_name}.json"
+        _save_json(trial_config_file, config=config)
     scores_file = output_dir / "scores.json"
-    _save(scores_file, scores=scores)
+    _save_json(scores_file, scores=scores)
     return output_dir
-def setup_neptune(config: DictConfig) -> Run | None:
+def _save_json(file_path: Path, **data: Any) -> None:  # noqa: ANN401
     """
-    Setup the Neptune run.
+    Save the data to a file. Add the current timestamp and Python version to the data.
     Args:
-        config: The Hydra configuration.
-    Returns:
-        The Neptune run.
+        file_path: The path to the file.
+        data: The data to be saved.
     """
-    if config.neptune.run:
-        run = Run(
-            project=config.neptune.project,
-            tags=[
-                config.task.type,
-                config.task.name,
-                config.data.name,
-            ],
-        )
-        run["config"] = stringify_unsupported(config)
-        return run
-    return None
+    current_time = datetime.now()
+    data["_timestamp"] = current_time.isoformat()
+    data["_python_version"] = sys.version
+    data["_interpreter_path"] = sys.executable
+    file_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(file_path, "w", encoding="utf-8") as file:
+        json.dump(data, file, indent=4)
-def log_to_neptune(run: Run, results: dict[str, Any], output_dir: Path | None = None) -> None:
+def setup_optuna_neptune_callback(tags: str | list[str] | None = None) -> NeptuneCallback:
     """
-    Log the evaluation results to Neptune.
+    Log the optimization process to Neptune.
     Args:
-        run: The Neptune run.
-        results: The evaluation results.
-        output_dir: The output directory.
+        tags: Experiment tags.
     """
-    output_dir = output_dir or Path(HydraConfig.get().runtime.output_dir)
-    run["evaluation/metrics"] = stringify_unsupported(results["metrics"])
-    run["evaluation/time_perf"] = stringify_unsupported(results["time_perf"])
-    run["evaluation/results"] = stringify_unsupported(results["results"])
-    run["evaluation/metrics.json"].upload((output_dir / "metrics.json").as_posix())
-    run["evaluation/results.json"].upload((output_dir / "results.json").as_posix())
+    run = Run(tags=tags)
+    return NeptuneCallback(run)

{ragbits_evaluate-0.5.0.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/METADATA RENAMED Viewed

@@ -1,13 +1,13 @@
-Metadata-Version: 2.3
+Metadata-Version: 2.4
 Name: ragbits-evaluate
-Version: 0.5.0
+Version: 1.4.0.dev202602030301
 Summary: Evaluation module for Ragbits components
 Project-URL: Homepage, https://github.com/deepsense-ai/ragbits
 Project-URL: Bug Reports, https://github.com/deepsense-ai/ragbits/issues
 Project-URL: Documentation, https://ragbits.deepsense.ai/
 Project-URL: Source, https://github.com/deepsense-ai/ragbits
 Author-email: "deepsense.ai" <ragbits@deepsense.ai>
-License: MIT
+License-Expression: MIT
 Keywords: Evaluation,GenAI,Generative AI,LLMs,Large Language Models,RAG,Retrieval Augmented Generation
 Classifier: Development Status :: 4 - Beta
 Classifier: Environment :: Console
@@ -22,13 +22,37 @@ Classifier: Programming Language :: Python :: 3.13
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Requires-Python: >=3.10
-Requires-Dist: distilabel==1.4.1
-Requires-Dist: hydra-core~=1.3.2
-Requires-Dist: neptune~=1.12.0
-Requires-Dist: optuna==4.0.0
-Requires-Dist: ragbits-core==0.5.0
+Requires-Dist: datasets<4.0.0,>=3.0.1
+Requires-Dist: deepeval<3.0.0,>=2.0.0
+Requires-Dist: distilabel<2.0.0,>=1.5.0
+Requires-Dist: hydra-core<2.0.0,>=1.3.2
+Requires-Dist: neptune[optuna]<2.0.0,>=1.12.0
+Requires-Dist: optuna<5.0.0,>=4.0.0
+Requires-Dist: ragbits-core==1.4.0.dev202602030301
 Provides-Extra: relari
-Requires-Dist: continuous-eval~=0.3.12; extra == 'relari'
+Requires-Dist: continuous-eval<1.0.0,>=0.3.12; extra == 'relari'
 Description-Content-Type: text/markdown
 # Ragbits Evaluate
+Ragbits Evaluate is a package that contains tools for evaluating the performance of AI pipelines defined with Ragbits components. It also helps with automatically finding the best hyperparameter configurations for them.
+## Installation
+To install the Ragbits Evaluate package, run:
+```sh
+pip install ragbits-evaluate
+```
+<!--
+TODO: Add a minimalistic example inspired by the Quickstart chapter on Ragbits Evaluate once it is ready.
+-->
+## Documentation
+<!--
+TODO:
+* Add link to the Quickstart chapter on Ragbits Evaluate once it is ready.
+* Add link to API Reference once classes from the Evaluate package are added to the API Reference.
+-->
+* [How-To Guides - Evaluate](https://ragbits.deepsense.ai/how-to/evaluate/optimize/)

ragbits-evaluate 0.5.0__py3-none-any.whl → 1.4.0.dev202602030301__py3-none-any.whl

ragbits-evaluate 0.5.0py3-none-any.whl → 1.4.0.dev202602030301py3-none-any.whl