PyPI - rakam-eval-sdk - Versions diffs - 0.2.3__py3-none-any.whl → 0.2.4rc1__py3-none-any.whl - Mend

rakam-eval-sdk 0.2.3py3-none-any.whl → 0.2.4rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

rakam_eval_sdk/cli.py +182 -41
rakam_eval_sdk/client.py +104 -52
rakam_eval_sdk/schema.py +22 -1
{rakam_eval_sdk-0.2.3.dist-info → rakam_eval_sdk-0.2.4rc1.dist-info}/METADATA +1 -1
rakam_eval_sdk-0.2.4rc1.dist-info/RECORD +10 -0
rakam_eval_sdk-0.2.3.dist-info/RECORD +0 -10
{rakam_eval_sdk-0.2.3.dist-info → rakam_eval_sdk-0.2.4rc1.dist-info}/WHEEL +0 -0
{rakam_eval_sdk-0.2.3.dist-info → rakam_eval_sdk-0.2.4rc1.dist-info}/entry_points.txt +0 -0

rakam_eval_sdk/cli.py CHANGED Viewed

@@ -6,16 +6,18 @@ import uuid
 from datetime import datetime
 from pathlib import Path
 from pprint import pprint
-from typing import Any, Dict, Optional, Sequence
+from typing import Any, Dict, List, Optional, Tuple, Union
 import typer
 from dotenv import load_dotenv
 from rich.console import Console
 from rich.panel import Panel
 from rich.pretty import Pretty
+from typer import secho
 from rakam_eval_sdk.client import DeepEvalClient
 from rakam_eval_sdk.decorators import eval_run
+from rakam_eval_sdk.schema import MetricDiff, TestCaseComparison
 from rakam_eval_sdk.utils.decorator_utils import (
     find_decorated_functions,
     load_module_from_path,
@@ -35,14 +37,14 @@ metrics_app = typer.Typer(help="Metrics utilities")
 app.add_typer(metrics_app, name="metrics")
-def extract_metric_names(config: Any) -> list[tuple[str, str | None]]:
+def extract_metric_names(config: Any) -> list[tuple[str, Optional[str]]]:
     """
     Returns [(type, name)] from EvalConfig / SchemaEvalConfig
     """
     if not hasattr(config, "metrics"):
         return []
-    results: list[tuple[str, str | None]] = []
+    results: list[tuple[str, Optional[str]]] = []
     for metric in config.metrics or []:
         metric_type = getattr(metric, "type", None)
@@ -75,7 +77,7 @@ def metrics(
     files = directory.rglob("*.py") if recursive else directory.glob("*.py")
     TARGET_DECORATOR = eval_run.__name__
-    all_metrics: set[tuple[str, str | None]] = set()
+    all_metrics: set[tuple[str, Optional[str]]] = set()
     found_any = False
     for file in sorted(files):
@@ -157,25 +159,28 @@ def list_evals(
 def list_runs(
     limit: int = typer.Option(20, help="Max number of runs"),
     offset: int = typer.Option(0, help="Pagination offset"),
-):
+) -> None:
     """
     List evaluation runs (newest first).
     """
     client = DeepEvalClient()
-    runs = client.list_evaluation_testcases(
+    response = client.list_evaluation_testcases(
         limit=limit,
         offset=offset,
         raise_exception=True,
     )
+    assert response is not None
+    items = response.get("items", [])
+    total = response.get("total", 0)
-    if not runs:
+    if not items:
         typer.echo("No evaluation runs found.")
         return
-    typer.echo(f"[id] " f"{'tag':<20}" f"{'label':<20}" f"created_at")
-    # pretty CLI output
-    for run in runs:
+    typer.echo(f"[id] {'tag':<20}{'label':<20}created_at")
+    for run in items:
         run_id = run.get("id")
         label = run.get("label") or "-"
         uid = run.get("tag") or "-"
@@ -189,8 +194,12 @@ def list_runs(
             except ValueError:
                 pass
-        typer.echo(
-            f"[{run_id}] " f"{uid:<20} " f"{label:<20} " f"{created_at}")
+        typer.echo(f"[{run_id}] {uid:<20} {label:<20} {created_at}")
+    shown = offset + len(items)
+    if shown < total:
+        typer.echo()
+        typer.echo(f"Showing {shown} of {total} runs. Use --limit to see more.")
 @app.command()
@@ -210,15 +219,15 @@ def show(
         "--raw",
         help="Print raw JSON instead of formatted output",
     ),
-):
+) -> None:
     """
     Show a single evaluation testcase by ID or tag.
     """
     if not id and not tag:
-        raise typer.BadParameter("You must provide either --id or --uid")
+        raise typer.BadParameter("You must provide either --id or --tag")
     if id and tag:
-        raise typer.BadParameter("Provide only one of --id or --uid")
+        raise typer.BadParameter("Provide only one of --id or --tag")
     client = DeepEvalClient()
@@ -226,6 +235,7 @@ def show(
         result = client.get_evaluation_testcase_by_id(id)
         identifier = f"id={id}"
     else:
+        assert tag is not None
         result = client.get_evaluation_testcase_by_tag(tag)
         identifier = f"tag={tag}"
@@ -397,7 +407,7 @@ def run(
 def _print_and_save(
     resp: dict,
     pretty: bool,
-    out: Path | None,
+    out: Optional[Path],
     overwrite: bool,
 ) -> None:
     if pretty:
@@ -410,8 +420,7 @@ def _print_and_save(
         return
     if out.exists() and not overwrite:
-        typer.echo(
-            f"❌ File already exists: {out} (use --overwrite to replace)")
+        typer.echo(f"❌ File already exists: {out} (use --overwrite to replace)")
         raise typer.Exit(code=1)
     out.parent.mkdir(parents=True, exist_ok=True)
@@ -422,12 +431,109 @@ def _print_and_save(
     typer.echo(f"💾 Result saved to {out}")
+def pct_change(a: Optional[float], b: Optional[float]) -> Optional[str]:
+    if a is None or b is None or a == 0:
+        return None
+    return f"{((b - a) / a) * 100:+.2f}%"
+def metric_direction(delta: Optional[float]) -> str:
+    if delta is None:
+        return "unchanged"
+    if delta > 0:
+        return "improved"
+    if delta < 0:
+        return "regressed"
+    return "unchanged"
+def print_metric_diff(diff: MetricDiff) -> None:
+    secho(f"\nMetric: {diff.metric}", bold=True)
+    if diff.status == "added":
+        secho(f"+ score: {diff.score_b}", fg="green")
+        secho(f"+ threshold: {diff.threshold_b}", fg="green")
+        secho(f"+ success: {diff.success_b}", fg="green")
+        return
+    if diff.status == "removed":
+        secho(f"- score: {diff.score_a}", fg="red")
+        secho(f"- threshold: {diff.threshold_a}", fg="red")
+        secho(f"- success: {diff.success_a}", fg="red")
+        return
+    # unchanged / changed
+    if diff.score_a != diff.score_b:
+        direction = metric_direction(diff.delta)
+        color = "green" if direction == "improved" else "red"
+        pct = pct_change(diff.score_a, diff.score_b)
+        secho(f"- score: {diff.score_a}", fg="red")
+        secho(
+            f"+ score: {diff.score_b}" + (f"   ({pct})" if pct else ""),
+            fg=color,
+        )
+    else:
+        secho(f"  score: {diff.score_a}", dim=True)
+    if diff.threshold_a != diff.threshold_b:
+        secho(f"- threshold: {diff.threshold_a}", fg="red")
+        secho(f"+ threshold: {diff.threshold_b}", fg="green")
+    else:
+        secho(f"  threshold: {diff.threshold_a}", dim=True)
+    if diff.success_a != diff.success_b:
+        secho(f"- success: {diff.success_a}", fg="red")
+        secho(f"+ success: {diff.success_b}", fg="green")
+    else:
+        secho(f"  success: {diff.success_a}", dim=True)
+def summarize(metrics: Any) -> Dict[str, int]:
+    summary = {
+        "improved": 0,
+        "regressed": 0,
+        "unchanged": 0,
+        "added": 0,
+        "removed": 0,
+    }
+    for m in metrics:
+        if m.status in ("added", "removed"):
+            summary[m.status] += 1
+        else:
+            direction = metric_direction(m.delta)
+            summary[direction] += 1
+    return summary
+def pretty_print_comparison(resp: Any, summary_only: bool = False) -> None:
+    if not summary_only:
+        for metric in resp.metrics:
+            print_metric_diff(metric)
+        return
+    print_summary(resp.metrics)
+def print_summary(metrics: Any) -> None:
+    summary = summarize(metrics)
+    secho("\nSummary:", bold=True)
+    secho(f"  ↑ Improved: {summary['improved']}", fg="green")
+    secho(f"  ↓ Regressed: {summary['regressed']}", fg="red")
+    secho(f"  ± Unchanged: {summary['unchanged']}", dim=True)
+    secho(f"  + Added: {summary['added']}", fg="green")
+    secho(f"  - Removed: {summary['removed']}", fg="red")
 @app.command()
 def compare(
     tag: list[str] = typer.Option(
         [],
         "--tag",
-        help="Tag identifying a reference testcase",
+        help="Label identifying a reference testcase",
     ),
     run: list[int] = typer.Option(
         [],
@@ -437,14 +543,19 @@ def compare(
     pretty: bool = typer.Option(
         True,
         "--pretty/--raw",
-        help="Pretty-print the response",
+        help="Pretty diff output (default) or raw JSON",
+    ),
+    summary: bool = typer.Option(
+        False,
+        "--summary",
+        help="Show summary only (no per-metric diff)",
     ),
     raise_exception: bool = typer.Option(
         False,
         "--raise",
         help="Raise HTTP exceptions instead of swallowing them",
     ),
-    out: Path | None = typer.Option(
+    out: Optional[Path] = typer.Option(
         None,
         "-o",
         "--out",
@@ -457,32 +568,40 @@ def compare(
     ),
 ) -> None:
     """
-    Compare two evaluation testcases using runs and/or labels.
+    Compare two evaluation testcases (runs and/or labels).
+    Output:
+      - Unified diff-style view by default
+      - Raw JSON with --raw
+      - Summary of improvements / regressions
     """
-    targets = []
+    targets: List[Tuple[str, Union[str, int]]] = []
     for r in run:
         targets.append(("run", r))
-    for l in tag:
-        targets.append(("label", l))
+    for t in tag:
+        targets.append(("label", t))
     if len(targets) != 2:
-        typer.echo(
-            "❌ Provide exactly two targets using --run and/or --label"
+        typer.secho(
+            "❌ Provide exactly two targets using --run and/or --tag",
+            fg="red",
+            bold=True,
         )
         raise typer.Exit(code=1)
-    client = DeepEvalClient()
     (type_a, value_a), (type_b, value_b) = targets
-    typer.echo(
-        f"🔍 Comparing {type_a} '{value_a}' ↔ {type_b} '{value_b}'"
+    typer.secho(
+        f"🔍 Comparing {type_a} '{value_a}' ↔ {type_b} '{value_b}'",
+        bold=True,
     )
-    kwargs = {"raise_exception": raise_exception}
+    client = DeepEvalClient()
+    kwargs: Dict[str, Any] = {"raise_exception": raise_exception}
     if type_a == "run":
         kwargs["testcase_a_id"] = value_a
@@ -497,14 +616,34 @@ def compare(
     try:
         resp = client.compare_testcases(**kwargs)
     except Exception as e:
-        typer.echo(f"❌ Request failed: {e}")
+        typer.secho(f"❌ Request failed: {e}", fg="red")
         raise typer.Exit(code=1)
     if not resp:
-        typer.echo("⚠️ No response received")
+        typer.secho("⚠️ No response received", fg="yellow")
         raise typer.Exit(code=1)
-    _print_and_save(resp, pretty, out, overwrite)
+    if out:
+        if out.exists() and not overwrite:
+            typer.secho(
+                f"❌ File already exists: {out} (use --overwrite)",
+                fg="red",
+            )
+            raise typer.Exit(code=1)
+        out.write_text(json.dumps(resp, indent=2))
+        typer.secho(f"💾 Saved raw output to {out}", fg="green")
+    if not pretty:
+        typer.echo(json.dumps(resp, indent=2))
+        return
+    comparison = TestCaseComparison(**resp)
+    pretty_print_comparison(
+        comparison,
+        summary_only=summary,
+    )
 @app.command(hidden=True)
@@ -527,7 +666,7 @@ def compare_label_latest(
         "--raise",
         help="Raise HTTP exceptions instead of swallowing them",
     ),
-    out: Path | None = typer.Option(
+    out: Optional[Path] = typer.Option(
         None,
         "-o",
         "--out",
@@ -579,7 +718,7 @@ def compare_last(
         "--raise",
         help="Raise HTTP exceptions instead of swallowing them",
     ),
-    out: Path | None = typer.Option(
+    out: Optional[Path] = typer.Option(
         None,
         "-o",
         "--out",
@@ -631,7 +770,7 @@ def tag_command(
         "--delete",
         help="Delete a tag",
     ),
-):
+) -> None:
     """
     Assign a tag to a run or delete a tag.
     """
@@ -648,8 +787,9 @@ def tag_command(
     client = DeepEvalClient()
-    # --- delete mode ---
     if delete:
+        assert run_id is not None
         result = client.update_evaluation_testcase_tag(
             testcase_id=run_id,
             tag=delete,
@@ -658,13 +798,14 @@ def tag_command(
         typer.echo("🗑️ Tag deleted successfully")
         typer.echo(f"Tag: {delete}")
         return
-    # --- assign/update mode ---
+    assert run_id is not None
+    assert tag is not None
     result = client.update_evaluation_testcase_tag(
         testcase_id=run_id,
         tag=tag,
         raise_exception=True,
     )
+    assert result is not None
     typer.echo("✅ Tag assigned successfully")
     typer.echo(f"Run ID: {run_id}")

rakam_eval_sdk/client.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import os
 import random
-from typing import Any, Dict, List, Literal, Optional, Union, cast, overload
+from typing import Any, Dict, List, Literal, Optional, Tuple, Union, cast, overload
 import requests
@@ -40,8 +40,7 @@ class DeepEvalClient:
         )
         self.base_url = raw_url.rstrip("/")
         self.api_token = (
-            api_token or settings_token or os.getenv(
-                "EVALFRAMEWORK_API_KEY", "")
+            api_token or settings_token or os.getenv("EVALFRAMEWORK_API_KEY", "")
         )
         self.timeout = timeout
@@ -50,10 +49,10 @@ class DeepEvalClient:
         method: HTTPMethod,
         endpoint: str,
         *,
-        json: dict | None = None,
-        params: dict | None = None,
+        json: Optional[Dict] = None,
+        params: Optional[Dict] = None,
         raise_exception: bool = False,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         url = f"{self.base_url}{endpoint}"
         headers = {
@@ -90,17 +89,63 @@ class DeepEvalClient:
                 "raw": resp.text,
             }
-    def _get(self, endpoint: str, params: dict, *args, **kw):
-        return self._request("GET", endpoint, params=params, *args, **kw)
+    def _get(
+        self,
+        endpoint: str,
+        params: Dict,
+        raise_exception: bool = False,
+        *args: Tuple,
+        **kw: Dict,
+    ) -> Optional[Dict]:
+        return self._request(
+            "GET", endpoint, params=params, raise_exception=raise_exception, *args, **kw
+        )
-    def _post(self, endpoint: str, payload: dict, *args, **kw):
-        return self._request("POST", endpoint, json=payload, *args, **kw)
+    def _post(
+        self,
+        endpoint: str,
+        payload: Dict,
+        raise_exception: bool = False,
+        *args: Tuple,
+        **kw: Dict,
+    ) -> Optional[Dict]:
+        return self._request(
+            "POST", endpoint, json=payload, raise_exception=raise_exception, *args, **kw
+        )
-    def _patch(self, endpoint: str, payload: dict, *args, **kw):
-        return self._request("PATCH", endpoint, json=payload, *args, **kw)
+    def _patch(
+        self,
+        endpoint: str,
+        payload: Dict,
+        raise_exception: bool = False,
+        *args: Tuple,
+        **kw: Dict,
+    ) -> Optional[Dict]:
+        return self._request(
+            "PATCH",
+            endpoint,
+            json=payload,
+            raise_exception=raise_exception,
+            *args,
+            **kw,
+        )
-    def _delete(self, endpoint: str, payload: dict, *args, **kw):
-        return self._request("DELETE", endpoint, json=payload, *args, **kw)
+    def _delete(
+        self,
+        endpoint: str,
+        payload: Dict,
+        raise_exception: bool = False,
+        *args: Tuple,
+        **kw: Dict,
+    ) -> Optional[Dict]:
+        return self._request(
+            "DELETE",
+            endpoint,
+            json=payload,
+            raise_exception=raise_exception,
+            *args,
+            **kw,
+        )
     def update_evaluation_testcase_tag(
         self,
@@ -133,10 +178,11 @@ class DeepEvalClient:
         limit: int = 10,
         offset: int = 0,
         raise_exception: bool = False,
-    ) -> Optional[List[Dict]]:
+    ) -> Optional[Dict]:
         """
         List evaluation testcases for the current API token only.
         Sorted by created_at DESC (newest first).
+        Returns items + pagination metadata.
         """
         return self._get(
             endpoint="/eval-framework/deepeval/evaluation-testcases/token",
@@ -197,27 +243,25 @@ class DeepEvalClient:
     def compare_testcases(
         self,
         *,
-        testcase_a_id: int | None = None,
-        testcase_a_tag: str | None = None,
-        testcase_b_id: int | None = None,
-        testcase_b_tag: str | None = None,
+        testcase_a_id: Optional[int] = None,
+        testcase_a_tag: Optional[str] = None,
+        testcase_b_id: Optional[int] = None,
+        testcase_b_tag: Optional[str] = None,
         raise_exception: bool = False,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         """
         Compare two evaluation testcases using IDs or tags.
         Exactly one identifier (id or tag) must be provided per testcase.
         """
-        def validate(id_, tag, name: str):
+        def validate(id_: Optional[int], tag: Optional[str], name: str) -> None:
             if bool(id_) == bool(tag):
-                raise ValueError(
-                    f"Provide exactly one of {name}_id or {name}_tag"
-                )
+                raise ValueError(f"Provide exactly one of {name}_id or {name}_tag")
         validate(testcase_a_id, testcase_a_tag, "testcase_a")
         validate(testcase_b_id, testcase_b_tag, "testcase_b")
-        params: dict[str, int | str] = {}
+        params: Dict[str, Union[int, str]] = {}
         if testcase_a_id is not None:
             params["testcase_a_id"] = testcase_a_id
@@ -241,7 +285,7 @@ class DeepEvalClient:
         label_a: str,
         label_b: str,
         raise_exception: bool = False,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         """
         Compare the latest evaluation testcases for two labels.
         """
@@ -259,7 +303,7 @@ class DeepEvalClient:
         *,
         label: str,
         raise_exception: bool = False,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         """
         Compare the last two evaluation testcases for a given label.
         """
@@ -277,7 +321,7 @@ class DeepEvalClient:
         config: EvalConfig,
         *,
         raise_exception: bool = False,
-    ) -> Optional[dict]: ...
+    ) -> Optional[Dict]: ...
     @overload
     def text_eval(
@@ -286,20 +330,20 @@ class DeepEvalClient:
         data: List[TextInputItem],
         metrics: List[MetricConfig],
         component: str = "unknown",
-        label: str | None = None,
+        label: Optional[str] = None,
         raise_exception: bool = False,
-    ) -> Optional[dict]: ...
+    ) -> Optional[Dict]: ...
     def text_eval(
         self,
-        config: EvalConfig | None = None,
+        config: Optional[EvalConfig] = None,
         *,
-        data: List[TextInputItem] | None = None,
-        metrics: List[MetricConfig] | None = None,
+        data: Optional[List[TextInputItem]] = None,
+        metrics: Optional[List[MetricConfig]] = None,
         component: str = "unknown",
-        label: str | None = None,
+        label: Optional[str] = None,
         raise_exception: bool = False,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         if config is None:
             config = EvalConfig(
                 data=data,
@@ -309,7 +353,9 @@ class DeepEvalClient:
             )
         return self._post(
-            endpoint="/deepeval/text-eval", payload=config.model_dump(), raise_exception=raise_exception
+            endpoint="/deepeval/text-eval",
+            payload=config.model_dump(),
+            raise_exception=raise_exception,
         )
     def text_eval_background(
@@ -319,12 +365,16 @@ class DeepEvalClient:
         raise_exception: bool = False,
         component: str = "unknown",
         label: Union[str, None] = None,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         """Run background text evaluation (async job)."""
         payload = EvalConfig.model_construct(
             data=data, metrics=metrics, component=component, version=label
         ).model_dump()
-        return self._post(endpoint="/deepeval/text-eval/background", payload=payload, raise_exception=raise_exception)
+        return self._post(
+            endpoint="/deepeval/text-eval/background",
+            payload=payload,
+            raise_exception=raise_exception,
+        )
     @overload
     def schema_eval(
@@ -333,9 +383,9 @@ class DeepEvalClient:
         data: List[SchemaInputItem],
         metrics: List[SchemaMetricConfig],
         component: str = "unknown",
-        label: str | None = None,
+        label: Optional[str] = None,
         raise_exception: bool = False,
-    ) -> Optional[dict]: ...
+    ) -> Optional[Dict]: ...
     @overload
     def schema_eval(
@@ -343,18 +393,18 @@ class DeepEvalClient:
         config: SchemaEvalConfig,
         *,
         raise_exception: bool = False,
-    ) -> Optional[dict]: ...
+    ) -> Optional[Dict]: ...
     def schema_eval(
         self,
-        config: SchemaEvalConfig | None = None,
+        config: Optional[SchemaEvalConfig] = None,
         *,
-        data: List[SchemaInputItem] | None = None,
-        metrics: List[SchemaMetricConfig] | None = None,
+        data: Optional[List[SchemaInputItem]] = None,
+        metrics: Optional[List[SchemaMetricConfig]] = None,
         component: str = "unknown",
-        label: str | None = None,
+        label: Optional[str] = None,
         raise_exception: bool = False,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         if config is None:
             if data is None or metrics is None:
                 raise ValueError(
@@ -381,13 +431,15 @@ class DeepEvalClient:
         raise_exception: bool = False,
         component: str = "unknown",
         label: Union[str, None] = None,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         """Run background schema evaluation (async job)."""
         payload = SchemaEvalConfig.model_construct(
             data=data, metrics=metrics, component=component, version=label
         ).model_dump()
         return self._post(
-            endpoint="/deepeval/schema-eval/background", payload=payload, raise_exception=raise_exception
+            endpoint="/deepeval/schema-eval/background",
+            payload=payload,
+            raise_exception=raise_exception,
         )
     def maybe_text_eval(
@@ -398,7 +450,7 @@ class DeepEvalClient:
         raise_exception: bool = False,
         component: str = "unknown",
         label: Union[str, None] = None,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         """Randomly run text_eval based on a probability between 0 and 1."""
         self._validate_chance(chance)
         return (
@@ -421,7 +473,7 @@ class DeepEvalClient:
         raise_exception: bool = False,
         component: str = "unknown",
         label: Union[str, None] = None,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         """Randomly run text_eval_background based on a probability between 0 and 1."""
         self._validate_chance(chance)
         return (
@@ -440,7 +492,7 @@ class DeepEvalClient:
         raise_exception: bool = False,
         component: str = "unknown",
         label: Union[str, None] = None,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         """Randomly run schema_eval based on a probability between 0 and 1."""
         self._validate_chance(chance)
         return (
@@ -463,7 +515,7 @@ class DeepEvalClient:
         raise_exception: bool = False,
         component: str = "unknown",
         label: Union[str, None] = None,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         """Randomly run text_eval_background based on a probability between 0 and 1."""
         self._validate_chance(chance)
         return (

rakam_eval_sdk/schema.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # Common base class for all metric configs
 import sys
-from typing import Annotated, Any, Dict, List, Literal, Optional, Union
+from typing import Any, Dict, List, Literal, Optional, Union
 # Base class (you can keep this abstract)
 from pydantic import BaseModel, Field
@@ -129,3 +129,24 @@ class SchemaEvalConfig(BaseModel):
     label: Union[str, None] = None
     data: List[SchemaInputItem]
     metrics: List[SchemaMetricConfig] = Field(default_factory=list)
+class MetricDiff(BaseModel):
+    metric: str
+    score_a: Optional[float]
+    score_b: Optional[float]
+    delta: Optional[float]
+    success_a: Optional[bool]
+    success_b: Optional[bool]
+    threshold_a: Optional[float]
+    threshold_b: Optional[float]
+    status: str  # "unchanged" | "changed" | "added" | "removed"
+class TestCaseComparison(BaseModel):
+    testcase_a_id: int
+    testcase_b_id: int
+    metrics: List[MetricDiff]

{rakam_eval_sdk-0.2.3.dist-info → rakam_eval_sdk-0.2.4rc1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: rakam-eval-sdk
-Version: 0.2.3
+Version: 0.2.4rc1
 Summary: Evaluation Framework SDK
 Author: Mohamed Bachar Touil
 License: MIT

rakam_eval_sdk-0.2.4rc1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,10 @@
+rakam_eval_sdk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+rakam_eval_sdk/cli.py,sha256=HjymI0UTKRgJb-ewDuINuTtH605MkcB9nD0qNlbdFyc,21905
+rakam_eval_sdk/client.py,sha256=8KwpNt7WHgMfcXAQtrAlnUihai9TiADRieiuWeIxg6E,15575
+rakam_eval_sdk/decorators.py,sha256=_9VFQmoYWd6cqnNryZJWEwYHQRxY7vIOam4z45zBk3c,1794
+rakam_eval_sdk/schema.py,sha256=DT-uQsE3XB7AAii68QACwIslLqXRQFb1c6efkqLQysI,3960
+rakam_eval_sdk/utils/decorator_utils.py,sha256=g0TjXtG9o4hwhUAFP8GJsXAkjhZhzeseTAg-YBFjj2g,1763
+rakam_eval_sdk-0.2.4rc1.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
+rakam_eval_sdk-0.2.4rc1.dist-info/entry_points.txt,sha256=tNhwmM_UGELb3h0zOfgCrtTheUkP-k8jGv0rTOfRSps,56
+rakam_eval_sdk-0.2.4rc1.dist-info/METADATA,sha256=Sy2Ghp5wcrmS7AzBkhp8C8LVb0dTS7tSNtgEAJCfbU8,6022
+rakam_eval_sdk-0.2.4rc1.dist-info/RECORD,,

rakam_eval_sdk-0.2.3.dist-info/RECORD DELETED Viewed

@@ -1,10 +0,0 @@
-rakam_eval_sdk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-rakam_eval_sdk/cli.py,sha256=gV4FsAde1WB5rRPLW2mRqTO4X0SZhvepcWH10KE5IIY,17642
-rakam_eval_sdk/client.py,sha256=sEXogMqtXgKGt_1Xuwg-S2Q2zRC6Y0ZVG_Wy1djbi28,14585
-rakam_eval_sdk/decorators.py,sha256=_9VFQmoYWd6cqnNryZJWEwYHQRxY7vIOam4z45zBk3c,1794
-rakam_eval_sdk/schema.py,sha256=woXu5GT1wuP_d7G11idopwpGmk0nV1SrxGShgWk-RA8,3530
-rakam_eval_sdk/utils/decorator_utils.py,sha256=g0TjXtG9o4hwhUAFP8GJsXAkjhZhzeseTAg-YBFjj2g,1763
-rakam_eval_sdk-0.2.3.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
-rakam_eval_sdk-0.2.3.dist-info/entry_points.txt,sha256=tNhwmM_UGELb3h0zOfgCrtTheUkP-k8jGv0rTOfRSps,56
-rakam_eval_sdk-0.2.3.dist-info/METADATA,sha256=ybu9LsKuQw0gfhJGPW-wTO6LC4bJw7sDlVuUQJQRqZ8,6019
-rakam_eval_sdk-0.2.3.dist-info/RECORD,,

{rakam_eval_sdk-0.2.3.dist-info → rakam_eval_sdk-0.2.4rc1.dist-info}/WHEEL RENAMED Viewed

File without changes

{rakam_eval_sdk-0.2.3.dist-info → rakam_eval_sdk-0.2.4rc1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

rakam-eval-sdk 0.2.3__py3-none-any.whl → 0.2.4rc1__py3-none-any.whl

rakam-eval-sdk 0.2.3py3-none-any.whl → 0.2.4rc1py3-none-any.whl