PyPI - rakam-eval-sdk - Versions diffs - 0.2.3__tar.gz → 0.2.4__tar.gz - Mend

rakam-eval-sdk 0.2.3tar.gz → 0.2.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

{rakam_eval_sdk-0.2.3 → rakam_eval_sdk-0.2.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: rakam-eval-sdk
-Version: 0.2.3
+Version: 0.2.4
 Summary: Evaluation Framework SDK
 Author: Mohamed Bachar Touil
 License: MIT

{rakam_eval_sdk-0.2.3 → rakam_eval_sdk-0.2.4}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "uv_build"
 [project]
 name = "rakam-eval-sdk"
-version = "0.2.3"
+version = "0.2.4"
 description = "Evaluation Framework SDK"
 readme = "README.md"
 requires-python = ">=3.8"

{rakam_eval_sdk-0.2.3 → rakam_eval_sdk-0.2.4}/src/rakam_eval_sdk/cli.py RENAMED Viewed

@@ -6,16 +6,18 @@ import uuid
 from datetime import datetime
 from pathlib import Path
 from pprint import pprint
-from typing import Any, Dict, Optional, Sequence
+from typing import Any, Dict, Optional
 import typer
 from dotenv import load_dotenv
 from rich.console import Console
 from rich.panel import Panel
 from rich.pretty import Pretty
+from typer import secho
 from rakam_eval_sdk.client import DeepEvalClient
 from rakam_eval_sdk.decorators import eval_run
+from rakam_eval_sdk.schema import MetricDiff, TestCaseComparison
 from rakam_eval_sdk.utils.decorator_utils import (
     find_decorated_functions,
     load_module_from_path,
@@ -163,19 +165,22 @@ def list_runs(
     """
     client = DeepEvalClient()
-    runs = client.list_evaluation_testcases(
+    response = client.list_evaluation_testcases(
         limit=limit,
         offset=offset,
         raise_exception=True,
     )
-    if not runs:
+    items = response.get("items", [])
+    total = response.get("total", 0)
+    if not items:
         typer.echo("No evaluation runs found.")
         return
-    typer.echo(f"[id] " f"{'tag':<20}" f"{'label':<20}" f"created_at")
-    # pretty CLI output
-    for run in runs:
+    typer.echo(f"[id] {'tag':<20}{'label':<20}created_at")
+    for run in items:
         run_id = run.get("id")
         label = run.get("label") or "-"
         uid = run.get("tag") or "-"
@@ -189,8 +194,12 @@ def list_runs(
             except ValueError:
                 pass
-        typer.echo(
-            f"[{run_id}] " f"{uid:<20} " f"{label:<20} " f"{created_at}")
+        typer.echo(f"[{run_id}] {uid:<20} {label:<20} {created_at}")
+    shown = offset + len(items)
+    if shown < total:
+        typer.echo()
+        typer.echo(f"Showing {shown} of {total} runs. Use --limit to see more.")
 @app.command()
@@ -410,8 +419,7 @@ def _print_and_save(
         return
     if out.exists() and not overwrite:
-        typer.echo(
-            f"❌ File already exists: {out} (use --overwrite to replace)")
+        typer.echo(f"❌ File already exists: {out} (use --overwrite to replace)")
         raise typer.Exit(code=1)
     out.parent.mkdir(parents=True, exist_ok=True)
@@ -422,12 +430,109 @@ def _print_and_save(
     typer.echo(f"💾 Result saved to {out}")
+def pct_change(a: float | None, b: float | None) -> str | None:
+    if a is None or b is None or a == 0:
+        return None
+    return f"{((b - a) / a) * 100:+.2f}%"
+def metric_direction(delta: float | None) -> str:
+    if delta is None:
+        return "unchanged"
+    if delta > 0:
+        return "improved"
+    if delta < 0:
+        return "regressed"
+    return "unchanged"
+def print_metric_diff(diff: MetricDiff):
+    secho(f"\nMetric: {diff.metric}", bold=True)
+    if diff.status == "added":
+        secho(f"+ score: {diff.score_b}", fg="green")
+        secho(f"+ threshold: {diff.threshold_b}", fg="green")
+        secho(f"+ success: {diff.success_b}", fg="green")
+        return
+    if diff.status == "removed":
+        secho(f"- score: {diff.score_a}", fg="red")
+        secho(f"- threshold: {diff.threshold_a}", fg="red")
+        secho(f"- success: {diff.success_a}", fg="red")
+        return
+    # unchanged / changed
+    if diff.score_a != diff.score_b:
+        direction = metric_direction(diff.delta)
+        color = "green" if direction == "improved" else "red"
+        pct = pct_change(diff.score_a, diff.score_b)
+        secho(f"- score: {diff.score_a}", fg="red")
+        secho(
+            f"+ score: {diff.score_b}" + (f"   ({pct})" if pct else ""),
+            fg=color,
+        )
+    else:
+        secho(f"  score: {diff.score_a}", dim=True)
+    if diff.threshold_a != diff.threshold_b:
+        secho(f"- threshold: {diff.threshold_a}", fg="red")
+        secho(f"+ threshold: {diff.threshold_b}", fg="green")
+    else:
+        secho(f"  threshold: {diff.threshold_a}", dim=True)
+    if diff.success_a != diff.success_b:
+        secho(f"- success: {diff.success_a}", fg="red")
+        secho(f"+ success: {diff.success_b}", fg="green")
+    else:
+        secho(f"  success: {diff.success_a}", dim=True)
+def summarize(metrics: Any) -> Dict[str, int]:
+    summary = {
+        "improved": 0,
+        "regressed": 0,
+        "unchanged": 0,
+        "added": 0,
+        "removed": 0,
+    }
+    for m in metrics:
+        if m.status in ("added", "removed"):
+            summary[m.status] += 1
+        else:
+            direction = metric_direction(m.delta)
+            summary[direction] += 1
+    return summary
+def pretty_print_comparison(resp: Any, summary_only: bool = False):
+    if not summary_only:
+        for metric in resp.metrics:
+            print_metric_diff(metric)
+        return
+    print_summary(resp.metrics)
+def print_summary(metrics: Any):
+    summary = summarize(metrics)
+    secho("\nSummary:", bold=True)
+    secho(f"  ↑ Improved: {summary['improved']}", fg="green")
+    secho(f"  ↓ Regressed: {summary['regressed']}", fg="red")
+    secho(f"  ± Unchanged: {summary['unchanged']}", dim=True)
+    secho(f"  + Added: {summary['added']}", fg="green")
+    secho(f"  - Removed: {summary['removed']}", fg="red")
 @app.command()
 def compare(
     tag: list[str] = typer.Option(
         [],
         "--tag",
-        help="Tag identifying a reference testcase",
+        help="Label identifying a reference testcase",
     ),
     run: list[int] = typer.Option(
         [],
@@ -437,7 +542,12 @@ def compare(
     pretty: bool = typer.Option(
         True,
         "--pretty/--raw",
-        help="Pretty-print the response",
+        help="Pretty diff output (default) or raw JSON",
+    ),
+    summary: bool = typer.Option(
+        False,
+        "--summary",
+        help="Show summary only (no per-metric diff)",
     ),
     raise_exception: bool = typer.Option(
         False,
@@ -457,31 +567,39 @@ def compare(
     ),
 ) -> None:
     """
-    Compare two evaluation testcases using runs and/or labels.
+    Compare two evaluation testcases (runs and/or labels).
+    Output:
+      - Unified diff-style view by default
+      - Raw JSON with --raw
+      - Summary of improvements / regressions
     """
-    targets = []
+    targets: list[tuple[str, str | int]] = []
     for r in run:
         targets.append(("run", r))
-    for l in tag:
-        targets.append(("label", l))
+    for t in tag:
+        targets.append(("label", t))
     if len(targets) != 2:
-        typer.echo(
-            "❌ Provide exactly two targets using --run and/or --label"
+        typer.secho(
+            "❌ Provide exactly two targets using --run and/or --tag",
+            fg="red",
+            bold=True,
         )
         raise typer.Exit(code=1)
-    client = DeepEvalClient()
     (type_a, value_a), (type_b, value_b) = targets
-    typer.echo(
-        f"🔍 Comparing {type_a} '{value_a}' ↔ {type_b} '{value_b}'"
+    typer.secho(
+        f"🔍 Comparing {type_a} '{value_a}' ↔ {type_b} '{value_b}'",
+        bold=True,
     )
+    client = DeepEvalClient()
     kwargs = {"raise_exception": raise_exception}
     if type_a == "run":
@@ -497,14 +615,34 @@ def compare(
     try:
         resp = client.compare_testcases(**kwargs)
     except Exception as e:
-        typer.echo(f"❌ Request failed: {e}")
+        typer.secho(f"❌ Request failed: {e}", fg="red")
         raise typer.Exit(code=1)
     if not resp:
-        typer.echo("⚠️ No response received")
+        typer.secho("⚠️ No response received", fg="yellow")
         raise typer.Exit(code=1)
-    _print_and_save(resp, pretty, out, overwrite)
+    if out:
+        if out.exists() and not overwrite:
+            typer.secho(
+                f"❌ File already exists: {out} (use --overwrite)",
+                fg="red",
+            )
+            raise typer.Exit(code=1)
+        out.write_text(json.dumps(resp, indent=2))
+        typer.secho(f"💾 Saved raw output to {out}", fg="green")
+    if not pretty:
+        typer.echo(json.dumps(resp, indent=2))
+        return
+    comparison = TestCaseComparison(**resp)
+    pretty_print_comparison(
+        comparison,
+        summary_only=summary,
+    )
 @app.command(hidden=True)

{rakam_eval_sdk-0.2.3 → rakam_eval_sdk-0.2.4}/src/rakam_eval_sdk/client.py RENAMED Viewed

@@ -40,8 +40,7 @@ class DeepEvalClient:
         )
         self.base_url = raw_url.rstrip("/")
         self.api_token = (
-            api_token or settings_token or os.getenv(
-                "EVALFRAMEWORK_API_KEY", "")
+            api_token or settings_token or os.getenv("EVALFRAMEWORK_API_KEY", "")
         )
         self.timeout = timeout
@@ -50,10 +49,10 @@ class DeepEvalClient:
         method: HTTPMethod,
         endpoint: str,
         *,
-        json: dict | None = None,
-        params: dict | None = None,
+        json: Dict | None = None,
+        params: Dict | None = None,
         raise_exception: bool = False,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         url = f"{self.base_url}{endpoint}"
         headers = {
@@ -90,16 +89,16 @@ class DeepEvalClient:
                 "raw": resp.text,
             }
-    def _get(self, endpoint: str, params: dict, *args, **kw):
+    def _get(self, endpoint: str, params: Dict, *args, **kw):
         return self._request("GET", endpoint, params=params, *args, **kw)
-    def _post(self, endpoint: str, payload: dict, *args, **kw):
+    def _post(self, endpoint: str, payload: Dict, *args, **kw):
         return self._request("POST", endpoint, json=payload, *args, **kw)
-    def _patch(self, endpoint: str, payload: dict, *args, **kw):
+    def _patch(self, endpoint: str, payload: Dict, *args, **kw):
         return self._request("PATCH", endpoint, json=payload, *args, **kw)
-    def _delete(self, endpoint: str, payload: dict, *args, **kw):
+    def _delete(self, endpoint: str, payload: Dict, *args, **kw):
         return self._request("DELETE", endpoint, json=payload, *args, **kw)
     def update_evaluation_testcase_tag(
@@ -133,10 +132,11 @@ class DeepEvalClient:
         limit: int = 10,
         offset: int = 0,
         raise_exception: bool = False,
-    ) -> Optional[List[Dict]]:
+    ) -> Optional[Dict]:
         """
         List evaluation testcases for the current API token only.
         Sorted by created_at DESC (newest first).
+        Returns items + pagination metadata.
         """
         return self._get(
             endpoint="/eval-framework/deepeval/evaluation-testcases/token",
@@ -202,7 +202,7 @@ class DeepEvalClient:
         testcase_b_id: int | None = None,
         testcase_b_tag: str | None = None,
         raise_exception: bool = False,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         """
         Compare two evaluation testcases using IDs or tags.
         Exactly one identifier (id or tag) must be provided per testcase.
@@ -210,9 +210,7 @@ class DeepEvalClient:
         def validate(id_, tag, name: str):
             if bool(id_) == bool(tag):
-                raise ValueError(
-                    f"Provide exactly one of {name}_id or {name}_tag"
-                )
+                raise ValueError(f"Provide exactly one of {name}_id or {name}_tag")
         validate(testcase_a_id, testcase_a_tag, "testcase_a")
         validate(testcase_b_id, testcase_b_tag, "testcase_b")
@@ -241,7 +239,7 @@ class DeepEvalClient:
         label_a: str,
         label_b: str,
         raise_exception: bool = False,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         """
         Compare the latest evaluation testcases for two labels.
         """
@@ -259,7 +257,7 @@ class DeepEvalClient:
         *,
         label: str,
         raise_exception: bool = False,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         """
         Compare the last two evaluation testcases for a given label.
         """
@@ -277,7 +275,7 @@ class DeepEvalClient:
         config: EvalConfig,
         *,
         raise_exception: bool = False,
-    ) -> Optional[dict]: ...
+    ) -> Optional[Dict]: ...
     @overload
     def text_eval(
@@ -288,7 +286,7 @@ class DeepEvalClient:
         component: str = "unknown",
         label: str | None = None,
         raise_exception: bool = False,
-    ) -> Optional[dict]: ...
+    ) -> Optional[Dict]: ...
     def text_eval(
         self,
@@ -299,7 +297,7 @@ class DeepEvalClient:
         component: str = "unknown",
         label: str | None = None,
         raise_exception: bool = False,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         if config is None:
             config = EvalConfig(
                 data=data,
@@ -309,7 +307,9 @@ class DeepEvalClient:
             )
         return self._post(
-            endpoint="/deepeval/text-eval", payload=config.model_dump(), raise_exception=raise_exception
+            endpoint="/deepeval/text-eval",
+            payload=config.model_dump(),
+            raise_exception=raise_exception,
         )
     def text_eval_background(
@@ -319,12 +319,16 @@ class DeepEvalClient:
         raise_exception: bool = False,
         component: str = "unknown",
         label: Union[str, None] = None,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         """Run background text evaluation (async job)."""
         payload = EvalConfig.model_construct(
             data=data, metrics=metrics, component=component, version=label
         ).model_dump()
-        return self._post(endpoint="/deepeval/text-eval/background", payload=payload, raise_exception=raise_exception)
+        return self._post(
+            endpoint="/deepeval/text-eval/background",
+            payload=payload,
+            raise_exception=raise_exception,
+        )
     @overload
     def schema_eval(
@@ -335,7 +339,7 @@ class DeepEvalClient:
         component: str = "unknown",
         label: str | None = None,
         raise_exception: bool = False,
-    ) -> Optional[dict]: ...
+    ) -> Optional[Dict]: ...
     @overload
     def schema_eval(
@@ -343,7 +347,7 @@ class DeepEvalClient:
         config: SchemaEvalConfig,
         *,
         raise_exception: bool = False,
-    ) -> Optional[dict]: ...
+    ) -> Optional[Dict]: ...
     def schema_eval(
         self,
@@ -354,7 +358,7 @@ class DeepEvalClient:
         component: str = "unknown",
         label: str | None = None,
         raise_exception: bool = False,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         if config is None:
             if data is None or metrics is None:
                 raise ValueError(
@@ -381,13 +385,15 @@ class DeepEvalClient:
         raise_exception: bool = False,
         component: str = "unknown",
         label: Union[str, None] = None,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         """Run background schema evaluation (async job)."""
         payload = SchemaEvalConfig.model_construct(
             data=data, metrics=metrics, component=component, version=label
         ).model_dump()
         return self._post(
-            endpoint="/deepeval/schema-eval/background", payload=payload, raise_exception=raise_exception
+            endpoint="/deepeval/schema-eval/background",
+            payload=payload,
+            raise_exception=raise_exception,
         )
     def maybe_text_eval(
@@ -398,7 +404,7 @@ class DeepEvalClient:
         raise_exception: bool = False,
         component: str = "unknown",
         label: Union[str, None] = None,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         """Randomly run text_eval based on a probability between 0 and 1."""
         self._validate_chance(chance)
         return (
@@ -421,7 +427,7 @@ class DeepEvalClient:
         raise_exception: bool = False,
         component: str = "unknown",
         label: Union[str, None] = None,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         """Randomly run text_eval_background based on a probability between 0 and 1."""
         self._validate_chance(chance)
         return (
@@ -440,7 +446,7 @@ class DeepEvalClient:
         raise_exception: bool = False,
         component: str = "unknown",
         label: Union[str, None] = None,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         """Randomly run schema_eval based on a probability between 0 and 1."""
         self._validate_chance(chance)
         return (
@@ -463,7 +469,7 @@ class DeepEvalClient:
         raise_exception: bool = False,
         component: str = "unknown",
         label: Union[str, None] = None,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         """Randomly run text_eval_background based on a probability between 0 and 1."""
         self._validate_chance(chance)
         return (

{rakam_eval_sdk-0.2.3 → rakam_eval_sdk-0.2.4}/src/rakam_eval_sdk/schema.py RENAMED Viewed

@@ -129,3 +129,24 @@ class SchemaEvalConfig(BaseModel):
     label: Union[str, None] = None
     data: List[SchemaInputItem]
     metrics: List[SchemaMetricConfig] = Field(default_factory=list)
+class MetricDiff(BaseModel):
+    metric: str
+    score_a: Optional[float]
+    score_b: Optional[float]
+    delta: Optional[float]
+    success_a: Optional[bool]
+    success_b: Optional[bool]
+    threshold_a: Optional[float]
+    threshold_b: Optional[float]
+    status: str  # "unchanged" | "changed" | "added" | "removed"
+class TestCaseComparison(BaseModel):
+    testcase_a_id: int
+    testcase_b_id: int
+    metrics: List[MetricDiff]

{rakam_eval_sdk-0.2.3 → rakam_eval_sdk-0.2.4}/README.md RENAMED Viewed

File without changes

{rakam_eval_sdk-0.2.3 → rakam_eval_sdk-0.2.4}/src/rakam_eval_sdk/__init__.py RENAMED Viewed

File without changes

{rakam_eval_sdk-0.2.3 → rakam_eval_sdk-0.2.4}/src/rakam_eval_sdk/decorators.py RENAMED Viewed

File without changes

{rakam_eval_sdk-0.2.3 → rakam_eval_sdk-0.2.4}/src/rakam_eval_sdk/utils/decorator_utils.py RENAMED Viewed

File without changes

rakam-eval-sdk 0.2.3__tar.gz → 0.2.4__tar.gz

rakam-eval-sdk 0.2.3tar.gz → 0.2.4tar.gz