PyPI - rakam-eval-sdk - Versions diffs - 0.2.4rc7__py3-none-any.whl → 0.2.4rc8__py3-none-any.whl - Mend

rakam-eval-sdk 0.2.4rc7py3-none-any.whl → 0.2.4rc8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

rakam_eval_sdk/cli.py +135 -103
rakam_eval_sdk/tests/test_cli.py +58 -0
rakam_eval_sdk/tests/test_decorator.py +30 -0
rakam_eval_sdk/tests/test_metrics.py +51 -0
rakam_eval_sdk/tests/test_modules_loader.py +30 -0
rakam_eval_sdk/tests/test_print.py +37 -26
rakam_eval_sdk/utils/print.py +156 -21
{rakam_eval_sdk-0.2.4rc7.dist-info → rakam_eval_sdk-0.2.4rc8.dist-info}/METADATA +1 -1
rakam_eval_sdk-0.2.4rc8.dist-info/RECORD +17 -0
rakam_eval_sdk-0.2.4rc7.dist-info/RECORD +0 -14
{rakam_eval_sdk-0.2.4rc7.dist-info → rakam_eval_sdk-0.2.4rc8.dist-info}/WHEEL +0 -0
{rakam_eval_sdk-0.2.4rc7.dist-info → rakam_eval_sdk-0.2.4rc8.dist-info}/entry_points.txt +0 -0

rakam_eval_sdk/cli.py CHANGED Viewed

@@ -5,7 +5,7 @@ import sys
 import uuid
 from datetime import datetime
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Set, Tuple, Union
+from typing import Any, Dict, List, Optional, Set, Tuple
 import typer
 from dotenv import load_dotenv
@@ -20,7 +20,12 @@ from rakam_eval_sdk.utils.decorator_utils import (
     find_decorated_functions,
     load_module_from_path,
 )
-from rakam_eval_sdk.utils.print import _print_and_save, pretty_print_comparison
+from rakam_eval_sdk.utils.print import (
+    _print_and_save,
+    git_diff,
+    pretty_print_comparison,
+    serialize_for_diff,
+)
 load_dotenv()
 app = typer.Typer(help="CLI tools for evaluation utilities")
@@ -30,10 +35,10 @@ console = Console()
 PROJECT_ROOT = os.path.abspath(".")
 if PROJECT_ROOT not in sys.path:
     sys.path.insert(0, PROJECT_ROOT)
-list_app = typer.Typer(help="List resources")
-app.add_typer(list_app, name="list")
+list_app = typer.Typer(help="List evaluations or runs")
 metrics_app = typer.Typer(help="Metrics utilities")
-app.add_typer(metrics_app, name="metrics")
+# Sub-apps are registered at the end to control command order
 def extract_metric_names(config: Any) -> List[Tuple[str, Optional[str]]]:
@@ -137,7 +142,7 @@ def list_evals(
     ),
 ) -> None:
     """
-    Find functions decorated with @track.
+    List evaluations (functions decorated with @eval_run).
     """
     TARGET_DECORATOR = eval_run.__name__
     files = directory.rglob("*.py") if recursive else directory.glob("*.py")
@@ -151,16 +156,16 @@ def list_evals(
             typer.echo(f"{file}:{fn}")
     if not found:
-        typer.echo(f"No @{TARGET_DECORATOR} functions found.")
+        typer.echo("No evaluations found.")
 @list_app.command("runs")
 def list_runs(
-    limit: int = typer.Option(20, help="Max number of runs"),
+    limit: int = typer.Option(20, "-l", "--limit", help="Max number of runs"),
     offset: int = typer.Option(0, help="Pagination offset"),
 ) -> None:
     """
-    List evaluation runs (newest first).
+    List runs (newest first).
     """
     client = DeepEvalClient()
@@ -174,7 +179,7 @@ def list_runs(
     total = response.get("total", 0)
     if not items:
-        typer.echo("No evaluation runs found.")
+        typer.echo("No runs found.")
         return
     typer.echo(f"[id] {'tag':<20}{'label':<20}created_at")
@@ -203,13 +208,15 @@ def list_runs(
 @app.command()
 def show(
-    id: Optional[int] = typer.Option(
+    run_id: Optional[int] = typer.Option(
         None,
+        "-i",
         "--id",
         help="Run ID",
     ),
     tag: Optional[str] = typer.Option(
         None,
+        "-t",
         "--tag",
         help="Run tag",
     ),
@@ -220,19 +227,20 @@ def show(
     ),
 ) -> None:
     """
-    Show a single evaluation testcase by ID or tag.
+    Show a run by ID or tag.
     """
-    if not id and not tag:
+    if not run_id and not tag:
         raise typer.BadParameter("You must provide either --id or --tag")
-    if id and tag:
+    if run_id and tag:
         raise typer.BadParameter("Provide only one of --id or --tag")
     client = DeepEvalClient()
-    if id:
-        result = client.get_evaluation_testcase_by_id(id)
-        identifier = f"id={id}"
+    if run_id:
+        result = client.get_evaluation_testcase_by_id(run_id)
+        identifier = f"run_id={run_id}"
     else:
         assert tag is not None
         result = client.get_evaluation_testcase_by_tag(tag)
@@ -265,7 +273,7 @@ def show(
     console.print(
         Panel.fit(
             Pretty(result),
-            title="Evaluation TestCase",
+            title="Run",
             subtitle=identifier,
         )
     )
@@ -311,7 +319,7 @@ def run(
     save_runs: bool = typer.Option(
         False,
         "--save-runs",
-        help="Save each evaluation run result to a JSON file",
+        help="Save each run result to a JSON file",
     ),
     output_dir: Path = typer.Option(
         Path("./eval_runs"),
@@ -320,7 +328,7 @@ def run(
     ),
 ) -> None:
     """
-    Find and execute all functions decorated with @eval_run.
+    Execute evaluations (functions decorated with @eval_run).
     """
     files = directory.rglob("*.py") if recursive else directory.glob("*.py")
     TARGET_DECORATOR = eval_run.__name__
@@ -400,124 +408,141 @@ def run(
                 typer.echo(f"    ❌ Execution failed: {e}")
     if not executed_any and not dry_run:
-        typer.echo("\nNo @eval_run functions executed.")
+        typer.echo("\nNo evaluations executed.")
+def fetch_run(
+    client: DeepEvalClient,
+    *,
+    run_id: Optional[int],
+    tag: Optional[str],
+) -> Tuple[dict, str]:
+    """
+    Fetch a single run by id or tag.
+    Returns (payload, identifier)
+    """
+    if run_id is not None:
+        result = client.get_evaluation_testcase_by_id(run_id)
+        identifier = f"run_id={run_id}"
+    else:
+        assert tag is not None
+        result = client.get_evaluation_testcase_by_tag(tag)
+        identifier = f"tag={tag}"
+    if not result:
+        raise RuntimeError(f"No data returned for {identifier}")
+    return result, identifier
 @app.command()
 def compare(
     tag: List[str] = typer.Option(
         [],
+        "-t",
         "--tag",
-        help="Label identifying a reference testcase",
+        help="Run tag",
     ),
-    run: List[int] = typer.Option(
+    run_id: List[int] = typer.Option(
         [],
-        "--run",
-        help="Run ID identifying an evaluation testcase",
-    ),
-    pretty: bool = typer.Option(
-        True,
-        "--pretty/--raw",
-        help="Pretty diff output (default) or raw JSON",
+        "-i",
+        "--id",
+        help="Run ID",
     ),
     summary: bool = typer.Option(
         False,
         "--summary",
-        help="Show summary only (no per-metric diff)",
+        help="Show summary diff only",
     ),
-    raise_exception: bool = typer.Option(
+    side_by_side: bool = typer.Option(
         False,
-        "--raise",
-        help="Raise HTTP exceptions instead of swallowing them",
-    ),
-    out: Optional[Path] = typer.Option(
-        None,
-        "-o",
-        "--out",
-        help="Optional file path to save the result as JSON",
-    ),
-    overwrite: bool = typer.Option(
-        False,
-        "--overwrite",
-        help="Overwrite output file if it already exists",
+        "--side-by-side",
+        help="Show side-by-side diff (git)",
     ),
 ) -> None:
     """
-    Compare two evaluation testcases (runs and/or labels).
+    Compare two evaluation runs.
-    Output:
-      - Unified diff-style view by default
-      - Raw JSON with --raw
-      - Summary of improvements / regressions
+    Default: unified git diff
     """
-    targets: List[Tuple[str, Union[str, int]]] = []
+    if summary and side_by_side:
+        typer.secho(
+            "❌ --summary and --side-by-side cannot be used together",
+            fg="red",
+            bold=True,
+        )
+        raise typer.Exit(code=1)
-    for r in run:
-        targets.append(("run", r))
+    targets: List[Tuple[Optional[int], Optional[str]]] = []
+    for r in run_id:
+        targets.append(("run", r))
     for t in tag:
-        targets.append(("label", t))
+        targets.append(("tag", t))
     if len(targets) != 2:
         typer.secho(
-            "❌ Provide exactly two targets using --run and/or --tag",
+            "❌ Provide exactly two targets using --id and/or --tag",
             fg="red",
             bold=True,
         )
         raise typer.Exit(code=1)
-    (type_a, value_a), (type_b, value_b) = targets
-    typer.secho(
-        f"🔍 Comparing {type_a} '{value_a}' ↔ {type_b} '{value_b}'",
-        bold=True,
-    )
     client = DeepEvalClient()
+    # Summary mode (reduced payload)
+    (type_a, value_a), (type_b, value_b) = targets
+    if summary:
+        kwargs: Dict[str, Any] = {"raise_exception": True}
+        if type_a == "run":
+            kwargs["testcase_a_id"] = value_a
+        else:
+            kwargs["testcase_a_tag"] = value_a
+        if type_b == "run":
+            kwargs["testcase_b_id"] = value_b
+        else:
+            kwargs["testcase_b_tag"] = value_b
+        try:
+            resp = client.compare_testcases(**kwargs)
+        except Exception as e:
+            typer.secho(f"❌ Request failed: {e}", fg="red")
+            raise typer.Exit(code=1)
-    kwargs: Dict[str, Any] = {"raise_exception": raise_exception}
-    if type_a == "run":
-        kwargs["testcase_a_id"] = value_a
-    else:
-        kwargs["testcase_a_tag"] = value_a
-    if type_b == "run":
-        kwargs["testcase_b_id"] = value_b
-    else:
-        kwargs["testcase_b_tag"] = value_b
+        if not resp:
+            typer.secho("⚠️ No response received", fg="yellow")
+            raise typer.Exit(code=1)
+        comparison = TestCaseComparison(**resp)
+        pretty_print_comparison(
+            comparison,
+            summary_only=summary,
+        )
+        return
     try:
-        resp = client.compare_testcases(**kwargs)
+        run_a, id_a = fetch_run(
+            client,
+            run_id=value_a if type_a == "run" else None,
+            tag=value_a if type_a == "tag" else None,
+        )
+        run_b, id_b = fetch_run(
+            client,
+            run_id=value_b if type_b == "run" else None,
+            tag=value_b if type_b == "tag" else None,
+        )
     except Exception as e:
-        typer.secho(f"❌ Request failed: {e}", fg="red")
-        raise typer.Exit(code=1)
-    if not resp:
-        typer.secho("⚠️ No response received", fg="yellow")
+        typer.secho(f"❌ Fetch failed: {e}", fg="red")
         raise typer.Exit(code=1)
-    if out:
-        if out.exists() and not overwrite:
-            typer.secho(
-                f"❌ File already exists: {out} (use --overwrite)",
-                fg="red",
-            )
-            raise typer.Exit(code=1)
-        out.write_text(json.dumps(resp, indent=2))
-        typer.secho(f"💾 Saved raw output to {out}", fg="green")
+    a_text = serialize_for_diff(run_a)
+    b_text = serialize_for_diff(run_b)
-    if not pretty:
-        typer.echo(json.dumps(resp, indent=2))
-        return
-    comparison = TestCaseComparison(**resp)
-    pretty_print_comparison(
-        comparison,
-        summary_only=summary,
+    git_diff(
+        a_text,
+        b_text,
+        label_a=f"{id_a}.full.json",
+        label_b=f"{id_b}.full.json",
+        side_by_side=side_by_side,
     )
@@ -554,7 +579,7 @@ def compare_label_latest(
     ),
 ) -> None:
     """
-    Compare the latest evaluation runs for two labels.
+    Compare the latest runs for two labels.
     """
     client = DeepEvalClient()
@@ -632,11 +657,13 @@ def compare_last(
 def tag_command(
     run_id: Optional[int] = typer.Option(
         None,
-        "--run",
-        help="Evaluation run ID",
+        "-i",
+        "--id",
+        help="Run ID",
     ),
     tag: Optional[str] = typer.Option(
         None,
+        "-t",
         "--tag",
         help="Tag to assign to the run",
     ),
@@ -653,11 +680,11 @@ def tag_command(
     # --- validation ---
     if delete:
         if run_id or tag:
-            typer.echo("❌ --delete cannot be used with --run or --tag")
+            typer.echo("❌ --delete cannot be used with --id or --tag")
             raise typer.Exit(code=1)
     else:
         if not run_id or not tag:
-            typer.echo("❌ Use --run and --tag together, or --delete")
+            typer.echo("❌ Use --id and --tag together, or --delete")
             raise typer.Exit(code=1)
     client = DeepEvalClient()
@@ -687,6 +714,11 @@ def tag_command(
     typer.echo(f"Tag: {result.get('tag')}")
+# Register sub-apps in user journey order (after regular commands)
+app.add_typer(list_app, name="list")
+app.add_typer(metrics_app, name="metrics")
 def main() -> None:
     app()

rakam_eval_sdk/tests/test_cli.py ADDED Viewed

@@ -0,0 +1,58 @@
+from pathlib import Path
+from types import SimpleNamespace
+import pytest
+import typer
+from rakam_eval_sdk.cli import metrics
+def test_metrics_no_metrics_found(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    file = tmp_path / "eval.py"
+    file.write_text("def foo(): pass")
+    monkeypatch.setattr(
+        "rakam_eval_sdk.cli.find_decorated_functions",
+        lambda *_: [],
+    )
+    with pytest.raises(typer.Exit) as exc:
+        metrics(directory=tmp_path, recursive=False)
+    assert exc.value.exit_code == 0
+def test_metrics_finds_metrics(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys
+) -> None:
+    file = tmp_path / "eval.py"
+    file.write_text("")
+    monkeypatch.setattr(
+        "rakam_eval_sdk.cli.find_decorated_functions",
+        lambda *_: ["run_eval"],
+    )
+    fake_module = SimpleNamespace(
+        run_eval=lambda: SimpleNamespace(
+            metrics=[
+                SimpleNamespace(type="accuracy", name=None),
+                SimpleNamespace(type="f1", name="f1_v2"),
+            ]
+        )
+    )
+    monkeypatch.setattr(
+        "rakam_eval_sdk.cli.load_module_from_path",
+        lambda _: fake_module,
+    )
+    metrics(directory=tmp_path, recursive=False)
+    out = capsys.readouterr().out
+    assert "accuracy" in out
+    assert "f1" in out
+    assert "unique metrics found" in out

rakam_eval_sdk/tests/test_decorator.py CHANGED Viewed

@@ -1,9 +1,12 @@
 import os
+from pathlib import Path
+from typing import List
 import pytest
 from rakam_eval_sdk.client import DeepEvalClient
 from rakam_eval_sdk.decorators import eval_run
+from rakam_eval_sdk.utils.decorator_utils import find_decorated_functions
 @pytest.fixture
@@ -80,3 +83,30 @@ def test_eval_run_with_parentheses(capsys: pytest.CaptureFixture[str]) -> None:
     assert result == 12
     assert "[eval_run]" in capsys.readouterr().out
+def test_find_decorated_functions(tmp_path: Path) -> None:
+    code = """
+from rakam_eval_sdk.decorators import eval_run
+@eval_run
+def foo():
+    pass
+@eval_run()
+def bar():
+    pass
+async def baz():
+    pass
+@other
+def nope():
+    pass
+"""
+    file = tmp_path / "test_mod.py"
+    file.write_text(code)
+    result: List[str] = find_decorated_functions(file, "eval_run")
+    assert set(result) == {"foo", "bar"}

rakam_eval_sdk/tests/test_metrics.py ADDED Viewed

@@ -0,0 +1,51 @@
+from types import SimpleNamespace
+from rakam_eval_sdk.cli import extract_metric_names
+def test_extract_metric_names_no_metrics_attr() -> None:
+    config = SimpleNamespace()
+    result = extract_metric_names(config)
+    assert result == []
+def test_extract_metric_names_empty_metrics() -> None:
+    config = SimpleNamespace(metrics=[])
+    result = extract_metric_names(config)
+    assert result == []
+def test_extract_metric_names_with_types_only() -> None:
+    config = SimpleNamespace(
+        metrics=[
+            SimpleNamespace(type="accuracy"),
+            SimpleNamespace(type="latency"),
+        ]
+    )
+    result = extract_metric_names(config)
+    assert result == [
+        ("accuracy", None),
+        ("latency", None),
+    ]
+def test_extract_metric_names_with_type_and_name() -> None:
+    config = SimpleNamespace(
+        metrics=[
+            SimpleNamespace(type="accuracy", name="acc_v1"),
+            SimpleNamespace(type="f1", name=None),
+        ]
+    )
+    result = extract_metric_names(config)
+    assert result == [
+        ("accuracy", "acc_v1"),
+        ("f1", None),
+    ]

rakam_eval_sdk/tests/test_modules_loader.py ADDED Viewed

@@ -0,0 +1,30 @@
+from pathlib import Path
+from types import ModuleType
+import pytest
+from rakam_eval_sdk.utils.decorator_utils import load_module_from_path
+def test_load_module_from_path_success(tmp_path: Path) -> None:
+    file = tmp_path / "mod.py"
+    file.write_text(
+        """
+x = 42
+def hello():
+    return "world"
+"""
+    )
+    module: ModuleType = load_module_from_path(file)
+    assert module.x == 42
+    assert module.hello() == "world"
+def test_load_module_from_path_invalid(tmp_path: Path) -> None:
+    file: Path = tmp_path / "broken.py"
+    file.write_text("def foo(")  # invalid syntax
+    with pytest.raises(SyntaxError):
+        load_module_from_path(file)

rakam_eval_sdk/tests/test_print.py CHANGED Viewed

@@ -1,7 +1,8 @@
+from typing import Optional, List, Dict
 import json
 from pathlib import Path
 from types import SimpleNamespace
-from typing import Any, Dict, List, Optional
+from typing import Any
 import pytest
 import typer
@@ -34,6 +35,7 @@ def test_pct_change(a: Optional[int], b: Optional[int], expected: Optional[str])
 # metric_direction
 # -------------------------
 @pytest.mark.parametrize(
     "delta,expected",
     [
@@ -89,47 +91,54 @@ def test_print_and_save_overwrites(tmp_path: Path) -> None:
     assert json.loads(out.read_text()) == {"x": 2}
-# -------------------------
-# summarize
-# -------------------------
 def make_metric(
     *,
+    metric: str,
     status: str = "changed",
     delta: Optional[float] = None,
 ) -> SimpleNamespace:
-    return SimpleNamespace(status=status, delta=delta)
+    return SimpleNamespace(
+        metric=metric,
+        status=status,
+        delta=delta,
+    )
-def test_summarize_counts_all_states() -> None:
+def test_summarize_groups_all_states() -> None:
     metrics: List[SimpleNamespace] = [
-        make_metric(status="added"),
-        make_metric(status="removed"),
-        make_metric(delta=1.0),
-        make_metric(delta=-1.0),
-        make_metric(delta=0.0),
-        make_metric(delta=None),
+        make_metric(metric="added_metric", status="added"),
+        make_metric(metric="removed_metric", status="removed"),
+        make_metric(metric="improved_metric", delta=1.0),
+        make_metric(metric="regressed_metric", delta=-1.0),
+        make_metric(metric="unchanged_zero", delta=0.0),
+        make_metric(metric="unchanged_none", delta=None),
     ]
-    summary: Dict[str, int] = summarize(metrics)
+    summary: Dict[str, List[str]] = summarize(metrics=metrics)
     assert summary == {
-        "improved": 1,
-        "regressed": 1,
-        "unchanged": 2,
-        "added": 1,
-        "removed": 1,
+        "improved": ["improved_metric"],
+        "regressed": ["regressed_metric"],
+        "unchanged": ["unchanged_zero", "unchanged_none"],
+        "added": ["added_metric"],
+        "removed": ["removed_metric"],
     }
-# -------------------------
-# pretty_print_comparison
-# -------------------------
+def test_summarize_empty_metrics() -> None:
+    assert summarize(metrics=[]) == {
+        "improved": [],
+        "regressed": [],
+        "unchanged": [],
+        "added": [],
+        "removed": [],
+    }
 def test_pretty_print_comparison_summary_only(monkeypatch: pytest.MonkeyPatch) -> None:
     metrics: List[SimpleNamespace] = [
-        make_metric(status="added"),
-        make_metric(delta=1.0),
+        make_metric(metric="added_metric", status="added"),
+        make_metric(metric="improved_metric", delta=1.0),
     ]
     resp: SimpleNamespace = SimpleNamespace(metrics=metrics)
@@ -139,6 +148,7 @@ def test_pretty_print_comparison_summary_only(monkeypatch: pytest.MonkeyPatch) -
         called["metrics"] = arg
     import rakam_eval_sdk.utils.print as mod
     monkeypatch.setattr(mod, "print_summary", fake_print_summary)
     pretty_print_comparison(resp, summary_only=True)
@@ -148,8 +158,8 @@ def test_pretty_print_comparison_summary_only(monkeypatch: pytest.MonkeyPatch) -
 def test_pretty_print_comparison_full(monkeypatch: pytest.MonkeyPatch) -> None:
     metrics: List[SimpleNamespace] = [
-        make_metric(status="added"),
-        make_metric(delta=1.0),
+        make_metric(metric="added_metric", status="added"),
+        make_metric(metric="improved_metric", delta=1.0),
     ]
     resp: SimpleNamespace = SimpleNamespace(metrics=metrics)
@@ -159,6 +169,7 @@ def test_pretty_print_comparison_full(monkeypatch: pytest.MonkeyPatch) -> None:
         calls.append(metric)
     import rakam_eval_sdk.utils.print as mod
     monkeypatch.setattr(mod, "print_metric_diff", fake_print_metric_diff)
     pretty_print_comparison(resp, summary_only=False)

rakam_eval_sdk/utils/print.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import json
 from pathlib import Path
 from pprint import pprint
-from typing import Any, Dict,  Optional
+from typing import Any, Dict, List, Optional
 import typer
 from typer import secho
@@ -25,8 +25,7 @@ def _print_and_save(
         return
     if out.exists() and not overwrite:
-        typer.echo(
-            f"❌ File already exists: {out} (use --overwrite to replace)")
+        typer.echo(f"❌ File already exists: {out} (use --overwrite to replace)")
         raise typer.Exit(code=1)
     out.parent.mkdir(parents=True, exist_ok=True)
@@ -95,34 +94,68 @@ def print_metric_diff(diff: MetricDiff) -> None:
         secho(f"  success: {diff.success_a}", dim=True)
-def summarize(metrics: Any) -> Dict[str, int]:
-    summary = {
-        "improved": 0,
-        "regressed": 0,
-        "unchanged": 0,
-        "added": 0,
-        "removed": 0,
+def summarize(metrics: List[MetricDiff]) -> Dict[str, List[str]]:
+    """
+    Returns metric names grouped by category.
+    """
+    summary: Dict[str, List[str]] = {
+        "improved": [],
+        "regressed": [],
+        "unchanged": [],
+        "added": [],
+        "removed": [],
     }
     for m in metrics:
-        if m.status in ("added", "removed"):
-            summary[m.status] += 1
-        else:
-            direction = metric_direction(m.delta)
-            summary[direction] += 1
+        if m.status == "added":
+            summary["added"].append(m.metric)
+            continue
+        if m.status == "removed":
+            summary["removed"].append(m.metric)
+            continue
+        direction = metric_direction(m.delta)
+        summary[direction].append(m.metric)
     return summary
-def print_summary(metrics: Any) -> None:
+def _fmt(metrics: List[str]) -> str:
+    return ", ".join(metrics) if metrics else "-"
+def print_summary(metrics: List[MetricDiff]) -> None:
     summary = summarize(metrics)
+    rows = [
+        ("↑ Improved", "improved", "green"),
+        ("↓ Regressed", "regressed", "red"),
+        ("± Unchanged", "unchanged", None),
+        ("+ Added.", "added", "green"),
+        ("- Removed.", "removed", "red"),
+    ]
     secho("\nSummary:", bold=True)
-    secho(f"  ↑ Improved: {summary['improved']}", fg="green")
-    secho(f"  ↓ Regressed: {summary['regressed']}", fg="red")
-    secho(f"  ± Unchanged: {summary['unchanged']}", dim=True)
-    secho(f"  + Added: {summary['added']}", fg="green")
-    secho(f"  - Removed: {summary['removed']}", fg="red")
+    secho(
+        "  | Status       | # | Metrics                |",
+        dim=True,
+    )
+    secho(
+        "  |--------------|---|------------------------|",
+        dim=True,
+    )
+    for label, key, color in rows:
+        count = len(summary[key])
+        metrics_str = _fmt(summary[key])
+        line = f"  | {label:<12} | {count:<1} | {metrics_str:<22} |"
+        if color:
+            secho(line, fg=color)
+        else:
+            secho(line, dim=True)
 def pretty_print_comparison(resp: Any, summary_only: bool = False) -> None:
@@ -132,3 +165,105 @@ def pretty_print_comparison(resp: Any, summary_only: bool = False) -> None:
         return
     print_summary(resp.metrics)
+def serialize_for_diff(obj: dict) -> str:
+    """
+    Stable, git-friendly JSON representation
+    """
+    return (
+        json.dumps(
+            obj,
+            indent=4,
+            sort_keys=True,
+            ensure_ascii=False,
+        )
+        + "\n"
+    )
+def git_diff(
+    a_text: str,
+    b_text: str,
+    *,
+    label_a: str,
+    label_b: str,
+    side_by_side: bool = False,
+) -> None:
+    """
+    Show diff between two text blobs.
+    - side_by_side: uses git difftool with vimdiff (interactive)
+    - fallback to git diff -U3 or difflib if git is not available
+    """
+    import shutil
+    import subprocess
+    from pathlib import Path
+    from tempfile import TemporaryDirectory
+    import typer
+    git = shutil.which("git")
+    vimdiff = shutil.which("vimdiff")
+    with TemporaryDirectory() as tmp:
+        a = Path(tmp) / label_a
+        b = Path(tmp) / label_b
+        a.write_text(a_text)
+        b.write_text(b_text)
+        # --- Side-by-side with vimdiff ---
+        if side_by_side:
+            if not git:
+                typer.secho(
+                    "❌ Git is required for side-by-side diffs", fg="red", bold=True
+                )
+                return
+            if not vimdiff:
+                typer.secho(
+                    "❌ Vimdiff is not installed. Please install vim or vimdiff to use side-by-side mode.",
+                    fg="red",
+                    bold=True,
+                )
+                return
+            cmd = [
+                git,
+                "difftool",
+                "--no-index",
+                "--tool=vimdiff",
+                "--no-prompt",  # skip Y/n prompt
+                str(a),
+                str(b),
+            ]
+            subprocess.run(cmd, check=False)
+            return
+        # (default)
+        if git:
+            cmd = [
+                git,
+                "diff",
+                "--no-index",
+                "--color=always",
+                "-U3",
+                str(a),
+                str(b),
+            ]
+            subprocess.run(cmd, check=False)
+            return
+        #  Fallback
+        import difflib
+        diff = difflib.unified_diff(
+            a_text.splitlines(),
+            b_text.splitlines(),
+            fromfile=label_a,
+            tofile=label_b,
+            lineterm="",
+        )
+        for line in diff:
+            typer.echo(line)

{rakam_eval_sdk-0.2.4rc7.dist-info → rakam_eval_sdk-0.2.4rc8.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: rakam-eval-sdk
-Version: 0.2.4rc7
+Version: 0.2.4rc8
 Summary: Evaluation Framework SDK
 Author: Mohamed Bachar Touil
 License: MIT

rakam_eval_sdk-0.2.4rc8.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,17 @@
+rakam_eval_sdk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+rakam_eval_sdk/cli.py,sha256=mxLb2tJHdoSW33_4N6AoxZrMZVHoA2nDtLe8LBuUu-c,19068
+rakam_eval_sdk/client.py,sha256=8KwpNt7WHgMfcXAQtrAlnUihai9TiADRieiuWeIxg6E,15575
+rakam_eval_sdk/decorators.py,sha256=IOe4r4P8JoWM9thPtb3bJ1EXvGxkjydHbKS0W1fiYSk,1908
+rakam_eval_sdk/schema.py,sha256=ouxiV1ydQXiVn5DjIAYDT4yqsXA0h95bq6c-oSkqxL8,3912
+rakam_eval_sdk/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+rakam_eval_sdk/tests/test_cli.py,sha256=SkysYGjZyZuWkurgu9VmZQxpuDSRvssraYG3zqpnfBo,1366
+rakam_eval_sdk/tests/test_decorator.py,sha256=TMkzWLv-WFhvrEhAmcmF1S-LJ_7ufIawNFVCIEcLQvw,2498
+rakam_eval_sdk/tests/test_metrics.py,sha256=QC7yHdvB-GXIcTVOsHY8ednGBI18OI6FxWvGLUW_VHY,1118
+rakam_eval_sdk/tests/test_modules_loader.py,sha256=s3jUVCUP1d-U4siDElVGjkjYVT6wL2kNbDKI3T_bca0,668
+rakam_eval_sdk/tests/test_print.py,sha256=dBYYUnpv-PLU6mEPTITjCyhijOkKmbKozH6f0xU6JeI,4655
+rakam_eval_sdk/utils/decorator_utils.py,sha256=g0TjXtG9o4hwhUAFP8GJsXAkjhZhzeseTAg-YBFjj2g,1763
+rakam_eval_sdk/utils/print.py,sha256=8MbGryh6TxgaQENWcjnfDc1bbRA0FFvnNcu8jp-KEuQ,6857
+rakam_eval_sdk-0.2.4rc8.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
+rakam_eval_sdk-0.2.4rc8.dist-info/entry_points.txt,sha256=tNhwmM_UGELb3h0zOfgCrtTheUkP-k8jGv0rTOfRSps,56
+rakam_eval_sdk-0.2.4rc8.dist-info/METADATA,sha256=Icpm85_loND_wTYbO1BajqCo9wacFP1GQZf8YpyqiSM,6022
+rakam_eval_sdk-0.2.4rc8.dist-info/RECORD,,

rakam_eval_sdk-0.2.4rc7.dist-info/RECORD DELETED Viewed

@@ -1,14 +0,0 @@
-rakam_eval_sdk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-rakam_eval_sdk/cli.py,sha256=ZFsEXVBlQNThu8dK_JVGpirmY5ppuzE8lubw-XqE1_k,18385
-rakam_eval_sdk/client.py,sha256=8KwpNt7WHgMfcXAQtrAlnUihai9TiADRieiuWeIxg6E,15575
-rakam_eval_sdk/decorators.py,sha256=IOe4r4P8JoWM9thPtb3bJ1EXvGxkjydHbKS0W1fiYSk,1908
-rakam_eval_sdk/schema.py,sha256=ouxiV1ydQXiVn5DjIAYDT4yqsXA0h95bq6c-oSkqxL8,3912
-rakam_eval_sdk/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-rakam_eval_sdk/tests/test_decorator.py,sha256=ppqdhHamyC-rs_RsEjCegsnkV4kLhX-qpG5UicA6D6w,1956
-rakam_eval_sdk/tests/test_print.py,sha256=uraR7qrZxnB3BCCUB6l9k1L62-VU7f0hJeArL3ellrk,4148
-rakam_eval_sdk/utils/decorator_utils.py,sha256=g0TjXtG9o4hwhUAFP8GJsXAkjhZhzeseTAg-YBFjj2g,1763
-rakam_eval_sdk/utils/print.py,sha256=PaeJrgPtLvVBr7uCXA-Q9UNJqrMyky_UMCd7JGFCQ2w,3743
-rakam_eval_sdk-0.2.4rc7.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
-rakam_eval_sdk-0.2.4rc7.dist-info/entry_points.txt,sha256=tNhwmM_UGELb3h0zOfgCrtTheUkP-k8jGv0rTOfRSps,56
-rakam_eval_sdk-0.2.4rc7.dist-info/METADATA,sha256=JkNU5IxCvwkTYOQA8f2yZcMpYYEl0QHOOybylw6--L0,6022
-rakam_eval_sdk-0.2.4rc7.dist-info/RECORD,,

{rakam_eval_sdk-0.2.4rc7.dist-info → rakam_eval_sdk-0.2.4rc8.dist-info}/WHEEL RENAMED Viewed

File without changes

{rakam_eval_sdk-0.2.4rc7.dist-info → rakam_eval_sdk-0.2.4rc8.dist-info}/entry_points.txt RENAMED Viewed

File without changes

rakam-eval-sdk 0.2.4rc7__py3-none-any.whl → 0.2.4rc8__py3-none-any.whl

rakam-eval-sdk 0.2.4rc7py3-none-any.whl → 0.2.4rc8py3-none-any.whl