PyPI - rakam-eval-sdk - Versions diffs - 0.2.4rc6__tar.gz → 0.2.4rc8__tar.gz - Mend

rakam-eval-sdk 0.2.4rc6tar.gz → 0.2.4rc8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

{rakam_eval_sdk-0.2.4rc6 → rakam_eval_sdk-0.2.4rc8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: rakam-eval-sdk
-Version: 0.2.4rc6
+Version: 0.2.4rc8
 Summary: Evaluation Framework SDK
 Author: Mohamed Bachar Touil
 License: MIT

{rakam_eval_sdk-0.2.4rc6 → rakam_eval_sdk-0.2.4rc8}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "uv_build"
 [project]
 name = "rakam-eval-sdk"
-version = "0.2.4rc6"
+version = "0.2.4rc8"
 description = "Evaluation Framework SDK"
 readme = "README.md"
 requires-python = ">=3.8"
@@ -25,6 +25,7 @@ dev = [
     "build>=1.2.2.post1",
     "pytest>=8.3.5",
     "twine>=6.1.0",
+    "pytest-cov>=5.0.0",
 ]
 [tool.isort]

{rakam_eval_sdk-0.2.4rc6 → rakam_eval_sdk-0.2.4rc8}/src/rakam_eval_sdk/cli.py RENAMED Viewed

@@ -5,23 +5,27 @@ import sys
 import uuid
 from datetime import datetime
 from pathlib import Path
-from pprint import pprint
-from typing import Any, Dict, List, Optional, Tuple, Union, Set
+from typing import Any, Dict, List, Optional, Set, Tuple
 import typer
 from dotenv import load_dotenv
 from rich.console import Console
 from rich.panel import Panel
 from rich.pretty import Pretty
-from typer import secho
 from rakam_eval_sdk.client import DeepEvalClient
 from rakam_eval_sdk.decorators import eval_run
-from rakam_eval_sdk.schema import MetricDiff, TestCaseComparison
+from rakam_eval_sdk.schema import TestCaseComparison
 from rakam_eval_sdk.utils.decorator_utils import (
     find_decorated_functions,
     load_module_from_path,
 )
+from rakam_eval_sdk.utils.print import (
+    _print_and_save,
+    git_diff,
+    pretty_print_comparison,
+    serialize_for_diff,
+)
 load_dotenv()
 app = typer.Typer(help="CLI tools for evaluation utilities")
@@ -31,10 +35,10 @@ console = Console()
 PROJECT_ROOT = os.path.abspath(".")
 if PROJECT_ROOT not in sys.path:
     sys.path.insert(0, PROJECT_ROOT)
-list_app = typer.Typer(help="List resources")
-app.add_typer(list_app, name="list")
+list_app = typer.Typer(help="List evaluations or runs")
 metrics_app = typer.Typer(help="Metrics utilities")
-app.add_typer(metrics_app, name="metrics")
+# Sub-apps are registered at the end to control command order
 def extract_metric_names(config: Any) -> List[Tuple[str, Optional[str]]]:
@@ -138,7 +142,7 @@ def list_evals(
     ),
 ) -> None:
     """
-    Find functions decorated with @track.
+    List evaluations (functions decorated with @eval_run).
     """
     TARGET_DECORATOR = eval_run.__name__
     files = directory.rglob("*.py") if recursive else directory.glob("*.py")
@@ -152,16 +156,16 @@ def list_evals(
             typer.echo(f"{file}:{fn}")
     if not found:
-        typer.echo(f"No @{TARGET_DECORATOR} functions found.")
+        typer.echo("No evaluations found.")
 @list_app.command("runs")
 def list_runs(
-    limit: int = typer.Option(20, help="Max number of runs"),
+    limit: int = typer.Option(20, "-l", "--limit", help="Max number of runs"),
     offset: int = typer.Option(0, help="Pagination offset"),
 ) -> None:
     """
-    List evaluation runs (newest first).
+    List runs (newest first).
     """
     client = DeepEvalClient()
@@ -175,7 +179,7 @@ def list_runs(
     total = response.get("total", 0)
     if not items:
-        typer.echo("No evaluation runs found.")
+        typer.echo("No runs found.")
         return
     typer.echo(f"[id] {'tag':<20}{'label':<20}created_at")
@@ -204,13 +208,15 @@ def list_runs(
 @app.command()
 def show(
-    id: Optional[int] = typer.Option(
+    run_id: Optional[int] = typer.Option(
         None,
+        "-i",
         "--id",
         help="Run ID",
     ),
     tag: Optional[str] = typer.Option(
         None,
+        "-t",
         "--tag",
         help="Run tag",
     ),
@@ -221,19 +227,20 @@ def show(
     ),
 ) -> None:
     """
-    Show a single evaluation testcase by ID or tag.
+    Show a run by ID or tag.
     """
-    if not id and not tag:
+    if not run_id and not tag:
         raise typer.BadParameter("You must provide either --id or --tag")
-    if id and tag:
+    if run_id and tag:
         raise typer.BadParameter("Provide only one of --id or --tag")
     client = DeepEvalClient()
-    if id:
-        result = client.get_evaluation_testcase_by_id(id)
-        identifier = f"id={id}"
+    if run_id:
+        result = client.get_evaluation_testcase_by_id(run_id)
+        identifier = f"run_id={run_id}"
     else:
         assert tag is not None
         result = client.get_evaluation_testcase_by_tag(tag)
@@ -266,7 +273,7 @@ def show(
     console.print(
         Panel.fit(
             Pretty(result),
-            title="Evaluation TestCase",
+            title="Run",
             subtitle=identifier,
         )
     )
@@ -312,7 +319,7 @@ def run(
     save_runs: bool = typer.Option(
         False,
         "--save-runs",
-        help="Save each evaluation run result to a JSON file",
+        help="Save each run result to a JSON file",
     ),
     output_dir: Path = typer.Option(
         Path("./eval_runs"),
@@ -321,7 +328,7 @@ def run(
     ),
 ) -> None:
     """
-    Find and execute all functions decorated with @eval_run.
+    Execute evaluations (functions decorated with @eval_run).
     """
     files = directory.rglob("*.py") if recursive else directory.glob("*.py")
     TARGET_DECORATOR = eval_run.__name__
@@ -401,248 +408,141 @@ def run(
                 typer.echo(f"    ❌ Execution failed: {e}")
     if not executed_any and not dry_run:
-        typer.echo("\nNo @eval_run functions executed.")
-def _print_and_save(
-    resp: dict,
-    pretty: bool,
-    out: Optional[Path],
-    overwrite: bool,
-) -> None:
-    if pretty:
-        typer.echo(typer.style("📊 Result:", bold=True))
-        pprint(resp)
-    else:
-        typer.echo(resp)
-    if out is None:
-        return
-    if out.exists() and not overwrite:
-        typer.echo(f"❌ File already exists: {out} (use --overwrite to replace)")
-        raise typer.Exit(code=1)
+        typer.echo("\nNo evaluations executed.")
-    out.parent.mkdir(parents=True, exist_ok=True)
-    with out.open("w", encoding="utf-8") as f:
-        json.dump(resp, f, indent=2, ensure_ascii=False)
-    typer.echo(f"💾 Result saved to {out}")
-def pct_change(a: Optional[float], b: Optional[float]) -> Optional[str]:
-    if a is None or b is None or a == 0:
-        return None
-    return f"{((b - a) / a) * 100:+.2f}%"
-def metric_direction(delta: Optional[float]) -> str:
-    if delta is None:
-        return "unchanged"
-    if delta > 0:
-        return "improved"
-    if delta < 0:
-        return "regressed"
-    return "unchanged"
-def print_metric_diff(diff: MetricDiff) -> None:
-    secho(f"\nMetric: {diff.metric}", bold=True)
-    if diff.status == "added":
-        secho(f"+ score: {diff.score_b}", fg="green")
-        secho(f"+ threshold: {diff.threshold_b}", fg="green")
-        secho(f"+ success: {diff.success_b}", fg="green")
-        return
-    if diff.status == "removed":
-        secho(f"- score: {diff.score_a}", fg="red")
-        secho(f"- threshold: {diff.threshold_a}", fg="red")
-        secho(f"- success: {diff.success_a}", fg="red")
-        return
-    # unchanged / changed
-    if diff.score_a != diff.score_b:
-        direction = metric_direction(diff.delta)
-        color = "green" if direction == "improved" else "red"
-        pct = pct_change(diff.score_a, diff.score_b)
-        secho(f"- score: {diff.score_a}", fg="red")
-        secho(
-            f"+ score: {diff.score_b}" + (f"   ({pct})" if pct else ""),
-            fg=color,
-        )
-    else:
-        secho(f"  score: {diff.score_a}", dim=True)
-    if diff.threshold_a != diff.threshold_b:
-        secho(f"- threshold: {diff.threshold_a}", fg="red")
-        secho(f"+ threshold: {diff.threshold_b}", fg="green")
-    else:
-        secho(f"  threshold: {diff.threshold_a}", dim=True)
-    if diff.success_a != diff.success_b:
-        secho(f"- success: {diff.success_a}", fg="red")
-        secho(f"+ success: {diff.success_b}", fg="green")
+def fetch_run(
+    client: DeepEvalClient,
+    *,
+    run_id: Optional[int],
+    tag: Optional[str],
+) -> Tuple[dict, str]:
+    """
+    Fetch a single run by id or tag.
+    Returns (payload, identifier)
+    """
+    if run_id is not None:
+        result = client.get_evaluation_testcase_by_id(run_id)
+        identifier = f"run_id={run_id}"
     else:
-        secho(f"  success: {diff.success_a}", dim=True)
-def summarize(metrics: Any) -> Dict[str, int]:
-    summary = {
-        "improved": 0,
-        "regressed": 0,
-        "unchanged": 0,
-        "added": 0,
-        "removed": 0,
-    }
-    for m in metrics:
-        if m.status in ("added", "removed"):
-            summary[m.status] += 1
-        else:
-            direction = metric_direction(m.delta)
-            summary[direction] += 1
-    return summary
-def pretty_print_comparison(resp: Any, summary_only: bool = False) -> None:
-    if not summary_only:
-        for metric in resp.metrics:
-            print_metric_diff(metric)
-        return
-    print_summary(resp.metrics)
+        assert tag is not None
+        result = client.get_evaluation_testcase_by_tag(tag)
+        identifier = f"tag={tag}"
-def print_summary(metrics: Any) -> None:
-    summary = summarize(metrics)
+    if not result:
+        raise RuntimeError(f"No data returned for {identifier}")
-    secho("\nSummary:", bold=True)
-    secho(f"  ↑ Improved: {summary['improved']}", fg="green")
-    secho(f"  ↓ Regressed: {summary['regressed']}", fg="red")
-    secho(f"  ± Unchanged: {summary['unchanged']}", dim=True)
-    secho(f"  + Added: {summary['added']}", fg="green")
-    secho(f"  - Removed: {summary['removed']}", fg="red")
+    return result, identifier
 @app.command()
 def compare(
     tag: List[str] = typer.Option(
         [],
+        "-t",
         "--tag",
-        help="Label identifying a reference testcase",
+        help="Run tag",
     ),
-    run: List[int] = typer.Option(
+    run_id: List[int] = typer.Option(
         [],
-        "--run",
-        help="Run ID identifying an evaluation testcase",
-    ),
-    pretty: bool = typer.Option(
-        True,
-        "--pretty/--raw",
-        help="Pretty diff output (default) or raw JSON",
+        "-i",
+        "--id",
+        help="Run ID",
     ),
     summary: bool = typer.Option(
         False,
         "--summary",
-        help="Show summary only (no per-metric diff)",
+        help="Show summary diff only",
     ),
-    raise_exception: bool = typer.Option(
-        False,
-        "--raise",
-        help="Raise HTTP exceptions instead of swallowing them",
-    ),
-    out: Optional[Path] = typer.Option(
-        None,
-        "-o",
-        "--out",
-        help="Optional file path to save the result as JSON",
-    ),
-    overwrite: bool = typer.Option(
+    side_by_side: bool = typer.Option(
         False,
-        "--overwrite",
-        help="Overwrite output file if it already exists",
+        "--side-by-side",
+        help="Show side-by-side diff (git)",
     ),
 ) -> None:
     """
-    Compare two evaluation testcases (runs and/or labels).
+    Compare two evaluation runs.
-    Output:
-      - Unified diff-style view by default
-      - Raw JSON with --raw
-      - Summary of improvements / regressions
+    Default: unified git diff
     """
-    targets: List[Tuple[str, Union[str, int]]] = []
+    if summary and side_by_side:
+        typer.secho(
+            "❌ --summary and --side-by-side cannot be used together",
+            fg="red",
+            bold=True,
+        )
+        raise typer.Exit(code=1)
-    for r in run:
-        targets.append(("run", r))
+    targets: List[Tuple[Optional[int], Optional[str]]] = []
+    for r in run_id:
+        targets.append(("run", r))
     for t in tag:
-        targets.append(("label", t))
+        targets.append(("tag", t))
     if len(targets) != 2:
         typer.secho(
-            "❌ Provide exactly two targets using --run and/or --tag",
+            "❌ Provide exactly two targets using --id and/or --tag",
             fg="red",
             bold=True,
         )
         raise typer.Exit(code=1)
-    (type_a, value_a), (type_b, value_b) = targets
-    typer.secho(
-        f"🔍 Comparing {type_a} '{value_a}' ↔ {type_b} '{value_b}'",
-        bold=True,
-    )
     client = DeepEvalClient()
+    # Summary mode (reduced payload)
+    (type_a, value_a), (type_b, value_b) = targets
+    if summary:
+        kwargs: Dict[str, Any] = {"raise_exception": True}
+        if type_a == "run":
+            kwargs["testcase_a_id"] = value_a
+        else:
+            kwargs["testcase_a_tag"] = value_a
-    kwargs: Dict[str, Any] = {"raise_exception": raise_exception}
-    if type_a == "run":
-        kwargs["testcase_a_id"] = value_a
-    else:
-        kwargs["testcase_a_tag"] = value_a
+        if type_b == "run":
+            kwargs["testcase_b_id"] = value_b
+        else:
+            kwargs["testcase_b_tag"] = value_b
+        try:
+            resp = client.compare_testcases(**kwargs)
+        except Exception as e:
+            typer.secho(f"❌ Request failed: {e}", fg="red")
+            raise typer.Exit(code=1)
-    if type_b == "run":
-        kwargs["testcase_b_id"] = value_b
-    else:
-        kwargs["testcase_b_tag"] = value_b
+        if not resp:
+            typer.secho("⚠️ No response received", fg="yellow")
+            raise typer.Exit(code=1)
+        comparison = TestCaseComparison(**resp)
+        pretty_print_comparison(
+            comparison,
+            summary_only=summary,
+        )
+        return
     try:
-        resp = client.compare_testcases(**kwargs)
+        run_a, id_a = fetch_run(
+            client,
+            run_id=value_a if type_a == "run" else None,
+            tag=value_a if type_a == "tag" else None,
+        )
+        run_b, id_b = fetch_run(
+            client,
+            run_id=value_b if type_b == "run" else None,
+            tag=value_b if type_b == "tag" else None,
+        )
     except Exception as e:
-        typer.secho(f"❌ Request failed: {e}", fg="red")
+        typer.secho(f"❌ Fetch failed: {e}", fg="red")
         raise typer.Exit(code=1)
-    if not resp:
-        typer.secho("⚠️ No response received", fg="yellow")
-        raise typer.Exit(code=1)
+    a_text = serialize_for_diff(run_a)
+    b_text = serialize_for_diff(run_b)
-    if out:
-        if out.exists() and not overwrite:
-            typer.secho(
-                f"❌ File already exists: {out} (use --overwrite)",
-                fg="red",
-            )
-            raise typer.Exit(code=1)
-        out.write_text(json.dumps(resp, indent=2))
-        typer.secho(f"💾 Saved raw output to {out}", fg="green")
-    if not pretty:
-        typer.echo(json.dumps(resp, indent=2))
-        return
-    comparison = TestCaseComparison(**resp)
-    pretty_print_comparison(
-        comparison,
-        summary_only=summary,
+    git_diff(
+        a_text,
+        b_text,
+        label_a=f"{id_a}.full.json",
+        label_b=f"{id_b}.full.json",
+        side_by_side=side_by_side,
     )
@@ -679,7 +579,7 @@ def compare_label_latest(
     ),
 ) -> None:
     """
-    Compare the latest evaluation runs for two labels.
+    Compare the latest runs for two labels.
     """
     client = DeepEvalClient()
@@ -757,11 +657,13 @@ def compare_last(
 def tag_command(
     run_id: Optional[int] = typer.Option(
         None,
-        "--run",
-        help="Evaluation run ID",
+        "-i",
+        "--id",
+        help="Run ID",
     ),
     tag: Optional[str] = typer.Option(
         None,
+        "-t",
         "--tag",
         help="Tag to assign to the run",
     ),
@@ -778,11 +680,11 @@ def tag_command(
     # --- validation ---
     if delete:
         if run_id or tag:
-            typer.echo("❌ --delete cannot be used with --run or --tag")
+            typer.echo("❌ --delete cannot be used with --id or --tag")
             raise typer.Exit(code=1)
     else:
         if not run_id or not tag:
-            typer.echo("❌ Use --run and --tag together, or --delete")
+            typer.echo("❌ Use --id and --tag together, or --delete")
             raise typer.Exit(code=1)
     client = DeepEvalClient()
@@ -812,6 +714,11 @@ def tag_command(
     typer.echo(f"Tag: {result.get('tag')}")
+# Register sub-apps in user journey order (after regular commands)
+app.add_typer(list_app, name="list")
+app.add_typer(metrics_app, name="metrics")
 def main() -> None:
     app()

rakam_eval_sdk-0.2.4rc8/src/rakam_eval_sdk/tests/__init__.py ADDED Viewed

File without changes

rakam_eval_sdk-0.2.4rc8/src/rakam_eval_sdk/tests/test_cli.py ADDED Viewed

@@ -0,0 +1,58 @@
+from pathlib import Path
+from types import SimpleNamespace
+import pytest
+import typer
+from rakam_eval_sdk.cli import metrics
+def test_metrics_no_metrics_found(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    file = tmp_path / "eval.py"
+    file.write_text("def foo(): pass")
+    monkeypatch.setattr(
+        "rakam_eval_sdk.cli.find_decorated_functions",
+        lambda *_: [],
+    )
+    with pytest.raises(typer.Exit) as exc:
+        metrics(directory=tmp_path, recursive=False)
+    assert exc.value.exit_code == 0
+def test_metrics_finds_metrics(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys
+) -> None:
+    file = tmp_path / "eval.py"
+    file.write_text("")
+    monkeypatch.setattr(
+        "rakam_eval_sdk.cli.find_decorated_functions",
+        lambda *_: ["run_eval"],
+    )
+    fake_module = SimpleNamespace(
+        run_eval=lambda: SimpleNamespace(
+            metrics=[
+                SimpleNamespace(type="accuracy", name=None),
+                SimpleNamespace(type="f1", name="f1_v2"),
+            ]
+        )
+    )
+    monkeypatch.setattr(
+        "rakam_eval_sdk.cli.load_module_from_path",
+        lambda _: fake_module,
+    )
+    metrics(directory=tmp_path, recursive=False)
+    out = capsys.readouterr().out
+    assert "accuracy" in out
+    assert "f1" in out
+    assert "unique metrics found" in out

rakam_eval_sdk-0.2.4rc8/src/rakam_eval_sdk/tests/test_decorator.py ADDED Viewed

@@ -0,0 +1,112 @@
+import os
+from pathlib import Path
+from typing import List
+import pytest
+from rakam_eval_sdk.client import DeepEvalClient
+from rakam_eval_sdk.decorators import eval_run
+from rakam_eval_sdk.utils.decorator_utils import find_decorated_functions
+@pytest.fixture
+def client() -> DeepEvalClient:
+    return DeepEvalClient(base_url="http://testserver", api_token="testtoken")
+class FakeCPUTimes:
+    def __init__(self, user: float = 1.0, system: float = 1.0) -> None:
+        self.user: float = user
+        self.system: float = system
+class FakeMemInfo:
+    def __init__(self, rss: int) -> None:
+        self.rss: int = rss
+class FakeProcess:
+    def __init__(self) -> None:
+        self._cpu_calls: int = 0
+        self._mem_calls: int = 0
+    def cpu_times(self) -> FakeCPUTimes:
+        self._cpu_calls += 1
+        # simulate CPU usage increase
+        return FakeCPUTimes(
+            user=1.0 + self._cpu_calls,
+            system=1.0,
+        )
+    def memory_info(self) -> FakeMemInfo:
+        self._mem_calls += 1
+        return FakeMemInfo(rss=100_000_000 + (self._mem_calls * 10_000))
+@pytest.fixture(autouse=True)
+def patch_psutil(
+    monkeypatch: pytest.MonkeyPatch,
+) -> FakeProcess:
+    fake_process: FakeProcess = FakeProcess()
+    monkeypatch.setattr(
+        "rakam_eval_sdk.decorators.psutil.Process",
+        lambda pid: fake_process,
+    )
+    monkeypatch.setattr(os, "getpid", lambda: 123)
+    return fake_process
+def test_eval_run_basic(capsys: pytest.CaptureFixture[str]) -> None:
+    @eval_run
+    def add(a: int, b: int) -> int:
+        return a + b
+    result: int = add(2, 3)
+    assert result == 5
+    out: str = capsys.readouterr().out
+    assert "[eval_run]" in out
+    assert "add" in out
+    assert "time=" in out
+    assert "cpu=" in out
+    assert "mem_delta=" in out
+def test_eval_run_with_parentheses(capsys: pytest.CaptureFixture[str]) -> None:
+    @eval_run()
+    def mul(a: int, b: int) -> int:
+        return a * b
+    result: int = mul(3, 4)
+    assert result == 12
+    assert "[eval_run]" in capsys.readouterr().out
+def test_find_decorated_functions(tmp_path: Path) -> None:
+    code = """
+from rakam_eval_sdk.decorators import eval_run
+@eval_run
+def foo():
+    pass
+@eval_run()
+def bar():
+    pass
+async def baz():
+    pass
+@other
+def nope():
+    pass
+"""
+    file = tmp_path / "test_mod.py"
+    file.write_text(code)
+    result: List[str] = find_decorated_functions(file, "eval_run")
+    assert set(result) == {"foo", "bar"}

rakam_eval_sdk-0.2.4rc8/src/rakam_eval_sdk/tests/test_metrics.py ADDED Viewed

@@ -0,0 +1,51 @@
+from types import SimpleNamespace
+from rakam_eval_sdk.cli import extract_metric_names
+def test_extract_metric_names_no_metrics_attr() -> None:
+    config = SimpleNamespace()
+    result = extract_metric_names(config)
+    assert result == []
+def test_extract_metric_names_empty_metrics() -> None:
+    config = SimpleNamespace(metrics=[])
+    result = extract_metric_names(config)
+    assert result == []
+def test_extract_metric_names_with_types_only() -> None:
+    config = SimpleNamespace(
+        metrics=[
+            SimpleNamespace(type="accuracy"),
+            SimpleNamespace(type="latency"),
+        ]
+    )
+    result = extract_metric_names(config)
+    assert result == [
+        ("accuracy", None),
+        ("latency", None),
+    ]
+def test_extract_metric_names_with_type_and_name() -> None:
+    config = SimpleNamespace(
+        metrics=[
+            SimpleNamespace(type="accuracy", name="acc_v1"),
+            SimpleNamespace(type="f1", name=None),
+        ]
+    )
+    result = extract_metric_names(config)
+    assert result == [
+        ("accuracy", "acc_v1"),
+        ("f1", None),
+    ]

rakam_eval_sdk-0.2.4rc8/src/rakam_eval_sdk/tests/test_modules_loader.py ADDED Viewed

@@ -0,0 +1,30 @@
+from pathlib import Path
+from types import ModuleType
+import pytest
+from rakam_eval_sdk.utils.decorator_utils import load_module_from_path
+def test_load_module_from_path_success(tmp_path: Path) -> None:
+    file = tmp_path / "mod.py"
+    file.write_text(
+        """
+x = 42
+def hello():
+    return "world"
+"""
+    )
+    module: ModuleType = load_module_from_path(file)
+    assert module.x == 42
+    assert module.hello() == "world"
+def test_load_module_from_path_invalid(tmp_path: Path) -> None:
+    file: Path = tmp_path / "broken.py"
+    file.write_text("def foo(")  # invalid syntax
+    with pytest.raises(SyntaxError):
+        load_module_from_path(file)

rakam_eval_sdk-0.2.4rc8/src/rakam_eval_sdk/tests/test_print.py ADDED Viewed

@@ -0,0 +1,177 @@
+from typing import Optional, List, Dict
+import json
+from pathlib import Path
+from types import SimpleNamespace
+from typing import Any
+import pytest
+import typer
+from rakam_eval_sdk.utils.print import (
+    _print_and_save,
+    metric_direction,
+    pct_change,
+    pretty_print_comparison,
+    summarize,
+)
+@pytest.mark.parametrize(
+    "a,b,expected",
+    [
+        (10, 20, "+100.00%"),
+        (20, 10, "-50.00%"),
+        (10, 10, "+0.00%"),
+        (None, 10, None),
+        (10, None, None),
+        (0, 10, None),
+    ],
+)
+def test_pct_change(a: Optional[int], b: Optional[int], expected: Optional[str]):
+    assert pct_change(a, b) == expected
+# -------------------------
+# metric_direction
+# -------------------------
+@pytest.mark.parametrize(
+    "delta,expected",
+    [
+        (1.0, "improved"),
+        (-1.0, "regressed"),
+        (0.0, "unchanged"),
+        (None, "unchanged"),
+    ],
+)
+def test_metric_direction(delta: Optional[float], expected: str):
+    assert metric_direction(delta) == expected
+def test_print_and_save_no_output_path(capsys: pytest.CaptureFixture[str]) -> None:
+    resp: Dict[str, int] = {"a": 1}
+    _print_and_save(resp, pretty=False, out=None, overwrite=False)
+    captured = capsys.readouterr()
+    assert str(resp) in captured.out
+def test_print_and_save_creates_file(
+    tmp_path: Path,
+    capsys: pytest.CaptureFixture[str],
+) -> None:
+    out: Path = tmp_path / "result.json"
+    resp: Dict[str, str] = {"hello": "world"}
+    _print_and_save(resp, pretty=False, out=out, overwrite=False)
+    assert out.exists()
+    assert json.loads(out.read_text()) == resp
+    captured = capsys.readouterr()
+    assert "Result saved" in captured.out
+def test_print_and_save_refuses_overwrite(tmp_path: Path) -> None:
+    out: Path = tmp_path / "result.json"
+    out.write_text("{}")
+    with pytest.raises(typer.Exit):
+        _print_and_save({"x": 1}, pretty=False, out=out, overwrite=False)
+def test_print_and_save_overwrites(tmp_path: Path) -> None:
+    out: Path = tmp_path / "result.json"
+    out.write_text("{}")
+    _print_and_save({"x": 2}, pretty=False, out=out, overwrite=True)
+    assert json.loads(out.read_text()) == {"x": 2}
+def make_metric(
+    *,
+    metric: str,
+    status: str = "changed",
+    delta: Optional[float] = None,
+) -> SimpleNamespace:
+    return SimpleNamespace(
+        metric=metric,
+        status=status,
+        delta=delta,
+    )
+def test_summarize_groups_all_states() -> None:
+    metrics: List[SimpleNamespace] = [
+        make_metric(metric="added_metric", status="added"),
+        make_metric(metric="removed_metric", status="removed"),
+        make_metric(metric="improved_metric", delta=1.0),
+        make_metric(metric="regressed_metric", delta=-1.0),
+        make_metric(metric="unchanged_zero", delta=0.0),
+        make_metric(metric="unchanged_none", delta=None),
+    ]
+    summary: Dict[str, List[str]] = summarize(metrics=metrics)
+    assert summary == {
+        "improved": ["improved_metric"],
+        "regressed": ["regressed_metric"],
+        "unchanged": ["unchanged_zero", "unchanged_none"],
+        "added": ["added_metric"],
+        "removed": ["removed_metric"],
+    }
+def test_summarize_empty_metrics() -> None:
+    assert summarize(metrics=[]) == {
+        "improved": [],
+        "regressed": [],
+        "unchanged": [],
+        "added": [],
+        "removed": [],
+    }
+def test_pretty_print_comparison_summary_only(monkeypatch: pytest.MonkeyPatch) -> None:
+    metrics: List[SimpleNamespace] = [
+        make_metric(metric="added_metric", status="added"),
+        make_metric(metric="improved_metric", delta=1.0),
+    ]
+    resp: SimpleNamespace = SimpleNamespace(metrics=metrics)
+    called: Dict[str, Any] = {}
+    def fake_print_summary(arg: Any) -> None:
+        called["metrics"] = arg
+    import rakam_eval_sdk.utils.print as mod
+    monkeypatch.setattr(mod, "print_summary", fake_print_summary)
+    pretty_print_comparison(resp, summary_only=True)
+    assert called["metrics"] == metrics
+def test_pretty_print_comparison_full(monkeypatch: pytest.MonkeyPatch) -> None:
+    metrics: List[SimpleNamespace] = [
+        make_metric(metric="added_metric", status="added"),
+        make_metric(metric="improved_metric", delta=1.0),
+    ]
+    resp: SimpleNamespace = SimpleNamespace(metrics=metrics)
+    calls: List[Any] = []
+    def fake_print_metric_diff(metric: Any) -> None:
+        calls.append(metric)
+    import rakam_eval_sdk.utils.print as mod
+    monkeypatch.setattr(mod, "print_metric_diff", fake_print_metric_diff)
+    pretty_print_comparison(resp, summary_only=False)
+    assert calls == metrics

rakam_eval_sdk-0.2.4rc8/src/rakam_eval_sdk/utils/print.py ADDED Viewed

@@ -0,0 +1,269 @@
+import json
+from pathlib import Path
+from pprint import pprint
+from typing import Any, Dict, List, Optional
+import typer
+from typer import secho
+from rakam_eval_sdk.schema import MetricDiff
+def _print_and_save(
+    resp: dict,
+    pretty: bool,
+    out: Optional[Path],
+    overwrite: bool,
+) -> None:
+    if pretty:
+        typer.echo(typer.style("📊 Result:", bold=True))
+        pprint(resp)
+    else:
+        typer.echo(resp)
+    if out is None:
+        return
+    if out.exists() and not overwrite:
+        typer.echo(f"❌ File already exists: {out} (use --overwrite to replace)")
+        raise typer.Exit(code=1)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    with out.open("w", encoding="utf-8") as f:
+        json.dump(resp, f, indent=2, ensure_ascii=False)
+    typer.echo(f"💾 Result saved to {out}")
+def pct_change(a: Optional[float], b: Optional[float]) -> Optional[str]:
+    if a is None or b is None or a == 0:
+        return None
+    return f"{((b - a) / a) * 100:+.2f}%"
+def metric_direction(delta: Optional[float]) -> str:
+    if delta is None:
+        return "unchanged"
+    if delta > 0:
+        return "improved"
+    if delta < 0:
+        return "regressed"
+    return "unchanged"
+def print_metric_diff(diff: MetricDiff) -> None:
+    secho(f"\nMetric: {diff.metric}", bold=True)
+    if diff.status == "added":
+        secho(f"+ score: {diff.score_b}", fg="green")
+        secho(f"+ threshold: {diff.threshold_b}", fg="green")
+        secho(f"+ success: {diff.success_b}", fg="green")
+        return
+    if diff.status == "removed":
+        secho(f"- score: {diff.score_a}", fg="red")
+        secho(f"- threshold: {diff.threshold_a}", fg="red")
+        secho(f"- success: {diff.success_a}", fg="red")
+        return
+    # unchanged / changed
+    if diff.score_a != diff.score_b:
+        direction = metric_direction(diff.delta)
+        color = "green" if direction == "improved" else "red"
+        pct = pct_change(diff.score_a, diff.score_b)
+        secho(f"- score: {diff.score_a}", fg="red")
+        secho(
+            f"+ score: {diff.score_b}" + (f"   ({pct})" if pct else ""),
+            fg=color,
+        )
+    else:
+        secho(f"  score: {diff.score_a}", dim=True)
+    if diff.threshold_a != diff.threshold_b:
+        secho(f"- threshold: {diff.threshold_a}", fg="red")
+        secho(f"+ threshold: {diff.threshold_b}", fg="green")
+    else:
+        secho(f"  threshold: {diff.threshold_a}", dim=True)
+    if diff.success_a != diff.success_b:
+        secho(f"- success: {diff.success_a}", fg="red")
+        secho(f"+ success: {diff.success_b}", fg="green")
+    else:
+        secho(f"  success: {diff.success_a}", dim=True)
+def summarize(metrics: List[MetricDiff]) -> Dict[str, List[str]]:
+    """
+    Returns metric names grouped by category.
+    """
+    summary: Dict[str, List[str]] = {
+        "improved": [],
+        "regressed": [],
+        "unchanged": [],
+        "added": [],
+        "removed": [],
+    }
+    for m in metrics:
+        if m.status == "added":
+            summary["added"].append(m.metric)
+            continue
+        if m.status == "removed":
+            summary["removed"].append(m.metric)
+            continue
+        direction = metric_direction(m.delta)
+        summary[direction].append(m.metric)
+    return summary
+def _fmt(metrics: List[str]) -> str:
+    return ", ".join(metrics) if metrics else "-"
+def print_summary(metrics: List[MetricDiff]) -> None:
+    summary = summarize(metrics)
+    rows = [
+        ("↑ Improved", "improved", "green"),
+        ("↓ Regressed", "regressed", "red"),
+        ("± Unchanged", "unchanged", None),
+        ("+ Added.", "added", "green"),
+        ("- Removed.", "removed", "red"),
+    ]
+    secho("\nSummary:", bold=True)
+    secho(
+        "  | Status       | # | Metrics                |",
+        dim=True,
+    )
+    secho(
+        "  |--------------|---|------------------------|",
+        dim=True,
+    )
+    for label, key, color in rows:
+        count = len(summary[key])
+        metrics_str = _fmt(summary[key])
+        line = f"  | {label:<12} | {count:<1} | {metrics_str:<22} |"
+        if color:
+            secho(line, fg=color)
+        else:
+            secho(line, dim=True)
+def pretty_print_comparison(resp: Any, summary_only: bool = False) -> None:
+    if not summary_only:
+        for metric in resp.metrics:
+            print_metric_diff(metric)
+        return
+    print_summary(resp.metrics)
+def serialize_for_diff(obj: dict) -> str:
+    """
+    Stable, git-friendly JSON representation
+    """
+    return (
+        json.dumps(
+            obj,
+            indent=4,
+            sort_keys=True,
+            ensure_ascii=False,
+        )
+        + "\n"
+    )
+def git_diff(
+    a_text: str,
+    b_text: str,
+    *,
+    label_a: str,
+    label_b: str,
+    side_by_side: bool = False,
+) -> None:
+    """
+    Show diff between two text blobs.
+    - side_by_side: uses git difftool with vimdiff (interactive)
+    - fallback to git diff -U3 or difflib if git is not available
+    """
+    import shutil
+    import subprocess
+    from pathlib import Path
+    from tempfile import TemporaryDirectory
+    import typer
+    git = shutil.which("git")
+    vimdiff = shutil.which("vimdiff")
+    with TemporaryDirectory() as tmp:
+        a = Path(tmp) / label_a
+        b = Path(tmp) / label_b
+        a.write_text(a_text)
+        b.write_text(b_text)
+        # --- Side-by-side with vimdiff ---
+        if side_by_side:
+            if not git:
+                typer.secho(
+                    "❌ Git is required for side-by-side diffs", fg="red", bold=True
+                )
+                return
+            if not vimdiff:
+                typer.secho(
+                    "❌ Vimdiff is not installed. Please install vim or vimdiff to use side-by-side mode.",
+                    fg="red",
+                    bold=True,
+                )
+                return
+            cmd = [
+                git,
+                "difftool",
+                "--no-index",
+                "--tool=vimdiff",
+                "--no-prompt",  # skip Y/n prompt
+                str(a),
+                str(b),
+            ]
+            subprocess.run(cmd, check=False)
+            return
+        # (default)
+        if git:
+            cmd = [
+                git,
+                "diff",
+                "--no-index",
+                "--color=always",
+                "-U3",
+                str(a),
+                str(b),
+            ]
+            subprocess.run(cmd, check=False)
+            return
+        #  Fallback
+        import difflib
+        diff = difflib.unified_diff(
+            a_text.splitlines(),
+            b_text.splitlines(),
+            fromfile=label_a,
+            tofile=label_b,
+            lineterm="",
+        )
+        for line in diff:
+            typer.echo(line)

{rakam_eval_sdk-0.2.4rc6 → rakam_eval_sdk-0.2.4rc8}/README.md RENAMED Viewed

File without changes

{rakam_eval_sdk-0.2.4rc6 → rakam_eval_sdk-0.2.4rc8}/src/rakam_eval_sdk/__init__.py RENAMED Viewed

File without changes

{rakam_eval_sdk-0.2.4rc6 → rakam_eval_sdk-0.2.4rc8}/src/rakam_eval_sdk/client.py RENAMED Viewed

File without changes

{rakam_eval_sdk-0.2.4rc6 → rakam_eval_sdk-0.2.4rc8}/src/rakam_eval_sdk/decorators.py RENAMED Viewed

File without changes

{rakam_eval_sdk-0.2.4rc6 → rakam_eval_sdk-0.2.4rc8}/src/rakam_eval_sdk/schema.py RENAMED Viewed

File without changes

{rakam_eval_sdk-0.2.4rc6 → rakam_eval_sdk-0.2.4rc8}/src/rakam_eval_sdk/utils/decorator_utils.py RENAMED Viewed

File without changes

rakam-eval-sdk 0.2.4rc6__tar.gz → 0.2.4rc8__tar.gz

rakam-eval-sdk 0.2.4rc6tar.gz → 0.2.4rc8tar.gz