PyPI - rakam-eval-sdk - Versions diffs - 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl - Mend

rakam-eval-sdk 0.2.2py3-none-any.whl → 0.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

rakam_eval_sdk/cli.py +306 -87
rakam_eval_sdk/client.py +85 -50
rakam_eval_sdk/schema.py +21 -0
{rakam_eval_sdk-0.2.2.dist-info → rakam_eval_sdk-0.2.4.dist-info}/METADATA +1 -1
rakam_eval_sdk-0.2.4.dist-info/RECORD +10 -0
rakam_eval_sdk-0.2.2.dist-info/RECORD +0 -10
{rakam_eval_sdk-0.2.2.dist-info → rakam_eval_sdk-0.2.4.dist-info}/WHEEL +0 -0
{rakam_eval_sdk-0.2.2.dist-info → rakam_eval_sdk-0.2.4.dist-info}/entry_points.txt +0 -0

rakam_eval_sdk/cli.py CHANGED Viewed

@@ -6,16 +6,18 @@ import uuid
 from datetime import datetime
 from pathlib import Path
 from pprint import pprint
-from typing import Any, Dict, Optional, Sequence
+from typing import Any, Dict, Optional
 import typer
 from dotenv import load_dotenv
 from rich.console import Console
 from rich.panel import Panel
 from rich.pretty import Pretty
+from typer import secho
 from rakam_eval_sdk.client import DeepEvalClient
 from rakam_eval_sdk.decorators import eval_run
+from rakam_eval_sdk.schema import MetricDiff, TestCaseComparison
 from rakam_eval_sdk.utils.decorator_utils import (
     find_decorated_functions,
     load_module_from_path,
@@ -35,55 +37,88 @@ metrics_app = typer.Typer(help="Metrics utilities")
 app.add_typer(metrics_app, name="metrics")
+def extract_metric_names(config: Any) -> list[tuple[str, str | None]]:
+    """
+    Returns [(type, name)] from EvalConfig / SchemaEvalConfig
+    """
+    if not hasattr(config, "metrics"):
+        return []
+    results: list[tuple[str, str | None]] = []
+    for metric in config.metrics or []:
+        metric_type = getattr(metric, "type", None)
+        metric_name = getattr(metric, "name", None)
+        if metric_type:
+            results.append((metric_type, metric_name))
+    return results
 @metrics_app.command("list")
-def list_metrics(
-    limit: int = typer.Option(
-        20,
-        "--limit",
-        help="Number of testcases to inspect for metrics",
+def metrics(
+    directory: Path = typer.Argument(
+        Path("./eval"),
+        exists=True,
+        file_okay=False,
+        dir_okay=True,
+        help="Directory to scan (default: ./eval)",
     ),
-):
+    recursive: bool = typer.Option(
+        False,
+        "-r",
+        "--recursive",
+        help="Recursively search for Python files",
+    ),
+) -> None:
     """
-    List unique metric names found in evaluation testcases.
+    List all metric types used by loaded eval configs.
     """
-    client = DeepEvalClient()
+    files = directory.rglob("*.py") if recursive else directory.glob("*.py")
+    TARGET_DECORATOR = eval_run.__name__
-    testcases = client.list_evaluation_testcases(
-        limit=limit,
-        offset=0,
-        raise_exception=True,
-    )
+    all_metrics: set[tuple[str, str | None]] = set()
+    found_any = False
-    if not testcases:
-        typer.echo("No evaluation testcases found.")
-        return
+    for file in sorted(files):
+        functions = find_decorated_functions(file, TARGET_DECORATOR)
+        if not functions:
+            continue
-    metric_names: set[str] = set()
+        typer.echo(f"\n📄 {file}")
-    def collect_metrics(entries: Sequence[Dict] | None):
-        if not entries:
-            return
-        for entry in entries:
-            for metric in entry.get("metrics", []) or []:
-                name = metric.get("name")
-                if name:
-                    metric_names.add(name)
+        try:
+            module = load_module_from_path(file)
+        except Exception as e:
+            typer.echo(f"  ❌ Failed to import module: {e}")
+            continue
-    for tc in testcases:
+        for fn_name in functions:
+            try:
+                func = getattr(module, fn_name)
+                result = func()
-        collect_metrics(tc.get("result"))
+                metrics = extract_metric_names(result)
+                if not metrics:
+                    continue
-    if not metric_names:
-        typer.echo("No metrics found.")
-        return
+                found_any = True
+                for metric_type, metric_name in metrics:
+                    all_metrics.add((metric_type, metric_name))
-    typer.echo(
-        f"📊 Found {len(metric_names)} unique metrics "
-        f"(from latest {limit} testcases)\n"
-    )
+                    if metric_name:
+                        typer.echo(f"  • {metric_type} (alias: {metric_name})")
+                    else:
+                        typer.echo(f"  • {metric_type}")
+            except Exception as e:
+                typer.echo(f"  ❌ Failed to inspect {fn_name}: {e}")
+    if not found_any:
+        typer.echo("\nNo metrics found.")
+        raise typer.Exit(code=0)
-    for name in sorted(metric_names):
-        typer.echo(f"- {name}")
+    typer.echo(f"\n✅ {len(all_metrics)} unique metrics found")
 @list_app.command("evals")
@@ -130,19 +165,22 @@ def list_runs(
     """
     client = DeepEvalClient()
-    runs = client.list_evaluation_testcases(
+    response = client.list_evaluation_testcases(
         limit=limit,
         offset=offset,
         raise_exception=True,
     )
-    if not runs:
+    items = response.get("items", [])
+    total = response.get("total", 0)
+    if not items:
         typer.echo("No evaluation runs found.")
         return
-    typer.echo(f"[id] " f"{'tag':<20}" f"{'label':<20}" f"created_at")
-    # pretty CLI output
-    for run in runs:
+    typer.echo(f"[id] {'tag':<20}{'label':<20}created_at")
+    for run in items:
         run_id = run.get("id")
         label = run.get("label") or "-"
         uid = run.get("tag") or "-"
@@ -156,8 +194,12 @@ def list_runs(
             except ValueError:
                 pass
-        typer.echo(
-            f"[{run_id}] " f"{uid:<20} " f"{label:<20} " f"{created_at}")
+        typer.echo(f"[{run_id}] {uid:<20} {label:<20} {created_at}")
+    shown = offset + len(items)
+    if shown < total:
+        typer.echo()
+        typer.echo(f"Showing {shown} of {total} runs. Use --limit to see more.")
 @app.command()
@@ -165,12 +207,12 @@ def show(
     id: Optional[int] = typer.Option(
         None,
         "--id",
-        help="Numeric evaluation testcase ID",
+        help="Run ID",
     ),
     tag: Optional[str] = typer.Option(
         None,
         "--tag",
-        help="Evaluation testcase tag",
+        help="Run tag",
     ),
     raw: bool = typer.Option(
         False,
@@ -377,8 +419,7 @@ def _print_and_save(
         return
     if out.exists() and not overwrite:
-        typer.echo(
-            f"❌ File already exists: {out} (use --overwrite to replace)")
+        typer.echo(f"❌ File already exists: {out} (use --overwrite to replace)")
         raise typer.Exit(code=1)
     out.parent.mkdir(parents=True, exist_ok=True)
@@ -389,20 +430,124 @@ def _print_and_save(
     typer.echo(f"💾 Result saved to {out}")
+def pct_change(a: float | None, b: float | None) -> str | None:
+    if a is None or b is None or a == 0:
+        return None
+    return f"{((b - a) / a) * 100:+.2f}%"
+def metric_direction(delta: float | None) -> str:
+    if delta is None:
+        return "unchanged"
+    if delta > 0:
+        return "improved"
+    if delta < 0:
+        return "regressed"
+    return "unchanged"
+def print_metric_diff(diff: MetricDiff):
+    secho(f"\nMetric: {diff.metric}", bold=True)
+    if diff.status == "added":
+        secho(f"+ score: {diff.score_b}", fg="green")
+        secho(f"+ threshold: {diff.threshold_b}", fg="green")
+        secho(f"+ success: {diff.success_b}", fg="green")
+        return
+    if diff.status == "removed":
+        secho(f"- score: {diff.score_a}", fg="red")
+        secho(f"- threshold: {diff.threshold_a}", fg="red")
+        secho(f"- success: {diff.success_a}", fg="red")
+        return
+    # unchanged / changed
+    if diff.score_a != diff.score_b:
+        direction = metric_direction(diff.delta)
+        color = "green" if direction == "improved" else "red"
+        pct = pct_change(diff.score_a, diff.score_b)
+        secho(f"- score: {diff.score_a}", fg="red")
+        secho(
+            f"+ score: {diff.score_b}" + (f"   ({pct})" if pct else ""),
+            fg=color,
+        )
+    else:
+        secho(f"  score: {diff.score_a}", dim=True)
+    if diff.threshold_a != diff.threshold_b:
+        secho(f"- threshold: {diff.threshold_a}", fg="red")
+        secho(f"+ threshold: {diff.threshold_b}", fg="green")
+    else:
+        secho(f"  threshold: {diff.threshold_a}", dim=True)
+    if diff.success_a != diff.success_b:
+        secho(f"- success: {diff.success_a}", fg="red")
+        secho(f"+ success: {diff.success_b}", fg="green")
+    else:
+        secho(f"  success: {diff.success_a}", dim=True)
+def summarize(metrics: Any) -> Dict[str, int]:
+    summary = {
+        "improved": 0,
+        "regressed": 0,
+        "unchanged": 0,
+        "added": 0,
+        "removed": 0,
+    }
+    for m in metrics:
+        if m.status in ("added", "removed"):
+            summary[m.status] += 1
+        else:
+            direction = metric_direction(m.delta)
+            summary[direction] += 1
+    return summary
+def pretty_print_comparison(resp: Any, summary_only: bool = False):
+    if not summary_only:
+        for metric in resp.metrics:
+            print_metric_diff(metric)
+        return
+    print_summary(resp.metrics)
+def print_summary(metrics: Any):
+    summary = summarize(metrics)
+    secho("\nSummary:", bold=True)
+    secho(f"  ↑ Improved: {summary['improved']}", fg="green")
+    secho(f"  ↓ Regressed: {summary['regressed']}", fg="red")
+    secho(f"  ± Unchanged: {summary['unchanged']}", dim=True)
+    secho(f"  + Added: {summary['added']}", fg="green")
+    secho(f"  - Removed: {summary['removed']}", fg="red")
 @app.command()
 def compare(
-    testcase_a_id: int = typer.Argument(
-        ...,
-        help="ID of the first testcase",
+    tag: list[str] = typer.Option(
+        [],
+        "--tag",
+        help="Label identifying a reference testcase",
     ),
-    testcase_b_id: int = typer.Argument(
-        ...,
-        help="ID of the second testcase",
+    run: list[int] = typer.Option(
+        [],
+        "--run",
+        help="Run ID identifying an evaluation testcase",
     ),
     pretty: bool = typer.Option(
         True,
         "--pretty/--raw",
-        help="Pretty-print the response",
+        help="Pretty diff output (default) or raw JSON",
+    ),
+    summary: bool = typer.Option(
+        False,
+        "--summary",
+        help="Show summary only (no per-metric diff)",
     ),
     raise_exception: bool = typer.Option(
         False,
@@ -422,29 +567,85 @@ def compare(
     ),
 ) -> None:
     """
-    Compare two DeepEval evaluation testcases.
+    Compare two evaluation testcases (runs and/or labels).
+    Output:
+      - Unified diff-style view by default
+      - Raw JSON with --raw
+      - Summary of improvements / regressions
     """
+    targets: list[tuple[str, str | int]] = []
+    for r in run:
+        targets.append(("run", r))
+    for t in tag:
+        targets.append(("label", t))
+    if len(targets) != 2:
+        typer.secho(
+            "❌ Provide exactly two targets using --run and/or --tag",
+            fg="red",
+            bold=True,
+        )
+        raise typer.Exit(code=1)
+    (type_a, value_a), (type_b, value_b) = targets
+    typer.secho(
+        f"🔍 Comparing {type_a} '{value_a}' ↔ {type_b} '{value_b}'",
+        bold=True,
+    )
     client = DeepEvalClient()
-    typer.echo(f"🔍 Comparing testcases {testcase_a_id} ↔ {testcase_b_id}")
+    kwargs = {"raise_exception": raise_exception}
+    if type_a == "run":
+        kwargs["testcase_a_id"] = value_a
+    else:
+        kwargs["testcase_a_tag"] = value_a
+    if type_b == "run":
+        kwargs["testcase_b_id"] = value_b
+    else:
+        kwargs["testcase_b_tag"] = value_b
     try:
-        resp = client.compare_testcases(
-            testcase_a_id=testcase_a_id,
-            testcase_b_id=testcase_b_id,
-            raise_exception=raise_exception,
-        )
+        resp = client.compare_testcases(**kwargs)
     except Exception as e:
-        typer.echo(f"❌ Request failed: {e}")
+        typer.secho(f"❌ Request failed: {e}", fg="red")
         raise typer.Exit(code=1)
     if not resp:
-        typer.echo("⚠️ No response received")
+        typer.secho("⚠️ No response received", fg="yellow")
         raise typer.Exit(code=1)
-    _print_and_save(resp, pretty, out, overwrite)
+    if out:
+        if out.exists() and not overwrite:
+            typer.secho(
+                f"❌ File already exists: {out} (use --overwrite)",
+                fg="red",
+            )
+            raise typer.Exit(code=1)
-@app.command()
+        out.write_text(json.dumps(resp, indent=2))
+        typer.secho(f"💾 Saved raw output to {out}", fg="green")
+    if not pretty:
+        typer.echo(json.dumps(resp, indent=2))
+        return
+    comparison = TestCaseComparison(**resp)
+    pretty_print_comparison(
+        comparison,
+        summary_only=summary,
+    )
+@app.command(hidden=True)
 def compare_label_latest(
     label_a: str = typer.Argument(
         ...,
@@ -500,7 +701,7 @@ def compare_label_latest(
     _print_and_save(resp, pretty, out, overwrite)
-@app.command()
+@app.command(hidden=True)
 def compare_last(
     label: str = typer.Argument(
         ...,
@@ -551,44 +752,62 @@ def compare_last(
     _print_and_save(resp, pretty, out, overwrite)
-@list_app.command("tag")
-def update_run_tag(
-    run_id: int = typer.Argument(..., help="Evaluation run ID"),
+@app.command("tag")
+def tag_command(
+    run_id: Optional[int] = typer.Option(
+        None,
+        "--run",
+        help="Evaluation run ID",
+    ),
     tag: Optional[str] = typer.Option(
         None,
         "--tag",
-        "-t",
-        help="Tag to add or update",
+        help="Tag to assign to the run",
     ),
-    remove: bool = typer.Option(
-        False,
-        "--remove",
-        help="Remove tag from the run",
+    delete: Optional[str] = typer.Option(
+        None,
+        "--delete",
+        help="Delete a tag",
     ),
 ):
     """
-    Add, update, or remove a tag from an evaluation run.
+    Assign a tag to a run or delete a tag.
     """
-    if not tag and not remove:
-        typer.echo("❌ You must provide --tag or --remove")
-        raise typer.Exit(code=1)
-    if tag and remove:
-        typer.echo("❌ Use either --tag or --remove, not both")
-        raise typer.Exit(code=1)
+    # --- validation ---
+    if delete:
+        if run_id or tag:
+            typer.echo("❌ --delete cannot be used with --run or --tag")
+            raise typer.Exit(code=1)
+    else:
+        if not run_id or not tag:
+            typer.echo("❌ Use --run and --tag together, or --delete")
+            raise typer.Exit(code=1)
     client = DeepEvalClient()
+    # --- delete mode ---
+    if delete:
+        result = client.update_evaluation_testcase_tag(
+            testcase_id=run_id,
+            tag=delete,
+            raise_exception=True,
+        )
+        typer.echo("🗑️ Tag deleted successfully")
+        typer.echo(f"Tag: {delete}")
+        return
+    # --- assign/update mode ---
     result = client.update_evaluation_testcase_tag(
         testcase_id=run_id,
-        tag=None if remove else tag,
+        tag=tag,
         raise_exception=True,
     )
-    action = "removed" if remove else "updated"
-    typer.echo(f"✅ Tag {action} successfully")
+    typer.echo("✅ Tag assigned successfully")
     typer.echo(f"Run ID: {run_id}")
-    typer.echo(f"Tag: {result.get('tag') or '-'}")
+    typer.echo(f"Tag: {result.get('tag')}")
 def main() -> None:
     app()

rakam_eval_sdk/client.py CHANGED Viewed

@@ -1,7 +1,6 @@
-from typing import Optional, Dict
 import os
 import random
-from typing import Any, Dict, List, Optional, Union, cast, overload
+from typing import Any, Dict, List, Literal, Optional, Union, cast, overload
 import requests
@@ -13,7 +12,6 @@ from .schema import (
     SchemaMetricConfig,
     TextInputItem,
 )
-from typing import Optional, Literal, cast
 HTTPMethod = Literal["GET", "POST", "PATCH", "PUT", "DELETE"]
@@ -42,8 +40,7 @@ class DeepEvalClient:
         )
         self.base_url = raw_url.rstrip("/")
         self.api_token = (
-            api_token or settings_token or os.getenv(
-                "EVALFRAMEWORK_API_KEY", "")
+            api_token or settings_token or os.getenv("EVALFRAMEWORK_API_KEY", "")
         )
         self.timeout = timeout
@@ -52,10 +49,10 @@ class DeepEvalClient:
         method: HTTPMethod,
         endpoint: str,
         *,
-        json: dict | None = None,
-        params: dict | None = None,
+        json: Dict | None = None,
+        params: Dict | None = None,
         raise_exception: bool = False,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         url = f"{self.base_url}{endpoint}"
         headers = {
@@ -92,14 +89,17 @@ class DeepEvalClient:
                 "raw": resp.text,
             }
-    def _get(self, endpoint: str, params: dict, **kw):
-        return self._request("GET", endpoint, params=params, **kw)
+    def _get(self, endpoint: str, params: Dict, *args, **kw):
+        return self._request("GET", endpoint, params=params, *args, **kw)
+    def _post(self, endpoint: str, payload: Dict, *args, **kw):
+        return self._request("POST", endpoint, json=payload, *args, **kw)
-    def _post(self, endpoint: str, payload: dict, **kw):
-        return self._request("POST", endpoint, json=payload, **kw)
+    def _patch(self, endpoint: str, payload: Dict, *args, **kw):
+        return self._request("PATCH", endpoint, json=payload, *args, **kw)
-    def _patch(self, endpoint: str, payload: dict, **kw):
-        return self._request("PATCH", endpoint, json=payload, **kw)
+    def _delete(self, endpoint: str, payload: Dict, *args, **kw):
+        return self._request("DELETE", endpoint, json=payload, *args, **kw)
     def update_evaluation_testcase_tag(
         self,
@@ -114,8 +114,14 @@ class DeepEvalClient:
         - tag="smoke" → add / update tag
         - tag=None → remove tag
         """
+        if testcase_id is None:
+            return self._delete(
+                endpoint=f"/deepeval/tag/{tag}",
+                payload={},
+                raise_exception=raise_exception,
+            )
         return self._patch(
-            f"/evaluation-testcases/{testcase_id}/tag",
+            endpoint=f"/deepeval/{testcase_id}/tag",
             payload={"tag": tag},
             raise_exception=raise_exception,
         )
@@ -126,13 +132,14 @@ class DeepEvalClient:
         limit: int = 10,
         offset: int = 0,
         raise_exception: bool = False,
-    ) -> Optional[List[Dict]]:
+    ) -> Optional[Dict]:
         """
         List evaluation testcases for the current API token only.
         Sorted by created_at DESC (newest first).
+        Returns items + pagination metadata.
         """
         return self._get(
-            "/eval-framework/deepeval/evaluation-testcases/token",
+            endpoint="/eval-framework/deepeval/evaluation-testcases/token",
             params={
                 "limit": limit,
                 "offset": offset,
@@ -150,7 +157,7 @@ class DeepEvalClient:
         Fetch a single evaluation testcase by numeric ID.
         """
         return self._get(
-            f"/eval-framework/deepeval/id/{testcase_id}",
+            endpoint=f"/eval-framework/deepeval/id/{testcase_id}",
             params={},
             raise_exception=raise_exception,
         )
@@ -165,7 +172,7 @@ class DeepEvalClient:
         Fetch a single evaluation testcase by tag.
         """
         return self._get(
-            f"/eval-framework/deepeval/tag/{tag}",
+            endpoint=f"/eval-framework/deepeval/tag/{tag}",
             params={},
             raise_exception=raise_exception,
         )
@@ -190,19 +197,39 @@ class DeepEvalClient:
     def compare_testcases(
         self,
         *,
-        testcase_a_id: int,
-        testcase_b_id: int,
+        testcase_a_id: int | None = None,
+        testcase_a_tag: str | None = None,
+        testcase_b_id: int | None = None,
+        testcase_b_tag: str | None = None,
         raise_exception: bool = False,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         """
-        Compare two evaluation testcases.
+        Compare two evaluation testcases using IDs or tags.
+        Exactly one identifier (id or tag) must be provided per testcase.
         """
+        def validate(id_, tag, name: str):
+            if bool(id_) == bool(tag):
+                raise ValueError(f"Provide exactly one of {name}_id or {name}_tag")
+        validate(testcase_a_id, testcase_a_tag, "testcase_a")
+        validate(testcase_b_id, testcase_b_tag, "testcase_b")
+        params: dict[str, int | str] = {}
+        if testcase_a_id is not None:
+            params["testcase_a_id"] = testcase_a_id
+        else:
+            params["testcase_a_tag"] = testcase_a_tag  # type: ignore
+        if testcase_b_id is not None:
+            params["testcase_b_id"] = testcase_b_id
+        else:
+            params["testcase_b_tag"] = testcase_b_tag  # type: ignore
         return self._get(
-            "/eval-framework/deepeval/evaluation-testcases/compare",
-            params={
-                "testcase_a_id": testcase_a_id,
-                "testcase_b_id": testcase_b_id,
-            },
+            endpoint="/eval-framework/deepeval/evaluation-testcases/compare",
+            params=params,
             raise_exception=raise_exception,
         )
@@ -212,12 +239,12 @@ class DeepEvalClient:
         label_a: str,
         label_b: str,
         raise_exception: bool = False,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         """
         Compare the latest evaluation testcases for two labels.
         """
         return self._get(
-            "/eval-framework/deepeval/evaluation-testcases/compare-latest",
+            endpoint="/eval-framework/deepeval/evaluation-testcases/compare-latest",
             params={
                 "label_a": label_a,
                 "label_b": label_b,
@@ -230,12 +257,12 @@ class DeepEvalClient:
         *,
         label: str,
         raise_exception: bool = False,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         """
         Compare the last two evaluation testcases for a given label.
         """
         return self._get(
-            "/eval-framework/deepeval/evaluation-testcases/compare-last-two",
+            endpoint="/eval-framework/deepeval/evaluation-testcases/compare-last-two",
             params={
                 "label": label,
             },
@@ -248,7 +275,7 @@ class DeepEvalClient:
         config: EvalConfig,
         *,
         raise_exception: bool = False,
-    ) -> Optional[dict]: ...
+    ) -> Optional[Dict]: ...
     @overload
     def text_eval(
@@ -259,7 +286,7 @@ class DeepEvalClient:
         component: str = "unknown",
         label: str | None = None,
         raise_exception: bool = False,
-    ) -> Optional[dict]: ...
+    ) -> Optional[Dict]: ...
     def text_eval(
         self,
@@ -270,7 +297,7 @@ class DeepEvalClient:
         component: str = "unknown",
         label: str | None = None,
         raise_exception: bool = False,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         if config is None:
             config = EvalConfig(
                 data=data,
@@ -280,7 +307,9 @@ class DeepEvalClient:
             )
         return self._post(
-            "/deepeval/text-eval", config.model_dump(), raise_exception
+            endpoint="/deepeval/text-eval",
+            payload=config.model_dump(),
+            raise_exception=raise_exception,
         )
     def text_eval_background(
@@ -290,12 +319,16 @@ class DeepEvalClient:
         raise_exception: bool = False,
         component: str = "unknown",
         label: Union[str, None] = None,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         """Run background text evaluation (async job)."""
         payload = EvalConfig.model_construct(
             data=data, metrics=metrics, component=component, version=label
         ).model_dump()
-        return self._post("/deepeval/text-eval/background", payload, raise_exception)
+        return self._post(
+            endpoint="/deepeval/text-eval/background",
+            payload=payload,
+            raise_exception=raise_exception,
+        )
     @overload
     def schema_eval(
@@ -306,7 +339,7 @@ class DeepEvalClient:
         component: str = "unknown",
         label: str | None = None,
         raise_exception: bool = False,
-    ) -> Optional[dict]: ...
+    ) -> Optional[Dict]: ...
     @overload
     def schema_eval(
@@ -314,7 +347,7 @@ class DeepEvalClient:
         config: SchemaEvalConfig,
         *,
         raise_exception: bool = False,
-    ) -> Optional[dict]: ...
+    ) -> Optional[Dict]: ...
     def schema_eval(
         self,
@@ -325,7 +358,7 @@ class DeepEvalClient:
         component: str = "unknown",
         label: str | None = None,
         raise_exception: bool = False,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         if config is None:
             if data is None or metrics is None:
                 raise ValueError(
@@ -340,9 +373,9 @@ class DeepEvalClient:
             )
         return self._post(
-            "/deepeval/schema-eval",
-            config.model_dump(),
-            raise_exception,
+            endpoint="/deepeval/schema-eval",
+            payload=config.model_dump(),
+            raise_exception=raise_exception,
         )
     def schema_eval_background(
@@ -352,13 +385,15 @@ class DeepEvalClient:
         raise_exception: bool = False,
         component: str = "unknown",
         label: Union[str, None] = None,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         """Run background schema evaluation (async job)."""
         payload = SchemaEvalConfig.model_construct(
             data=data, metrics=metrics, component=component, version=label
         ).model_dump()
         return self._post(
-            "/deepeval/schema-eval/background", payload, raise_exception
+            endpoint="/deepeval/schema-eval/background",
+            payload=payload,
+            raise_exception=raise_exception,
         )
     def maybe_text_eval(
@@ -369,7 +404,7 @@ class DeepEvalClient:
         raise_exception: bool = False,
         component: str = "unknown",
         label: Union[str, None] = None,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         """Randomly run text_eval based on a probability between 0 and 1."""
         self._validate_chance(chance)
         return (
@@ -392,7 +427,7 @@ class DeepEvalClient:
         raise_exception: bool = False,
         component: str = "unknown",
         label: Union[str, None] = None,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         """Randomly run text_eval_background based on a probability between 0 and 1."""
         self._validate_chance(chance)
         return (
@@ -411,7 +446,7 @@ class DeepEvalClient:
         raise_exception: bool = False,
         component: str = "unknown",
         label: Union[str, None] = None,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         """Randomly run schema_eval based on a probability between 0 and 1."""
         self._validate_chance(chance)
         return (
@@ -434,7 +469,7 @@ class DeepEvalClient:
         raise_exception: bool = False,
         component: str = "unknown",
         label: Union[str, None] = None,
-    ) -> Optional[dict]:
+    ) -> Optional[Dict]:
         """Randomly run text_eval_background based on a probability between 0 and 1."""
         self._validate_chance(chance)
         return (

rakam_eval_sdk/schema.py CHANGED Viewed

@@ -129,3 +129,24 @@ class SchemaEvalConfig(BaseModel):
     label: Union[str, None] = None
     data: List[SchemaInputItem]
     metrics: List[SchemaMetricConfig] = Field(default_factory=list)
+class MetricDiff(BaseModel):
+    metric: str
+    score_a: Optional[float]
+    score_b: Optional[float]
+    delta: Optional[float]
+    success_a: Optional[bool]
+    success_b: Optional[bool]
+    threshold_a: Optional[float]
+    threshold_b: Optional[float]
+    status: str  # "unchanged" | "changed" | "added" | "removed"
+class TestCaseComparison(BaseModel):
+    testcase_a_id: int
+    testcase_b_id: int
+    metrics: List[MetricDiff]

{rakam_eval_sdk-0.2.2.dist-info → rakam_eval_sdk-0.2.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: rakam-eval-sdk
-Version: 0.2.2
+Version: 0.2.4
 Summary: Evaluation Framework SDK
 Author: Mohamed Bachar Touil
 License: MIT

rakam_eval_sdk-0.2.4.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,10 @@
+rakam_eval_sdk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+rakam_eval_sdk/cli.py,sha256=IvC45LyALCCoxZ7ZNifVCbPs6UveGtUCfjr1q9e1A_Y,21658
+rakam_eval_sdk/client.py,sha256=7spK7P3h5uiA3Q3erOHOwf04N1_RT1pysWyt_E-cz0Q,14666
+rakam_eval_sdk/decorators.py,sha256=_9VFQmoYWd6cqnNryZJWEwYHQRxY7vIOam4z45zBk3c,1794
+rakam_eval_sdk/schema.py,sha256=YfZgHtPh41xFYdQTsbS0OSvNdI8yIv5zoh76v21hemo,3971
+rakam_eval_sdk/utils/decorator_utils.py,sha256=g0TjXtG9o4hwhUAFP8GJsXAkjhZhzeseTAg-YBFjj2g,1763
+rakam_eval_sdk-0.2.4.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
+rakam_eval_sdk-0.2.4.dist-info/entry_points.txt,sha256=tNhwmM_UGELb3h0zOfgCrtTheUkP-k8jGv0rTOfRSps,56
+rakam_eval_sdk-0.2.4.dist-info/METADATA,sha256=XOTmtgO7NiG8NuLn4B_yJujUJONSJCvwGQ9ak3zIhv4,6019
+rakam_eval_sdk-0.2.4.dist-info/RECORD,,

rakam_eval_sdk-0.2.2.dist-info/RECORD DELETED Viewed

@@ -1,10 +0,0 @@
-rakam_eval_sdk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-rakam_eval_sdk/cli.py,sha256=LQBaTJGpXGRMNPkkqw_NzK4d1dbF8p4A4_8SN_XALgI,15500
-rakam_eval_sdk/client.py,sha256=e15d7ycm8G7oJuX_Ie9cB39TK33taMjFvVwc_4kLMww,13255
-rakam_eval_sdk/decorators.py,sha256=_9VFQmoYWd6cqnNryZJWEwYHQRxY7vIOam4z45zBk3c,1794
-rakam_eval_sdk/schema.py,sha256=woXu5GT1wuP_d7G11idopwpGmk0nV1SrxGShgWk-RA8,3530
-rakam_eval_sdk/utils/decorator_utils.py,sha256=g0TjXtG9o4hwhUAFP8GJsXAkjhZhzeseTAg-YBFjj2g,1763
-rakam_eval_sdk-0.2.2.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
-rakam_eval_sdk-0.2.2.dist-info/entry_points.txt,sha256=tNhwmM_UGELb3h0zOfgCrtTheUkP-k8jGv0rTOfRSps,56
-rakam_eval_sdk-0.2.2.dist-info/METADATA,sha256=2KBfQDA4cIq634ZFZ0z--FtOtty_PG_T8Pgh8a9CLJk,6019
-rakam_eval_sdk-0.2.2.dist-info/RECORD,,

{rakam_eval_sdk-0.2.2.dist-info → rakam_eval_sdk-0.2.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{rakam_eval_sdk-0.2.2.dist-info → rakam_eval_sdk-0.2.4.dist-info}/entry_points.txt RENAMED Viewed

File without changes

rakam-eval-sdk 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl

rakam-eval-sdk 0.2.2py3-none-any.whl → 0.2.4py3-none-any.whl