PyPI - rakam-eval-sdk - Versions diffs - 0.2.1__tar.gz → 0.2.3__tar.gz - Mend

rakam-eval-sdk 0.2.1tar.gz → 0.2.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

{rakam_eval_sdk-0.2.1 → rakam_eval_sdk-0.2.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: rakam-eval-sdk
-Version: 0.2.1
+Version: 0.2.3
 Summary: Evaluation Framework SDK
 Author: Mohamed Bachar Touil
 License: MIT

{rakam_eval_sdk-0.2.1 → rakam_eval_sdk-0.2.3}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "uv_build"
 [project]
 name = "rakam-eval-sdk"
-version = "0.2.1"
+version = "0.2.3"
 description = "Evaluation Framework SDK"
 readme = "README.md"
 requires-python = ">=3.8"

{rakam_eval_sdk-0.2.1 → rakam_eval_sdk-0.2.3}/src/rakam_eval_sdk/cli.py RENAMED Viewed

@@ -35,59 +35,92 @@ metrics_app = typer.Typer(help="Metrics utilities")
 app.add_typer(metrics_app, name="metrics")
+def extract_metric_names(config: Any) -> list[tuple[str, str | None]]:
+    """
+    Returns [(type, name)] from EvalConfig / SchemaEvalConfig
+    """
+    if not hasattr(config, "metrics"):
+        return []
+    results: list[tuple[str, str | None]] = []
+    for metric in config.metrics or []:
+        metric_type = getattr(metric, "type", None)
+        metric_name = getattr(metric, "name", None)
+        if metric_type:
+            results.append((metric_type, metric_name))
+    return results
 @metrics_app.command("list")
-def list_metrics(
-    limit: int = typer.Option(
-        20,
-        "--limit",
-        help="Number of testcases to inspect for metrics",
+def metrics(
+    directory: Path = typer.Argument(
+        Path("./eval"),
+        exists=True,
+        file_okay=False,
+        dir_okay=True,
+        help="Directory to scan (default: ./eval)",
     ),
-):
+    recursive: bool = typer.Option(
+        False,
+        "-r",
+        "--recursive",
+        help="Recursively search for Python files",
+    ),
+) -> None:
     """
-    List unique metric names found in evaluation testcases.
+    List all metric types used by loaded eval configs.
     """
-    client = DeepEvalClient()
+    files = directory.rglob("*.py") if recursive else directory.glob("*.py")
+    TARGET_DECORATOR = eval_run.__name__
-    testcases = client.list_evaluation_testcases(
-        limit=limit,
-        offset=0,
-        raise_exception=True,
-    )
+    all_metrics: set[tuple[str, str | None]] = set()
+    found_any = False
-    if not testcases:
-        typer.echo("No evaluation testcases found.")
-        return
+    for file in sorted(files):
+        functions = find_decorated_functions(file, TARGET_DECORATOR)
+        if not functions:
+            continue
-    metric_names: set[str] = set()
+        typer.echo(f"\n📄 {file}")
-    def collect_metrics(entries: Sequence[Dict] | None):
-        if not entries:
-            return
-        for entry in entries:
-            for metric in entry.get("metrics", []) or []:
-                name = metric.get("name")
-                if name:
-                    metric_names.add(name)
+        try:
+            module = load_module_from_path(file)
+        except Exception as e:
+            typer.echo(f"  ❌ Failed to import module: {e}")
+            continue
-    for tc in testcases:
+        for fn_name in functions:
+            try:
+                func = getattr(module, fn_name)
+                result = func()
-        collect_metrics(tc.get("result"))
+                metrics = extract_metric_names(result)
+                if not metrics:
+                    continue
-    if not metric_names:
-        typer.echo("No metrics found.")
-        return
+                found_any = True
+                for metric_type, metric_name in metrics:
+                    all_metrics.add((metric_type, metric_name))
-    typer.echo(
-        f"📊 Found {len(metric_names)} unique metrics "
-        f"(from latest {limit} testcases)\n"
-    )
+                    if metric_name:
+                        typer.echo(f"  • {metric_type} (alias: {metric_name})")
+                    else:
+                        typer.echo(f"  • {metric_type}")
-    for name in sorted(metric_names):
-        typer.echo(f"- {name}")
+            except Exception as e:
+                typer.echo(f"  ❌ Failed to inspect {fn_name}: {e}")
+    if not found_any:
+        typer.echo("\nNo metrics found.")
+        raise typer.Exit(code=0)
+    typer.echo(f"\n✅ {len(all_metrics)} unique metrics found")
-@list_app.command("eval")
-def list(
+@list_app.command("evals")
+def list_evals(
     directory: Path = typer.Argument(
         Path("./eval"),
         exists=True,
@@ -140,12 +173,12 @@ def list_runs(
         typer.echo("No evaluation runs found.")
         return
-    typer.echo(f"[id] " f"{'unique_id':<20}" f"{'label':<20}" f"created_at")
+    typer.echo(f"[id] " f"{'tag':<20}" f"{'label':<20}" f"created_at")
     # pretty CLI output
     for run in runs:
         run_id = run.get("id")
         label = run.get("label") or "-"
-        uid = run.get("unique_id") or "-"
+        uid = run.get("tag") or "-"
         created_at = run.get("created_at")
         if created_at:
@@ -156,20 +189,21 @@ def list_runs(
             except ValueError:
                 pass
-        typer.echo(f"[{run_id}] " f"{uid:<20} " f"{label:<20} " f"{created_at}")
+        typer.echo(
+            f"[{run_id}] " f"{uid:<20} " f"{label:<20} " f"{created_at}")
-@list_app.command("show")
-def show_testcase(
+@app.command()
+def show(
     id: Optional[int] = typer.Option(
         None,
         "--id",
-        help="Numeric evaluation testcase ID",
+        help="Run ID",
     ),
-    uid: Optional[str] = typer.Option(
+    tag: Optional[str] = typer.Option(
         None,
-        "--uid",
-        help="Evaluation testcase unique_id",
+        "--tag",
+        help="Run tag",
     ),
     raw: bool = typer.Option(
         False,
@@ -178,12 +212,12 @@ def show_testcase(
     ),
 ):
     """
-    Show a single evaluation testcase by ID or unique_id.
+    Show a single evaluation testcase by ID or tag.
     """
-    if not id and not uid:
+    if not id and not tag:
         raise typer.BadParameter("You must provide either --id or --uid")
-    if id and uid:
+    if id and tag:
         raise typer.BadParameter("Provide only one of --id or --uid")
     client = DeepEvalClient()
@@ -192,8 +226,8 @@ def show_testcase(
         result = client.get_evaluation_testcase_by_id(id)
         identifier = f"id={id}"
     else:
-        result = client.get_evaluation_testcase_by_unique_id(uid)
-        identifier = f"unique_id={uid}"
+        result = client.get_evaluation_testcase_by_tag(tag)
+        identifier = f"tag={tag}"
     if not result:
         console.print(
@@ -376,7 +410,8 @@ def _print_and_save(
         return
     if out.exists() and not overwrite:
-        typer.echo(f"❌ File already exists: {out} (use --overwrite to replace)")
+        typer.echo(
+            f"❌ File already exists: {out} (use --overwrite to replace)")
         raise typer.Exit(code=1)
     out.parent.mkdir(parents=True, exist_ok=True)
@@ -388,14 +423,16 @@ def _print_and_save(
 @app.command()
-def compare_testcases(
-    testcase_a_id: int = typer.Argument(
-        ...,
-        help="ID of the first testcase",
+def compare(
+    tag: list[str] = typer.Option(
+        [],
+        "--tag",
+        help="Tag identifying a reference testcase",
     ),
-    testcase_b_id: int = typer.Argument(
-        ...,
-        help="ID of the second testcase",
+    run: list[int] = typer.Option(
+        [],
+        "--run",
+        help="Run ID identifying an evaluation testcase",
     ),
     pretty: bool = typer.Option(
         True,
@@ -420,18 +457,45 @@ def compare_testcases(
     ),
 ) -> None:
     """
-    Compare two DeepEval evaluation testcases.
+    Compare two evaluation testcases using runs and/or labels.
     """
+    targets = []
+    for r in run:
+        targets.append(("run", r))
+    for l in tag:
+        targets.append(("label", l))
+    if len(targets) != 2:
+        typer.echo(
+            "❌ Provide exactly two targets using --run and/or --label"
+        )
+        raise typer.Exit(code=1)
     client = DeepEvalClient()
-    typer.echo(f"🔍 Comparing testcases {testcase_a_id} ↔ {testcase_b_id}")
+    (type_a, value_a), (type_b, value_b) = targets
+    typer.echo(
+        f"🔍 Comparing {type_a} '{value_a}' ↔ {type_b} '{value_b}'"
+    )
+    kwargs = {"raise_exception": raise_exception}
+    if type_a == "run":
+        kwargs["testcase_a_id"] = value_a
+    else:
+        kwargs["testcase_a_tag"] = value_a
+    if type_b == "run":
+        kwargs["testcase_b_id"] = value_b
+    else:
+        kwargs["testcase_b_tag"] = value_b
     try:
-        resp = client.compare_testcases(
-            testcase_a_id=testcase_a_id,
-            testcase_b_id=testcase_b_id,
-            raise_exception=raise_exception,
-        )
+        resp = client.compare_testcases(**kwargs)
     except Exception as e:
         typer.echo(f"❌ Request failed: {e}")
         raise typer.Exit(code=1)
@@ -439,10 +503,11 @@ def compare_testcases(
     if not resp:
         typer.echo("⚠️ No response received")
         raise typer.Exit(code=1)
     _print_and_save(resp, pretty, out, overwrite)
-@app.command()
+@app.command(hidden=True)
 def compare_label_latest(
     label_a: str = typer.Argument(
         ...,
@@ -498,7 +563,7 @@ def compare_label_latest(
     _print_and_save(resp, pretty, out, overwrite)
-@app.command()
+@app.command(hidden=True)
 def compare_last(
     label: str = typer.Argument(
         ...,
@@ -549,6 +614,63 @@ def compare_last(
     _print_and_save(resp, pretty, out, overwrite)
+@app.command("tag")
+def tag_command(
+    run_id: Optional[int] = typer.Option(
+        None,
+        "--run",
+        help="Evaluation run ID",
+    ),
+    tag: Optional[str] = typer.Option(
+        None,
+        "--tag",
+        help="Tag to assign to the run",
+    ),
+    delete: Optional[str] = typer.Option(
+        None,
+        "--delete",
+        help="Delete a tag",
+    ),
+):
+    """
+    Assign a tag to a run or delete a tag.
+    """
+    # --- validation ---
+    if delete:
+        if run_id or tag:
+            typer.echo("❌ --delete cannot be used with --run or --tag")
+            raise typer.Exit(code=1)
+    else:
+        if not run_id or not tag:
+            typer.echo("❌ Use --run and --tag together, or --delete")
+            raise typer.Exit(code=1)
+    client = DeepEvalClient()
+    # --- delete mode ---
+    if delete:
+        result = client.update_evaluation_testcase_tag(
+            testcase_id=run_id,
+            tag=delete,
+            raise_exception=True,
+        )
+        typer.echo("🗑️ Tag deleted successfully")
+        typer.echo(f"Tag: {delete}")
+        return
+    # --- assign/update mode ---
+    result = client.update_evaluation_testcase_tag(
+        testcase_id=run_id,
+        tag=tag,
+        raise_exception=True,
+    )
+    typer.echo("✅ Tag assigned successfully")
+    typer.echo(f"Run ID: {run_id}")
+    typer.echo(f"Tag: {result.get('tag')}")
 def main() -> None:
     app()

{rakam_eval_sdk-0.2.1 → rakam_eval_sdk-0.2.3}/src/rakam_eval_sdk/client.py RENAMED Viewed

@@ -1,6 +1,6 @@
 import os
 import random
-from typing import Any, Dict, List, Optional, Union, cast, overload
+from typing import Any, Dict, List, Literal, Optional, Union, cast, overload
 import requests
@@ -13,6 +13,8 @@ from .schema import (
     TextInputItem,
 )
+HTTPMethod = Literal["GET", "POST", "PATCH", "PUT", "DELETE"]
 class DeepEvalClient:
     """
@@ -45,21 +47,31 @@ class DeepEvalClient:
     def _request(
         self,
+        method: HTTPMethod,
         endpoint: str,
-        payload: dict,
+        *,
+        json: dict | None = None,
+        params: dict | None = None,
         raise_exception: bool = False,
     ) -> Optional[dict]:
-        """Internal helper to send POST requests with standard headers and error handling."""
         url = f"{self.base_url}{endpoint}"
         headers = {
             "accept": "application/json",
-            "Content-Type": "application/json",
             "X-API-Token": self.api_token,
         }
+        if json is not None:
+            headers["Content-Type"] = "application/json"
         try:
-            resp = requests.post(
-                url, headers=headers, json=payload, timeout=self.timeout
+            resp = requests.request(
+                method=method,
+                url=url,
+                headers=headers,
+                json=json,
+                params=params,
+                timeout=self.timeout,
             )
             if raise_exception:
                 resp.raise_for_status()
@@ -73,41 +85,47 @@ class DeepEvalClient:
         except ValueError:
             if raise_exception:
                 raise
-            return {"error": "Invalid JSON response", "raw": resp.text}
+            return {
+                "error": "Invalid JSON response",
+                "raw": resp.text,
+            }
-    def _get(
+    def _get(self, endpoint: str, params: dict, *args, **kw):
+        return self._request("GET", endpoint, params=params, *args, **kw)
+    def _post(self, endpoint: str, payload: dict, *args, **kw):
+        return self._request("POST", endpoint, json=payload, *args, **kw)
+    def _patch(self, endpoint: str, payload: dict, *args, **kw):
+        return self._request("PATCH", endpoint, json=payload, *args, **kw)
+    def _delete(self, endpoint: str, payload: dict, *args, **kw):
+        return self._request("DELETE", endpoint, json=payload, *args, **kw)
+    def update_evaluation_testcase_tag(
         self,
-        endpoint: str,
-        params: dict,
+        *,
+        testcase_id: int,
+        tag: Optional[str],
         raise_exception: bool = False,
-    ) -> Optional[dict]:
-        """Internal helper to send GET requests with standard headers and error handling."""
-        url = f"{self.base_url}{endpoint}"
-        headers = {
-            "accept": "application/json",
-            "X-API-Token": self.api_token,
-        }
+    ) -> Optional[Dict]:
+        """
+        Add, update, or remove a tag from an evaluation testcase.
-        try:
-            resp = requests.get(
-                url,
-                headers=headers,
-                params=params,
-                timeout=self.timeout,
+        - tag="smoke" → add / update tag
+        - tag=None → remove tag
+        """
+        if testcase_id is None:
+            return self._delete(
+                endpoint=f"/deepeval/tag/{tag}",
+                payload={},
+                raise_exception=raise_exception,
             )
-            if raise_exception:
-                resp.raise_for_status()
-        except requests.RequestException as e:
-            if raise_exception:
-                raise
-            return {"error": str(e)}
-        try:
-            return cast(dict, resp.json())
-        except ValueError:
-            if raise_exception:
-                raise
-            return {"error": "Invalid JSON response", "raw": resp.text}
+        return self._patch(
+            endpoint=f"/deepeval/{testcase_id}/tag",
+            payload={"tag": tag},
+            raise_exception=raise_exception,
+        )
     def list_evaluation_testcases(
         self,
@@ -121,7 +139,7 @@ class DeepEvalClient:
         Sorted by created_at DESC (newest first).
         """
         return self._get(
-            "/eval-framework/deepeval/evaluation-testcases/token",
+            endpoint="/eval-framework/deepeval/evaluation-testcases/token",
             params={
                 "limit": limit,
                 "offset": offset,
@@ -139,22 +157,22 @@ class DeepEvalClient:
         Fetch a single evaluation testcase by numeric ID.
         """
         return self._get(
-            f"/eval-framework/deepeval/id/{testcase_id}",
+            endpoint=f"/eval-framework/deepeval/id/{testcase_id}",
             params={},
             raise_exception=raise_exception,
         )
-    def get_evaluation_testcase_by_unique_id(
+    def get_evaluation_testcase_by_tag(
         self,
-        unique_id: str,
+        tag: str,
         *,
         raise_exception: bool = False,
     ) -> Optional[Dict]:
         """
-        Fetch a single evaluation testcase by unique_id.
+        Fetch a single evaluation testcase by tag.
         """
         return self._get(
-            f"/eval-framework/deepeval/uid/{unique_id}",
+            endpoint=f"/eval-framework/deepeval/tag/{tag}",
             params={},
             raise_exception=raise_exception,
         )
@@ -163,35 +181,57 @@ class DeepEvalClient:
         self,
         *,
         id: Optional[int] = None,
-        unique_id: Optional[str] = None,
+        tag: Optional[str] = None,
         raise_exception: bool = False,
     ) -> Optional[Dict]:
         if id is not None:
             return self.get_evaluation_testcase_by_id(
                 id, raise_exception=raise_exception
             )
-        if unique_id is not None:
-            return self.get_evaluation_testcase_by_unique_id(
-                unique_id, raise_exception=raise_exception
+        if tag is not None:
+            return self.get_evaluation_testcase_by_tag(
+                tag, raise_exception=raise_exception
             )
-        raise ValueError("Either id or unique_id must be provided")
+        raise ValueError("Either id or tag must be provided")
     def compare_testcases(
         self,
         *,
-        testcase_a_id: int,
-        testcase_b_id: int,
+        testcase_a_id: int | None = None,
+        testcase_a_tag: str | None = None,
+        testcase_b_id: int | None = None,
+        testcase_b_tag: str | None = None,
         raise_exception: bool = False,
     ) -> Optional[dict]:
         """
-        Compare two evaluation testcases.
+        Compare two evaluation testcases using IDs or tags.
+        Exactly one identifier (id or tag) must be provided per testcase.
         """
+        def validate(id_, tag, name: str):
+            if bool(id_) == bool(tag):
+                raise ValueError(
+                    f"Provide exactly one of {name}_id or {name}_tag"
+                )
+        validate(testcase_a_id, testcase_a_tag, "testcase_a")
+        validate(testcase_b_id, testcase_b_tag, "testcase_b")
+        params: dict[str, int | str] = {}
+        if testcase_a_id is not None:
+            params["testcase_a_id"] = testcase_a_id
+        else:
+            params["testcase_a_tag"] = testcase_a_tag  # type: ignore
+        if testcase_b_id is not None:
+            params["testcase_b_id"] = testcase_b_id
+        else:
+            params["testcase_b_tag"] = testcase_b_tag  # type: ignore
         return self._get(
-            "/eval-framework/deepeval/evaluation-testcases/compare",
-            params={
-                "testcase_a_id": testcase_a_id,
-                "testcase_b_id": testcase_b_id,
-            },
+            endpoint="/eval-framework/deepeval/evaluation-testcases/compare",
+            params=params,
             raise_exception=raise_exception,
         )
@@ -206,7 +246,7 @@ class DeepEvalClient:
         Compare the latest evaluation testcases for two labels.
         """
         return self._get(
-            "/eval-framework/deepeval/evaluation-testcases/compare-latest",
+            endpoint="/eval-framework/deepeval/evaluation-testcases/compare-latest",
             params={
                 "label_a": label_a,
                 "label_b": label_b,
@@ -224,7 +264,7 @@ class DeepEvalClient:
         Compare the last two evaluation testcases for a given label.
         """
         return self._get(
-            "/eval-framework/deepeval/evaluation-testcases/compare-last-two",
+            endpoint="/eval-framework/deepeval/evaluation-testcases/compare-last-two",
             params={
                 "label": label,
             },
@@ -268,8 +308,8 @@ class DeepEvalClient:
                 label=label,
             )
-        return self._request(
-            "/deepeval/text-eval", config.model_dump(), raise_exception
+        return self._post(
+            endpoint="/deepeval/text-eval", payload=config.model_dump(), raise_exception=raise_exception
         )
     def text_eval_background(
@@ -284,7 +324,7 @@ class DeepEvalClient:
         payload = EvalConfig.model_construct(
             data=data, metrics=metrics, component=component, version=label
         ).model_dump()
-        return self._request("/deepeval/text-eval/background", payload, raise_exception)
+        return self._post(endpoint="/deepeval/text-eval/background", payload=payload, raise_exception=raise_exception)
     @overload
     def schema_eval(
@@ -328,10 +368,10 @@ class DeepEvalClient:
                 label=label,
             )
-        return self._request(
-            "/deepeval/schema-eval",
-            config.model_dump(),
-            raise_exception,
+        return self._post(
+            endpoint="/deepeval/schema-eval",
+            payload=config.model_dump(),
+            raise_exception=raise_exception,
         )
     def schema_eval_background(
@@ -346,8 +386,8 @@ class DeepEvalClient:
         payload = SchemaEvalConfig.model_construct(
             data=data, metrics=metrics, component=component, version=label
         ).model_dump()
-        return self._request(
-            "/deepeval/schema-eval/background", payload, raise_exception
+        return self._post(
+            endpoint="/deepeval/schema-eval/background", payload=payload, raise_exception=raise_exception
         )
     def maybe_text_eval(

{rakam_eval_sdk-0.2.1 → rakam_eval_sdk-0.2.3}/src/rakam_eval_sdk/schema.py RENAMED Viewed

@@ -117,7 +117,6 @@ class SchemaInputItem(InputItem):
 class EvalConfig(BaseModel):
     __eval_config__ = "text_eval"
-    unique_id: Union[str, None] = None
     component: str = "unknown"
     label: Union[str, None] = None
     data: List[TextInputItem]
@@ -127,7 +126,6 @@ class EvalConfig(BaseModel):
 class SchemaEvalConfig(BaseModel):
     __eval_config__ = "schema_eval"
     component: str = "unknown"
-    unique_id: Union[str, None] = None
     label: Union[str, None] = None
     data: List[SchemaInputItem]
     metrics: List[SchemaMetricConfig] = Field(default_factory=list)

{rakam_eval_sdk-0.2.1 → rakam_eval_sdk-0.2.3}/README.md RENAMED Viewed

File without changes

{rakam_eval_sdk-0.2.1 → rakam_eval_sdk-0.2.3}/src/rakam_eval_sdk/__init__.py RENAMED Viewed

File without changes

{rakam_eval_sdk-0.2.1 → rakam_eval_sdk-0.2.3}/src/rakam_eval_sdk/decorators.py RENAMED Viewed

File without changes

{rakam_eval_sdk-0.2.1 → rakam_eval_sdk-0.2.3}/src/rakam_eval_sdk/utils/decorator_utils.py RENAMED Viewed

File without changes

rakam-eval-sdk 0.2.1__tar.gz → 0.2.3__tar.gz

rakam-eval-sdk 0.2.1tar.gz → 0.2.3tar.gz