PyPI - rakam-eval-sdk - Versions diffs - 0.2.2__tar.gz → 0.2.3__tar.gz - Mend

rakam-eval-sdk 0.2.2tar.gz → 0.2.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

{rakam_eval_sdk-0.2.2 → rakam_eval_sdk-0.2.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: rakam-eval-sdk
-Version: 0.2.2
+Version: 0.2.3
 Summary: Evaluation Framework SDK
 Author: Mohamed Bachar Touil
 License: MIT

{rakam_eval_sdk-0.2.2 → rakam_eval_sdk-0.2.3}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "uv_build"
 [project]
 name = "rakam-eval-sdk"
-version = "0.2.2"
+version = "0.2.3"
 description = "Evaluation Framework SDK"
 readme = "README.md"
 requires-python = ">=3.8"

{rakam_eval_sdk-0.2.2 → rakam_eval_sdk-0.2.3}/src/rakam_eval_sdk/cli.py RENAMED Viewed

@@ -35,55 +35,88 @@ metrics_app = typer.Typer(help="Metrics utilities")
 app.add_typer(metrics_app, name="metrics")
+def extract_metric_names(config: Any) -> list[tuple[str, str | None]]:
+    """
+    Returns [(type, name)] from EvalConfig / SchemaEvalConfig
+    """
+    if not hasattr(config, "metrics"):
+        return []
+    results: list[tuple[str, str | None]] = []
+    for metric in config.metrics or []:
+        metric_type = getattr(metric, "type", None)
+        metric_name = getattr(metric, "name", None)
+        if metric_type:
+            results.append((metric_type, metric_name))
+    return results
 @metrics_app.command("list")
-def list_metrics(
-    limit: int = typer.Option(
-        20,
-        "--limit",
-        help="Number of testcases to inspect for metrics",
+def metrics(
+    directory: Path = typer.Argument(
+        Path("./eval"),
+        exists=True,
+        file_okay=False,
+        dir_okay=True,
+        help="Directory to scan (default: ./eval)",
     ),
-):
+    recursive: bool = typer.Option(
+        False,
+        "-r",
+        "--recursive",
+        help="Recursively search for Python files",
+    ),
+) -> None:
     """
-    List unique metric names found in evaluation testcases.
+    List all metric types used by loaded eval configs.
     """
-    client = DeepEvalClient()
+    files = directory.rglob("*.py") if recursive else directory.glob("*.py")
+    TARGET_DECORATOR = eval_run.__name__
-    testcases = client.list_evaluation_testcases(
-        limit=limit,
-        offset=0,
-        raise_exception=True,
-    )
+    all_metrics: set[tuple[str, str | None]] = set()
+    found_any = False
-    if not testcases:
-        typer.echo("No evaluation testcases found.")
-        return
+    for file in sorted(files):
+        functions = find_decorated_functions(file, TARGET_DECORATOR)
+        if not functions:
+            continue
-    metric_names: set[str] = set()
+        typer.echo(f"\n📄 {file}")
-    def collect_metrics(entries: Sequence[Dict] | None):
-        if not entries:
-            return
-        for entry in entries:
-            for metric in entry.get("metrics", []) or []:
-                name = metric.get("name")
-                if name:
-                    metric_names.add(name)
+        try:
+            module = load_module_from_path(file)
+        except Exception as e:
+            typer.echo(f"  ❌ Failed to import module: {e}")
+            continue
-    for tc in testcases:
+        for fn_name in functions:
+            try:
+                func = getattr(module, fn_name)
+                result = func()
-        collect_metrics(tc.get("result"))
+                metrics = extract_metric_names(result)
+                if not metrics:
+                    continue
-    if not metric_names:
-        typer.echo("No metrics found.")
-        return
+                found_any = True
+                for metric_type, metric_name in metrics:
+                    all_metrics.add((metric_type, metric_name))
-    typer.echo(
-        f"📊 Found {len(metric_names)} unique metrics "
-        f"(from latest {limit} testcases)\n"
-    )
+                    if metric_name:
+                        typer.echo(f"  • {metric_type} (alias: {metric_name})")
+                    else:
+                        typer.echo(f"  • {metric_type}")
+            except Exception as e:
+                typer.echo(f"  ❌ Failed to inspect {fn_name}: {e}")
+    if not found_any:
+        typer.echo("\nNo metrics found.")
+        raise typer.Exit(code=0)
-    for name in sorted(metric_names):
-        typer.echo(f"- {name}")
+    typer.echo(f"\n✅ {len(all_metrics)} unique metrics found")
 @list_app.command("evals")
@@ -165,12 +198,12 @@ def show(
     id: Optional[int] = typer.Option(
         None,
         "--id",
-        help="Numeric evaluation testcase ID",
+        help="Run ID",
     ),
     tag: Optional[str] = typer.Option(
         None,
         "--tag",
-        help="Evaluation testcase tag",
+        help="Run tag",
     ),
     raw: bool = typer.Option(
         False,
@@ -391,13 +424,15 @@ def _print_and_save(
 @app.command()
 def compare(
-    testcase_a_id: int = typer.Argument(
-        ...,
-        help="ID of the first testcase",
+    tag: list[str] = typer.Option(
+        [],
+        "--tag",
+        help="Tag identifying a reference testcase",
     ),
-    testcase_b_id: int = typer.Argument(
-        ...,
-        help="ID of the second testcase",
+    run: list[int] = typer.Option(
+        [],
+        "--run",
+        help="Run ID identifying an evaluation testcase",
     ),
     pretty: bool = typer.Option(
         True,
@@ -422,18 +457,45 @@ def compare(
     ),
 ) -> None:
     """
-    Compare two DeepEval evaluation testcases.
+    Compare two evaluation testcases using runs and/or labels.
     """
+    targets = []
+    for r in run:
+        targets.append(("run", r))
+    for l in tag:
+        targets.append(("label", l))
+    if len(targets) != 2:
+        typer.echo(
+            "❌ Provide exactly two targets using --run and/or --label"
+        )
+        raise typer.Exit(code=1)
     client = DeepEvalClient()
-    typer.echo(f"🔍 Comparing testcases {testcase_a_id} ↔ {testcase_b_id}")
+    (type_a, value_a), (type_b, value_b) = targets
+    typer.echo(
+        f"🔍 Comparing {type_a} '{value_a}' ↔ {type_b} '{value_b}'"
+    )
+    kwargs = {"raise_exception": raise_exception}
+    if type_a == "run":
+        kwargs["testcase_a_id"] = value_a
+    else:
+        kwargs["testcase_a_tag"] = value_a
+    if type_b == "run":
+        kwargs["testcase_b_id"] = value_b
+    else:
+        kwargs["testcase_b_tag"] = value_b
     try:
-        resp = client.compare_testcases(
-            testcase_a_id=testcase_a_id,
-            testcase_b_id=testcase_b_id,
-            raise_exception=raise_exception,
-        )
+        resp = client.compare_testcases(**kwargs)
     except Exception as e:
         typer.echo(f"❌ Request failed: {e}")
         raise typer.Exit(code=1)
@@ -441,10 +503,11 @@ def compare(
     if not resp:
         typer.echo("⚠️ No response received")
         raise typer.Exit(code=1)
     _print_and_save(resp, pretty, out, overwrite)
-@app.command()
+@app.command(hidden=True)
 def compare_label_latest(
     label_a: str = typer.Argument(
         ...,
@@ -500,7 +563,7 @@ def compare_label_latest(
     _print_and_save(resp, pretty, out, overwrite)
-@app.command()
+@app.command(hidden=True)
 def compare_last(
     label: str = typer.Argument(
         ...,
@@ -551,44 +614,62 @@ def compare_last(
     _print_and_save(resp, pretty, out, overwrite)
-@list_app.command("tag")
-def update_run_tag(
-    run_id: int = typer.Argument(..., help="Evaluation run ID"),
+@app.command("tag")
+def tag_command(
+    run_id: Optional[int] = typer.Option(
+        None,
+        "--run",
+        help="Evaluation run ID",
+    ),
     tag: Optional[str] = typer.Option(
         None,
         "--tag",
-        "-t",
-        help="Tag to add or update",
+        help="Tag to assign to the run",
     ),
-    remove: bool = typer.Option(
-        False,
-        "--remove",
-        help="Remove tag from the run",
+    delete: Optional[str] = typer.Option(
+        None,
+        "--delete",
+        help="Delete a tag",
     ),
 ):
     """
-    Add, update, or remove a tag from an evaluation run.
+    Assign a tag to a run or delete a tag.
     """
-    if not tag and not remove:
-        typer.echo("❌ You must provide --tag or --remove")
-        raise typer.Exit(code=1)
-    if tag and remove:
-        typer.echo("❌ Use either --tag or --remove, not both")
-        raise typer.Exit(code=1)
+    # --- validation ---
+    if delete:
+        if run_id or tag:
+            typer.echo("❌ --delete cannot be used with --run or --tag")
+            raise typer.Exit(code=1)
+    else:
+        if not run_id or not tag:
+            typer.echo("❌ Use --run and --tag together, or --delete")
+            raise typer.Exit(code=1)
     client = DeepEvalClient()
+    # --- delete mode ---
+    if delete:
+        result = client.update_evaluation_testcase_tag(
+            testcase_id=run_id,
+            tag=delete,
+            raise_exception=True,
+        )
+        typer.echo("🗑️ Tag deleted successfully")
+        typer.echo(f"Tag: {delete}")
+        return
+    # --- assign/update mode ---
     result = client.update_evaluation_testcase_tag(
         testcase_id=run_id,
-        tag=None if remove else tag,
+        tag=tag,
         raise_exception=True,
     )
-    action = "removed" if remove else "updated"
-    typer.echo(f"✅ Tag {action} successfully")
+    typer.echo("✅ Tag assigned successfully")
     typer.echo(f"Run ID: {run_id}")
-    typer.echo(f"Tag: {result.get('tag') or '-'}")
+    typer.echo(f"Tag: {result.get('tag')}")
 def main() -> None:
     app()

{rakam_eval_sdk-0.2.2 → rakam_eval_sdk-0.2.3}/src/rakam_eval_sdk/client.py RENAMED Viewed

@@ -1,7 +1,6 @@
-from typing import Optional, Dict
 import os
 import random
-from typing import Any, Dict, List, Optional, Union, cast, overload
+from typing import Any, Dict, List, Literal, Optional, Union, cast, overload
 import requests
@@ -13,7 +12,6 @@ from .schema import (
     SchemaMetricConfig,
     TextInputItem,
 )
-from typing import Optional, Literal, cast
 HTTPMethod = Literal["GET", "POST", "PATCH", "PUT", "DELETE"]
@@ -92,14 +90,17 @@ class DeepEvalClient:
                 "raw": resp.text,
             }
-    def _get(self, endpoint: str, params: dict, **kw):
-        return self._request("GET", endpoint, params=params, **kw)
+    def _get(self, endpoint: str, params: dict, *args, **kw):
+        return self._request("GET", endpoint, params=params, *args, **kw)
-    def _post(self, endpoint: str, payload: dict, **kw):
-        return self._request("POST", endpoint, json=payload, **kw)
+    def _post(self, endpoint: str, payload: dict, *args, **kw):
+        return self._request("POST", endpoint, json=payload, *args, **kw)
-    def _patch(self, endpoint: str, payload: dict, **kw):
-        return self._request("PATCH", endpoint, json=payload, **kw)
+    def _patch(self, endpoint: str, payload: dict, *args, **kw):
+        return self._request("PATCH", endpoint, json=payload, *args, **kw)
+    def _delete(self, endpoint: str, payload: dict, *args, **kw):
+        return self._request("DELETE", endpoint, json=payload, *args, **kw)
     def update_evaluation_testcase_tag(
         self,
@@ -114,8 +115,14 @@ class DeepEvalClient:
         - tag="smoke" → add / update tag
         - tag=None → remove tag
         """
+        if testcase_id is None:
+            return self._delete(
+                endpoint=f"/deepeval/tag/{tag}",
+                payload={},
+                raise_exception=raise_exception,
+            )
         return self._patch(
-            f"/evaluation-testcases/{testcase_id}/tag",
+            endpoint=f"/deepeval/{testcase_id}/tag",
             payload={"tag": tag},
             raise_exception=raise_exception,
         )
@@ -132,7 +139,7 @@ class DeepEvalClient:
         Sorted by created_at DESC (newest first).
         """
         return self._get(
-            "/eval-framework/deepeval/evaluation-testcases/token",
+            endpoint="/eval-framework/deepeval/evaluation-testcases/token",
             params={
                 "limit": limit,
                 "offset": offset,
@@ -150,7 +157,7 @@ class DeepEvalClient:
         Fetch a single evaluation testcase by numeric ID.
         """
         return self._get(
-            f"/eval-framework/deepeval/id/{testcase_id}",
+            endpoint=f"/eval-framework/deepeval/id/{testcase_id}",
             params={},
             raise_exception=raise_exception,
         )
@@ -165,7 +172,7 @@ class DeepEvalClient:
         Fetch a single evaluation testcase by tag.
         """
         return self._get(
-            f"/eval-framework/deepeval/tag/{tag}",
+            endpoint=f"/eval-framework/deepeval/tag/{tag}",
             params={},
             raise_exception=raise_exception,
         )
@@ -190,19 +197,41 @@ class DeepEvalClient:
     def compare_testcases(
         self,
         *,
-        testcase_a_id: int,
-        testcase_b_id: int,
+        testcase_a_id: int | None = None,
+        testcase_a_tag: str | None = None,
+        testcase_b_id: int | None = None,
+        testcase_b_tag: str | None = None,
         raise_exception: bool = False,
     ) -> Optional[dict]:
         """
-        Compare two evaluation testcases.
+        Compare two evaluation testcases using IDs or tags.
+        Exactly one identifier (id or tag) must be provided per testcase.
         """
+        def validate(id_, tag, name: str):
+            if bool(id_) == bool(tag):
+                raise ValueError(
+                    f"Provide exactly one of {name}_id or {name}_tag"
+                )
+        validate(testcase_a_id, testcase_a_tag, "testcase_a")
+        validate(testcase_b_id, testcase_b_tag, "testcase_b")
+        params: dict[str, int | str] = {}
+        if testcase_a_id is not None:
+            params["testcase_a_id"] = testcase_a_id
+        else:
+            params["testcase_a_tag"] = testcase_a_tag  # type: ignore
+        if testcase_b_id is not None:
+            params["testcase_b_id"] = testcase_b_id
+        else:
+            params["testcase_b_tag"] = testcase_b_tag  # type: ignore
         return self._get(
-            "/eval-framework/deepeval/evaluation-testcases/compare",
-            params={
-                "testcase_a_id": testcase_a_id,
-                "testcase_b_id": testcase_b_id,
-            },
+            endpoint="/eval-framework/deepeval/evaluation-testcases/compare",
+            params=params,
             raise_exception=raise_exception,
         )
@@ -217,7 +246,7 @@ class DeepEvalClient:
         Compare the latest evaluation testcases for two labels.
         """
         return self._get(
-            "/eval-framework/deepeval/evaluation-testcases/compare-latest",
+            endpoint="/eval-framework/deepeval/evaluation-testcases/compare-latest",
             params={
                 "label_a": label_a,
                 "label_b": label_b,
@@ -235,7 +264,7 @@ class DeepEvalClient:
         Compare the last two evaluation testcases for a given label.
         """
         return self._get(
-            "/eval-framework/deepeval/evaluation-testcases/compare-last-two",
+            endpoint="/eval-framework/deepeval/evaluation-testcases/compare-last-two",
             params={
                 "label": label,
             },
@@ -280,7 +309,7 @@ class DeepEvalClient:
             )
         return self._post(
-            "/deepeval/text-eval", config.model_dump(), raise_exception
+            endpoint="/deepeval/text-eval", payload=config.model_dump(), raise_exception=raise_exception
         )
     def text_eval_background(
@@ -295,7 +324,7 @@ class DeepEvalClient:
         payload = EvalConfig.model_construct(
             data=data, metrics=metrics, component=component, version=label
         ).model_dump()
-        return self._post("/deepeval/text-eval/background", payload, raise_exception)
+        return self._post(endpoint="/deepeval/text-eval/background", payload=payload, raise_exception=raise_exception)
     @overload
     def schema_eval(
@@ -340,9 +369,9 @@ class DeepEvalClient:
             )
         return self._post(
-            "/deepeval/schema-eval",
-            config.model_dump(),
-            raise_exception,
+            endpoint="/deepeval/schema-eval",
+            payload=config.model_dump(),
+            raise_exception=raise_exception,
         )
     def schema_eval_background(
@@ -358,7 +387,7 @@ class DeepEvalClient:
             data=data, metrics=metrics, component=component, version=label
         ).model_dump()
         return self._post(
-            "/deepeval/schema-eval/background", payload, raise_exception
+            endpoint="/deepeval/schema-eval/background", payload=payload, raise_exception=raise_exception
         )
     def maybe_text_eval(

{rakam_eval_sdk-0.2.2 → rakam_eval_sdk-0.2.3}/README.md RENAMED Viewed

File without changes

{rakam_eval_sdk-0.2.2 → rakam_eval_sdk-0.2.3}/src/rakam_eval_sdk/__init__.py RENAMED Viewed

File without changes

{rakam_eval_sdk-0.2.2 → rakam_eval_sdk-0.2.3}/src/rakam_eval_sdk/decorators.py RENAMED Viewed

File without changes

{rakam_eval_sdk-0.2.2 → rakam_eval_sdk-0.2.3}/src/rakam_eval_sdk/schema.py RENAMED Viewed

File without changes

{rakam_eval_sdk-0.2.2 → rakam_eval_sdk-0.2.3}/src/rakam_eval_sdk/utils/decorator_utils.py RENAMED Viewed

File without changes

rakam-eval-sdk 0.2.2__tar.gz → 0.2.3__tar.gz

rakam-eval-sdk 0.2.2tar.gz → 0.2.3tar.gz