PyPI - rakam-eval-sdk - Versions diffs - 0.1.16__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

rakam-eval-sdk 0.1.16py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

rakam_eval_sdk/cli.py +548 -8
rakam_eval_sdk/client.py +250 -25
rakam_eval_sdk/decorators.py +70 -0
rakam_eval_sdk/schema.py +7 -3
rakam_eval_sdk/utils/decorator_utils.py +58 -0
{rakam_eval_sdk-0.1.16.dist-info → rakam_eval_sdk-0.2.0.dist-info}/METADATA +3 -1
rakam_eval_sdk-0.2.0.dist-info/RECORD +10 -0
rakam_eval_sdk-0.2.0.dist-info/entry_points.txt +3 -0
rakam_eval_sdk-0.1.16.dist-info/RECORD +0 -8
rakam_eval_sdk-0.1.16.dist-info/entry_points.txt +0 -3
{rakam_eval_sdk-0.1.16.dist-info → rakam_eval_sdk-0.2.0.dist-info}/WHEEL +0 -0

rakam_eval_sdk/cli.py CHANGED Viewed

@@ -1,17 +1,557 @@
 # cli.py
-import typer
+import json
+import os
+import sys
+import uuid
+from datetime import datetime
 from pathlib import Path
+from pprint import pprint
+from typing import Any, Dict, Optional, Sequence
+import typer
+from dotenv import load_dotenv
+from rich.console import Console
+from rich.panel import Panel
+from rich.pretty import Pretty
+from rakam_eval_sdk.client import DeepEvalClient
+from rakam_eval_sdk.decorators import eval_run
+from rakam_eval_sdk.utils.decorator_utils import (
+    find_decorated_functions,
+    load_module_from_path,
+)
+load_dotenv()
+app = typer.Typer(help="CLI tools for evaluation utilities")
+console = Console()
+# add root of the project to sys.path
+PROJECT_ROOT = os.path.abspath(".")
+if PROJECT_ROOT not in sys.path:
+    sys.path.insert(0, PROJECT_ROOT)
+list_app = typer.Typer(help="List resources")
+app.add_typer(list_app, name="list")
+metrics_app = typer.Typer(help="Metrics utilities")
+app.add_typer(metrics_app, name="metrics")
+@metrics_app.command("list")
+def list_metrics(
+    limit: int = typer.Option(
+        20,
+        "--limit",
+        help="Number of testcases to inspect for metrics",
+    ),
+):
+    """
+    List unique metric names found in evaluation testcases.
+    """
+    client = DeepEvalClient()
+    testcases = client.list_evaluation_testcases(
+        limit=limit,
+        offset=0,
+        raise_exception=True,
+    )
+    if not testcases:
+        typer.echo("No evaluation testcases found.")
+        return
+    metric_names: set[str] = set()
+    def collect_metrics(entries: Sequence[Dict] | None):
+        if not entries:
+            return
+        for entry in entries:
+            for metric in entry.get("metrics", []) or []:
+                name = metric.get("name")
+                if name:
+                    metric_names.add(name)
+    for tc in testcases:
+        collect_metrics(tc.get("result"))
+    if not metric_names:
+        typer.echo("No metrics found.")
+        return
+    typer.echo(
+        f"📊 Found {len(metric_names)} unique metrics "
+        f"(from latest {limit} testcases)\n"
+    )
+    for name in sorted(metric_names):
+        typer.echo(f"- {name}")
+@list_app.command("eval")
+def list(
+    directory: Path = typer.Argument(
+        Path("./eval"),
+        exists=True,
+        file_okay=False,
+        dir_okay=True,
+        help="Directory to scan (default: ./eval)",
+    ),
+    recursive: bool = typer.Option(
+        False,
+        "--recursive",
+        "-r",
+        help="Recursively search for Python files",
+    ),
+) -> None:
+    """
+    Find functions decorated with @track.
+    """
+    TARGET_DECORATOR = eval_run.__name__
+    files = directory.rglob("*.py") if recursive else directory.glob("*.py")
+    found = False
+    for file in sorted(files):
+        functions = find_decorated_functions(file, TARGET_DECORATOR)
+        for fn in functions:
+            found = True
+            typer.echo(f"{file}:{fn}")
+    if not found:
+        typer.echo(f"No @{TARGET_DECORATOR} functions found.")
+@list_app.command("runs")
+def list_runs(
+    limit: int = typer.Option(20, help="Max number of runs"),
+    offset: int = typer.Option(0, help="Pagination offset"),
+):
+    """
+    List evaluation runs (newest first).
+    """
+    client = DeepEvalClient()
+    runs = client.list_evaluation_testcases(
+        limit=limit,
+        offset=offset,
+        raise_exception=True,
+    )
+    if not runs:
+        typer.echo("No evaluation runs found.")
+        return
+    typer.echo(f"[id] " f"{'unique_id':<20}" f"{'label':<20}" f"created_at")
+    # pretty CLI output
+    for run in runs:
+        run_id = run.get("id")
+        label = run.get("label") or "-"
+        uid = run.get("unique_id") or "-"
+        created_at = run.get("created_at")
+        if created_at:
+            try:
+                created_at = datetime.fromisoformat(created_at).strftime(
+                    "%Y-%m-%d %H:%M:%S"
+                )
+            except ValueError:
+                pass
+        typer.echo(f"[{run_id}] " f"{uid:<20} " f"{label:<20} " f"{created_at}")
+@list_app.command("show")
+def show_testcase(
+    id: Optional[int] = typer.Option(
+        None,
+        "--id",
+        help="Numeric evaluation testcase ID",
+    ),
+    uid: Optional[str] = typer.Option(
+        None,
+        "--uid",
+        help="Evaluation testcase unique_id",
+    ),
+    raw: bool = typer.Option(
+        False,
+        "--raw",
+        help="Print raw JSON instead of formatted output",
+    ),
+):
+    """
+    Show a single evaluation testcase by ID or unique_id.
+    """
+    if not id and not uid:
+        raise typer.BadParameter("You must provide either --id or --uid")
+    if id and uid:
+        raise typer.BadParameter("Provide only one of --id or --uid")
+    client = DeepEvalClient()
+    if id:
+        result = client.get_evaluation_testcase_by_id(id)
+        identifier = f"id={id}"
+    else:
+        result = client.get_evaluation_testcase_by_unique_id(uid)
+        identifier = f"unique_id={uid}"
+    if not result:
+        console.print(
+            Panel(
+                f"No response received for {identifier}",
+                title="Error",
+                style="red",
+            )
+        )
+        raise typer.Exit(code=1)
+    if isinstance(result, dict) and result.get("error"):
+        console.print(
+            Panel(
+                result["error"],
+                title="Error",
+                style="red",
+            )
+        )
+        raise typer.Exit(code=1)
+    if raw:
+        console.print(Pretty(result))
+        raise typer.Exit()
+    console.print(
+        Panel.fit(
+            Pretty(result),
+            title="Evaluation TestCase",
+            subtitle=identifier,
+        )
+    )
+def validate_eval_result(result: Any, fn_name: str) -> str:
+    eval_config = getattr(result, "__eval_config__", None)
+    if not isinstance(eval_config, str):
+        expected = "EvalConfig or SchemaEvalConfig"
+        actual = type(result).__name__
+        typer.echo(
+            f"    ❌ Invalid return type from `{fn_name}`\n"
+            f"       Expected: {expected}\n"
+            f"       Got: {actual}"
+        )
+        return ""
+    return eval_config
+@app.command()
+def run(
+    directory: Path = typer.Argument(
+        Path("./eval"),
+        exists=True,
+        file_okay=False,
+        dir_okay=True,
+        help="Directory to scan (default: ./eval)",
+    ),
+    recursive: bool = typer.Option(
+        False,
+        "-r",
+        "--recursive",
+        help="Recursively search for Python files",
+    ),
+    dry_run: bool = typer.Option(
+        False,
+        "--dry-run",
+        help="Only list functions without executing them",
+    ),
+    save_runs: bool = typer.Option(
+        False,
+        "--save-runs",
+        help="Save each evaluation run result to a JSON file",
+    ),
+    output_dir: Path = typer.Option(
+        Path("./eval_runs"),
+        "--output-dir",
+        help="Directory where run results are saved",
+    ),
+) -> None:
+    """
+    Find and execute all functions decorated with @eval_run.
+    """
+    files = directory.rglob("*.py") if recursive else directory.glob("*.py")
+    TARGET_DECORATOR = eval_run.__name__
+    executed_any = False
+    if save_runs and not dry_run:
+        output_dir.mkdir(parents=True, exist_ok=True)
+    for file in sorted(files):
+        functions = find_decorated_functions(file, TARGET_DECORATOR)
+        if not functions:
+            continue
+        typer.echo(f"\n📄 {file}")
+        module = None
+        if not dry_run:
+            try:
+                module = load_module_from_path(file)
+            except Exception as e:
+                typer.echo(f"  ❌ Failed to import module: {e}")
+                continue
+        for fn_name in functions:
+            typer.echo(f"  ▶ {fn_name}")
+            if dry_run:
+                continue
+            try:
+                func = getattr(module, fn_name)
+                result = func()
+                eval_type = validate_eval_result(result, fn_name)
+                if not eval_type:
+                    continue
+                client = DeepEvalClient()
+                if eval_type == "text_eval":
+                    resp = client.text_eval(config=result)
+                else:
+                    resp = client.schema_eval(config=result)
+                typer.echo(f"{resp}")
+                executed_any = True
+                typer.echo(f"    ✅ Returned {type(result).__name__}")
+                if save_runs:
+                    run_id = (
+                        resp["id"]
+                        if resp is not None and "id" in resp
+                        else uuid.uuid4().hex[:8]
+                    )
+                    output_path = output_dir / f"run_{fn_name}_{run_id}.json"
+                    def to_json_safe(obj: Any) -> Any:
+                        if hasattr(obj, "model_dump"):
+                            return obj.model_dump()
+                        if hasattr(obj, "dict"):
+                            return obj.dict()
+                        return obj
+                    with output_path.open("w", encoding="utf-8") as f:
+                        json.dump(
+                            to_json_safe(resp),
+                            f,
+                            indent=2,
+                            ensure_ascii=False,
+                        )
+                    typer.echo(f"    💾 Saved run → {output_path}")
+            except Exception as e:
+                typer.echo(f"    ❌ Execution failed: {e}")
+    if not executed_any and not dry_run:
+        typer.echo("\nNo @eval_run functions executed.")
+def _print_and_save(
+    resp: dict,
+    pretty: bool,
+    out: Path | None,
+    overwrite: bool,
+) -> None:
+    if pretty:
+        typer.echo(typer.style("📊 Result:", bold=True))
+        pprint(resp)
+    else:
+        typer.echo(resp)
+    if out is None:
+        return
-app = typer.Typer()
+    if out.exists() and not overwrite:
+        typer.echo(f"❌ File already exists: {out} (use --overwrite to replace)")
+        raise typer.Exit(code=1)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    with out.open("w", encoding="utf-8") as f:
+        json.dump(resp, f, indent=2, ensure_ascii=False)
+    typer.echo(f"💾 Result saved to {out}")
+@app.command()
+def compare_testcases(
+    testcase_a_id: int = typer.Argument(
+        ...,
+        help="ID of the first testcase",
+    ),
+    testcase_b_id: int = typer.Argument(
+        ...,
+        help="ID of the second testcase",
+    ),
+    pretty: bool = typer.Option(
+        True,
+        "--pretty/--raw",
+        help="Pretty-print the response",
+    ),
+    raise_exception: bool = typer.Option(
+        False,
+        "--raise",
+        help="Raise HTTP exceptions instead of swallowing them",
+    ),
+    out: Path | None = typer.Option(
+        None,
+        "-o",
+        "--out",
+        help="Optional file path to save the result as JSON",
+    ),
+    overwrite: bool = typer.Option(
+        False,
+        "--overwrite",
+        help="Overwrite output file if it already exists",
+    ),
+) -> None:
+    """
+    Compare two DeepEval evaluation testcases.
+    """
+    client = DeepEvalClient()
+    typer.echo(f"🔍 Comparing testcases {testcase_a_id} ↔ {testcase_b_id}")
+    try:
+        resp = client.compare_testcases(
+            testcase_a_id=testcase_a_id,
+            testcase_b_id=testcase_b_id,
+            raise_exception=raise_exception,
+        )
+    except Exception as e:
+        typer.echo(f"❌ Request failed: {e}")
+        raise typer.Exit(code=1)
+    if not resp:
+        typer.echo("⚠️ No response received")
+        raise typer.Exit(code=1)
+    _print_and_save(resp, pretty, out, overwrite)
+@app.command()
+def compare_label_latest(
+    label_a: str = typer.Argument(
+        ...,
+        help="First label (latest run will be used)",
+    ),
+    label_b: str = typer.Argument(
+        ...,
+        help="Second label (latest run will be used)",
+    ),
+    pretty: bool = typer.Option(
+        True,
+        "--pretty/--raw",
+        help="Pretty-print the response",
+    ),
+    raise_exception: bool = typer.Option(
+        False,
+        "--raise",
+        help="Raise HTTP exceptions instead of swallowing them",
+    ),
+    out: Path | None = typer.Option(
+        None,
+        "-o",
+        "--out",
+        help="Optional file path to save the result as JSON",
+    ),
+    overwrite: bool = typer.Option(
+        False,
+        "--overwrite",
+        help="Overwrite output file if it already exists",
+    ),
+) -> None:
+    """
+    Compare the latest evaluation runs for two labels.
+    """
+    client = DeepEvalClient()
+    typer.echo(f"🔍 Comparing latest runs: '{label_a}' ↔ '{label_b}'")
+    try:
+        resp = client.compare_latest_by_labels(
+            label_a=label_a,
+            label_b=label_b,
+            raise_exception=raise_exception,
+        )
+    except Exception as e:
+        typer.echo(f"❌ Request failed: {e}")
+        raise typer.Exit(code=1)
+    if not resp:
+        typer.echo("⚠️ No response received")
+        raise typer.Exit(code=1)
+    _print_and_save(resp, pretty, out, overwrite)
 @app.command()
-def read(file: Path):
-    """Read a Python file"""
-    if file.suffix != ".py":
-        raise typer.BadParameter("Must be a .py file")
-    typer.echo(file.read_text())
+def compare_last(
+    label: str = typer.Argument(
+        ...,
+        help="Label whose last two runs will be compared",
+    ),
+    pretty: bool = typer.Option(
+        True,
+        "--pretty/--raw",
+        help="Pretty-print the response",
+    ),
+    raise_exception: bool = typer.Option(
+        False,
+        "--raise",
+        help="Raise HTTP exceptions instead of swallowing them",
+    ),
+    out: Path | None = typer.Option(
+        None,
+        "-o",
+        "--out",
+        help="Optional file path to save the result as JSON",
+    ),
+    overwrite: bool = typer.Option(
+        False,
+        "--overwrite",
+        help="Overwrite output file if it already exists",
+    ),
+) -> None:
+    """
+    Compare the last two evaluation runs of a label.
+    """
+    client = DeepEvalClient()
+    typer.echo(f"🔍 Comparing last two runs for label '{label}'")
+    try:
+        resp = client.compare_last_two_by_label(
+            label=label,
+            raise_exception=raise_exception,
+        )
+    except Exception as e:
+        typer.echo(f"❌ Request failed: {e}")
+        raise typer.Exit(code=1)
-def main():
+    if not resp:
+        typer.echo("⚠️ No response received")
+        raise typer.Exit(code=1)
+    _print_and_save(resp, pretty, out, overwrite)
+def main() -> None:
     app()
+if __name__ == "__main__":
+    main()

rakam_eval_sdk/client.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import os
 import random
-from typing import Any, List, Optional, Union, cast
+from typing import Any, Dict, List, Optional, Union, cast, overload
 import requests
@@ -74,19 +74,202 @@ class DeepEvalClient:
                 raise
             return {"error": "Invalid JSON response", "raw": resp.text}
+    def _get(
+        self,
+        endpoint: str,
+        params: dict,
+        raise_exception: bool = False,
+    ) -> Optional[dict]:
+        """Internal helper to send GET requests with standard headers and error handling."""
+        url = f"{self.base_url}{endpoint}"
+        headers = {
+            "accept": "application/json",
+            "X-API-Token": self.api_token,
+        }
+        try:
+            resp = requests.get(
+                url,
+                headers=headers,
+                params=params,
+                timeout=self.timeout,
+            )
+            if raise_exception:
+                resp.raise_for_status()
+        except requests.RequestException as e:
+            if raise_exception:
+                raise
+            return {"error": str(e)}
+        try:
+            return cast(dict, resp.json())
+        except ValueError:
+            if raise_exception:
+                raise
+            return {"error": "Invalid JSON response", "raw": resp.text}
+    def list_evaluation_testcases(
+        self,
+        *,
+        limit: int = 10,
+        offset: int = 0,
+        raise_exception: bool = False,
+    ) -> Optional[List[Dict]]:
+        """
+        List evaluation testcases for the current API token only.
+        Sorted by created_at DESC (newest first).
+        """
+        return self._get(
+            "/eval-framework/deepeval/evaluation-testcases/token",
+            params={
+                "limit": limit,
+                "offset": offset,
+            },
+            raise_exception=raise_exception,
+        )
+    def get_evaluation_testcase_by_id(
+        self,
+        testcase_id: int,
+        *,
+        raise_exception: bool = False,
+    ) -> Optional[Dict]:
+        """
+        Fetch a single evaluation testcase by numeric ID.
+        """
+        return self._get(
+            f"/eval-framework/deepeval/id/{testcase_id}",
+            params={},
+            raise_exception=raise_exception,
+        )
+    def get_evaluation_testcase_by_unique_id(
+        self,
+        unique_id: str,
+        *,
+        raise_exception: bool = False,
+    ) -> Optional[Dict]:
+        """
+        Fetch a single evaluation testcase by unique_id.
+        """
+        return self._get(
+            f"/eval-framework/deepeval/uid/{unique_id}",
+            params={},
+            raise_exception=raise_exception,
+        )
+    def get_evaluation_testcase(
+        self,
+        *,
+        id: Optional[int] = None,
+        unique_id: Optional[str] = None,
+        raise_exception: bool = False,
+    ) -> Optional[Dict]:
+        if id is not None:
+            return self.get_evaluation_testcase_by_id(
+                id, raise_exception=raise_exception
+            )
+        if unique_id is not None:
+            return self.get_evaluation_testcase_by_unique_id(
+                unique_id, raise_exception=raise_exception
+            )
+        raise ValueError("Either id or unique_id must be provided")
+    def compare_testcases(
+        self,
+        *,
+        testcase_a_id: int,
+        testcase_b_id: int,
+        raise_exception: bool = False,
+    ) -> Optional[dict]:
+        """
+        Compare two evaluation testcases.
+        """
+        return self._get(
+            "/eval-framework/deepeval/evaluation-testcases/compare",
+            params={
+                "testcase_a_id": testcase_a_id,
+                "testcase_b_id": testcase_b_id,
+            },
+            raise_exception=raise_exception,
+        )
+    def compare_latest_by_labels(
+        self,
+        *,
+        label_a: str,
+        label_b: str,
+        raise_exception: bool = False,
+    ) -> Optional[dict]:
+        """
+        Compare the latest evaluation testcases for two labels.
+        """
+        return self._get(
+            "/eval-framework/deepeval/evaluation-testcases/compare-latest",
+            params={
+                "label_a": label_a,
+                "label_b": label_b,
+            },
+            raise_exception=raise_exception,
+        )
+    def compare_last_two_by_label(
+        self,
+        *,
+        label: str,
+        raise_exception: bool = False,
+    ) -> Optional[dict]:
+        """
+        Compare the last two evaluation testcases for a given label.
+        """
+        return self._get(
+            "/eval-framework/deepeval/evaluation-testcases/compare-last-two",
+            params={
+                "label": label,
+            },
+            raise_exception=raise_exception,
+        )
+    @overload
+    def text_eval(
+        self,
+        config: EvalConfig,
+        *,
+        raise_exception: bool = False,
+    ) -> Optional[dict]: ...
+    @overload
     def text_eval(
         self,
+        *,
         data: List[TextInputItem],
         metrics: List[MetricConfig],
+        component: str = "unknown",
+        label: str | None = None,
         raise_exception: bool = False,
+    ) -> Optional[dict]: ...
+    def text_eval(
+        self,
+        config: EvalConfig | None = None,
+        *,
+        data: List[TextInputItem] | None = None,
+        metrics: List[MetricConfig] | None = None,
         component: str = "unknown",
-        version: Union[str, None] = None,
+        label: str | None = None,
+        raise_exception: bool = False,
     ) -> Optional[dict]:
-        """Run synchronous text evaluation."""
-        payload = EvalConfig.model_construct(
-            data=data, metrics=metrics, component=component, version=version
-        ).model_dump()
-        return self._request("/deepeval/text-eval", payload, raise_exception)
+        if config is None:
+            config = EvalConfig(
+                data=data,
+                metrics=metrics,
+                component=component,
+                label=label,
+            )
+        return self._request(
+            "/deepeval/text-eval", config.model_dump(), raise_exception
+        )
     def text_eval_background(
         self,
@@ -94,27 +277,61 @@ class DeepEvalClient:
         metrics: List[MetricConfig],
         raise_exception: bool = False,
         component: str = "unknown",
-        version: Union[str, None] = None,
+        label: Union[str, None] = None,
     ) -> Optional[dict]:
         """Run background text evaluation (async job)."""
         payload = EvalConfig.model_construct(
-            data=data, metrics=metrics, component=component, version=version
+            data=data, metrics=metrics, component=component, version=label
         ).model_dump()
         return self._request("/deepeval/text-eval/background", payload, raise_exception)
+    @overload
     def schema_eval(
         self,
+        *,
         data: List[SchemaInputItem],
         metrics: List[SchemaMetricConfig],
+        component: str = "unknown",
+        label: str | None = None,
+        raise_exception: bool = False,
+    ) -> Optional[dict]: ...
+    @overload
+    def schema_eval(
+        self,
+        config: SchemaEvalConfig,
+        *,
         raise_exception: bool = False,
+    ) -> Optional[dict]: ...
+    def schema_eval(
+        self,
+        config: SchemaEvalConfig | None = None,
+        *,
+        data: List[SchemaInputItem] | None = None,
+        metrics: List[SchemaMetricConfig] | None = None,
         component: str = "unknown",
-        version: Union[str, None] = None,
+        label: str | None = None,
+        raise_exception: bool = False,
     ) -> Optional[dict]:
-        """Run synchronous schema evaluation."""
-        payload = SchemaEvalConfig.model_construct(
-            data=data, metrics=metrics, component=component, version=version
-        ).model_dump()
-        return self._request("/deepeval/schema-eval", payload, raise_exception)
+        if config is None:
+            if data is None or metrics is None:
+                raise ValueError(
+                    "Either `config` or both `data` and `metrics` must be provided"
+                )
+            config = SchemaEvalConfig(
+                data=data,
+                metrics=metrics,
+                component=component,
+                label=label,
+            )
+        return self._request(
+            "/deepeval/schema-eval",
+            config.model_dump(),
+            raise_exception,
+        )
     def schema_eval_background(
         self,
@@ -122,11 +339,11 @@ class DeepEvalClient:
         metrics: List[SchemaMetricConfig],
         raise_exception: bool = False,
         component: str = "unknown",
-        version: Union[str, None] = None,
+        label: Union[str, None] = None,
     ) -> Optional[dict]:
         """Run background schema evaluation (async job)."""
         payload = SchemaEvalConfig.model_construct(
-            data=data, metrics=metrics, component=component, version=version
+            data=data, metrics=metrics, component=component, version=label
         ).model_dump()
         return self._request(
             "/deepeval/schema-eval/background", payload, raise_exception
@@ -139,13 +356,17 @@ class DeepEvalClient:
         chance: float,
         raise_exception: bool = False,
         component: str = "unknown",
-        version: Union[str, None] = None,
+        label: Union[str, None] = None,
     ) -> Optional[dict]:
         """Randomly run text_eval based on a probability between 0 and 1."""
         self._validate_chance(chance)
         return (
             self.text_eval(
-                data, metrics, raise_exception, component=component, version=version
+                data=data,
+                metrics=metrics,
+                raise_exception=raise_exception,
+                component=component,
+                label=label,
             )
             if random.random() <= chance
             else None
@@ -158,13 +379,13 @@ class DeepEvalClient:
         chance: float,
         raise_exception: bool = False,
         component: str = "unknown",
-        version: Union[str, None] = None,
+        label: Union[str, None] = None,
     ) -> Optional[dict]:
         """Randomly run text_eval_background based on a probability between 0 and 1."""
         self._validate_chance(chance)
         return (
             self.text_eval_background(
-                data, metrics, raise_exception, component=component, version=version
+                data, metrics, raise_exception, component=component, label=label
             )
             if random.random() <= chance
             else None
@@ -177,13 +398,17 @@ class DeepEvalClient:
         chance: float,
         raise_exception: bool = False,
         component: str = "unknown",
-        version: Union[str, None] = None,
+        label: Union[str, None] = None,
     ) -> Optional[dict]:
         """Randomly run schema_eval based on a probability between 0 and 1."""
         self._validate_chance(chance)
         return (
             self.schema_eval(
-                data, metrics, raise_exception, component=component, version=version
+                data=data,
+                metrics=metrics,
+                raise_exception=raise_exception,
+                component=component,
+                label=label,
             )
             if random.random() <= chance
             else None
@@ -196,13 +421,13 @@ class DeepEvalClient:
         chance: float,
         raise_exception: bool = False,
         component: str = "unknown",
-        version: Union[str, None] = None,
+        label: Union[str, None] = None,
     ) -> Optional[dict]:
         """Randomly run text_eval_background based on a probability between 0 and 1."""
         self._validate_chance(chance)
         return (
             self.schema_eval_background(
-                data, metrics, raise_exception, component=component, version=version
+                data, metrics, raise_exception, component=component, label=label
             )
             if random.random() <= chance
             else None

rakam_eval_sdk/decorators.py ADDED Viewed

@@ -0,0 +1,70 @@
+import functools
+import os
+import time
+from typing import Callable, Dict, Optional, ParamSpec, TypeVar, Union, overload
+import psutil
+P = ParamSpec("P")
+R = TypeVar("R")
+@overload
+def eval_run(func: Callable[P, R]) -> Callable[P, R]: ...
+@overload
+def eval_run(
+    func: None = None,
+    **decorator_kwargs: Dict[str, object],
+) -> Callable[[Callable[P, R]], Callable[P, R]]: ...
+def eval_run(
+    func: Optional[Callable[P, R]] = None,
+    **decorator_kwargs: Dict[str, object],
+) -> Union[
+    Callable[P, R],
+    Callable[[Callable[P, R]], Callable[P, R]],
+]:
+    # used as @eval_run
+    if callable(func):
+        return _wrap(func)
+    # used as @eval_run(...)
+    def decorator(real_func: Callable[P, R]) -> Callable[P, R]:
+        return _wrap(real_func)
+    return decorator
+def _wrap(func: Callable[P, R]) -> Callable[P, R]:
+    @functools.wraps(func)
+    def inner(*args: P.args, **kwargs: P.kwargs) -> R:
+        process = psutil.Process(os.getpid())
+        start_time = time.perf_counter()
+        start_cpu = process.cpu_times()
+        start_mem = process.memory_info().rss
+        try:
+            return func(*args, **kwargs)
+        finally:
+            end_time = time.perf_counter()
+            end_cpu = process.cpu_times()
+            end_mem = process.memory_info().rss
+            elapsed = end_time - start_time
+            cpu_used = (end_cpu.user + end_cpu.system) - (
+                start_cpu.user + start_cpu.system
+            )
+            mem_delta_mb = (end_mem - start_mem) / (1024 * 1024)
+            print(
+                f"[eval_run] {func.__module__}.{func.__name__} | "
+                f"time={elapsed:.4f}s | "
+                f"cpu={cpu_used:.4f}s | "
+                f"mem_delta={mem_delta_mb:.2f}MB"
+            )
+    return inner

rakam_eval_sdk/schema.py CHANGED Viewed

@@ -39,7 +39,7 @@ class CorrectnessConfig(MetricConfigBase):
             "Minor formatting differences like '$1,250.00' vs '$1250.00' are acceptable."
         ]
     )
-    criteria: Optional[str] = (None,)
+    criteria: Optional[str] = None
     params: List[Literal["actual_output", "expected_output"]] = Field(
         default=["actual_output", "expected_output"]
     )
@@ -116,14 +116,18 @@ class SchemaInputItem(InputItem):
 class EvalConfig(BaseModel):
+    __eval_config__ = "text_eval"
+    unique_id: Union[str, None] = None
     component: str = "unknown"
-    version: Union[str, None] = None
+    label: Union[str, None] = None
     data: List[TextInputItem]
     metrics: List[MetricConfig] = Field(default_factory=list)
 class SchemaEvalConfig(BaseModel):
+    __eval_config__ = "schema_eval"
     component: str = "unknown"
-    version: Union[str, None] = None
+    unique_id: Union[str, None] = None
+    label: Union[str, None] = None
     data: List[SchemaInputItem]
     metrics: List[SchemaMetricConfig] = Field(default_factory=list)

rakam_eval_sdk/utils/decorator_utils.py ADDED Viewed

@@ -0,0 +1,58 @@
+import ast
+import importlib
+import importlib.util
+from pathlib import Path
+from types import ModuleType
+from typing import List
+class DecoratedFunctionVisitor(ast.NodeVisitor):
+    def __init__(self, decorator_name: str):
+        self.decorator_name = decorator_name
+        self.results: List[str] = []
+    def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
+        for deco in node.decorator_list:
+            if self._matches(deco):
+                self.results.append(node.name)
+        self.generic_visit(node)
+    def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
+        for deco in node.decorator_list:
+            if self._matches(deco):
+                self.results.append(node.name)
+        self.generic_visit(node)
+    def _matches(self, deco: ast.expr) -> bool:
+        # @deco
+        if isinstance(deco, ast.Name):
+            return deco.id == self.decorator_name
+        # @module.deco
+        if isinstance(deco, ast.Attribute):
+            return deco.attr == self.decorator_name
+        # @deco(...)
+        if isinstance(deco, ast.Call):
+            return self._matches(deco.func)
+        return False
+def find_decorated_functions(
+    file_path: Path,
+    decorator_name: str,
+) -> List[str]:
+    tree = ast.parse(file_path.read_text(encoding="utf-8"))
+    visitor = DecoratedFunctionVisitor(decorator_name)
+    visitor.visit(tree)
+    return visitor.results
+def load_module_from_path(file_path: Path) -> ModuleType:
+    spec = importlib.util.spec_from_file_location(file_path.stem, file_path)
+    if spec is None or spec.loader is None:
+        raise ImportError(f"Cannot import {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module

{rakam_eval_sdk-0.1.16.dist-info → rakam_eval_sdk-0.2.0.dist-info}/METADATA RENAMED Viewed

@@ -1,9 +1,11 @@
 Metadata-Version: 2.3
 Name: rakam-eval-sdk
-Version: 0.1.16
+Version: 0.2.0
 Summary: Evaluation Framework SDK
 Author: Mohamed Bachar Touil
 License: MIT
+Requires-Dist: dotenv>=0.9.9
+Requires-Dist: psutil>=7.2.1
 Requires-Dist: pydantic>=2.10.6
 Requires-Dist: requests
 Requires-Dist: typer>=0.20.1

rakam_eval_sdk-0.2.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,10 @@
+rakam_eval_sdk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+rakam_eval_sdk/cli.py,sha256=0Ym7C83xirGEf0YCh4-agwfjYPJ3yieeS02nXJ_iE-Y,14486
+rakam_eval_sdk/client.py,sha256=JQ-vCJmMLqXql7nNGYBy5dlkZsCq05gOOQhucwwexC8,13060
+rakam_eval_sdk/decorators.py,sha256=_9VFQmoYWd6cqnNryZJWEwYHQRxY7vIOam4z45zBk3c,1794
+rakam_eval_sdk/schema.py,sha256=P4LlnaInXWTq-ve6qPTTxPyzmj3j_1gcqV9i7CYRYec,3608
+rakam_eval_sdk/utils/decorator_utils.py,sha256=g0TjXtG9o4hwhUAFP8GJsXAkjhZhzeseTAg-YBFjj2g,1763
+rakam_eval_sdk-0.2.0.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
+rakam_eval_sdk-0.2.0.dist-info/entry_points.txt,sha256=tNhwmM_UGELb3h0zOfgCrtTheUkP-k8jGv0rTOfRSps,56
+rakam_eval_sdk-0.2.0.dist-info/METADATA,sha256=1l6TbYR49zIKpDyNjgqJFLQ_b1mAvBHKizAWF085-9M,6016
+rakam_eval_sdk-0.2.0.dist-info/RECORD,,

rakam_eval_sdk-0.2.0.dist-info/entry_points.txt ADDED Viewed

@@ -0,0 +1,3 @@
+[console_scripts]
+rakam_eval = rakam_eval_sdk.cli:main

rakam_eval_sdk-0.1.16.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-rakam_eval_sdk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-rakam_eval_sdk/cli.py,sha256=dn1KXh-_VLpIvLFHnpHPfAL33ICoAJ9Y2iCOikkJcxY,277
-rakam_eval_sdk/client.py,sha256=q-Y11maLVKaEnq4OSyFCqrP3JgFS1xpyp9-bZhFssIA,7123
-rakam_eval_sdk/schema.py,sha256=MQfF0SEHf2wzeXJNTsMs-yDbN0vZJQbN_crfpPXsTk8,3467
-rakam_eval_sdk-0.1.16.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
-rakam_eval_sdk-0.1.16.dist-info/entry_points.txt,sha256=NzE2wDRB4Kt-TblkjSD37abcfP4B5STBOyygEhGTLdU,51
-rakam_eval_sdk-0.1.16.dist-info/METADATA,sha256=s7N_RsRR87-6aQhNmCnuDeMDrL6ZOm0vr7iR2cS5FwU,5959
-rakam_eval_sdk-0.1.16.dist-info/RECORD,,

rakam_eval_sdk-0.1.16.dist-info/entry_points.txt DELETED Viewed

@@ -1,3 +0,0 @@
-[console_scripts]
-mycli = rakam_eval_sdk.cli:main

{rakam_eval_sdk-0.1.16.dist-info → rakam_eval_sdk-0.2.0.dist-info}/WHEEL RENAMED Viewed

File without changes

rakam-eval-sdk 0.1.16__py3-none-any.whl → 0.2.0__py3-none-any.whl

rakam-eval-sdk 0.1.16py3-none-any.whl → 0.2.0py3-none-any.whl