PyPI - atlas-eval - Versions diffs - 0.1.0__tar.gz - Mend

atlas-eval 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

atlas_eval-0.1.0/PKG-INFO +51 -0
atlas_eval-0.1.0/README.md +36 -0
atlas_eval-0.1.0/atlas_eval.egg-info/PKG-INFO +51 -0
atlas_eval-0.1.0/atlas_eval.egg-info/SOURCES.txt +7 -0
atlas_eval-0.1.0/atlas_eval.egg-info/dependency_links.txt +1 -0
atlas_eval-0.1.0/atlas_eval.egg-info/top_level.txt +1 -0
atlas_eval-0.1.0/atlas_eval.py +249 -0
atlas_eval-0.1.0/pyproject.toml +25 -0
atlas_eval-0.1.0/setup.cfg +4 -0

atlas_eval-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,51 @@
+Metadata-Version: 2.4
+Name: atlas-eval
+Version: 0.1.0
+Summary: Evaluate your AI agent or LLM/SLM on the Bharat AI Index / Atlas Agent Arena.
+Author: Atlas AI Labs
+License: MIT
+Project-URL: Homepage, https://github.com/atlas-ai-labs/bharat-ai-index
+Project-URL: Documentation, https://github.com/atlas-ai-labs/bharat-ai-index/tree/main/sdk
+Keywords: llm,agent,evaluation,benchmark,india,sovereign-ai,agno,langchain
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Intended Audience :: Developers
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+# atlas-eval (Python)
+Evaluate your AI agent / LLM on the Bharat AI Index — Atlas Agent Arena.
+> Requires a running Atlas instance (default `http://localhost:3000`). Start one with
+> `npm run dev` in the repo, or `docker run -p 3000:3000 ... bharat-ai-index`.
+## Install
+```bash
+# until published to PyPI:
+pip install -e sdk/python        # from the repo root
+# or copy sdk/python/atlas_eval.py into your project
+```
+## Use
+```python
+from atlas_eval import evaluate_agent, AccuracyEval, PerformanceEval, ReliabilityEval
+# whole-agent benchmark (sandboxed tasks, agentic score)
+def my_agent(ctx):
+    return {"tool": "fs_read", "args": {"path": "invoice_a.txt"}}
+report = evaluate_agent(my_agent, base_url="http://localhost:3000")
+# code-first evals (Agno-style)
+AccuracyEval(agent=lambda q: "New Delhi", input="Capital of India?",
+             expected_output="New Delhi").run(print_results=True)
+PerformanceEval(func=lambda: my_model("ping"), num_iterations=5).run(print_results=True)
+ReliabilityEval(tool_calls=["search"], expected_tool_calls=["search"]).run(print_results=True)
+```
+`PerformanceEval` and `ReliabilityEval` run fully locally (no server). `evaluate_agent`
+and `AccuracyEval` call the Atlas instance (and a configured judge model for accuracy).
+See the top-level [`sdk/README.md`](../README.md) for the protocol and JS equivalents.

atlas_eval-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,36 @@
+# atlas-eval (Python)
+Evaluate your AI agent / LLM on the Bharat AI Index — Atlas Agent Arena.
+> Requires a running Atlas instance (default `http://localhost:3000`). Start one with
+> `npm run dev` in the repo, or `docker run -p 3000:3000 ... bharat-ai-index`.
+## Install
+```bash
+# until published to PyPI:
+pip install -e sdk/python        # from the repo root
+# or copy sdk/python/atlas_eval.py into your project
+```
+## Use
+```python
+from atlas_eval import evaluate_agent, AccuracyEval, PerformanceEval, ReliabilityEval
+# whole-agent benchmark (sandboxed tasks, agentic score)
+def my_agent(ctx):
+    return {"tool": "fs_read", "args": {"path": "invoice_a.txt"}}
+report = evaluate_agent(my_agent, base_url="http://localhost:3000")
+# code-first evals (Agno-style)
+AccuracyEval(agent=lambda q: "New Delhi", input="Capital of India?",
+             expected_output="New Delhi").run(print_results=True)
+PerformanceEval(func=lambda: my_model("ping"), num_iterations=5).run(print_results=True)
+ReliabilityEval(tool_calls=["search"], expected_tool_calls=["search"]).run(print_results=True)
+```
+`PerformanceEval` and `ReliabilityEval` run fully locally (no server). `evaluate_agent`
+and `AccuracyEval` call the Atlas instance (and a configured judge model for accuracy).
+See the top-level [`sdk/README.md`](../README.md) for the protocol and JS equivalents.

atlas_eval-0.1.0/atlas_eval.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,51 @@
+Metadata-Version: 2.4
+Name: atlas-eval
+Version: 0.1.0
+Summary: Evaluate your AI agent or LLM/SLM on the Bharat AI Index / Atlas Agent Arena.
+Author: Atlas AI Labs
+License: MIT
+Project-URL: Homepage, https://github.com/atlas-ai-labs/bharat-ai-index
+Project-URL: Documentation, https://github.com/atlas-ai-labs/bharat-ai-index/tree/main/sdk
+Keywords: llm,agent,evaluation,benchmark,india,sovereign-ai,agno,langchain
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Intended Audience :: Developers
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+# atlas-eval (Python)
+Evaluate your AI agent / LLM on the Bharat AI Index — Atlas Agent Arena.
+> Requires a running Atlas instance (default `http://localhost:3000`). Start one with
+> `npm run dev` in the repo, or `docker run -p 3000:3000 ... bharat-ai-index`.
+## Install
+```bash
+# until published to PyPI:
+pip install -e sdk/python        # from the repo root
+# or copy sdk/python/atlas_eval.py into your project
+```
+## Use
+```python
+from atlas_eval import evaluate_agent, AccuracyEval, PerformanceEval, ReliabilityEval
+# whole-agent benchmark (sandboxed tasks, agentic score)
+def my_agent(ctx):
+    return {"tool": "fs_read", "args": {"path": "invoice_a.txt"}}
+report = evaluate_agent(my_agent, base_url="http://localhost:3000")
+# code-first evals (Agno-style)
+AccuracyEval(agent=lambda q: "New Delhi", input="Capital of India?",
+             expected_output="New Delhi").run(print_results=True)
+PerformanceEval(func=lambda: my_model("ping"), num_iterations=5).run(print_results=True)
+ReliabilityEval(tool_calls=["search"], expected_tool_calls=["search"]).run(print_results=True)
+```
+`PerformanceEval` and `ReliabilityEval` run fully locally (no server). `evaluate_agent`
+and `AccuracyEval` call the Atlas instance (and a configured judge model for accuracy).
+See the top-level [`sdk/README.md`](../README.md) for the protocol and JS equivalents.

atlas_eval-0.1.0/atlas_eval.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,7 @@
+README.md
+atlas_eval.py
+pyproject.toml
+atlas_eval.egg-info/PKG-INFO
+atlas_eval.egg-info/SOURCES.txt
+atlas_eval.egg-info/dependency_links.txt
+atlas_eval.egg-info/top_level.txt

atlas_eval-0.1.0/atlas_eval.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

atlas_eval-0.1.0/atlas_eval.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ atlas_eval

atlas_eval-0.1.0/atlas_eval.py ADDED Viewed

@@ -0,0 +1,249 @@
+"""Atlas eval SDK (Python) — evaluate YOUR agentic system in a few lines.
+You write your agent as one function: given the step context, return the next action.
+`evaluate_agent` spins up a tiny local endpoint for it, tells a running Atlas instance
+to benchmark it on the Agent Arena, and returns the full scorecard. No framework lock-in
+— wrap LangChain, LangGraph, Agno, or your own loop.
+    from atlas_eval import evaluate_agent
+    def my_agent(ctx):
+        # ctx = { goal, tools, history, lastObservation, step, maxSteps }
+        return {"tool": "fs_read", "args": {"path": "invoice_a.txt"}}
+    report = evaluate_agent(my_agent, base_url="http://localhost:3000")
+    print(report["agenticScore"], report["successRate"])
+Only depends on the Python standard library.
+"""
+from __future__ import annotations
+import json
+import socketserver
+import threading
+import time
+import urllib.error
+import urllib.request
+from dataclasses import dataclass
+from http.server import BaseHTTPRequestHandler
+from typing import Any, Callable, Dict, List, Optional
+__version__ = "0.1.0"
+_ATLAS_HINT = (
+    "Is Atlas running? Start it with `npm run dev` (http://localhost:3000) or "
+    "`docker run -p 3000:3000 bharat-ai-index`, then pass base_url=…"
+)
+def _post_json(url: str, payload: dict, timeout: int = 120) -> dict:
+    """POST JSON with friendly errors for the common 'server not running' case."""
+    req = urllib.request.Request(
+        url, data=json.dumps(payload).encode(), headers={"Content-Type": "application/json"})
+    try:
+        with urllib.request.urlopen(req, timeout=timeout) as resp:
+            return json.loads(resp.read())
+    except urllib.error.HTTPError as e:
+        detail = e.read().decode("utf-8", "ignore")[:300]
+        raise RuntimeError(f"Atlas error HTTP {e.code} at {url}: {detail}") from None
+    except urllib.error.URLError as e:
+        raise RuntimeError(f"Could not reach Atlas at {url}. {_ATLAS_HINT} (reason: {e.reason})") from None
+def check(base_url: str = "http://localhost:3000") -> dict:
+    """Preflight: return Atlas /api/health, or raise a friendly error if unreachable."""
+    try:
+        with urllib.request.urlopen(f"{base_url.rstrip('/')}/api/health", timeout=10) as r:
+            return json.loads(r.read())
+    except Exception as e:  # noqa: BLE001
+        raise RuntimeError(f"Atlas not reachable at {base_url}. {_ATLAS_HINT} ({e})") from None
+Context = Dict[str, Any]
+Action = Dict[str, Any]
+AgentFn = Callable[[Context], Action]
+def evaluate_agent(
+    decide: AgentFn,
+    base_url: str = "http://localhost:3000",
+    label: str = "Python Agent",
+    timeout: int = 600,
+) -> Dict[str, Any]:
+    """Run the Atlas agentic benchmark against your `decide` function and return the run."""
+    class Handler(BaseHTTPRequestHandler):
+        def do_POST(self):  # noqa: N802
+            length = int(self.headers.get("Content-Length", 0))
+            ctx = json.loads(self.rfile.read(length) or b"{}")
+            try:
+                action = decide(ctx)
+            except Exception as exc:  # never crash the eval on a bad step
+                action = {"tool": "finish", "args": {"answer": f"agent error: {exc}"}}
+            payload = json.dumps({"action": action}).encode()
+            self.send_response(200)
+            self.send_header("Content-Type", "application/json")
+            self.send_header("Content-Length", str(len(payload)))
+            self.end_headers()
+            self.wfile.write(payload)
+        def log_message(self, *args):  # silence
+            return
+    httpd = socketserver.TCPServer(("127.0.0.1", 0), Handler)
+    port = httpd.server_address[1]
+    thread = threading.Thread(target=httpd.serve_forever, daemon=True)
+    thread.start()
+    try:
+        agent_url = f"http://127.0.0.1:{port}/act"
+        out = _post_json(
+            f"{base_url.rstrip('/')}/api/agent/run",
+            {"agentUrl": agent_url, "agentLabel": label},
+            timeout=timeout,
+        )
+        return out.get("run", out)
+    finally:
+        httpd.shutdown()
+        httpd.server_close()
+def serve_agent(decide: AgentFn, port: int = 4700) -> None:
+    """Long-running mode: expose your agent at /act and let Atlas call it on demand."""
+    class Handler(BaseHTTPRequestHandler):
+        def do_POST(self):  # noqa: N802
+            length = int(self.headers.get("Content-Length", 0))
+            ctx = json.loads(self.rfile.read(length) or b"{}")
+            try:
+                action = decide(ctx)
+            except Exception as exc:
+                action = {"tool": "finish", "args": {"answer": f"agent error: {exc}"}}
+            payload = json.dumps({"action": action}).encode()
+            self.send_response(200)
+            self.send_header("Content-Type", "application/json")
+            self.end_headers()
+            self.wfile.write(payload)
+        def log_message(self, *args):
+            return
+    print(f"agent listening on http://127.0.0.1:{port}/act")
+    socketserver.TCPServer(("127.0.0.1", port), Handler).serve_forever()
+# ──────────────────────────────────────────────────────────────────────────────
+# Code-first evals (Agno-style): write an eval against your agent and .run() it.
+# ──────────────────────────────────────────────────────────────────────────────
+def _agent_output(agent: Any, text: str) -> str:
+    """Accept a callable agent(input)->str or an object exposing .run(input)."""
+    if callable(agent):
+        out = agent(text)
+    elif hasattr(agent, "run"):
+        out = agent.run(text)
+    else:
+        raise TypeError("agent must be callable or expose a .run() method")
+    return out if isinstance(out, str) else str(getattr(out, "content", out))
+@dataclass
+class AccuracyResult:
+    score: float        # 0..10 (LLM-as-judge)
+    passed: bool
+    reason: str
+    output: str
+class AccuracyEval:
+    """Measure correctness of your agent's answer with LLM-as-judge (via Atlas).
+        AccuracyEval(agent=my_agent, input="What is the capital of India?",
+                     expected_output="New Delhi").run(print_results=True)
+    """
+    def __init__(self, agent, input, expected_output=None, guidelines=None,
+                 threshold=7.0, base_url="http://localhost:3000"):
+        self.agent = agent
+        self.input = input
+        self.expected_output = expected_output
+        self.guidelines = guidelines
+        self.threshold = threshold
+        self.base_url = base_url.rstrip("/")
+    def run(self, print_results: bool = False) -> AccuracyResult:
+        output = _agent_output(self.agent, self.input)
+        j = _post_json(f"{self.base_url}/api/eval/judge", {
+            "input": self.input, "output": output,
+            "expected": self.expected_output or "",
+            "guidelines": self.guidelines or "", "threshold": self.threshold,
+        }, timeout=120)
+        res = AccuracyResult(float(j.get("score", 0)), bool(j.get("passed", False)),
+                             str(j.get("reason", "")), output)
+        if print_results:
+            print(f"[accuracy] {res.score:.1f}/10  passed={res.passed}  :: {res.reason}")
+        return res
+@dataclass
+class PerformanceResult:
+    avg_ms: float
+    min_ms: float
+    max_ms: float
+    p95_ms: float
+    iterations: int
+class PerformanceEval:
+    """Measure runtime latency of an agent call over N iterations.
+        PerformanceEval(func=lambda: my_agent("ping"), num_iterations=5).run(print_results=True)
+    """
+    def __init__(self, func: Callable[[], Any], num_iterations: int = 5, warmup: int = 1):
+        self.func = func
+        self.num_iterations = num_iterations
+        self.warmup = warmup
+    def run(self, print_results: bool = False) -> PerformanceResult:
+        for _ in range(self.warmup):
+            self.func()
+        times: List[float] = []
+        for _ in range(self.num_iterations):
+            t0 = time.perf_counter()
+            self.func()
+            times.append((time.perf_counter() - t0) * 1000)
+        times.sort()
+        avg = sum(times) / len(times)
+        p95 = times[min(len(times) - 1, int(0.95 * len(times)))]
+        res = PerformanceResult(avg, times[0], times[-1], p95, self.num_iterations)
+        if print_results:
+            print(f"[performance] avg={res.avg_ms:.0f}ms  p95={res.p95_ms:.0f}ms  over {res.iterations} runs")
+        return res
+@dataclass
+class ReliabilityResult:
+    passed: bool
+    missing: List[str]
+    made: List[str]
+    expected: List[str]
+class ReliabilityEval:
+    """Assert your agent made the expected tool calls.
+        ReliabilityEval(tool_calls=["search","finish"], expected_tool_calls=["search"]).run()
+    `tool_calls` may be names or objects exposing .name / .tool.
+    """
+    def __init__(self, tool_calls, expected_tool_calls):
+        self.tool_calls = list(tool_calls)
+        self.expected = list(expected_tool_calls)
+    def run(self, print_results: bool = False) -> ReliabilityResult:
+        made = [t if isinstance(t, str) else getattr(t, "name", getattr(t, "tool", str(t)))
+                for t in self.tool_calls]
+        missing = [e for e in self.expected if e not in made]
+        res = ReliabilityResult(len(missing) == 0, missing, made, self.expected)
+        if print_results:
+            print(f"[reliability] passed={res.passed}  missing={res.missing}  made={res.made}")
+        return res

atlas_eval-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,25 @@
+[build-system]
+requires = ["setuptools>=68"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "atlas-eval"
+version = "0.1.0"
+description = "Evaluate your AI agent or LLM/SLM on the Bharat AI Index / Atlas Agent Arena."
+readme = "README.md"
+requires-python = ">=3.8"
+license = { text = "MIT" }
+authors = [{ name = "Atlas AI Labs" }]
+keywords = ["llm", "agent", "evaluation", "benchmark", "india", "sovereign-ai", "agno", "langchain"]
+classifiers = [
+  "Programming Language :: Python :: 3",
+  "License :: OSI Approved :: MIT License",
+  "Intended Audience :: Developers",
+]
+[project.urls]
+Homepage = "https://github.com/atlas-ai-labs/bharat-ai-index"
+Documentation = "https://github.com/atlas-ai-labs/bharat-ai-index/tree/main/sdk"
+[tool.setuptools]
+py-modules = ["atlas_eval"]

atlas_eval-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0