atlas-eval 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,51 @@
1
+ Metadata-Version: 2.4
2
+ Name: atlas-eval
3
+ Version: 0.1.0
4
+ Summary: Evaluate your AI agent or LLM/SLM on the Bharat AI Index / Atlas Agent Arena.
5
+ Author: Atlas AI Labs
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/atlas-ai-labs/bharat-ai-index
8
+ Project-URL: Documentation, https://github.com/atlas-ai-labs/bharat-ai-index/tree/main/sdk
9
+ Keywords: llm,agent,evaluation,benchmark,india,sovereign-ai,agno,langchain
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Intended Audience :: Developers
13
+ Requires-Python: >=3.8
14
+ Description-Content-Type: text/markdown
15
+
16
+ # atlas-eval (Python)
17
+
18
+ Evaluate your AI agent / LLM on the Bharat AI Index — Atlas Agent Arena.
19
+
20
+ > Requires a running Atlas instance (default `http://localhost:3000`). Start one with
21
+ > `npm run dev` in the repo, or `docker run -p 3000:3000 ... bharat-ai-index`.
22
+
23
+ ## Install
24
+
25
+ ```bash
26
+ # until published to PyPI:
27
+ pip install -e sdk/python # from the repo root
28
+ # or copy sdk/python/atlas_eval.py into your project
29
+ ```
30
+
31
+ ## Use
32
+
33
+ ```python
34
+ from atlas_eval import evaluate_agent, AccuracyEval, PerformanceEval, ReliabilityEval
35
+
36
+ # whole-agent benchmark (sandboxed tasks, agentic score)
37
+ def my_agent(ctx):
38
+ return {"tool": "fs_read", "args": {"path": "invoice_a.txt"}}
39
+ report = evaluate_agent(my_agent, base_url="http://localhost:3000")
40
+
41
+ # code-first evals (Agno-style)
42
+ AccuracyEval(agent=lambda q: "New Delhi", input="Capital of India?",
43
+ expected_output="New Delhi").run(print_results=True)
44
+ PerformanceEval(func=lambda: my_model("ping"), num_iterations=5).run(print_results=True)
45
+ ReliabilityEval(tool_calls=["search"], expected_tool_calls=["search"]).run(print_results=True)
46
+ ```
47
+
48
+ `PerformanceEval` and `ReliabilityEval` run fully locally (no server). `evaluate_agent`
49
+ and `AccuracyEval` call the Atlas instance (and a configured judge model for accuracy).
50
+
51
+ See the top-level [`sdk/README.md`](../README.md) for the protocol and JS equivalents.
@@ -0,0 +1,36 @@
1
+ # atlas-eval (Python)
2
+
3
+ Evaluate your AI agent / LLM on the Bharat AI Index — Atlas Agent Arena.
4
+
5
+ > Requires a running Atlas instance (default `http://localhost:3000`). Start one with
6
+ > `npm run dev` in the repo, or `docker run -p 3000:3000 ... bharat-ai-index`.
7
+
8
+ ## Install
9
+
10
+ ```bash
11
+ # until published to PyPI:
12
+ pip install -e sdk/python # from the repo root
13
+ # or copy sdk/python/atlas_eval.py into your project
14
+ ```
15
+
16
+ ## Use
17
+
18
+ ```python
19
+ from atlas_eval import evaluate_agent, AccuracyEval, PerformanceEval, ReliabilityEval
20
+
21
+ # whole-agent benchmark (sandboxed tasks, agentic score)
22
+ def my_agent(ctx):
23
+ return {"tool": "fs_read", "args": {"path": "invoice_a.txt"}}
24
+ report = evaluate_agent(my_agent, base_url="http://localhost:3000")
25
+
26
+ # code-first evals (Agno-style)
27
+ AccuracyEval(agent=lambda q: "New Delhi", input="Capital of India?",
28
+ expected_output="New Delhi").run(print_results=True)
29
+ PerformanceEval(func=lambda: my_model("ping"), num_iterations=5).run(print_results=True)
30
+ ReliabilityEval(tool_calls=["search"], expected_tool_calls=["search"]).run(print_results=True)
31
+ ```
32
+
33
+ `PerformanceEval` and `ReliabilityEval` run fully locally (no server). `evaluate_agent`
34
+ and `AccuracyEval` call the Atlas instance (and a configured judge model for accuracy).
35
+
36
+ See the top-level [`sdk/README.md`](../README.md) for the protocol and JS equivalents.
@@ -0,0 +1,51 @@
1
+ Metadata-Version: 2.4
2
+ Name: atlas-eval
3
+ Version: 0.1.0
4
+ Summary: Evaluate your AI agent or LLM/SLM on the Bharat AI Index / Atlas Agent Arena.
5
+ Author: Atlas AI Labs
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/atlas-ai-labs/bharat-ai-index
8
+ Project-URL: Documentation, https://github.com/atlas-ai-labs/bharat-ai-index/tree/main/sdk
9
+ Keywords: llm,agent,evaluation,benchmark,india,sovereign-ai,agno,langchain
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Intended Audience :: Developers
13
+ Requires-Python: >=3.8
14
+ Description-Content-Type: text/markdown
15
+
16
+ # atlas-eval (Python)
17
+
18
+ Evaluate your AI agent / LLM on the Bharat AI Index — Atlas Agent Arena.
19
+
20
+ > Requires a running Atlas instance (default `http://localhost:3000`). Start one with
21
+ > `npm run dev` in the repo, or `docker run -p 3000:3000 ... bharat-ai-index`.
22
+
23
+ ## Install
24
+
25
+ ```bash
26
+ # until published to PyPI:
27
+ pip install -e sdk/python # from the repo root
28
+ # or copy sdk/python/atlas_eval.py into your project
29
+ ```
30
+
31
+ ## Use
32
+
33
+ ```python
34
+ from atlas_eval import evaluate_agent, AccuracyEval, PerformanceEval, ReliabilityEval
35
+
36
+ # whole-agent benchmark (sandboxed tasks, agentic score)
37
+ def my_agent(ctx):
38
+ return {"tool": "fs_read", "args": {"path": "invoice_a.txt"}}
39
+ report = evaluate_agent(my_agent, base_url="http://localhost:3000")
40
+
41
+ # code-first evals (Agno-style)
42
+ AccuracyEval(agent=lambda q: "New Delhi", input="Capital of India?",
43
+ expected_output="New Delhi").run(print_results=True)
44
+ PerformanceEval(func=lambda: my_model("ping"), num_iterations=5).run(print_results=True)
45
+ ReliabilityEval(tool_calls=["search"], expected_tool_calls=["search"]).run(print_results=True)
46
+ ```
47
+
48
+ `PerformanceEval` and `ReliabilityEval` run fully locally (no server). `evaluate_agent`
49
+ and `AccuracyEval` call the Atlas instance (and a configured judge model for accuracy).
50
+
51
+ See the top-level [`sdk/README.md`](../README.md) for the protocol and JS equivalents.
@@ -0,0 +1,7 @@
1
+ README.md
2
+ atlas_eval.py
3
+ pyproject.toml
4
+ atlas_eval.egg-info/PKG-INFO
5
+ atlas_eval.egg-info/SOURCES.txt
6
+ atlas_eval.egg-info/dependency_links.txt
7
+ atlas_eval.egg-info/top_level.txt
@@ -0,0 +1 @@
1
+ atlas_eval
@@ -0,0 +1,249 @@
1
+ """Atlas eval SDK (Python) — evaluate YOUR agentic system in a few lines.
2
+
3
+ You write your agent as one function: given the step context, return the next action.
4
+ `evaluate_agent` spins up a tiny local endpoint for it, tells a running Atlas instance
5
+ to benchmark it on the Agent Arena, and returns the full scorecard. No framework lock-in
6
+ — wrap LangChain, LangGraph, Agno, or your own loop.
7
+
8
+ from atlas_eval import evaluate_agent
9
+
10
+ def my_agent(ctx):
11
+ # ctx = { goal, tools, history, lastObservation, step, maxSteps }
12
+ return {"tool": "fs_read", "args": {"path": "invoice_a.txt"}}
13
+
14
+ report = evaluate_agent(my_agent, base_url="http://localhost:3000")
15
+ print(report["agenticScore"], report["successRate"])
16
+
17
+ Only depends on the Python standard library.
18
+ """
19
+ from __future__ import annotations
20
+
21
+ import json
22
+ import socketserver
23
+ import threading
24
+ import time
25
+ import urllib.error
26
+ import urllib.request
27
+ from dataclasses import dataclass
28
+ from http.server import BaseHTTPRequestHandler
29
+ from typing import Any, Callable, Dict, List, Optional
30
+
31
+ __version__ = "0.1.0"
32
+
33
+ _ATLAS_HINT = (
34
+ "Is Atlas running? Start it with `npm run dev` (http://localhost:3000) or "
35
+ "`docker run -p 3000:3000 bharat-ai-index`, then pass base_url=…"
36
+ )
37
+
38
+
39
+ def _post_json(url: str, payload: dict, timeout: int = 120) -> dict:
40
+ """POST JSON with friendly errors for the common 'server not running' case."""
41
+ req = urllib.request.Request(
42
+ url, data=json.dumps(payload).encode(), headers={"Content-Type": "application/json"})
43
+ try:
44
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
45
+ return json.loads(resp.read())
46
+ except urllib.error.HTTPError as e:
47
+ detail = e.read().decode("utf-8", "ignore")[:300]
48
+ raise RuntimeError(f"Atlas error HTTP {e.code} at {url}: {detail}") from None
49
+ except urllib.error.URLError as e:
50
+ raise RuntimeError(f"Could not reach Atlas at {url}. {_ATLAS_HINT} (reason: {e.reason})") from None
51
+
52
+
53
+ def check(base_url: str = "http://localhost:3000") -> dict:
54
+ """Preflight: return Atlas /api/health, or raise a friendly error if unreachable."""
55
+ try:
56
+ with urllib.request.urlopen(f"{base_url.rstrip('/')}/api/health", timeout=10) as r:
57
+ return json.loads(r.read())
58
+ except Exception as e: # noqa: BLE001
59
+ raise RuntimeError(f"Atlas not reachable at {base_url}. {_ATLAS_HINT} ({e})") from None
60
+
61
+ Context = Dict[str, Any]
62
+ Action = Dict[str, Any]
63
+ AgentFn = Callable[[Context], Action]
64
+
65
+
66
+ def evaluate_agent(
67
+ decide: AgentFn,
68
+ base_url: str = "http://localhost:3000",
69
+ label: str = "Python Agent",
70
+ timeout: int = 600,
71
+ ) -> Dict[str, Any]:
72
+ """Run the Atlas agentic benchmark against your `decide` function and return the run."""
73
+
74
+ class Handler(BaseHTTPRequestHandler):
75
+ def do_POST(self): # noqa: N802
76
+ length = int(self.headers.get("Content-Length", 0))
77
+ ctx = json.loads(self.rfile.read(length) or b"{}")
78
+ try:
79
+ action = decide(ctx)
80
+ except Exception as exc: # never crash the eval on a bad step
81
+ action = {"tool": "finish", "args": {"answer": f"agent error: {exc}"}}
82
+ payload = json.dumps({"action": action}).encode()
83
+ self.send_response(200)
84
+ self.send_header("Content-Type", "application/json")
85
+ self.send_header("Content-Length", str(len(payload)))
86
+ self.end_headers()
87
+ self.wfile.write(payload)
88
+
89
+ def log_message(self, *args): # silence
90
+ return
91
+
92
+ httpd = socketserver.TCPServer(("127.0.0.1", 0), Handler)
93
+ port = httpd.server_address[1]
94
+ thread = threading.Thread(target=httpd.serve_forever, daemon=True)
95
+ thread.start()
96
+ try:
97
+ agent_url = f"http://127.0.0.1:{port}/act"
98
+ out = _post_json(
99
+ f"{base_url.rstrip('/')}/api/agent/run",
100
+ {"agentUrl": agent_url, "agentLabel": label},
101
+ timeout=timeout,
102
+ )
103
+ return out.get("run", out)
104
+ finally:
105
+ httpd.shutdown()
106
+ httpd.server_close()
107
+
108
+
109
+ def serve_agent(decide: AgentFn, port: int = 4700) -> None:
110
+ """Long-running mode: expose your agent at /act and let Atlas call it on demand."""
111
+
112
+ class Handler(BaseHTTPRequestHandler):
113
+ def do_POST(self): # noqa: N802
114
+ length = int(self.headers.get("Content-Length", 0))
115
+ ctx = json.loads(self.rfile.read(length) or b"{}")
116
+ try:
117
+ action = decide(ctx)
118
+ except Exception as exc:
119
+ action = {"tool": "finish", "args": {"answer": f"agent error: {exc}"}}
120
+ payload = json.dumps({"action": action}).encode()
121
+ self.send_response(200)
122
+ self.send_header("Content-Type", "application/json")
123
+ self.end_headers()
124
+ self.wfile.write(payload)
125
+
126
+ def log_message(self, *args):
127
+ return
128
+
129
+ print(f"agent listening on http://127.0.0.1:{port}/act")
130
+ socketserver.TCPServer(("127.0.0.1", port), Handler).serve_forever()
131
+
132
+
133
+ # ──────────────────────────────────────────────────────────────────────────────
134
+ # Code-first evals (Agno-style): write an eval against your agent and .run() it.
135
+ # ──────────────────────────────────────────────────────────────────────────────
136
+
137
+ def _agent_output(agent: Any, text: str) -> str:
138
+ """Accept a callable agent(input)->str or an object exposing .run(input)."""
139
+ if callable(agent):
140
+ out = agent(text)
141
+ elif hasattr(agent, "run"):
142
+ out = agent.run(text)
143
+ else:
144
+ raise TypeError("agent must be callable or expose a .run() method")
145
+ return out if isinstance(out, str) else str(getattr(out, "content", out))
146
+
147
+
148
+ @dataclass
149
+ class AccuracyResult:
150
+ score: float # 0..10 (LLM-as-judge)
151
+ passed: bool
152
+ reason: str
153
+ output: str
154
+
155
+
156
+ class AccuracyEval:
157
+ """Measure correctness of your agent's answer with LLM-as-judge (via Atlas).
158
+
159
+ AccuracyEval(agent=my_agent, input="What is the capital of India?",
160
+ expected_output="New Delhi").run(print_results=True)
161
+ """
162
+
163
+ def __init__(self, agent, input, expected_output=None, guidelines=None,
164
+ threshold=7.0, base_url="http://localhost:3000"):
165
+ self.agent = agent
166
+ self.input = input
167
+ self.expected_output = expected_output
168
+ self.guidelines = guidelines
169
+ self.threshold = threshold
170
+ self.base_url = base_url.rstrip("/")
171
+
172
+ def run(self, print_results: bool = False) -> AccuracyResult:
173
+ output = _agent_output(self.agent, self.input)
174
+ j = _post_json(f"{self.base_url}/api/eval/judge", {
175
+ "input": self.input, "output": output,
176
+ "expected": self.expected_output or "",
177
+ "guidelines": self.guidelines or "", "threshold": self.threshold,
178
+ }, timeout=120)
179
+ res = AccuracyResult(float(j.get("score", 0)), bool(j.get("passed", False)),
180
+ str(j.get("reason", "")), output)
181
+ if print_results:
182
+ print(f"[accuracy] {res.score:.1f}/10 passed={res.passed} :: {res.reason}")
183
+ return res
184
+
185
+
186
+ @dataclass
187
+ class PerformanceResult:
188
+ avg_ms: float
189
+ min_ms: float
190
+ max_ms: float
191
+ p95_ms: float
192
+ iterations: int
193
+
194
+
195
+ class PerformanceEval:
196
+ """Measure runtime latency of an agent call over N iterations.
197
+
198
+ PerformanceEval(func=lambda: my_agent("ping"), num_iterations=5).run(print_results=True)
199
+ """
200
+
201
+ def __init__(self, func: Callable[[], Any], num_iterations: int = 5, warmup: int = 1):
202
+ self.func = func
203
+ self.num_iterations = num_iterations
204
+ self.warmup = warmup
205
+
206
+ def run(self, print_results: bool = False) -> PerformanceResult:
207
+ for _ in range(self.warmup):
208
+ self.func()
209
+ times: List[float] = []
210
+ for _ in range(self.num_iterations):
211
+ t0 = time.perf_counter()
212
+ self.func()
213
+ times.append((time.perf_counter() - t0) * 1000)
214
+ times.sort()
215
+ avg = sum(times) / len(times)
216
+ p95 = times[min(len(times) - 1, int(0.95 * len(times)))]
217
+ res = PerformanceResult(avg, times[0], times[-1], p95, self.num_iterations)
218
+ if print_results:
219
+ print(f"[performance] avg={res.avg_ms:.0f}ms p95={res.p95_ms:.0f}ms over {res.iterations} runs")
220
+ return res
221
+
222
+
223
+ @dataclass
224
+ class ReliabilityResult:
225
+ passed: bool
226
+ missing: List[str]
227
+ made: List[str]
228
+ expected: List[str]
229
+
230
+
231
+ class ReliabilityEval:
232
+ """Assert your agent made the expected tool calls.
233
+
234
+ ReliabilityEval(tool_calls=["search","finish"], expected_tool_calls=["search"]).run()
235
+ `tool_calls` may be names or objects exposing .name / .tool.
236
+ """
237
+
238
+ def __init__(self, tool_calls, expected_tool_calls):
239
+ self.tool_calls = list(tool_calls)
240
+ self.expected = list(expected_tool_calls)
241
+
242
+ def run(self, print_results: bool = False) -> ReliabilityResult:
243
+ made = [t if isinstance(t, str) else getattr(t, "name", getattr(t, "tool", str(t)))
244
+ for t in self.tool_calls]
245
+ missing = [e for e in self.expected if e not in made]
246
+ res = ReliabilityResult(len(missing) == 0, missing, made, self.expected)
247
+ if print_results:
248
+ print(f"[reliability] passed={res.passed} missing={res.missing} made={res.made}")
249
+ return res
@@ -0,0 +1,25 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "atlas-eval"
7
+ version = "0.1.0"
8
+ description = "Evaluate your AI agent or LLM/SLM on the Bharat AI Index / Atlas Agent Arena."
9
+ readme = "README.md"
10
+ requires-python = ">=3.8"
11
+ license = { text = "MIT" }
12
+ authors = [{ name = "Atlas AI Labs" }]
13
+ keywords = ["llm", "agent", "evaluation", "benchmark", "india", "sovereign-ai", "agno", "langchain"]
14
+ classifiers = [
15
+ "Programming Language :: Python :: 3",
16
+ "License :: OSI Approved :: MIT License",
17
+ "Intended Audience :: Developers",
18
+ ]
19
+
20
+ [project.urls]
21
+ Homepage = "https://github.com/atlas-ai-labs/bharat-ai-index"
22
+ Documentation = "https://github.com/atlas-ai-labs/bharat-ai-index/tree/main/sdk"
23
+
24
+ [tool.setuptools]
25
+ py-modules = ["atlas_eval"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+