aievaluator 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ """AI Evaluator CLI โ€” evaluate your LLM agents from the command line."""
2
+
3
+ __version__ = "1.0.0"
File without changes
@@ -0,0 +1,178 @@
1
+ """HTTP client for AI Evaluator Engine API."""
2
+
3
+ import json as json_mod
4
+ from typing import Any, Optional
5
+
6
+ import httpx
7
+
8
+
9
+ class APIError(Exception):
10
+ """Error from the AI Evaluator API."""
11
+
12
+ def __init__(self, status_code: int, message: str, detail: Any = None):
13
+ self.status_code = status_code
14
+ self.message = message
15
+ self.detail = detail
16
+ super().__init__(message)
17
+
18
+
19
+ class APIClient:
20
+ """Thin HTTP wrapper around the AI Evaluator Engine API."""
21
+
22
+ def __init__(self, engine_url: str, api_key: Optional[str] = None, timeout: int = 300):
23
+ self.engine_url = engine_url.rstrip("/")
24
+ self.api_key = api_key
25
+ self.timeout = timeout
26
+
27
+ def _headers(self) -> dict:
28
+ h = {"Content-Type": "application/json"}
29
+ if self.api_key:
30
+ h["X-API-Key"] = self.api_key
31
+ return h
32
+
33
+ async def _request(
34
+ self,
35
+ method: str,
36
+ path: str,
37
+ json_data: Optional[dict] = None,
38
+ data: Optional[dict] = None,
39
+ files: Optional[dict] = None,
40
+ ) -> dict:
41
+ """Make an HTTP request to the engine. Raises APIError on failure."""
42
+ url = f"{self.engine_url}{path}"
43
+ async with httpx.AsyncClient(timeout=self.timeout) as client:
44
+ try:
45
+ if files:
46
+ resp = await client.request(
47
+ method, url, data=data, files=files, headers={"X-API-Key": self.api_key} if self.api_key else {},
48
+ )
49
+ else:
50
+ resp = await client.request(
51
+ method, url, json=json_data, headers=self._headers(),
52
+ )
53
+ except httpx.ConnectError:
54
+ raise APIError(0, f"Cannot connect to {self.engine_url}")
55
+ except httpx.TimeoutException:
56
+ raise APIError(0, f"Request timed out after {self.timeout}s")
57
+
58
+ if resp.status_code >= 400:
59
+ detail = None
60
+ try:
61
+ detail = resp.json()
62
+ except Exception:
63
+ detail = resp.text
64
+ raise APIError(resp.status_code, f"Engine returned HTTP {resp.status_code}", detail)
65
+
66
+ return resp.json()
67
+
68
+ async def health(self) -> dict:
69
+ """GET /health"""
70
+ return await self._request("GET", "/health")
71
+
72
+ async def get_usage(self) -> dict:
73
+ """GET /api/v1/tenants/me/usage"""
74
+ return await self._request("GET", "/api/v1/tenants/me/usage")
75
+
76
+ async def evaluate_sync(
77
+ self,
78
+ rows: list[dict],
79
+ agent_url: str,
80
+ agent_format: str = "openai",
81
+ metrics: Optional[list[str]] = None,
82
+ judge_model: Optional[str] = None,
83
+ name: Optional[str] = None,
84
+ custom_evaluators: list[dict] | None = None,
85
+ thresholds: Optional[dict[str, float]] = None,
86
+ ) -> dict:
87
+ """POST /api/v1/evaluations/sync"""
88
+ agent_json = {"url": agent_url, "format": agent_format}
89
+ body = {
90
+ "rows": rows,
91
+ "agent": agent_json,
92
+ "metrics": metrics or ["faithfulness", "g_eval"],
93
+ "custom_evaluators": custom_evaluators or [],
94
+ }
95
+ if name:
96
+ body["name"] = name
97
+ if judge_model:
98
+ body["judge_model"] = judge_model
99
+ if thresholds:
100
+ body["thresholds"] = thresholds
101
+
102
+ return await self._request("POST", "/api/v1/evaluations/sync", json_data=body)
103
+
104
+ async def evaluate_upload(
105
+ self,
106
+ file_path: str,
107
+ agent_url: str,
108
+ agent_format: str = "openai",
109
+ metrics: Optional[str] = None,
110
+ ) -> dict:
111
+ """POST /api/v1/evaluations/sync/upload (multipart form upload)."""
112
+ import os
113
+
114
+ url = f"{self.engine_url}/api/v1/evaluations/sync/upload"
115
+ async with httpx.AsyncClient(timeout=self.timeout) as client:
116
+ try:
117
+ with open(file_path, "rb") as f:
118
+ files = {"file": (os.path.basename(file_path), f, "application/json")}
119
+ data = {
120
+ "agent_endpoint": agent_url,
121
+ "agent_format": agent_format,
122
+ "metrics": metrics or "faithfulness,g_eval",
123
+ }
124
+ resp = await client.post(
125
+ url,
126
+ data=data,
127
+ files=files,
128
+ headers={"X-API-Key": self.api_key} if self.api_key else {},
129
+ )
130
+ except httpx.ConnectError:
131
+ raise APIError(0, f"Cannot connect to {self.engine_url}")
132
+ except httpx.TimeoutException:
133
+ raise APIError(0, f"Request timed out after {self.timeout}s")
134
+
135
+ if resp.status_code >= 400:
136
+ detail = None
137
+ try:
138
+ detail = resp.json()
139
+ except Exception:
140
+ detail = resp.text
141
+ raise APIError(resp.status_code, f"Engine returned HTTP {resp.status_code}", detail)
142
+
143
+ return resp.json()
144
+
145
+ async def playground_evaluate(
146
+ self,
147
+ queries: Optional[list[str]] = None,
148
+ rows: Optional[list[dict]] = None,
149
+ agent_endpoint: Optional[str] = None,
150
+ agent_config: Optional[dict] = None,
151
+ metrics: Optional[list] = None,
152
+ judge: Optional[str] = None,
153
+ ) -> dict:
154
+ """POST /api/v1/playground/evaluate (no auth required).
155
+
156
+ metrics accepts strings or dicts with thresholds:
157
+ ["g_eval"] or [{"name": "g_eval", "threshold": 0.9}]
158
+ """
159
+ body: dict = {"metrics": metrics or ["faithfulness", "g_eval"]}
160
+ if queries:
161
+ body["queries"] = queries
162
+ if rows:
163
+ body["rows"] = rows
164
+ if agent_config:
165
+ body["agent"] = agent_config
166
+ if agent_endpoint:
167
+ body["agent_endpoint"] = agent_endpoint
168
+ if judge:
169
+ body["judge"] = judge
170
+ return await self._request("POST", "/api/v1/playground/evaluate", json_data=body)
171
+
172
+ async def playground_status(self) -> dict:
173
+ """GET /api/v1/playground/status (no auth required)."""
174
+ async with httpx.AsyncClient(timeout=10) as client:
175
+ resp = await client.get(f"{self.engine_url}/api/v1/playground/status")
176
+ if resp.status_code >= 400:
177
+ return {"used": 0, "limit": 5, "remaining": 5, "resets_at": "midnight UTC"}
178
+ return resp.json()
aievaluator/cli.py ADDED
@@ -0,0 +1,532 @@
1
+ """AI Evaluator CLI โ€” main entry point.
2
+
3
+ Commands:
4
+ aievaluator login Authenticate with AI Evaluator
5
+ aievaluator whoami Show current tenant info
6
+ aievaluator quick Quick eval via playground (no API key)
7
+ aievaluator eval Full evaluation against an agent
8
+ aievaluator config Manage CLI configuration
9
+ """
10
+
11
+ import asyncio
12
+ import json as json_mod
13
+ import os
14
+ import sys
15
+ from pathlib import Path
16
+ from typing import Optional
17
+
18
+ import click
19
+
20
+ from . import __version__
21
+ from .api.client import APIClient, APIError
22
+ from .config import (
23
+ resolve_api_key,
24
+ resolve_engine_url,
25
+ resolve_default_metrics,
26
+ resolve_default_min_score,
27
+ save_config,
28
+ load_config,
29
+ get_all_config,
30
+ )
31
+ from .formatters import format_table, format_json_output, format_junit
32
+
33
+
34
+ def _parse_dataset_file(file_path: str) -> list[dict]:
35
+ """Parse a dataset file (JSON or JSONL) into a list of rows."""
36
+ with open(file_path, "r", encoding="utf-8") as f:
37
+ raw = f.read()
38
+
39
+ if file_path.endswith(".jsonl"):
40
+ rows = []
41
+ for line in raw.strip().split("\n"):
42
+ line = line.strip()
43
+ if line:
44
+ rows.append(json_mod.loads(line))
45
+ return rows
46
+ else:
47
+ data = json_mod.loads(raw)
48
+ return data if isinstance(data, list) else [data]
49
+
50
+
51
+ def _run_async(coro):
52
+ """Helper to run async coroutines from Click commands."""
53
+ return asyncio.run(coro)
54
+
55
+
56
+ def _parse_quick_metrics(metrics_str: str | None, default_threshold: float | None = None) -> list | None:
57
+ """Parse --metrics for quick command.
58
+
59
+ CU1: "faithfulness:0.90,g_eval:0.75" โ†’ [{"name":"faithfulness","threshold":0.9}, ...]
60
+ CU2: "faithfulness,g_eval" with default_threshold=0.8 โ†’ [{"name":"faithfulness","threshold":0.8}, ...]
61
+ Simple: "faithfulness,g_eval" โ†’ ["faithfulness", "g_eval"]
62
+ """
63
+ if not metrics_str:
64
+ return None
65
+ result = []
66
+ for item in metrics_str.split(","):
67
+ item = item.strip()
68
+ if ":" in item:
69
+ name, val = item.split(":", 1)
70
+ result.append({"name": name.strip(), "threshold": float(val.strip())})
71
+ elif default_threshold is not None:
72
+ result.append({"name": item, "threshold": default_threshold})
73
+ else:
74
+ result.append(item)
75
+ return result
76
+
77
+
78
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
79
+ # CLI Group
80
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
81
+
82
+ @click.group()
83
+ @click.version_option(version=__version__, prog_name="AI Evaluator CLI")
84
+ def main():
85
+ """AI Evaluator CLI โ€” evaluate your LLM agents from the command line.
86
+
87
+ \b
88
+ Quick start:
89
+ aievaluator quick "What is 2+2?" --expected "4"
90
+ aievaluator login
91
+ aievaluator eval --agent https://my-agent.com/chat --dataset ./tests.json
92
+ """
93
+ pass
94
+
95
+
96
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
97
+ # login
98
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
99
+
100
+ @main.command()
101
+ @click.option("--api-key", help="API key (non-interactive mode)", default=None)
102
+ @click.option("--engine-url", help="Engine URL", default=None)
103
+ def login(api_key: Optional[str], engine_url: Optional[str]):
104
+ """Authenticate with AI Evaluator.
105
+
106
+ Saves your API key to ~/.config/aievaluator/config.json.
107
+ Get your key at https://aievaluator.dev/settings
108
+ """
109
+ if not api_key:
110
+ click.echo()
111
+ click.echo("Enter your AI Evaluator API key:")
112
+ click.echo("(Get one at https://aievaluator.dev/settings)")
113
+ api_key = click.prompt("API key", hide_input=False).strip()
114
+
115
+ if not api_key:
116
+ click.echo("โŒ API key cannot be empty.", err=True)
117
+ sys.exit(2)
118
+
119
+ resolved_url = resolve_engine_url(engine_url)
120
+ client = APIClient(resolved_url, api_key)
121
+
122
+ async def _login():
123
+ try:
124
+ usage = await client.get_usage()
125
+ except APIError as e:
126
+ click.echo(f"โŒ Invalid API key or engine unreachable: {e.message}", err=True)
127
+ sys.exit(2)
128
+
129
+ # Save to global config
130
+ config = load_config()
131
+ config["api_key"] = api_key
132
+ config["engine_url"] = resolved_url
133
+ save_config(config)
134
+
135
+ tenant_name = usage.get("tenant_name", "Unknown")
136
+ tier = usage.get("tier", "unknown")
137
+ evals_used = usage.get("evaluations_this_cycle", 0)
138
+ evals_limit = usage.get("evaluations_limit", "โˆž")
139
+
140
+ click.echo()
141
+ click.echo(f"โœ… Logged in as {tenant_name} ({tier})")
142
+ click.echo(f" Evals: {evals_used}/{evals_limit} this cycle")
143
+ click.echo(f" Config saved to ~/.config/aievaluator/config.json")
144
+
145
+ _run_async(_login())
146
+
147
+
148
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
149
+ # whoami
150
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
151
+
152
+ @main.command()
153
+ @click.option("--api-key", help="API key (overrides config)", default=None)
154
+ def whoami(api_key: Optional[str]):
155
+ """Show current tenant info and usage."""
156
+ key = resolve_api_key(api_key)
157
+ if not key:
158
+ click.echo("โŒ Not logged in. Run: aievaluator login", err=True)
159
+ sys.exit(2)
160
+
161
+ engine_url = resolve_engine_url()
162
+ client = APIClient(engine_url, key)
163
+
164
+ async def _whoami():
165
+ try:
166
+ usage = await client.get_usage()
167
+ except APIError as e:
168
+ click.echo(f"โŒ {e.message}", err=True)
169
+ sys.exit(2)
170
+
171
+ tenant_name = usage.get("tenant_name", "Unknown")
172
+ tier = usage.get("tier", "unknown")
173
+ evals_used = usage.get("evaluations_this_cycle", 0)
174
+ evals_limit = usage.get("evaluations_limit", "โˆž")
175
+ tokens_in = usage.get("input_tokens_this_cycle", 0)
176
+ tokens_out = usage.get("output_tokens_this_cycle", 0)
177
+
178
+ click.echo()
179
+ click.echo(f"Tenant: {tenant_name}")
180
+ click.echo(f"Tier: {tier}")
181
+ click.echo(f"Evals: {evals_used}/{evals_limit} this cycle")
182
+ click.echo(f"Tokens: โ†“{tokens_in:,} ยท โ†‘{tokens_out:,} this cycle")
183
+
184
+ _run_async(_whoami())
185
+
186
+
187
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
188
+ # quick
189
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
190
+
191
+ @main.command()
192
+ @click.argument("query", required=False)
193
+ @click.option("--dataset", "dataset_file", help="JSON dataset file", type=click.Path(exists=True), default=None)
194
+ @click.option("--agent", "agent_url", help="Agent endpoint URL (default: internal chat agent)", default="/chat")
195
+ @click.option("--expected", help="Expected output for query", default=None)
196
+ @click.option("--metrics", help="Metrics: faithfulness,g_eval or faithfulness:0.90,g_eval:0.75", default=None)
197
+ @click.option("--min-score", help="Apply threshold to all metrics and enforce exit code", type=float, default=None)
198
+ @click.option("--judge", help="LLM judge model", default=None)
199
+ @click.option("--engine-url", help="Engine URL", default=None)
200
+ def quick(query, dataset_file, agent_url, expected, metrics, min_score, judge, engine_url):
201
+ """Quick evaluation via playground (no API key required).
202
+
203
+ \b
204
+ Examples:
205
+ aievaluator quick "What is 2+2?" --expected "4"
206
+ aievaluator quick --dataset ./smoke-tests.json --agent https://my-agent.com/chat
207
+ """
208
+ if not query and not dataset_file:
209
+ click.echo("โŒ Provide a query or --dataset", err=True)
210
+ sys.exit(2)
211
+ if query and dataset_file:
212
+ click.echo("โŒ Use query OR --dataset, not both", err=True)
213
+ sys.exit(2)
214
+
215
+ resolved_url = resolve_engine_url(engine_url)
216
+ client = APIClient(resolved_url)
217
+
218
+ # Parse metrics: CU1 (metric:threshold), CU2 (--min-score applies to all)
219
+ metrics_list = _parse_quick_metrics(metrics, min_score)
220
+
221
+ async def _quick():
222
+ # Check playground status first
223
+ try:
224
+ status = await client.playground_status()
225
+ except Exception:
226
+ status = {"used": 0, "limit": 5, "remaining": 5, "resets_at": "midnight UTC"}
227
+
228
+ remaining = status.get("remaining", 5)
229
+ limit = status.get("limit", 5)
230
+ click.echo(f"โš ๏ธ Playground mode โ€” {remaining}/{limit} remaining (resets at {status.get('resets_at', 'midnight UTC')})")
231
+ click.echo()
232
+
233
+ if remaining <= 0:
234
+ click.echo("โŒ Playground limit reached. Run `aievaluator login` for 100 free evals/month.")
235
+ sys.exit(2)
236
+
237
+ if query:
238
+ rows = [{"input": query}]
239
+ if expected:
240
+ rows[0]["expected_output"] = expected
241
+ else:
242
+ rows = _parse_dataset_file(dataset_file)
243
+
244
+ try:
245
+ result = await client.playground_evaluate(
246
+ rows=rows,
247
+ agent_endpoint=agent_url,
248
+ metrics=metrics_list,
249
+ judge=judge,
250
+ )
251
+ except APIError as e:
252
+ click.echo(f"โŒ {e.message}", err=True)
253
+ if e.detail:
254
+ click.echo(json_mod.dumps(e.detail, indent=2), err=True)
255
+ sys.exit(2)
256
+
257
+ overall_passed = all(r.get("passed", True) for r in result.get("results", []))
258
+ format_table(result, min_score or 0.0, resolved_url)
259
+
260
+ # CU2: exit code based on --min-score
261
+ if min_score is not None:
262
+ sys.exit(0 if overall_passed else 1)
263
+
264
+ _run_async(_quick())
265
+
266
+
267
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
268
+ # eval
269
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
270
+
271
+ @main.command()
272
+ @click.option("--agent", required=True, help="Agent endpoint URL")
273
+ @click.option("--dataset", "dataset_file", help="JSON dataset file", type=click.Path(exists=True), default=None)
274
+ @click.option("--rows", help="Inline JSON array of test cases", default=None)
275
+ @click.option("--metrics", help="Metrics (comma-separated)", default=None)
276
+ @click.option("--agent-format", help="Agent API format", type=click.Choice(["openai", "claude", "custom"]), default="openai")
277
+ @click.option("--min-score", help="Minimum overall score threshold (0-1)", type=float, default=None)
278
+ @click.option("--thresholds", "thresholds_str", help="Per-metric thresholds: faithfulness:0.90,g_eval:0.75", default=None)
279
+ @click.option("--custom", "custom_str", help="Inline custom evaluator: {\"name\":\"polite\",\"prompt\":\"Check...\",\"threshold\":0.8}", default=None)
280
+ @click.option("--format", "output_format", help="Output format", type=click.Choice(["table", "json", "junit"]), default="table")
281
+ @click.option("--ci", is_flag=True, help="CI mode (no colors, no prompts)")
282
+ @click.option("--timeout", help="Timeout in seconds", type=int, default=300)
283
+ @click.option("--judge-model", help="LLM judge model", default=None)
284
+ @click.option("--name", "eval_name", help="Human-readable name for this evaluation", default=None)
285
+ @click.option("--api-key", help="API key (overrides config)", default=None)
286
+ @click.option("--engine-url", help="Engine URL", default=None)
287
+ def eval_cmd(agent, dataset_file, rows, metrics, agent_format, min_score, thresholds_str, custom_str, output_format, ci, timeout, judge_model, eval_name, api_key, engine_url):
288
+ """Evaluate an AI agent against a dataset.
289
+
290
+ \b
291
+ Examples:
292
+ aievaluator eval --agent https://my-agent.com/chat --dataset ./tests.json
293
+ aievaluator eval --agent https://my-agent.com/chat --rows '[{"input":"Hi","expected_output":"Hello"}]'
294
+ aievaluator eval --agent $AGENT_URL --dataset ./evals.json --ci --format junit
295
+ """
296
+ # Validate data source
297
+ if not dataset_file and not rows:
298
+ click.echo("โŒ Provide --dataset or --rows", err=True)
299
+ sys.exit(2)
300
+ if dataset_file and rows:
301
+ click.echo("โŒ Use --dataset OR --rows, not both", err=True)
302
+ sys.exit(2)
303
+
304
+ key = resolve_api_key(api_key)
305
+ if not key:
306
+ click.echo("โŒ API key required. Run: aievaluator login", err=True)
307
+ sys.exit(2)
308
+
309
+ resolved_url = resolve_engine_url(engine_url)
310
+ client = APIClient(resolved_url, key, timeout=timeout)
311
+
312
+ # Resolve metrics
313
+ if metrics:
314
+ metrics_list = [m.strip() for m in metrics.split(",")]
315
+ else:
316
+ metrics_list = resolve_default_metrics().split(",")
317
+
318
+ # Resolve min_score
319
+ if min_score is None:
320
+ min_score = resolve_default_min_score()
321
+
322
+ # Parse per-metric thresholds: "faithfulness:0.90,g_eval:0.75" -> {"faithfulness": 0.90, "g_eval": 0.75}
323
+ thresholds_dict = {}
324
+ if thresholds_str:
325
+ for pair in thresholds_str.split(","):
326
+ pair = pair.strip()
327
+ if ":" in pair:
328
+ metric_name, val = pair.split(":", 1)
329
+ try:
330
+ thresholds_dict[metric_name.strip()] = float(val.strip())
331
+ except ValueError:
332
+ click.echo(f"โŒ Invalid threshold value in: {pair}", err=True)
333
+ sys.exit(2)
334
+
335
+ # CU3: parse inline custom evaluator
336
+ custom_evaluators = None
337
+ if custom_str:
338
+ try:
339
+ custom_evaluators = json_mod.loads(custom_str)
340
+ if isinstance(custom_evaluators, dict):
341
+ custom_evaluators = [custom_evaluators]
342
+ except json_mod.JSONDecodeError:
343
+ click.echo(f"โŒ Invalid JSON in --custom", err=True)
344
+ sys.exit(2)
345
+
346
+ async def _eval():
347
+ if dataset_file:
348
+ try:
349
+ rows_data = _parse_dataset_file(dataset_file)
350
+ except (json_mod.JSONDecodeError, FileNotFoundError) as e:
351
+ click.echo(f"โŒ Cannot read dataset: {e}", err=True)
352
+ sys.exit(2)
353
+ else:
354
+ try:
355
+ rows_data = json_mod.loads(rows)
356
+ except json_mod.JSONDecodeError as e:
357
+ click.echo(f"โŒ Invalid JSON in --rows: {e}", err=True)
358
+ sys.exit(2)
359
+ if not isinstance(rows_data, list):
360
+ rows_data = [rows_data]
361
+
362
+ try:
363
+ result = await client.evaluate_sync(
364
+ rows=rows_data,
365
+ agent_url=agent,
366
+ agent_format=agent_format,
367
+ metrics=metrics_list,
368
+ judge_model=judge_model,
369
+ name=eval_name,
370
+ thresholds=thresholds_dict if thresholds_dict else None,
371
+ custom_evaluators=custom_evaluators,
372
+ )
373
+ except APIError as e:
374
+ _handle_api_error(e)
375
+
376
+ # Format output
377
+ if output_format == "json":
378
+ output = format_json_output(result, min_score)
379
+ click.echo(output)
380
+ elif output_format == "junit":
381
+ output = format_junit(result, min_score)
382
+ click.echo(output)
383
+ else:
384
+ format_table(result, min_score, resolved_url)
385
+
386
+ # Exit code
387
+ overall_score = result.get("overall_score", 0)
388
+ if overall_score < min_score:
389
+ sys.exit(1)
390
+
391
+ _run_async(_eval())
392
+
393
+
394
+ def _handle_api_error(e: APIError):
395
+ """Print API error and exit."""
396
+ click.echo(f"โŒ {e.message}", err=True)
397
+ if e.detail:
398
+ if isinstance(e.detail, dict):
399
+ click.echo(json_mod.dumps(e.detail, indent=2), err=True)
400
+ else:
401
+ click.echo(str(e.detail)[:500], err=True)
402
+ sys.exit(3 if e.status_code == 0 else 2)
403
+
404
+
405
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
406
+ # config
407
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
408
+
409
+ @main.group()
410
+ def config():
411
+ """Manage CLI configuration."""
412
+ pass
413
+
414
+
415
+ @config.command("show")
416
+ def config_show():
417
+ """Show current configuration."""
418
+ cfg = get_all_config()
419
+ if cfg:
420
+ click.echo(json_mod.dumps(cfg, indent=2))
421
+ else:
422
+ click.echo("No configuration found. Run: aievaluator login")
423
+
424
+
425
+ @config.command("set")
426
+ @click.argument("key")
427
+ @click.argument("value")
428
+ def config_set(key: str, value: str):
429
+ """Set a configuration value.
430
+
431
+ \b
432
+ Keys: engine-url, default-metrics, default-min-score
433
+ """
434
+ valid_keys = {"engine-url", "default-metrics", "default-min-score"}
435
+ if key not in valid_keys:
436
+ click.echo(f"โŒ Invalid key: {key}. Valid keys: {', '.join(valid_keys)}", err=True)
437
+ sys.exit(2)
438
+
439
+ cfg = load_config()
440
+ if key == "default-min-score":
441
+ try:
442
+ cfg[key] = float(value)
443
+ except ValueError:
444
+ click.echo(f"โŒ default-min-score must be a number (0-1)", err=True)
445
+ sys.exit(2)
446
+ else:
447
+ cfg[key] = value
448
+ save_config(cfg)
449
+ click.echo(f"โœ… {key} = {value}")
450
+
451
+
452
+ @config.command("unset")
453
+ @click.argument("key")
454
+ def config_unset(key: str):
455
+ """Remove a configuration value."""
456
+ cfg = load_config()
457
+ if key in cfg:
458
+ del cfg[key]
459
+ save_config(cfg)
460
+ click.echo(f"โœ… {key} removed")
461
+ else:
462
+ click.echo(f"{key} was not set")
463
+
464
+
465
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
466
+ # init
467
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
468
+
469
+ _SMOKE_TEST_DATASET = [
470
+ {"input": "What is 2+2?", "expected_output": "4"},
471
+ {"input": "What is the capital of France?", "expected_output": "Paris"},
472
+ {"input": "Say hello in Spanish", "expected_output": "Hola"},
473
+ ]
474
+
475
+
476
+ @main.command()
477
+ def init():
478
+ """Initialize a new AI Evaluator project in the current directory.
479
+
480
+ Creates:
481
+ - aievaluator.config.json (project-local config)
482
+ - evals/smoke-test.json (example dataset)
483
+ - Updates .gitignore
484
+ """
485
+ cwd = Path.cwd()
486
+
487
+ # 1. Create aievaluator.config.json
488
+ config_path = cwd / "aievaluator.config.json"
489
+ if config_path.exists():
490
+ click.echo(f"โญ๏ธ aievaluator.config.json already exists, skipping")
491
+ else:
492
+ config_path.write_text(json_mod.dumps({
493
+ "engine_url": "https://api.aievaluator.dev",
494
+ "default_metrics": "faithfulness,g_eval",
495
+ "default_min_score": 0.80,
496
+ }, indent=2) + "\n")
497
+ click.echo(f"โœ… Created aievaluator.config.json")
498
+
499
+ # 2. Create evals/ directory + smoke-test.json
500
+ evals_dir = cwd / "evals"
501
+ evals_dir.mkdir(exist_ok=True)
502
+ smoke_path = evals_dir / "smoke-test.json"
503
+ if smoke_path.exists():
504
+ click.echo(f"โญ๏ธ evals/smoke-test.json already exists, skipping")
505
+ else:
506
+ smoke_path.write_text(json_mod.dumps(_SMOKE_TEST_DATASET, indent=2) + "\n")
507
+ click.echo(f"โœ… Created evals/smoke-test.json (3 example queries)")
508
+
509
+ # 3. Update .gitignore
510
+ gitignore_path = cwd / ".gitignore"
511
+ gitignore_lines = gitignore_path.read_text().split("\n") if gitignore_path.exists() else []
512
+ entry = "aievaluator.config.json"
513
+ if entry not in gitignore_lines:
514
+ with open(gitignore_path, "a") as f:
515
+ if gitignore_lines and gitignore_lines[-1].strip() != "":
516
+ f.write("\n")
517
+ f.write(f"{entry}\n")
518
+ click.echo(f"โœ… Added {entry} to .gitignore")
519
+
520
+ click.echo()
521
+ click.echo("Next steps:")
522
+ click.echo(" aievaluator quick --dataset ./evals/smoke-test.json")
523
+ click.echo(" aievaluator login (for 100 free evals/month)")
524
+ click.echo()
525
+
526
+
527
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
528
+ # Entry point
529
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
530
+
531
+ if __name__ == "__main__":
532
+ main()
aievaluator/config.py ADDED
@@ -0,0 +1,115 @@
1
+ """Config manager for AI Evaluator CLI.
2
+
3
+ Handles API key resolution with priority:
4
+ 1. --api-key flag
5
+ 2. AIEVALUATOR_API_KEY env var
6
+ 3. ./aievaluator.config.json (project-local)
7
+ 4. ~/.config/aievaluator/config.json (global)
8
+ """
9
+
10
+ import json
11
+ import os
12
+ from pathlib import Path
13
+ from typing import Optional
14
+
15
+
16
+ def _global_config_path() -> Path:
17
+ """Returns the global config path, platform-aware."""
18
+ if os.name == "nt":
19
+ base = Path(os.environ.get("APPDATA", Path.home() / "AppData" / "Roaming"))
20
+ elif os.environ.get("XDG_CONFIG_HOME"):
21
+ base = Path(os.environ["XDG_CONFIG_HOME"])
22
+ else:
23
+ base = Path.home() / ".config"
24
+ return base / "aievaluator" / "config.json"
25
+
26
+
27
+ def _load_json(path: Path) -> dict:
28
+ """Load a JSON file, returning {} if not found or invalid."""
29
+ try:
30
+ with open(path) as f:
31
+ return json.load(f)
32
+ except (FileNotFoundError, json.JSONDecodeError):
33
+ return {}
34
+
35
+
36
+ def _save_json(path: Path, data: dict) -> None:
37
+ """Save data as JSON, creating parent dirs."""
38
+ path.parent.mkdir(parents=True, exist_ok=True)
39
+ with open(path, "w") as f:
40
+ json.dump(data, f, indent=2)
41
+
42
+
43
+ def resolve_api_key(flag_value: Optional[str] = None) -> Optional[str]:
44
+ """Resolve API key by priority order. Returns None if not found."""
45
+ if flag_value:
46
+ return flag_value
47
+ env_value = os.environ.get("AIEVALUATOR_API_KEY")
48
+ if env_value:
49
+ return env_value
50
+ local = _load_json(Path("aievaluator.config.json"))
51
+ if local.get("api_key"):
52
+ return local["api_key"]
53
+ global_cfg = _load_json(_global_config_path())
54
+ return global_cfg.get("api_key")
55
+
56
+
57
+ def resolve_engine_url(flag_value: Optional[str] = None) -> str:
58
+ """Resolve engine URL by priority. Defaults to https://api.aievaluator.dev."""
59
+ default = "https://api.aievaluator.dev"
60
+
61
+ if flag_value:
62
+ return flag_value.rstrip("/")
63
+
64
+ env_value = os.environ.get("AIEVALUATOR_ENGINE_URL")
65
+ if env_value:
66
+ return env_value.rstrip("/")
67
+
68
+ local = _load_json(Path("aievaluator.config.json"))
69
+ if local.get("engine_url"):
70
+ return local["engine_url"].rstrip("/")
71
+
72
+ global_cfg = _load_json(_global_config_path())
73
+ if global_cfg.get("engine_url"):
74
+ return global_cfg["engine_url"].rstrip("/")
75
+
76
+ return default
77
+
78
+
79
+ def resolve_default_metrics() -> str:
80
+ """Resolve default metrics from config. Defaults to faithfulness,g_eval."""
81
+ default = "faithfulness,g_eval"
82
+ local = _load_json(Path("aievaluator.config.json"))
83
+ if local.get("default_metrics"):
84
+ return local["default_metrics"]
85
+ global_cfg = _load_json(_global_config_path())
86
+ return global_cfg.get("default_metrics", default)
87
+
88
+
89
+ def resolve_default_min_score() -> float:
90
+ """Resolve default min_score from config. Defaults to 0.0."""
91
+ local = _load_json(Path("aievaluator.config.json"))
92
+ if "default_min_score" in local:
93
+ return float(local["default_min_score"])
94
+ global_cfg = _load_json(_global_config_path())
95
+ return float(global_cfg.get("default_min_score", 0.0))
96
+
97
+
98
+ def save_config(data: dict, global_: bool = True) -> None:
99
+ """Save config dict. If global_=False, saves to project-local."""
100
+ path = _global_config_path() if global_ else Path("aievaluator.config.json")
101
+ _save_json(path, data)
102
+
103
+
104
+ def load_config(global_: bool = True) -> dict:
105
+ """Load config dict."""
106
+ path = _global_config_path() if global_ else Path("aievaluator.config.json")
107
+ return _load_json(path)
108
+
109
+
110
+ def get_all_config() -> dict:
111
+ """Get merged config: global + project-local on top."""
112
+ global_cfg = _load_json(_global_config_path())
113
+ local = _load_json(Path("aievaluator.config.json"))
114
+ merged = {**global_cfg, **local}
115
+ return merged
@@ -0,0 +1,7 @@
1
+ """Formatters for CLI output."""
2
+
3
+ from .table import format_table
4
+ from .json import format_json_output
5
+ from .junit import format_junit
6
+
7
+ __all__ = ["format_table", "format_json_output", "format_junit"]
@@ -0,0 +1,28 @@
1
+ """JSON formatter."""
2
+
3
+ import json as json_mod
4
+
5
+
6
+ def format_json_output(data: dict, min_score: float = 0.0) -> str:
7
+ """Return evaluation results as a JSON string."""
8
+ results = data.get("results", [])
9
+ overall_score = data.get("overall_score", 0)
10
+ total_rows = data.get("total_rows", len(results))
11
+ failed = sum(1 for r in results if not r.get("passed", True))
12
+ input_tokens = data.get("input_tokens", 0)
13
+ output_tokens = data.get("output_tokens", 0)
14
+ eval_id = data.get("evaluation_id", "")
15
+
16
+ output = {
17
+ "evaluation_id": eval_id,
18
+ "overall_score": overall_score,
19
+ "passed": overall_score >= min_score,
20
+ "min_score": min_score,
21
+ "total_rows": total_rows,
22
+ "failed_queries": failed,
23
+ "input_tokens": input_tokens,
24
+ "output_tokens": output_tokens,
25
+ "results": results,
26
+ }
27
+
28
+ return json_mod.dumps(output, indent=2)
@@ -0,0 +1,46 @@
1
+ """JUnit XML formatter for CI/CD integration."""
2
+
3
+ import xml.etree.ElementTree as ET
4
+ from xml.dom import minidom
5
+
6
+
7
+ def format_junit(data: dict, min_score: float = 0.0) -> str:
8
+ """Return evaluation results as a JUnit XML string."""
9
+ results = data.get("results", [])
10
+ total = len(results)
11
+ failures = sum(1 for r in results if not r.get("passed", True))
12
+
13
+ testsuite = ET.Element("testsuite", {
14
+ "name": "AI Evaluator",
15
+ "tests": str(total),
16
+ "failures": str(failures),
17
+ "errors": "0",
18
+ "time": "0",
19
+ })
20
+
21
+ for i, r in enumerate(results):
22
+ query = r.get("query", "")[:80]
23
+ testcase = ET.SubElement(testsuite, "testcase", {
24
+ "classname": "AI Evaluator",
25
+ "name": f"Query {i+1}: {query}",
26
+ "time": "0",
27
+ })
28
+
29
+ if not r.get("passed", True):
30
+ scores = r.get("scores", {})
31
+ scores_str = ", ".join(f"{k}: {v:.2f}" for k, v in scores.items())
32
+ expected = r.get("expected_output", "") or ""
33
+ got = r.get("agent_response", "") or ""
34
+
35
+ failure_text = (
36
+ f"Query: {query}\n"
37
+ f"Expected: {expected}\n"
38
+ f"Got: {got}\n"
39
+ f"Scores: {{{scores_str}}}"
40
+ )
41
+ ET.SubElement(testcase, "failure", {
42
+ "message": f"Score below threshold {min_score}",
43
+ }).text = failure_text
44
+
45
+ xml_str = ET.tostring(testsuite, encoding="unicode")
46
+ return minidom.parseString(xml_str).toprettyxml(indent=" ")
@@ -0,0 +1,53 @@
1
+ """Table formatter using Rich."""
2
+
3
+ from rich.console import Console
4
+ from rich.table import Table
5
+
6
+
7
+ def format_table(data: dict, min_score: float, engine_url: str) -> None:
8
+ """Print evaluation results as a Rich table."""
9
+ console = Console()
10
+ results = data.get("results", [])
11
+ overall_score = data.get("overall_score", 0)
12
+ total_rows = data.get("total_rows", len(results))
13
+ failed = sum(1 for r in results if not r.get("passed", True))
14
+ input_tokens = data.get("input_tokens", 0)
15
+ output_tokens = data.get("output_tokens", 0)
16
+ eval_id = data.get("evaluation_id", "")
17
+
18
+ score_pct = overall_score * 100
19
+ passed = overall_score >= min_score
20
+ icon = "โœ…" if passed else "โŒ"
21
+
22
+ console.print()
23
+ console.print(f" [bold]AI Evaluator โ€” Results[/bold]")
24
+ console.print(f" Overall Score: [bold]{score_pct:.1f}%[/bold] {icon} {'above' if passed else 'below'} threshold ({min_score*100:.0f}%)")
25
+ console.print(f" Total rows: {total_rows}")
26
+ console.print(f" Failed: {failed}")
27
+ console.print(f" Tokens: โ†“{input_tokens:,} ยท โ†‘{output_tokens:,}")
28
+ if eval_id:
29
+ console.print(f" Dashboard: [link={engine_url}/evaluations/{eval_id}/report]{engine_url}/evaluations/{eval_id}/report[/link]")
30
+ console.print()
31
+
32
+ table = Table(show_header=True, header_style="bold")
33
+ table.add_column("#", style="dim", width=4)
34
+ table.add_column("Query", max_width=50)
35
+ table.add_column("Score", justify="right", width=8)
36
+ table.add_column("Pass", justify="center", width=6)
37
+
38
+ for i, r in enumerate(results):
39
+ query = r.get("query", "")[:50]
40
+ scores = r.get("scores", {})
41
+ first_score = list(scores.values())[0] if scores else 0
42
+ score_str = f"{first_score * 100:.0f}%"
43
+ passed_icon = "โœ…" if r.get("passed", True) else "โŒ"
44
+ table.add_row(str(i + 1), query, score_str, passed_icon)
45
+
46
+ console.print(table)
47
+ console.print()
48
+
49
+ if passed:
50
+ console.print(f"[green]โœ… Score {score_pct:.1f}% meets threshold {min_score}[/green]")
51
+ else:
52
+ console.print(f"[red]โŒ Score {score_pct:.1f}% below threshold {min_score}[/red]")
53
+ console.print()
@@ -0,0 +1,366 @@
1
+ Metadata-Version: 2.4
2
+ Name: aievaluator
3
+ Version: 1.0.1
4
+ Summary: AI Evaluator CLI โ€” evaluate your LLM agents from the command line
5
+ Author-email: AI Evaluator <support@aievaluator.dev>
6
+ License: MIT
7
+ Project-URL: Homepage, https://aievaluator.dev
8
+ Project-URL: Repository, https://github.com/aievaluator-dev/aievaluator-cli
9
+ Project-URL: Issues, https://github.com/aievaluator-dev/aievaluator-cli/issues
10
+ Keywords: ai,evaluation,llm,agent,testing,ci-cd
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Software Development :: Testing
20
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
+ Requires-Python: >=3.10
22
+ Description-Content-Type: text/markdown
23
+ Requires-Dist: click>=8.1
24
+ Requires-Dist: httpx>=0.27
25
+ Requires-Dist: rich>=13.0
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest>=8.0; extra == "dev"
28
+ Requires-Dist: pytest-asyncio>=0.24; extra == "dev"
29
+ Requires-Dist: pytest-httpx>=0.30; extra == "dev"
30
+
31
+ # AI Evaluator CLI โ€” Python
32
+
33
+ [![PyPI](https://img.shields.io/pypi/v/aievaluator)](https://pypi.org/project/aievaluator/)
34
+ [![Python](https://img.shields.io/pypi/pyversions/aievaluator)](https://pypi.org/project/aievaluator/)
35
+
36
+ Evaluate your LLM agents from the terminal. No browser. No dashboard.
37
+
38
+ ```bash
39
+ pip install aievaluator
40
+ ```
41
+
42
+ ---
43
+
44
+ ## ๐Ÿงญ Tutorial โ€” From Zero to CI/CD
45
+
46
+ Every step builds on the previous one. Start wherever makes sense for you.
47
+
48
+ ---
49
+
50
+ ### Level 0 โ€” Try it without installing anything
51
+
52
+ ```bash
53
+ curl -s -X POST https://api.aievaluator.dev/api/v1/playground/evaluate \
54
+ -H "Content-Type: application/json" \
55
+ -d '{"queries":["What is 2+2?"],"metrics":["faithfulness"]}' | jq .
56
+ ```
57
+
58
+ 5 free per day. No key. No install. Good enough to decide if it's useful.
59
+
60
+ ---
61
+
62
+ ### Level 1 โ€” Install and evaluate a single prompt
63
+
64
+ ```bash
65
+ pip install aievaluator
66
+
67
+ # Ask a question, tell it what you expect
68
+ aievaluator quick "What is the capital of France?" --expected "Paris"
69
+ ```
70
+
71
+ You'll see a table with the score. The `--expected` is optional โ€” without it, the judge evaluates
72
+ the response on its own merits.
73
+
74
+ ```
75
+ โš ๏ธ Playground mode โ€” 4/5 remaining
76
+
77
+ AI Evaluator โ€” Results
78
+ Overall Score: 95.0% โœ… above threshold (0%)
79
+ Total rows: 1
80
+ Failed: 0
81
+
82
+ โ”Œโ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”
83
+ โ”‚ # โ”‚ Query โ”‚ Score โ”‚ Pass โ”‚
84
+ โ”œโ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”ค
85
+ โ”‚ 1 โ”‚ What is the capital of France? โ”‚ 95% โ”‚ โœ… โ”‚
86
+ โ””โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”˜
87
+ ```
88
+
89
+ ---
90
+
91
+ ### Level 2 โ€” Sign up and scaffold a project
92
+
93
+ Playground is great for trying, but you'll want more than 5 evals/day.
94
+
95
+ ```bash
96
+ # Get your API key at https://aievaluator.dev/settings
97
+ aievaluator login
98
+
99
+ # Check your account
100
+ aievaluator whoami
101
+ ```
102
+
103
+ Now scaffold your project:
104
+
105
+ ```bash
106
+ aievaluator init
107
+ ```
108
+
109
+ This creates:
110
+ - `aievaluator.config.json` โ€” project-local config
111
+ - `evals/smoke-test.json` โ€” sample dataset with 3 queries
112
+ - Updates `.gitignore`
113
+
114
+ Open `evals/smoke-test.json` and replace the sample queries with your own:
115
+
116
+ ```json
117
+ [
118
+ {"input": "What are your business hours?", "expected_output": "Mon-Fri 9am-6pm"},
119
+ {"input": "How do I cancel my order?", "expected_output": "Go to My Orders โ†’ Cancel"},
120
+ {"input": "Do you ship internationally?", "expected_output": "Yes, via DHL Express"}
121
+ ]
122
+ ```
123
+
124
+ Test it against the built-in agent:
125
+
126
+ ```bash
127
+ aievaluator quick --dataset ./evals/smoke-test.json
128
+ ```
129
+
130
+ ---
131
+
132
+ ### Level 3 โ€” Evaluate your own agent
133
+
134
+ Point the CLI at your agent's endpoint:
135
+
136
+ ```bash
137
+ aievaluator eval \
138
+ --agent https://chatbot-staging.acme.com/api/chat \
139
+ --dataset ./evals/smoke-test.json \
140
+ --metrics faithfulness,g_eval
141
+ ```
142
+
143
+ The CLI calls your agent with each query, then an LLM judge scores the responses.
144
+
145
+ ---
146
+
147
+ ### Level 4 โ€” Add quality gates
148
+
149
+ Not all metrics are equally important. Set different thresholds per metric:
150
+
151
+ ```bash
152
+ aievaluator eval \
153
+ --agent https://chatbot-staging.acme.com/api/chat \
154
+ --dataset ./evals/smoke-test.json \
155
+ --thresholds faithfulness:0.90,g_eval:0.75
156
+ ```
157
+
158
+ - `faithfulness` must be โ‰ฅ 90% (hallucination = instant fail)
159
+ - `g_eval` must be โ‰ฅ 75% (general quality)
160
+
161
+ If any metric fails to meet its threshold, that row is marked โŒ.
162
+
163
+ **Or set one bar for everything:**
164
+
165
+ ```bash
166
+ aievaluator eval \
167
+ --agent https://chatbot-staging.acme.com/api/chat \
168
+ --dataset ./evals/smoke-test.json \
169
+ --min-score 0.80
170
+ ```
171
+
172
+ This works on `quick` too:
173
+
174
+ ```bash
175
+ aievaluator quick "test prompt" --min-score 0.80
176
+ # Exit code 1 if any metric drops below 0.80
177
+ ```
178
+
179
+ ---
180
+
181
+ ### Level 5 โ€” Create your own evaluation criteria
182
+
183
+ Sometimes the built-in metrics aren't enough. Define a custom evaluator inline:
184
+
185
+ ```bash
186
+ aievaluator eval \
187
+ --agent https://chatbot-staging.acme.com/api/chat \
188
+ --dataset ./evals/smoke-test.json \
189
+ --metrics politeness,g_eval \
190
+ --custom '{"name":"politeness","prompt":"Is the response polite and professional? Answer YES or NO and explain.","threshold":0.85}'
191
+ ```
192
+
193
+ The custom evaluator `politeness` is defined in the request, referenced in `--metrics` by name,
194
+ and evaluated alongside `g_eval`. No dashboard needed.
195
+
196
+ **Custom evaluator with per-metric threshold override:**
197
+
198
+ ```bash
199
+ aievaluator eval \
200
+ --agent $URL --dataset ./tests.json \
201
+ --metrics politeness,g_eval \
202
+ --custom '{"name":"politeness","prompt":"Is the tone friendly?","threshold":0.7}' \
203
+ --thresholds politeness:0.90,g_eval:0.80
204
+ ```
205
+
206
+ The `--thresholds` flag overrides whatever was set in `--custom`. The engine uses the
207
+ per-evaluation value.
208
+
209
+ ---
210
+
211
+ ### Level 6 โ€” CI/CD pipeline
212
+
213
+ Add this to your GitHub Actions, GitLab CI, or Jenkins:
214
+
215
+ ```bash
216
+ aievaluator eval \
217
+ --agent $STAGING_AGENT \
218
+ --dataset ./evals/regression.json \
219
+ --thresholds faithfulness:0.90,g_eval:0.75 \
220
+ --min-score 0.80 \
221
+ --ci \
222
+ --format junit > report.xml
223
+ ```
224
+
225
+ | Flag | What it does |
226
+ |---|---|
227
+ | `--ci` | No colors, no prompts โ€” clean output for logs |
228
+ | `--format junit` | JUnit XML that CI systems understand natively |
229
+ | `--min-score 0.80` | Overall score must be โ‰ฅ 80% |
230
+ | `--thresholds` | Per-metric quality bars |
231
+
232
+ Exit code 1 = pipeline fails = deploy blocked.
233
+
234
+ **Environment variables for CI:**
235
+
236
+ ```bash
237
+ export AIEVALUATOR_API_KEY="sk-..." # No hardcoded keys in YAML
238
+ export AIEVALUATOR_ENGINE_URL="https://api.aievaluator.dev"
239
+ ```
240
+
241
+ ---
242
+
243
+ ## ๐Ÿ“‹ Complete Command Reference
244
+
245
+ ### `aievaluator login`
246
+
247
+ ```bash
248
+ aievaluator login # Interactive prompt
249
+ aievaluator login --api-key sk-xxx # Non-interactive (CI)
250
+ aievaluator login --engine-url https://custom.engine.com
251
+ ```
252
+
253
+ ### `aievaluator whoami`
254
+
255
+ ```bash
256
+ aievaluator whoami
257
+ # Tenant: acme-corp
258
+ # Tier: pro
259
+ # Evals: 42/5000 this cycle
260
+ # Tokens: โ†“124,800 ยท โ†‘89,200 this cycle
261
+ ```
262
+
263
+ ### `aievaluator quick`
264
+
265
+ ```bash
266
+ # Single query
267
+ aievaluator quick "What is 2+2?" --expected "4"
268
+
269
+ # Per-metric thresholds
270
+ aievaluator quick "test" --metrics faithfulness:0.90,g_eval:0.75
271
+
272
+ # General threshold
273
+ aievaluator quick "test" --min-score 0.80
274
+
275
+ # From dataset (JSON or JSONL)
276
+ aievaluator quick --dataset ./tests.json
277
+ aievaluator quick --dataset ./tests.jsonl
278
+
279
+ # Custom judge model
280
+ aievaluator quick "test" --judge deepseek
281
+ ```
282
+
283
+ ### `aievaluator eval`
284
+
285
+ ```bash
286
+ # Basic
287
+ aievaluator eval --agent $URL --dataset ./tests.json
288
+
289
+ # With quality gates
290
+ aievaluator eval --agent $URL --dataset ./tests.json \
291
+ --thresholds faithfulness:0.90,g_eval:0.75 --min-score 0.80
292
+
293
+ # Inline rows
294
+ aievaluator eval --agent $URL \
295
+ --rows '[{"input":"Hi","expected_output":"Hello"}]'
296
+
297
+ # Custom evaluator inline
298
+ aievaluator eval --agent $URL --dataset ./tests.json \
299
+ --metrics my-eval --custom '{"name":"my-eval","prompt":"...","threshold":0.8}'
300
+
301
+ # CI mode
302
+ aievaluator eval --agent $URL --dataset ./tests.json --ci --format junit
303
+
304
+ # Different agent format
305
+ aievaluator eval --agent $URL --dataset ./tests.json --agent-format claude
306
+ ```
307
+
308
+ ### `aievaluator config`
309
+
310
+ ```bash
311
+ aievaluator config show
312
+ aievaluator config set default-metrics "faithfulness,g_eval"
313
+ aievaluator config set default-min-score 0.80
314
+ aievaluator config unset default-min-score
315
+ ```
316
+
317
+ ### `aievaluator init`
318
+
319
+ ```bash
320
+ aievaluator init
321
+ # Creates aievaluator.config.json + evals/smoke-test.json + updates .gitignore
322
+ ```
323
+
324
+ ---
325
+
326
+ ## ๐Ÿ“Š Output Formats
327
+
328
+ ### Table (default)
329
+
330
+ Human-readable table with scores, pass/fail icons, and token counts.
331
+
332
+ ### JSON (`--format json`)
333
+
334
+ ```bash
335
+ aievaluator eval ... --format json | jq '.overall_score'
336
+ ```
337
+
338
+ Clean JSON on stdout. All logs/warnings go to stderr.
339
+
340
+ ### JUnit XML (`--format junit`)
341
+
342
+ ```bash
343
+ aievaluator eval ... --format junit > report.xml
344
+ ```
345
+
346
+ Native CI integration. `<testcase>` per query, `<failure>` for queries below threshold.
347
+
348
+ ---
349
+
350
+ ## ๐Ÿค– VS Code Extension
351
+
352
+ Prefer staying in your editor? Install the [VS Code extension](https://marketplace.visualstudio.com/items?itemName=aievaluator.aievaluator).
353
+
354
+ - Select text โ†’ right-click โ†’ Evaluate
355
+ - Per-metric threshold editor with preset buttons
356
+ - Custom evaluator support via Command Palette
357
+ - Sidebar with evaluation history
358
+ - Dataset file evaluation (JSON + JSONL)
359
+
360
+ [Full VS Code tutorial โ†’](../vscode/README.md)
361
+
362
+ ---
363
+
364
+ ## Requirements
365
+
366
+ - Python 3.10+
@@ -0,0 +1,14 @@
1
+ aievaluator/__init__.py,sha256=Qx3qP1jslumH1wgtyTm2pGCVIu0-RZRbeWzFn3GzSIU,98
2
+ aievaluator/cli.py,sha256=o7gC1yfIuqUf0gYVJSrf2ap1oBE2UmbF8Xx3r5RRLUc,21339
3
+ aievaluator/config.py,sha256=5hm0wwEebZhrsbzV-nM0YpsyqpgV9ILe5c3DlYAE_r0,3755
4
+ aievaluator/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ aievaluator/api/client.py,sha256=dG-uH_TiL-mSv5PKDmQN5kEscdnyn-f5ZZeiRjYhgAQ,6423
6
+ aievaluator/formatters/__init__.py,sha256=mLZD18yR0ioyxPDwnkzVy8wsG9vULw-BqNsQUfJPxpA,201
7
+ aievaluator/formatters/json.py,sha256=MQ681J8CVtRzldfmWw6MlwbU7jqIKrDIXJk2gHm8F2I,910
8
+ aievaluator/formatters/junit.py,sha256=iwDbnYdDdr87UWtPGLUArQMYyBb6zfuQ_OM1pgccBB0,1552
9
+ aievaluator/formatters/table.py,sha256=OXou3cuev2PRZSQcC4MTU0l8qllY2j-kTzOvsGKEpkY,2197
10
+ aievaluator-1.0.1.dist-info/METADATA,sha256=OsVHIgncs6cxs3WT7iV3tXvX88YWxxSNEbd77tuPfsE,9885
11
+ aievaluator-1.0.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
12
+ aievaluator-1.0.1.dist-info/entry_points.txt,sha256=9h3s8XUIa2RZ9m-2Bh3e78elTimvVglfU7LvA9gxY4k,53
13
+ aievaluator-1.0.1.dist-info/top_level.txt,sha256=IVDZHg6brLn7wBoRdLH_mbjntqKoEppiF_dZ_VVLL2E,12
14
+ aievaluator-1.0.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ aievaluator = aievaluator.cli:main
@@ -0,0 +1 @@
1
+ aievaluator