aievaluator 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aievaluator/__init__.py +3 -0
- aievaluator/api/__init__.py +0 -0
- aievaluator/api/client.py +178 -0
- aievaluator/cli.py +532 -0
- aievaluator/config.py +115 -0
- aievaluator/formatters/__init__.py +7 -0
- aievaluator/formatters/json.py +28 -0
- aievaluator/formatters/junit.py +46 -0
- aievaluator/formatters/table.py +53 -0
- aievaluator-1.0.1.dist-info/METADATA +366 -0
- aievaluator-1.0.1.dist-info/RECORD +14 -0
- aievaluator-1.0.1.dist-info/WHEEL +5 -0
- aievaluator-1.0.1.dist-info/entry_points.txt +2 -0
- aievaluator-1.0.1.dist-info/top_level.txt +1 -0
aievaluator/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
"""HTTP client for AI Evaluator Engine API."""
|
|
2
|
+
|
|
3
|
+
import json as json_mod
|
|
4
|
+
from typing import Any, Optional
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class APIError(Exception):
|
|
10
|
+
"""Error from the AI Evaluator API."""
|
|
11
|
+
|
|
12
|
+
def __init__(self, status_code: int, message: str, detail: Any = None):
|
|
13
|
+
self.status_code = status_code
|
|
14
|
+
self.message = message
|
|
15
|
+
self.detail = detail
|
|
16
|
+
super().__init__(message)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class APIClient:
|
|
20
|
+
"""Thin HTTP wrapper around the AI Evaluator Engine API."""
|
|
21
|
+
|
|
22
|
+
def __init__(self, engine_url: str, api_key: Optional[str] = None, timeout: int = 300):
|
|
23
|
+
self.engine_url = engine_url.rstrip("/")
|
|
24
|
+
self.api_key = api_key
|
|
25
|
+
self.timeout = timeout
|
|
26
|
+
|
|
27
|
+
def _headers(self) -> dict:
|
|
28
|
+
h = {"Content-Type": "application/json"}
|
|
29
|
+
if self.api_key:
|
|
30
|
+
h["X-API-Key"] = self.api_key
|
|
31
|
+
return h
|
|
32
|
+
|
|
33
|
+
async def _request(
|
|
34
|
+
self,
|
|
35
|
+
method: str,
|
|
36
|
+
path: str,
|
|
37
|
+
json_data: Optional[dict] = None,
|
|
38
|
+
data: Optional[dict] = None,
|
|
39
|
+
files: Optional[dict] = None,
|
|
40
|
+
) -> dict:
|
|
41
|
+
"""Make an HTTP request to the engine. Raises APIError on failure."""
|
|
42
|
+
url = f"{self.engine_url}{path}"
|
|
43
|
+
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
|
44
|
+
try:
|
|
45
|
+
if files:
|
|
46
|
+
resp = await client.request(
|
|
47
|
+
method, url, data=data, files=files, headers={"X-API-Key": self.api_key} if self.api_key else {},
|
|
48
|
+
)
|
|
49
|
+
else:
|
|
50
|
+
resp = await client.request(
|
|
51
|
+
method, url, json=json_data, headers=self._headers(),
|
|
52
|
+
)
|
|
53
|
+
except httpx.ConnectError:
|
|
54
|
+
raise APIError(0, f"Cannot connect to {self.engine_url}")
|
|
55
|
+
except httpx.TimeoutException:
|
|
56
|
+
raise APIError(0, f"Request timed out after {self.timeout}s")
|
|
57
|
+
|
|
58
|
+
if resp.status_code >= 400:
|
|
59
|
+
detail = None
|
|
60
|
+
try:
|
|
61
|
+
detail = resp.json()
|
|
62
|
+
except Exception:
|
|
63
|
+
detail = resp.text
|
|
64
|
+
raise APIError(resp.status_code, f"Engine returned HTTP {resp.status_code}", detail)
|
|
65
|
+
|
|
66
|
+
return resp.json()
|
|
67
|
+
|
|
68
|
+
async def health(self) -> dict:
|
|
69
|
+
"""GET /health"""
|
|
70
|
+
return await self._request("GET", "/health")
|
|
71
|
+
|
|
72
|
+
async def get_usage(self) -> dict:
|
|
73
|
+
"""GET /api/v1/tenants/me/usage"""
|
|
74
|
+
return await self._request("GET", "/api/v1/tenants/me/usage")
|
|
75
|
+
|
|
76
|
+
async def evaluate_sync(
|
|
77
|
+
self,
|
|
78
|
+
rows: list[dict],
|
|
79
|
+
agent_url: str,
|
|
80
|
+
agent_format: str = "openai",
|
|
81
|
+
metrics: Optional[list[str]] = None,
|
|
82
|
+
judge_model: Optional[str] = None,
|
|
83
|
+
name: Optional[str] = None,
|
|
84
|
+
custom_evaluators: list[dict] | None = None,
|
|
85
|
+
thresholds: Optional[dict[str, float]] = None,
|
|
86
|
+
) -> dict:
|
|
87
|
+
"""POST /api/v1/evaluations/sync"""
|
|
88
|
+
agent_json = {"url": agent_url, "format": agent_format}
|
|
89
|
+
body = {
|
|
90
|
+
"rows": rows,
|
|
91
|
+
"agent": agent_json,
|
|
92
|
+
"metrics": metrics or ["faithfulness", "g_eval"],
|
|
93
|
+
"custom_evaluators": custom_evaluators or [],
|
|
94
|
+
}
|
|
95
|
+
if name:
|
|
96
|
+
body["name"] = name
|
|
97
|
+
if judge_model:
|
|
98
|
+
body["judge_model"] = judge_model
|
|
99
|
+
if thresholds:
|
|
100
|
+
body["thresholds"] = thresholds
|
|
101
|
+
|
|
102
|
+
return await self._request("POST", "/api/v1/evaluations/sync", json_data=body)
|
|
103
|
+
|
|
104
|
+
async def evaluate_upload(
|
|
105
|
+
self,
|
|
106
|
+
file_path: str,
|
|
107
|
+
agent_url: str,
|
|
108
|
+
agent_format: str = "openai",
|
|
109
|
+
metrics: Optional[str] = None,
|
|
110
|
+
) -> dict:
|
|
111
|
+
"""POST /api/v1/evaluations/sync/upload (multipart form upload)."""
|
|
112
|
+
import os
|
|
113
|
+
|
|
114
|
+
url = f"{self.engine_url}/api/v1/evaluations/sync/upload"
|
|
115
|
+
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
|
116
|
+
try:
|
|
117
|
+
with open(file_path, "rb") as f:
|
|
118
|
+
files = {"file": (os.path.basename(file_path), f, "application/json")}
|
|
119
|
+
data = {
|
|
120
|
+
"agent_endpoint": agent_url,
|
|
121
|
+
"agent_format": agent_format,
|
|
122
|
+
"metrics": metrics or "faithfulness,g_eval",
|
|
123
|
+
}
|
|
124
|
+
resp = await client.post(
|
|
125
|
+
url,
|
|
126
|
+
data=data,
|
|
127
|
+
files=files,
|
|
128
|
+
headers={"X-API-Key": self.api_key} if self.api_key else {},
|
|
129
|
+
)
|
|
130
|
+
except httpx.ConnectError:
|
|
131
|
+
raise APIError(0, f"Cannot connect to {self.engine_url}")
|
|
132
|
+
except httpx.TimeoutException:
|
|
133
|
+
raise APIError(0, f"Request timed out after {self.timeout}s")
|
|
134
|
+
|
|
135
|
+
if resp.status_code >= 400:
|
|
136
|
+
detail = None
|
|
137
|
+
try:
|
|
138
|
+
detail = resp.json()
|
|
139
|
+
except Exception:
|
|
140
|
+
detail = resp.text
|
|
141
|
+
raise APIError(resp.status_code, f"Engine returned HTTP {resp.status_code}", detail)
|
|
142
|
+
|
|
143
|
+
return resp.json()
|
|
144
|
+
|
|
145
|
+
async def playground_evaluate(
|
|
146
|
+
self,
|
|
147
|
+
queries: Optional[list[str]] = None,
|
|
148
|
+
rows: Optional[list[dict]] = None,
|
|
149
|
+
agent_endpoint: Optional[str] = None,
|
|
150
|
+
agent_config: Optional[dict] = None,
|
|
151
|
+
metrics: Optional[list] = None,
|
|
152
|
+
judge: Optional[str] = None,
|
|
153
|
+
) -> dict:
|
|
154
|
+
"""POST /api/v1/playground/evaluate (no auth required).
|
|
155
|
+
|
|
156
|
+
metrics accepts strings or dicts with thresholds:
|
|
157
|
+
["g_eval"] or [{"name": "g_eval", "threshold": 0.9}]
|
|
158
|
+
"""
|
|
159
|
+
body: dict = {"metrics": metrics or ["faithfulness", "g_eval"]}
|
|
160
|
+
if queries:
|
|
161
|
+
body["queries"] = queries
|
|
162
|
+
if rows:
|
|
163
|
+
body["rows"] = rows
|
|
164
|
+
if agent_config:
|
|
165
|
+
body["agent"] = agent_config
|
|
166
|
+
if agent_endpoint:
|
|
167
|
+
body["agent_endpoint"] = agent_endpoint
|
|
168
|
+
if judge:
|
|
169
|
+
body["judge"] = judge
|
|
170
|
+
return await self._request("POST", "/api/v1/playground/evaluate", json_data=body)
|
|
171
|
+
|
|
172
|
+
async def playground_status(self) -> dict:
|
|
173
|
+
"""GET /api/v1/playground/status (no auth required)."""
|
|
174
|
+
async with httpx.AsyncClient(timeout=10) as client:
|
|
175
|
+
resp = await client.get(f"{self.engine_url}/api/v1/playground/status")
|
|
176
|
+
if resp.status_code >= 400:
|
|
177
|
+
return {"used": 0, "limit": 5, "remaining": 5, "resets_at": "midnight UTC"}
|
|
178
|
+
return resp.json()
|
aievaluator/cli.py
ADDED
|
@@ -0,0 +1,532 @@
|
|
|
1
|
+
"""AI Evaluator CLI โ main entry point.
|
|
2
|
+
|
|
3
|
+
Commands:
|
|
4
|
+
aievaluator login Authenticate with AI Evaluator
|
|
5
|
+
aievaluator whoami Show current tenant info
|
|
6
|
+
aievaluator quick Quick eval via playground (no API key)
|
|
7
|
+
aievaluator eval Full evaluation against an agent
|
|
8
|
+
aievaluator config Manage CLI configuration
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import asyncio
|
|
12
|
+
import json as json_mod
|
|
13
|
+
import os
|
|
14
|
+
import sys
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Optional
|
|
17
|
+
|
|
18
|
+
import click
|
|
19
|
+
|
|
20
|
+
from . import __version__
|
|
21
|
+
from .api.client import APIClient, APIError
|
|
22
|
+
from .config import (
|
|
23
|
+
resolve_api_key,
|
|
24
|
+
resolve_engine_url,
|
|
25
|
+
resolve_default_metrics,
|
|
26
|
+
resolve_default_min_score,
|
|
27
|
+
save_config,
|
|
28
|
+
load_config,
|
|
29
|
+
get_all_config,
|
|
30
|
+
)
|
|
31
|
+
from .formatters import format_table, format_json_output, format_junit
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _parse_dataset_file(file_path: str) -> list[dict]:
|
|
35
|
+
"""Parse a dataset file (JSON or JSONL) into a list of rows."""
|
|
36
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
37
|
+
raw = f.read()
|
|
38
|
+
|
|
39
|
+
if file_path.endswith(".jsonl"):
|
|
40
|
+
rows = []
|
|
41
|
+
for line in raw.strip().split("\n"):
|
|
42
|
+
line = line.strip()
|
|
43
|
+
if line:
|
|
44
|
+
rows.append(json_mod.loads(line))
|
|
45
|
+
return rows
|
|
46
|
+
else:
|
|
47
|
+
data = json_mod.loads(raw)
|
|
48
|
+
return data if isinstance(data, list) else [data]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _run_async(coro):
|
|
52
|
+
"""Helper to run async coroutines from Click commands."""
|
|
53
|
+
return asyncio.run(coro)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _parse_quick_metrics(metrics_str: str | None, default_threshold: float | None = None) -> list | None:
|
|
57
|
+
"""Parse --metrics for quick command.
|
|
58
|
+
|
|
59
|
+
CU1: "faithfulness:0.90,g_eval:0.75" โ [{"name":"faithfulness","threshold":0.9}, ...]
|
|
60
|
+
CU2: "faithfulness,g_eval" with default_threshold=0.8 โ [{"name":"faithfulness","threshold":0.8}, ...]
|
|
61
|
+
Simple: "faithfulness,g_eval" โ ["faithfulness", "g_eval"]
|
|
62
|
+
"""
|
|
63
|
+
if not metrics_str:
|
|
64
|
+
return None
|
|
65
|
+
result = []
|
|
66
|
+
for item in metrics_str.split(","):
|
|
67
|
+
item = item.strip()
|
|
68
|
+
if ":" in item:
|
|
69
|
+
name, val = item.split(":", 1)
|
|
70
|
+
result.append({"name": name.strip(), "threshold": float(val.strip())})
|
|
71
|
+
elif default_threshold is not None:
|
|
72
|
+
result.append({"name": item, "threshold": default_threshold})
|
|
73
|
+
else:
|
|
74
|
+
result.append(item)
|
|
75
|
+
return result
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
79
|
+
# CLI Group
|
|
80
|
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
81
|
+
|
|
82
|
+
@click.group()
|
|
83
|
+
@click.version_option(version=__version__, prog_name="AI Evaluator CLI")
|
|
84
|
+
def main():
|
|
85
|
+
"""AI Evaluator CLI โ evaluate your LLM agents from the command line.
|
|
86
|
+
|
|
87
|
+
\b
|
|
88
|
+
Quick start:
|
|
89
|
+
aievaluator quick "What is 2+2?" --expected "4"
|
|
90
|
+
aievaluator login
|
|
91
|
+
aievaluator eval --agent https://my-agent.com/chat --dataset ./tests.json
|
|
92
|
+
"""
|
|
93
|
+
pass
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
97
|
+
# login
|
|
98
|
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
99
|
+
|
|
100
|
+
@main.command()
|
|
101
|
+
@click.option("--api-key", help="API key (non-interactive mode)", default=None)
|
|
102
|
+
@click.option("--engine-url", help="Engine URL", default=None)
|
|
103
|
+
def login(api_key: Optional[str], engine_url: Optional[str]):
|
|
104
|
+
"""Authenticate with AI Evaluator.
|
|
105
|
+
|
|
106
|
+
Saves your API key to ~/.config/aievaluator/config.json.
|
|
107
|
+
Get your key at https://aievaluator.dev/settings
|
|
108
|
+
"""
|
|
109
|
+
if not api_key:
|
|
110
|
+
click.echo()
|
|
111
|
+
click.echo("Enter your AI Evaluator API key:")
|
|
112
|
+
click.echo("(Get one at https://aievaluator.dev/settings)")
|
|
113
|
+
api_key = click.prompt("API key", hide_input=False).strip()
|
|
114
|
+
|
|
115
|
+
if not api_key:
|
|
116
|
+
click.echo("โ API key cannot be empty.", err=True)
|
|
117
|
+
sys.exit(2)
|
|
118
|
+
|
|
119
|
+
resolved_url = resolve_engine_url(engine_url)
|
|
120
|
+
client = APIClient(resolved_url, api_key)
|
|
121
|
+
|
|
122
|
+
async def _login():
|
|
123
|
+
try:
|
|
124
|
+
usage = await client.get_usage()
|
|
125
|
+
except APIError as e:
|
|
126
|
+
click.echo(f"โ Invalid API key or engine unreachable: {e.message}", err=True)
|
|
127
|
+
sys.exit(2)
|
|
128
|
+
|
|
129
|
+
# Save to global config
|
|
130
|
+
config = load_config()
|
|
131
|
+
config["api_key"] = api_key
|
|
132
|
+
config["engine_url"] = resolved_url
|
|
133
|
+
save_config(config)
|
|
134
|
+
|
|
135
|
+
tenant_name = usage.get("tenant_name", "Unknown")
|
|
136
|
+
tier = usage.get("tier", "unknown")
|
|
137
|
+
evals_used = usage.get("evaluations_this_cycle", 0)
|
|
138
|
+
evals_limit = usage.get("evaluations_limit", "โ")
|
|
139
|
+
|
|
140
|
+
click.echo()
|
|
141
|
+
click.echo(f"โ
Logged in as {tenant_name} ({tier})")
|
|
142
|
+
click.echo(f" Evals: {evals_used}/{evals_limit} this cycle")
|
|
143
|
+
click.echo(f" Config saved to ~/.config/aievaluator/config.json")
|
|
144
|
+
|
|
145
|
+
_run_async(_login())
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
149
|
+
# whoami
|
|
150
|
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
151
|
+
|
|
152
|
+
@main.command()
|
|
153
|
+
@click.option("--api-key", help="API key (overrides config)", default=None)
|
|
154
|
+
def whoami(api_key: Optional[str]):
|
|
155
|
+
"""Show current tenant info and usage."""
|
|
156
|
+
key = resolve_api_key(api_key)
|
|
157
|
+
if not key:
|
|
158
|
+
click.echo("โ Not logged in. Run: aievaluator login", err=True)
|
|
159
|
+
sys.exit(2)
|
|
160
|
+
|
|
161
|
+
engine_url = resolve_engine_url()
|
|
162
|
+
client = APIClient(engine_url, key)
|
|
163
|
+
|
|
164
|
+
async def _whoami():
|
|
165
|
+
try:
|
|
166
|
+
usage = await client.get_usage()
|
|
167
|
+
except APIError as e:
|
|
168
|
+
click.echo(f"โ {e.message}", err=True)
|
|
169
|
+
sys.exit(2)
|
|
170
|
+
|
|
171
|
+
tenant_name = usage.get("tenant_name", "Unknown")
|
|
172
|
+
tier = usage.get("tier", "unknown")
|
|
173
|
+
evals_used = usage.get("evaluations_this_cycle", 0)
|
|
174
|
+
evals_limit = usage.get("evaluations_limit", "โ")
|
|
175
|
+
tokens_in = usage.get("input_tokens_this_cycle", 0)
|
|
176
|
+
tokens_out = usage.get("output_tokens_this_cycle", 0)
|
|
177
|
+
|
|
178
|
+
click.echo()
|
|
179
|
+
click.echo(f"Tenant: {tenant_name}")
|
|
180
|
+
click.echo(f"Tier: {tier}")
|
|
181
|
+
click.echo(f"Evals: {evals_used}/{evals_limit} this cycle")
|
|
182
|
+
click.echo(f"Tokens: โ{tokens_in:,} ยท โ{tokens_out:,} this cycle")
|
|
183
|
+
|
|
184
|
+
_run_async(_whoami())
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
188
|
+
# quick
|
|
189
|
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
190
|
+
|
|
191
|
+
@main.command()
|
|
192
|
+
@click.argument("query", required=False)
|
|
193
|
+
@click.option("--dataset", "dataset_file", help="JSON dataset file", type=click.Path(exists=True), default=None)
|
|
194
|
+
@click.option("--agent", "agent_url", help="Agent endpoint URL (default: internal chat agent)", default="/chat")
|
|
195
|
+
@click.option("--expected", help="Expected output for query", default=None)
|
|
196
|
+
@click.option("--metrics", help="Metrics: faithfulness,g_eval or faithfulness:0.90,g_eval:0.75", default=None)
|
|
197
|
+
@click.option("--min-score", help="Apply threshold to all metrics and enforce exit code", type=float, default=None)
|
|
198
|
+
@click.option("--judge", help="LLM judge model", default=None)
|
|
199
|
+
@click.option("--engine-url", help="Engine URL", default=None)
|
|
200
|
+
def quick(query, dataset_file, agent_url, expected, metrics, min_score, judge, engine_url):
|
|
201
|
+
"""Quick evaluation via playground (no API key required).
|
|
202
|
+
|
|
203
|
+
\b
|
|
204
|
+
Examples:
|
|
205
|
+
aievaluator quick "What is 2+2?" --expected "4"
|
|
206
|
+
aievaluator quick --dataset ./smoke-tests.json --agent https://my-agent.com/chat
|
|
207
|
+
"""
|
|
208
|
+
if not query and not dataset_file:
|
|
209
|
+
click.echo("โ Provide a query or --dataset", err=True)
|
|
210
|
+
sys.exit(2)
|
|
211
|
+
if query and dataset_file:
|
|
212
|
+
click.echo("โ Use query OR --dataset, not both", err=True)
|
|
213
|
+
sys.exit(2)
|
|
214
|
+
|
|
215
|
+
resolved_url = resolve_engine_url(engine_url)
|
|
216
|
+
client = APIClient(resolved_url)
|
|
217
|
+
|
|
218
|
+
# Parse metrics: CU1 (metric:threshold), CU2 (--min-score applies to all)
|
|
219
|
+
metrics_list = _parse_quick_metrics(metrics, min_score)
|
|
220
|
+
|
|
221
|
+
async def _quick():
|
|
222
|
+
# Check playground status first
|
|
223
|
+
try:
|
|
224
|
+
status = await client.playground_status()
|
|
225
|
+
except Exception:
|
|
226
|
+
status = {"used": 0, "limit": 5, "remaining": 5, "resets_at": "midnight UTC"}
|
|
227
|
+
|
|
228
|
+
remaining = status.get("remaining", 5)
|
|
229
|
+
limit = status.get("limit", 5)
|
|
230
|
+
click.echo(f"โ ๏ธ Playground mode โ {remaining}/{limit} remaining (resets at {status.get('resets_at', 'midnight UTC')})")
|
|
231
|
+
click.echo()
|
|
232
|
+
|
|
233
|
+
if remaining <= 0:
|
|
234
|
+
click.echo("โ Playground limit reached. Run `aievaluator login` for 100 free evals/month.")
|
|
235
|
+
sys.exit(2)
|
|
236
|
+
|
|
237
|
+
if query:
|
|
238
|
+
rows = [{"input": query}]
|
|
239
|
+
if expected:
|
|
240
|
+
rows[0]["expected_output"] = expected
|
|
241
|
+
else:
|
|
242
|
+
rows = _parse_dataset_file(dataset_file)
|
|
243
|
+
|
|
244
|
+
try:
|
|
245
|
+
result = await client.playground_evaluate(
|
|
246
|
+
rows=rows,
|
|
247
|
+
agent_endpoint=agent_url,
|
|
248
|
+
metrics=metrics_list,
|
|
249
|
+
judge=judge,
|
|
250
|
+
)
|
|
251
|
+
except APIError as e:
|
|
252
|
+
click.echo(f"โ {e.message}", err=True)
|
|
253
|
+
if e.detail:
|
|
254
|
+
click.echo(json_mod.dumps(e.detail, indent=2), err=True)
|
|
255
|
+
sys.exit(2)
|
|
256
|
+
|
|
257
|
+
overall_passed = all(r.get("passed", True) for r in result.get("results", []))
|
|
258
|
+
format_table(result, min_score or 0.0, resolved_url)
|
|
259
|
+
|
|
260
|
+
# CU2: exit code based on --min-score
|
|
261
|
+
if min_score is not None:
|
|
262
|
+
sys.exit(0 if overall_passed else 1)
|
|
263
|
+
|
|
264
|
+
_run_async(_quick())
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
268
|
+
# eval
|
|
269
|
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
270
|
+
|
|
271
|
+
@main.command()
|
|
272
|
+
@click.option("--agent", required=True, help="Agent endpoint URL")
|
|
273
|
+
@click.option("--dataset", "dataset_file", help="JSON dataset file", type=click.Path(exists=True), default=None)
|
|
274
|
+
@click.option("--rows", help="Inline JSON array of test cases", default=None)
|
|
275
|
+
@click.option("--metrics", help="Metrics (comma-separated)", default=None)
|
|
276
|
+
@click.option("--agent-format", help="Agent API format", type=click.Choice(["openai", "claude", "custom"]), default="openai")
|
|
277
|
+
@click.option("--min-score", help="Minimum overall score threshold (0-1)", type=float, default=None)
|
|
278
|
+
@click.option("--thresholds", "thresholds_str", help="Per-metric thresholds: faithfulness:0.90,g_eval:0.75", default=None)
|
|
279
|
+
@click.option("--custom", "custom_str", help="Inline custom evaluator: {\"name\":\"polite\",\"prompt\":\"Check...\",\"threshold\":0.8}", default=None)
|
|
280
|
+
@click.option("--format", "output_format", help="Output format", type=click.Choice(["table", "json", "junit"]), default="table")
|
|
281
|
+
@click.option("--ci", is_flag=True, help="CI mode (no colors, no prompts)")
|
|
282
|
+
@click.option("--timeout", help="Timeout in seconds", type=int, default=300)
|
|
283
|
+
@click.option("--judge-model", help="LLM judge model", default=None)
|
|
284
|
+
@click.option("--name", "eval_name", help="Human-readable name for this evaluation", default=None)
|
|
285
|
+
@click.option("--api-key", help="API key (overrides config)", default=None)
|
|
286
|
+
@click.option("--engine-url", help="Engine URL", default=None)
|
|
287
|
+
def eval_cmd(agent, dataset_file, rows, metrics, agent_format, min_score, thresholds_str, custom_str, output_format, ci, timeout, judge_model, eval_name, api_key, engine_url):
|
|
288
|
+
"""Evaluate an AI agent against a dataset.
|
|
289
|
+
|
|
290
|
+
\b
|
|
291
|
+
Examples:
|
|
292
|
+
aievaluator eval --agent https://my-agent.com/chat --dataset ./tests.json
|
|
293
|
+
aievaluator eval --agent https://my-agent.com/chat --rows '[{"input":"Hi","expected_output":"Hello"}]'
|
|
294
|
+
aievaluator eval --agent $AGENT_URL --dataset ./evals.json --ci --format junit
|
|
295
|
+
"""
|
|
296
|
+
# Validate data source
|
|
297
|
+
if not dataset_file and not rows:
|
|
298
|
+
click.echo("โ Provide --dataset or --rows", err=True)
|
|
299
|
+
sys.exit(2)
|
|
300
|
+
if dataset_file and rows:
|
|
301
|
+
click.echo("โ Use --dataset OR --rows, not both", err=True)
|
|
302
|
+
sys.exit(2)
|
|
303
|
+
|
|
304
|
+
key = resolve_api_key(api_key)
|
|
305
|
+
if not key:
|
|
306
|
+
click.echo("โ API key required. Run: aievaluator login", err=True)
|
|
307
|
+
sys.exit(2)
|
|
308
|
+
|
|
309
|
+
resolved_url = resolve_engine_url(engine_url)
|
|
310
|
+
client = APIClient(resolved_url, key, timeout=timeout)
|
|
311
|
+
|
|
312
|
+
# Resolve metrics
|
|
313
|
+
if metrics:
|
|
314
|
+
metrics_list = [m.strip() for m in metrics.split(",")]
|
|
315
|
+
else:
|
|
316
|
+
metrics_list = resolve_default_metrics().split(",")
|
|
317
|
+
|
|
318
|
+
# Resolve min_score
|
|
319
|
+
if min_score is None:
|
|
320
|
+
min_score = resolve_default_min_score()
|
|
321
|
+
|
|
322
|
+
# Parse per-metric thresholds: "faithfulness:0.90,g_eval:0.75" -> {"faithfulness": 0.90, "g_eval": 0.75}
|
|
323
|
+
thresholds_dict = {}
|
|
324
|
+
if thresholds_str:
|
|
325
|
+
for pair in thresholds_str.split(","):
|
|
326
|
+
pair = pair.strip()
|
|
327
|
+
if ":" in pair:
|
|
328
|
+
metric_name, val = pair.split(":", 1)
|
|
329
|
+
try:
|
|
330
|
+
thresholds_dict[metric_name.strip()] = float(val.strip())
|
|
331
|
+
except ValueError:
|
|
332
|
+
click.echo(f"โ Invalid threshold value in: {pair}", err=True)
|
|
333
|
+
sys.exit(2)
|
|
334
|
+
|
|
335
|
+
# CU3: parse inline custom evaluator
|
|
336
|
+
custom_evaluators = None
|
|
337
|
+
if custom_str:
|
|
338
|
+
try:
|
|
339
|
+
custom_evaluators = json_mod.loads(custom_str)
|
|
340
|
+
if isinstance(custom_evaluators, dict):
|
|
341
|
+
custom_evaluators = [custom_evaluators]
|
|
342
|
+
except json_mod.JSONDecodeError:
|
|
343
|
+
click.echo(f"โ Invalid JSON in --custom", err=True)
|
|
344
|
+
sys.exit(2)
|
|
345
|
+
|
|
346
|
+
async def _eval():
|
|
347
|
+
if dataset_file:
|
|
348
|
+
try:
|
|
349
|
+
rows_data = _parse_dataset_file(dataset_file)
|
|
350
|
+
except (json_mod.JSONDecodeError, FileNotFoundError) as e:
|
|
351
|
+
click.echo(f"โ Cannot read dataset: {e}", err=True)
|
|
352
|
+
sys.exit(2)
|
|
353
|
+
else:
|
|
354
|
+
try:
|
|
355
|
+
rows_data = json_mod.loads(rows)
|
|
356
|
+
except json_mod.JSONDecodeError as e:
|
|
357
|
+
click.echo(f"โ Invalid JSON in --rows: {e}", err=True)
|
|
358
|
+
sys.exit(2)
|
|
359
|
+
if not isinstance(rows_data, list):
|
|
360
|
+
rows_data = [rows_data]
|
|
361
|
+
|
|
362
|
+
try:
|
|
363
|
+
result = await client.evaluate_sync(
|
|
364
|
+
rows=rows_data,
|
|
365
|
+
agent_url=agent,
|
|
366
|
+
agent_format=agent_format,
|
|
367
|
+
metrics=metrics_list,
|
|
368
|
+
judge_model=judge_model,
|
|
369
|
+
name=eval_name,
|
|
370
|
+
thresholds=thresholds_dict if thresholds_dict else None,
|
|
371
|
+
custom_evaluators=custom_evaluators,
|
|
372
|
+
)
|
|
373
|
+
except APIError as e:
|
|
374
|
+
_handle_api_error(e)
|
|
375
|
+
|
|
376
|
+
# Format output
|
|
377
|
+
if output_format == "json":
|
|
378
|
+
output = format_json_output(result, min_score)
|
|
379
|
+
click.echo(output)
|
|
380
|
+
elif output_format == "junit":
|
|
381
|
+
output = format_junit(result, min_score)
|
|
382
|
+
click.echo(output)
|
|
383
|
+
else:
|
|
384
|
+
format_table(result, min_score, resolved_url)
|
|
385
|
+
|
|
386
|
+
# Exit code
|
|
387
|
+
overall_score = result.get("overall_score", 0)
|
|
388
|
+
if overall_score < min_score:
|
|
389
|
+
sys.exit(1)
|
|
390
|
+
|
|
391
|
+
_run_async(_eval())
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
def _handle_api_error(e: APIError):
|
|
395
|
+
"""Print API error and exit."""
|
|
396
|
+
click.echo(f"โ {e.message}", err=True)
|
|
397
|
+
if e.detail:
|
|
398
|
+
if isinstance(e.detail, dict):
|
|
399
|
+
click.echo(json_mod.dumps(e.detail, indent=2), err=True)
|
|
400
|
+
else:
|
|
401
|
+
click.echo(str(e.detail)[:500], err=True)
|
|
402
|
+
sys.exit(3 if e.status_code == 0 else 2)
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
406
|
+
# config
|
|
407
|
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
408
|
+
|
|
409
|
+
@main.group()
|
|
410
|
+
def config():
|
|
411
|
+
"""Manage CLI configuration."""
|
|
412
|
+
pass
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
@config.command("show")
|
|
416
|
+
def config_show():
|
|
417
|
+
"""Show current configuration."""
|
|
418
|
+
cfg = get_all_config()
|
|
419
|
+
if cfg:
|
|
420
|
+
click.echo(json_mod.dumps(cfg, indent=2))
|
|
421
|
+
else:
|
|
422
|
+
click.echo("No configuration found. Run: aievaluator login")
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
@config.command("set")
|
|
426
|
+
@click.argument("key")
|
|
427
|
+
@click.argument("value")
|
|
428
|
+
def config_set(key: str, value: str):
|
|
429
|
+
"""Set a configuration value.
|
|
430
|
+
|
|
431
|
+
\b
|
|
432
|
+
Keys: engine-url, default-metrics, default-min-score
|
|
433
|
+
"""
|
|
434
|
+
valid_keys = {"engine-url", "default-metrics", "default-min-score"}
|
|
435
|
+
if key not in valid_keys:
|
|
436
|
+
click.echo(f"โ Invalid key: {key}. Valid keys: {', '.join(valid_keys)}", err=True)
|
|
437
|
+
sys.exit(2)
|
|
438
|
+
|
|
439
|
+
cfg = load_config()
|
|
440
|
+
if key == "default-min-score":
|
|
441
|
+
try:
|
|
442
|
+
cfg[key] = float(value)
|
|
443
|
+
except ValueError:
|
|
444
|
+
click.echo(f"โ default-min-score must be a number (0-1)", err=True)
|
|
445
|
+
sys.exit(2)
|
|
446
|
+
else:
|
|
447
|
+
cfg[key] = value
|
|
448
|
+
save_config(cfg)
|
|
449
|
+
click.echo(f"โ
{key} = {value}")
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
@config.command("unset")
|
|
453
|
+
@click.argument("key")
|
|
454
|
+
def config_unset(key: str):
|
|
455
|
+
"""Remove a configuration value."""
|
|
456
|
+
cfg = load_config()
|
|
457
|
+
if key in cfg:
|
|
458
|
+
del cfg[key]
|
|
459
|
+
save_config(cfg)
|
|
460
|
+
click.echo(f"โ
{key} removed")
|
|
461
|
+
else:
|
|
462
|
+
click.echo(f"{key} was not set")
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
466
|
+
# init
|
|
467
|
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
468
|
+
|
|
469
|
+
_SMOKE_TEST_DATASET = [
|
|
470
|
+
{"input": "What is 2+2?", "expected_output": "4"},
|
|
471
|
+
{"input": "What is the capital of France?", "expected_output": "Paris"},
|
|
472
|
+
{"input": "Say hello in Spanish", "expected_output": "Hola"},
|
|
473
|
+
]
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
@main.command()
|
|
477
|
+
def init():
|
|
478
|
+
"""Initialize a new AI Evaluator project in the current directory.
|
|
479
|
+
|
|
480
|
+
Creates:
|
|
481
|
+
- aievaluator.config.json (project-local config)
|
|
482
|
+
- evals/smoke-test.json (example dataset)
|
|
483
|
+
- Updates .gitignore
|
|
484
|
+
"""
|
|
485
|
+
cwd = Path.cwd()
|
|
486
|
+
|
|
487
|
+
# 1. Create aievaluator.config.json
|
|
488
|
+
config_path = cwd / "aievaluator.config.json"
|
|
489
|
+
if config_path.exists():
|
|
490
|
+
click.echo(f"โญ๏ธ aievaluator.config.json already exists, skipping")
|
|
491
|
+
else:
|
|
492
|
+
config_path.write_text(json_mod.dumps({
|
|
493
|
+
"engine_url": "https://api.aievaluator.dev",
|
|
494
|
+
"default_metrics": "faithfulness,g_eval",
|
|
495
|
+
"default_min_score": 0.80,
|
|
496
|
+
}, indent=2) + "\n")
|
|
497
|
+
click.echo(f"โ
Created aievaluator.config.json")
|
|
498
|
+
|
|
499
|
+
# 2. Create evals/ directory + smoke-test.json
|
|
500
|
+
evals_dir = cwd / "evals"
|
|
501
|
+
evals_dir.mkdir(exist_ok=True)
|
|
502
|
+
smoke_path = evals_dir / "smoke-test.json"
|
|
503
|
+
if smoke_path.exists():
|
|
504
|
+
click.echo(f"โญ๏ธ evals/smoke-test.json already exists, skipping")
|
|
505
|
+
else:
|
|
506
|
+
smoke_path.write_text(json_mod.dumps(_SMOKE_TEST_DATASET, indent=2) + "\n")
|
|
507
|
+
click.echo(f"โ
Created evals/smoke-test.json (3 example queries)")
|
|
508
|
+
|
|
509
|
+
# 3. Update .gitignore
|
|
510
|
+
gitignore_path = cwd / ".gitignore"
|
|
511
|
+
gitignore_lines = gitignore_path.read_text().split("\n") if gitignore_path.exists() else []
|
|
512
|
+
entry = "aievaluator.config.json"
|
|
513
|
+
if entry not in gitignore_lines:
|
|
514
|
+
with open(gitignore_path, "a") as f:
|
|
515
|
+
if gitignore_lines and gitignore_lines[-1].strip() != "":
|
|
516
|
+
f.write("\n")
|
|
517
|
+
f.write(f"{entry}\n")
|
|
518
|
+
click.echo(f"โ
Added {entry} to .gitignore")
|
|
519
|
+
|
|
520
|
+
click.echo()
|
|
521
|
+
click.echo("Next steps:")
|
|
522
|
+
click.echo(" aievaluator quick --dataset ./evals/smoke-test.json")
|
|
523
|
+
click.echo(" aievaluator login (for 100 free evals/month)")
|
|
524
|
+
click.echo()
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
528
|
+
# Entry point
|
|
529
|
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
530
|
+
|
|
531
|
+
if __name__ == "__main__":
|
|
532
|
+
main()
|
aievaluator/config.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""Config manager for AI Evaluator CLI.
|
|
2
|
+
|
|
3
|
+
Handles API key resolution with priority:
|
|
4
|
+
1. --api-key flag
|
|
5
|
+
2. AIEVALUATOR_API_KEY env var
|
|
6
|
+
3. ./aievaluator.config.json (project-local)
|
|
7
|
+
4. ~/.config/aievaluator/config.json (global)
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import os
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Optional
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _global_config_path() -> Path:
|
|
17
|
+
"""Returns the global config path, platform-aware."""
|
|
18
|
+
if os.name == "nt":
|
|
19
|
+
base = Path(os.environ.get("APPDATA", Path.home() / "AppData" / "Roaming"))
|
|
20
|
+
elif os.environ.get("XDG_CONFIG_HOME"):
|
|
21
|
+
base = Path(os.environ["XDG_CONFIG_HOME"])
|
|
22
|
+
else:
|
|
23
|
+
base = Path.home() / ".config"
|
|
24
|
+
return base / "aievaluator" / "config.json"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _load_json(path: Path) -> dict:
|
|
28
|
+
"""Load a JSON file, returning {} if not found or invalid."""
|
|
29
|
+
try:
|
|
30
|
+
with open(path) as f:
|
|
31
|
+
return json.load(f)
|
|
32
|
+
except (FileNotFoundError, json.JSONDecodeError):
|
|
33
|
+
return {}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _save_json(path: Path, data: dict) -> None:
|
|
37
|
+
"""Save data as JSON, creating parent dirs."""
|
|
38
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
39
|
+
with open(path, "w") as f:
|
|
40
|
+
json.dump(data, f, indent=2)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def resolve_api_key(flag_value: Optional[str] = None) -> Optional[str]:
|
|
44
|
+
"""Resolve API key by priority order. Returns None if not found."""
|
|
45
|
+
if flag_value:
|
|
46
|
+
return flag_value
|
|
47
|
+
env_value = os.environ.get("AIEVALUATOR_API_KEY")
|
|
48
|
+
if env_value:
|
|
49
|
+
return env_value
|
|
50
|
+
local = _load_json(Path("aievaluator.config.json"))
|
|
51
|
+
if local.get("api_key"):
|
|
52
|
+
return local["api_key"]
|
|
53
|
+
global_cfg = _load_json(_global_config_path())
|
|
54
|
+
return global_cfg.get("api_key")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def resolve_engine_url(flag_value: Optional[str] = None) -> str:
|
|
58
|
+
"""Resolve engine URL by priority. Defaults to https://api.aievaluator.dev."""
|
|
59
|
+
default = "https://api.aievaluator.dev"
|
|
60
|
+
|
|
61
|
+
if flag_value:
|
|
62
|
+
return flag_value.rstrip("/")
|
|
63
|
+
|
|
64
|
+
env_value = os.environ.get("AIEVALUATOR_ENGINE_URL")
|
|
65
|
+
if env_value:
|
|
66
|
+
return env_value.rstrip("/")
|
|
67
|
+
|
|
68
|
+
local = _load_json(Path("aievaluator.config.json"))
|
|
69
|
+
if local.get("engine_url"):
|
|
70
|
+
return local["engine_url"].rstrip("/")
|
|
71
|
+
|
|
72
|
+
global_cfg = _load_json(_global_config_path())
|
|
73
|
+
if global_cfg.get("engine_url"):
|
|
74
|
+
return global_cfg["engine_url"].rstrip("/")
|
|
75
|
+
|
|
76
|
+
return default
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def resolve_default_metrics() -> str:
|
|
80
|
+
"""Resolve default metrics from config. Defaults to faithfulness,g_eval."""
|
|
81
|
+
default = "faithfulness,g_eval"
|
|
82
|
+
local = _load_json(Path("aievaluator.config.json"))
|
|
83
|
+
if local.get("default_metrics"):
|
|
84
|
+
return local["default_metrics"]
|
|
85
|
+
global_cfg = _load_json(_global_config_path())
|
|
86
|
+
return global_cfg.get("default_metrics", default)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def resolve_default_min_score() -> float:
|
|
90
|
+
"""Resolve default min_score from config. Defaults to 0.0."""
|
|
91
|
+
local = _load_json(Path("aievaluator.config.json"))
|
|
92
|
+
if "default_min_score" in local:
|
|
93
|
+
return float(local["default_min_score"])
|
|
94
|
+
global_cfg = _load_json(_global_config_path())
|
|
95
|
+
return float(global_cfg.get("default_min_score", 0.0))
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def save_config(data: dict, global_: bool = True) -> None:
|
|
99
|
+
"""Save config dict. If global_=False, saves to project-local."""
|
|
100
|
+
path = _global_config_path() if global_ else Path("aievaluator.config.json")
|
|
101
|
+
_save_json(path, data)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def load_config(global_: bool = True) -> dict:
|
|
105
|
+
"""Load config dict."""
|
|
106
|
+
path = _global_config_path() if global_ else Path("aievaluator.config.json")
|
|
107
|
+
return _load_json(path)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def get_all_config() -> dict:
|
|
111
|
+
"""Get merged config: global + project-local on top."""
|
|
112
|
+
global_cfg = _load_json(_global_config_path())
|
|
113
|
+
local = _load_json(Path("aievaluator.config.json"))
|
|
114
|
+
merged = {**global_cfg, **local}
|
|
115
|
+
return merged
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""JSON formatter."""
|
|
2
|
+
|
|
3
|
+
import json as json_mod
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def format_json_output(data: dict, min_score: float = 0.0) -> str:
|
|
7
|
+
"""Return evaluation results as a JSON string."""
|
|
8
|
+
results = data.get("results", [])
|
|
9
|
+
overall_score = data.get("overall_score", 0)
|
|
10
|
+
total_rows = data.get("total_rows", len(results))
|
|
11
|
+
failed = sum(1 for r in results if not r.get("passed", True))
|
|
12
|
+
input_tokens = data.get("input_tokens", 0)
|
|
13
|
+
output_tokens = data.get("output_tokens", 0)
|
|
14
|
+
eval_id = data.get("evaluation_id", "")
|
|
15
|
+
|
|
16
|
+
output = {
|
|
17
|
+
"evaluation_id": eval_id,
|
|
18
|
+
"overall_score": overall_score,
|
|
19
|
+
"passed": overall_score >= min_score,
|
|
20
|
+
"min_score": min_score,
|
|
21
|
+
"total_rows": total_rows,
|
|
22
|
+
"failed_queries": failed,
|
|
23
|
+
"input_tokens": input_tokens,
|
|
24
|
+
"output_tokens": output_tokens,
|
|
25
|
+
"results": results,
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
return json_mod.dumps(output, indent=2)
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""JUnit XML formatter for CI/CD integration."""
|
|
2
|
+
|
|
3
|
+
import xml.etree.ElementTree as ET
|
|
4
|
+
from xml.dom import minidom
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def format_junit(data: dict, min_score: float = 0.0) -> str:
|
|
8
|
+
"""Return evaluation results as a JUnit XML string."""
|
|
9
|
+
results = data.get("results", [])
|
|
10
|
+
total = len(results)
|
|
11
|
+
failures = sum(1 for r in results if not r.get("passed", True))
|
|
12
|
+
|
|
13
|
+
testsuite = ET.Element("testsuite", {
|
|
14
|
+
"name": "AI Evaluator",
|
|
15
|
+
"tests": str(total),
|
|
16
|
+
"failures": str(failures),
|
|
17
|
+
"errors": "0",
|
|
18
|
+
"time": "0",
|
|
19
|
+
})
|
|
20
|
+
|
|
21
|
+
for i, r in enumerate(results):
|
|
22
|
+
query = r.get("query", "")[:80]
|
|
23
|
+
testcase = ET.SubElement(testsuite, "testcase", {
|
|
24
|
+
"classname": "AI Evaluator",
|
|
25
|
+
"name": f"Query {i+1}: {query}",
|
|
26
|
+
"time": "0",
|
|
27
|
+
})
|
|
28
|
+
|
|
29
|
+
if not r.get("passed", True):
|
|
30
|
+
scores = r.get("scores", {})
|
|
31
|
+
scores_str = ", ".join(f"{k}: {v:.2f}" for k, v in scores.items())
|
|
32
|
+
expected = r.get("expected_output", "") or ""
|
|
33
|
+
got = r.get("agent_response", "") or ""
|
|
34
|
+
|
|
35
|
+
failure_text = (
|
|
36
|
+
f"Query: {query}\n"
|
|
37
|
+
f"Expected: {expected}\n"
|
|
38
|
+
f"Got: {got}\n"
|
|
39
|
+
f"Scores: {{{scores_str}}}"
|
|
40
|
+
)
|
|
41
|
+
ET.SubElement(testcase, "failure", {
|
|
42
|
+
"message": f"Score below threshold {min_score}",
|
|
43
|
+
}).text = failure_text
|
|
44
|
+
|
|
45
|
+
xml_str = ET.tostring(testsuite, encoding="unicode")
|
|
46
|
+
return minidom.parseString(xml_str).toprettyxml(indent=" ")
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Table formatter using Rich."""
|
|
2
|
+
|
|
3
|
+
from rich.console import Console
|
|
4
|
+
from rich.table import Table
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def format_table(data: dict, min_score: float, engine_url: str) -> None:
|
|
8
|
+
"""Print evaluation results as a Rich table."""
|
|
9
|
+
console = Console()
|
|
10
|
+
results = data.get("results", [])
|
|
11
|
+
overall_score = data.get("overall_score", 0)
|
|
12
|
+
total_rows = data.get("total_rows", len(results))
|
|
13
|
+
failed = sum(1 for r in results if not r.get("passed", True))
|
|
14
|
+
input_tokens = data.get("input_tokens", 0)
|
|
15
|
+
output_tokens = data.get("output_tokens", 0)
|
|
16
|
+
eval_id = data.get("evaluation_id", "")
|
|
17
|
+
|
|
18
|
+
score_pct = overall_score * 100
|
|
19
|
+
passed = overall_score >= min_score
|
|
20
|
+
icon = "โ
" if passed else "โ"
|
|
21
|
+
|
|
22
|
+
console.print()
|
|
23
|
+
console.print(f" [bold]AI Evaluator โ Results[/bold]")
|
|
24
|
+
console.print(f" Overall Score: [bold]{score_pct:.1f}%[/bold] {icon} {'above' if passed else 'below'} threshold ({min_score*100:.0f}%)")
|
|
25
|
+
console.print(f" Total rows: {total_rows}")
|
|
26
|
+
console.print(f" Failed: {failed}")
|
|
27
|
+
console.print(f" Tokens: โ{input_tokens:,} ยท โ{output_tokens:,}")
|
|
28
|
+
if eval_id:
|
|
29
|
+
console.print(f" Dashboard: [link={engine_url}/evaluations/{eval_id}/report]{engine_url}/evaluations/{eval_id}/report[/link]")
|
|
30
|
+
console.print()
|
|
31
|
+
|
|
32
|
+
table = Table(show_header=True, header_style="bold")
|
|
33
|
+
table.add_column("#", style="dim", width=4)
|
|
34
|
+
table.add_column("Query", max_width=50)
|
|
35
|
+
table.add_column("Score", justify="right", width=8)
|
|
36
|
+
table.add_column("Pass", justify="center", width=6)
|
|
37
|
+
|
|
38
|
+
for i, r in enumerate(results):
|
|
39
|
+
query = r.get("query", "")[:50]
|
|
40
|
+
scores = r.get("scores", {})
|
|
41
|
+
first_score = list(scores.values())[0] if scores else 0
|
|
42
|
+
score_str = f"{first_score * 100:.0f}%"
|
|
43
|
+
passed_icon = "โ
" if r.get("passed", True) else "โ"
|
|
44
|
+
table.add_row(str(i + 1), query, score_str, passed_icon)
|
|
45
|
+
|
|
46
|
+
console.print(table)
|
|
47
|
+
console.print()
|
|
48
|
+
|
|
49
|
+
if passed:
|
|
50
|
+
console.print(f"[green]โ
Score {score_pct:.1f}% meets threshold {min_score}[/green]")
|
|
51
|
+
else:
|
|
52
|
+
console.print(f"[red]โ Score {score_pct:.1f}% below threshold {min_score}[/red]")
|
|
53
|
+
console.print()
|
|
@@ -0,0 +1,366 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: aievaluator
|
|
3
|
+
Version: 1.0.1
|
|
4
|
+
Summary: AI Evaluator CLI โ evaluate your LLM agents from the command line
|
|
5
|
+
Author-email: AI Evaluator <support@aievaluator.dev>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://aievaluator.dev
|
|
8
|
+
Project-URL: Repository, https://github.com/aievaluator-dev/aievaluator-cli
|
|
9
|
+
Project-URL: Issues, https://github.com/aievaluator-dev/aievaluator-cli/issues
|
|
10
|
+
Keywords: ai,evaluation,llm,agent,testing,ci-cd
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Software Development :: Testing
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
Requires-Dist: click>=8.1
|
|
24
|
+
Requires-Dist: httpx>=0.27
|
|
25
|
+
Requires-Dist: rich>=13.0
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
28
|
+
Requires-Dist: pytest-asyncio>=0.24; extra == "dev"
|
|
29
|
+
Requires-Dist: pytest-httpx>=0.30; extra == "dev"
|
|
30
|
+
|
|
31
|
+
# AI Evaluator CLI โ Python
|
|
32
|
+
|
|
33
|
+
[](https://pypi.org/project/aievaluator/)
|
|
34
|
+
[](https://pypi.org/project/aievaluator/)
|
|
35
|
+
|
|
36
|
+
Evaluate your LLM agents from the terminal. No browser. No dashboard.
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install aievaluator
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
---
|
|
43
|
+
|
|
44
|
+
## ๐งญ Tutorial โ From Zero to CI/CD
|
|
45
|
+
|
|
46
|
+
Every step builds on the previous one. Start wherever makes sense for you.
|
|
47
|
+
|
|
48
|
+
---
|
|
49
|
+
|
|
50
|
+
### Level 0 โ Try it without installing anything
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
curl -s -X POST https://api.aievaluator.dev/api/v1/playground/evaluate \
|
|
54
|
+
-H "Content-Type: application/json" \
|
|
55
|
+
-d '{"queries":["What is 2+2?"],"metrics":["faithfulness"]}' | jq .
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
5 free per day. No key. No install. Good enough to decide if it's useful.
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
### Level 1 โ Install and evaluate a single prompt
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
pip install aievaluator
|
|
66
|
+
|
|
67
|
+
# Ask a question, tell it what you expect
|
|
68
|
+
aievaluator quick "What is the capital of France?" --expected "Paris"
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
You'll see a table with the score. The `--expected` is optional โ without it, the judge evaluates
|
|
72
|
+
the response on its own merits.
|
|
73
|
+
|
|
74
|
+
```
|
|
75
|
+
โ ๏ธ Playground mode โ 4/5 remaining
|
|
76
|
+
|
|
77
|
+
AI Evaluator โ Results
|
|
78
|
+
Overall Score: 95.0% โ
above threshold (0%)
|
|
79
|
+
Total rows: 1
|
|
80
|
+
Failed: 0
|
|
81
|
+
|
|
82
|
+
โโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโฌโโโโโโโ
|
|
83
|
+
โ # โ Query โ Score โ Pass โ
|
|
84
|
+
โโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโผโโโโโโโค
|
|
85
|
+
โ 1 โ What is the capital of France? โ 95% โ โ
โ
|
|
86
|
+
โโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโดโโโโโโโ
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
---
|
|
90
|
+
|
|
91
|
+
### Level 2 โ Sign up and scaffold a project
|
|
92
|
+
|
|
93
|
+
Playground is great for trying, but you'll want more than 5 evals/day.
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
# Get your API key at https://aievaluator.dev/settings
|
|
97
|
+
aievaluator login
|
|
98
|
+
|
|
99
|
+
# Check your account
|
|
100
|
+
aievaluator whoami
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Now scaffold your project:
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
aievaluator init
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
This creates:
|
|
110
|
+
- `aievaluator.config.json` โ project-local config
|
|
111
|
+
- `evals/smoke-test.json` โ sample dataset with 3 queries
|
|
112
|
+
- Updates `.gitignore`
|
|
113
|
+
|
|
114
|
+
Open `evals/smoke-test.json` and replace the sample queries with your own:
|
|
115
|
+
|
|
116
|
+
```json
|
|
117
|
+
[
|
|
118
|
+
{"input": "What are your business hours?", "expected_output": "Mon-Fri 9am-6pm"},
|
|
119
|
+
{"input": "How do I cancel my order?", "expected_output": "Go to My Orders โ Cancel"},
|
|
120
|
+
{"input": "Do you ship internationally?", "expected_output": "Yes, via DHL Express"}
|
|
121
|
+
]
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
Test it against the built-in agent:
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
aievaluator quick --dataset ./evals/smoke-test.json
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
---
|
|
131
|
+
|
|
132
|
+
### Level 3 โ Evaluate your own agent
|
|
133
|
+
|
|
134
|
+
Point the CLI at your agent's endpoint:
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
aievaluator eval \
|
|
138
|
+
--agent https://chatbot-staging.acme.com/api/chat \
|
|
139
|
+
--dataset ./evals/smoke-test.json \
|
|
140
|
+
--metrics faithfulness,g_eval
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
The CLI calls your agent with each query, then an LLM judge scores the responses.
|
|
144
|
+
|
|
145
|
+
---
|
|
146
|
+
|
|
147
|
+
### Level 4 โ Add quality gates
|
|
148
|
+
|
|
149
|
+
Not all metrics are equally important. Set different thresholds per metric:
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
aievaluator eval \
|
|
153
|
+
--agent https://chatbot-staging.acme.com/api/chat \
|
|
154
|
+
--dataset ./evals/smoke-test.json \
|
|
155
|
+
--thresholds faithfulness:0.90,g_eval:0.75
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
- `faithfulness` must be โฅ 90% (hallucination = instant fail)
|
|
159
|
+
- `g_eval` must be โฅ 75% (general quality)
|
|
160
|
+
|
|
161
|
+
If any metric fails to meet its threshold, that row is marked โ.
|
|
162
|
+
|
|
163
|
+
**Or set one bar for everything:**
|
|
164
|
+
|
|
165
|
+
```bash
|
|
166
|
+
aievaluator eval \
|
|
167
|
+
--agent https://chatbot-staging.acme.com/api/chat \
|
|
168
|
+
--dataset ./evals/smoke-test.json \
|
|
169
|
+
--min-score 0.80
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
This works on `quick` too:
|
|
173
|
+
|
|
174
|
+
```bash
|
|
175
|
+
aievaluator quick "test prompt" --min-score 0.80
|
|
176
|
+
# Exit code 1 if any metric drops below 0.80
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
---
|
|
180
|
+
|
|
181
|
+
### Level 5 โ Create your own evaluation criteria
|
|
182
|
+
|
|
183
|
+
Sometimes the built-in metrics aren't enough. Define a custom evaluator inline:
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
aievaluator eval \
|
|
187
|
+
--agent https://chatbot-staging.acme.com/api/chat \
|
|
188
|
+
--dataset ./evals/smoke-test.json \
|
|
189
|
+
--metrics politeness,g_eval \
|
|
190
|
+
--custom '{"name":"politeness","prompt":"Is the response polite and professional? Answer YES or NO and explain.","threshold":0.85}'
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
The custom evaluator `politeness` is defined in the request, referenced in `--metrics` by name,
|
|
194
|
+
and evaluated alongside `g_eval`. No dashboard needed.
|
|
195
|
+
|
|
196
|
+
**Custom evaluator with per-metric threshold override:**
|
|
197
|
+
|
|
198
|
+
```bash
|
|
199
|
+
aievaluator eval \
|
|
200
|
+
--agent $URL --dataset ./tests.json \
|
|
201
|
+
--metrics politeness,g_eval \
|
|
202
|
+
--custom '{"name":"politeness","prompt":"Is the tone friendly?","threshold":0.7}' \
|
|
203
|
+
--thresholds politeness:0.90,g_eval:0.80
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
The `--thresholds` flag overrides whatever was set in `--custom`. The engine uses the
|
|
207
|
+
per-evaluation value.
|
|
208
|
+
|
|
209
|
+
---
|
|
210
|
+
|
|
211
|
+
### Level 6 โ CI/CD pipeline
|
|
212
|
+
|
|
213
|
+
Add this to your GitHub Actions, GitLab CI, or Jenkins:
|
|
214
|
+
|
|
215
|
+
```bash
|
|
216
|
+
aievaluator eval \
|
|
217
|
+
--agent $STAGING_AGENT \
|
|
218
|
+
--dataset ./evals/regression.json \
|
|
219
|
+
--thresholds faithfulness:0.90,g_eval:0.75 \
|
|
220
|
+
--min-score 0.80 \
|
|
221
|
+
--ci \
|
|
222
|
+
--format junit > report.xml
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
| Flag | What it does |
|
|
226
|
+
|---|---|
|
|
227
|
+
| `--ci` | No colors, no prompts โ clean output for logs |
|
|
228
|
+
| `--format junit` | JUnit XML that CI systems understand natively |
|
|
229
|
+
| `--min-score 0.80` | Overall score must be โฅ 80% |
|
|
230
|
+
| `--thresholds` | Per-metric quality bars |
|
|
231
|
+
|
|
232
|
+
Exit code 1 = pipeline fails = deploy blocked.
|
|
233
|
+
|
|
234
|
+
**Environment variables for CI:**
|
|
235
|
+
|
|
236
|
+
```bash
|
|
237
|
+
export AIEVALUATOR_API_KEY="sk-..." # No hardcoded keys in YAML
|
|
238
|
+
export AIEVALUATOR_ENGINE_URL="https://api.aievaluator.dev"
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
---
|
|
242
|
+
|
|
243
|
+
## ๐ Complete Command Reference
|
|
244
|
+
|
|
245
|
+
### `aievaluator login`
|
|
246
|
+
|
|
247
|
+
```bash
|
|
248
|
+
aievaluator login # Interactive prompt
|
|
249
|
+
aievaluator login --api-key sk-xxx # Non-interactive (CI)
|
|
250
|
+
aievaluator login --engine-url https://custom.engine.com
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
### `aievaluator whoami`
|
|
254
|
+
|
|
255
|
+
```bash
|
|
256
|
+
aievaluator whoami
|
|
257
|
+
# Tenant: acme-corp
|
|
258
|
+
# Tier: pro
|
|
259
|
+
# Evals: 42/5000 this cycle
|
|
260
|
+
# Tokens: โ124,800 ยท โ89,200 this cycle
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
### `aievaluator quick`
|
|
264
|
+
|
|
265
|
+
```bash
|
|
266
|
+
# Single query
|
|
267
|
+
aievaluator quick "What is 2+2?" --expected "4"
|
|
268
|
+
|
|
269
|
+
# Per-metric thresholds
|
|
270
|
+
aievaluator quick "test" --metrics faithfulness:0.90,g_eval:0.75
|
|
271
|
+
|
|
272
|
+
# General threshold
|
|
273
|
+
aievaluator quick "test" --min-score 0.80
|
|
274
|
+
|
|
275
|
+
# From dataset (JSON or JSONL)
|
|
276
|
+
aievaluator quick --dataset ./tests.json
|
|
277
|
+
aievaluator quick --dataset ./tests.jsonl
|
|
278
|
+
|
|
279
|
+
# Custom judge model
|
|
280
|
+
aievaluator quick "test" --judge deepseek
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
### `aievaluator eval`
|
|
284
|
+
|
|
285
|
+
```bash
|
|
286
|
+
# Basic
|
|
287
|
+
aievaluator eval --agent $URL --dataset ./tests.json
|
|
288
|
+
|
|
289
|
+
# With quality gates
|
|
290
|
+
aievaluator eval --agent $URL --dataset ./tests.json \
|
|
291
|
+
--thresholds faithfulness:0.90,g_eval:0.75 --min-score 0.80
|
|
292
|
+
|
|
293
|
+
# Inline rows
|
|
294
|
+
aievaluator eval --agent $URL \
|
|
295
|
+
--rows '[{"input":"Hi","expected_output":"Hello"}]'
|
|
296
|
+
|
|
297
|
+
# Custom evaluator inline
|
|
298
|
+
aievaluator eval --agent $URL --dataset ./tests.json \
|
|
299
|
+
--metrics my-eval --custom '{"name":"my-eval","prompt":"...","threshold":0.8}'
|
|
300
|
+
|
|
301
|
+
# CI mode
|
|
302
|
+
aievaluator eval --agent $URL --dataset ./tests.json --ci --format junit
|
|
303
|
+
|
|
304
|
+
# Different agent format
|
|
305
|
+
aievaluator eval --agent $URL --dataset ./tests.json --agent-format claude
|
|
306
|
+
```
|
|
307
|
+
|
|
308
|
+
### `aievaluator config`
|
|
309
|
+
|
|
310
|
+
```bash
|
|
311
|
+
aievaluator config show
|
|
312
|
+
aievaluator config set default-metrics "faithfulness,g_eval"
|
|
313
|
+
aievaluator config set default-min-score 0.80
|
|
314
|
+
aievaluator config unset default-min-score
|
|
315
|
+
```
|
|
316
|
+
|
|
317
|
+
### `aievaluator init`
|
|
318
|
+
|
|
319
|
+
```bash
|
|
320
|
+
aievaluator init
|
|
321
|
+
# Creates aievaluator.config.json + evals/smoke-test.json + updates .gitignore
|
|
322
|
+
```
|
|
323
|
+
|
|
324
|
+
---
|
|
325
|
+
|
|
326
|
+
## ๐ Output Formats
|
|
327
|
+
|
|
328
|
+
### Table (default)
|
|
329
|
+
|
|
330
|
+
Human-readable table with scores, pass/fail icons, and token counts.
|
|
331
|
+
|
|
332
|
+
### JSON (`--format json`)
|
|
333
|
+
|
|
334
|
+
```bash
|
|
335
|
+
aievaluator eval ... --format json | jq '.overall_score'
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
Clean JSON on stdout. All logs/warnings go to stderr.
|
|
339
|
+
|
|
340
|
+
### JUnit XML (`--format junit`)
|
|
341
|
+
|
|
342
|
+
```bash
|
|
343
|
+
aievaluator eval ... --format junit > report.xml
|
|
344
|
+
```
|
|
345
|
+
|
|
346
|
+
Native CI integration. `<testcase>` per query, `<failure>` for queries below threshold.
|
|
347
|
+
|
|
348
|
+
---
|
|
349
|
+
|
|
350
|
+
## ๐ค VS Code Extension
|
|
351
|
+
|
|
352
|
+
Prefer staying in your editor? Install the [VS Code extension](https://marketplace.visualstudio.com/items?itemName=aievaluator.aievaluator).
|
|
353
|
+
|
|
354
|
+
- Select text โ right-click โ Evaluate
|
|
355
|
+
- Per-metric threshold editor with preset buttons
|
|
356
|
+
- Custom evaluator support via Command Palette
|
|
357
|
+
- Sidebar with evaluation history
|
|
358
|
+
- Dataset file evaluation (JSON + JSONL)
|
|
359
|
+
|
|
360
|
+
[Full VS Code tutorial โ](../vscode/README.md)
|
|
361
|
+
|
|
362
|
+
---
|
|
363
|
+
|
|
364
|
+
## Requirements
|
|
365
|
+
|
|
366
|
+
- Python 3.10+
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
aievaluator/__init__.py,sha256=Qx3qP1jslumH1wgtyTm2pGCVIu0-RZRbeWzFn3GzSIU,98
|
|
2
|
+
aievaluator/cli.py,sha256=o7gC1yfIuqUf0gYVJSrf2ap1oBE2UmbF8Xx3r5RRLUc,21339
|
|
3
|
+
aievaluator/config.py,sha256=5hm0wwEebZhrsbzV-nM0YpsyqpgV9ILe5c3DlYAE_r0,3755
|
|
4
|
+
aievaluator/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
+
aievaluator/api/client.py,sha256=dG-uH_TiL-mSv5PKDmQN5kEscdnyn-f5ZZeiRjYhgAQ,6423
|
|
6
|
+
aievaluator/formatters/__init__.py,sha256=mLZD18yR0ioyxPDwnkzVy8wsG9vULw-BqNsQUfJPxpA,201
|
|
7
|
+
aievaluator/formatters/json.py,sha256=MQ681J8CVtRzldfmWw6MlwbU7jqIKrDIXJk2gHm8F2I,910
|
|
8
|
+
aievaluator/formatters/junit.py,sha256=iwDbnYdDdr87UWtPGLUArQMYyBb6zfuQ_OM1pgccBB0,1552
|
|
9
|
+
aievaluator/formatters/table.py,sha256=OXou3cuev2PRZSQcC4MTU0l8qllY2j-kTzOvsGKEpkY,2197
|
|
10
|
+
aievaluator-1.0.1.dist-info/METADATA,sha256=OsVHIgncs6cxs3WT7iV3tXvX88YWxxSNEbd77tuPfsE,9885
|
|
11
|
+
aievaluator-1.0.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
12
|
+
aievaluator-1.0.1.dist-info/entry_points.txt,sha256=9h3s8XUIa2RZ9m-2Bh3e78elTimvVglfU7LvA9gxY4k,53
|
|
13
|
+
aievaluator-1.0.1.dist-info/top_level.txt,sha256=IVDZHg6brLn7wBoRdLH_mbjntqKoEppiF_dZ_VVLL2E,12
|
|
14
|
+
aievaluator-1.0.1.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
aievaluator
|