evalgrid 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
evalgrid/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """evalgrid — open-source AI agent evaluation framework."""
2
+
3
+ __version__ = "0.1.0"
evalgrid/__main__.py ADDED
@@ -0,0 +1,4 @@
1
+ from evalgrid.cli.main import cli
2
+
3
+ if __name__ == "__main__":
4
+ cli()
File without changes
evalgrid/api/app.py ADDED
@@ -0,0 +1,442 @@
1
+ """FastAPI application — eval runner + results API."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import logging
7
+ import os
8
+ import re
9
+ import uuid
10
+ from contextlib import asynccontextmanager
11
+ from datetime import datetime, timezone
12
+ from pathlib import Path
13
+ from typing import Any
14
+
15
+ import yaml
16
+ from fastapi import BackgroundTasks, Depends, FastAPI, HTTPException, Request
17
+ from fastapi.middleware.cors import CORSMiddleware
18
+ from fastapi.responses import JSONResponse
19
+ from fastapi.staticfiles import StaticFiles
20
+ from pydantic import BaseModel, Field
21
+ from sqlalchemy import select
22
+ from sqlalchemy.ext.asyncio import AsyncSession
23
+
24
+ from evalgrid.core import EvalConfig, ScenarioLoader
25
+ from evalgrid.core.models import PassFail
26
+ from evalgrid.core.runner import ScenarioRunner
27
+
28
+ from .database import RunRecord, ScenarioResultRecord, get_db, init_db
29
+
30
+ logger = logging.getLogger("evalgrid")
31
+
32
+ SCENARIOS_BASE_DIR = Path.home() / ".evalgrid" / "scenarios"
33
+
34
+ _API_KEY_RE = re.compile(
35
+ r"(?:sk-ant-|sk-|pypi-)[A-Za-z0-9\-_]{6,}"
36
+ r"|AIza[A-Za-z0-9\-_]{30,}"
37
+ )
38
+
39
+
40
+ def _sanitize_error(msg: str) -> str:
41
+ return _API_KEY_RE.sub("[REDACTED]", msg)
42
+
43
+
44
+ def _iso(dt: datetime | None) -> str | None:
45
+ """Return ISO-8601 string with explicit UTC offset so browsers parse it correctly."""
46
+ if dt is None:
47
+ return None
48
+ if dt.tzinfo is None:
49
+ dt = dt.replace(tzinfo=timezone.utc)
50
+ return dt.isoformat()
51
+
52
+
53
+ @asynccontextmanager
54
+ async def lifespan(app: FastAPI): # noqa: ARG001
55
+ await init_db()
56
+
57
+ # Mark any runs that were left running (server restart) as failed
58
+ from .database import SessionLocal
59
+ try:
60
+ async with SessionLocal() as db:
61
+ stale_rows = await db.execute(
62
+ select(RunRecord).where(RunRecord.finished_at.is_(None))
63
+ )
64
+ stale = stale_rows.scalars().all()
65
+ for record in stale:
66
+ record.finished_at = datetime.now(timezone.utc)
67
+ record.total = record.total or 0
68
+ record.passed = record.passed or 0
69
+ record.pass_rate = 0.0
70
+ record.results = [{"error": "Server restarted"}]
71
+ logger.warning("Marked stale run %s as failed (server restart)", record.run_id)
72
+ if stale:
73
+ await db.commit()
74
+ except Exception as exc:
75
+ logger.warning("Stale run cleanup skipped: %s", exc)
76
+
77
+ yield
78
+
79
+
80
+ app = FastAPI(title="evalgrid API", version="0.1.0", lifespan=lifespan)
81
+
82
+
83
+ @app.exception_handler(Exception)
84
+ async def global_exception_handler(request: Request, exc: Exception) -> JSONResponse:
85
+ logger.error("Unhandled exception on %s: %s", request.url.path, exc)
86
+ sanitized = _sanitize_error(str(exc))
87
+ return JSONResponse(status_code=500, content={"detail": f"Internal server error: {sanitized}"})
88
+
89
+
90
+ app.add_middleware(
91
+ CORSMiddleware,
92
+ allow_origins=[
93
+ "http://localhost:5173",
94
+ "http://localhost:8000",
95
+ "http://127.0.0.1:5173",
96
+ "http://127.0.0.1:8000",
97
+ ],
98
+ allow_methods=["*"],
99
+ allow_headers=["*"],
100
+ )
101
+
102
+
103
+ # ── Schema ────────────────────────────────────────────────────────────────────
104
+
105
+ class RunRequest(BaseModel):
106
+ scenarios_dir: str = ""
107
+ tags: list[str] = []
108
+ scenario_ids: list[str] = []
109
+ mock: bool = False
110
+ judge_model: str | None = None
111
+
112
+
113
+ class RunSummary(BaseModel):
114
+ run_id: str
115
+ started_at: str
116
+ finished_at: str | None = None
117
+ pass_rate: float
118
+ passed: int
119
+ total: int
120
+ status: str
121
+ scenario_names: list[str] = []
122
+ skipped_count: int = 0
123
+ skipped_scenarios: list[dict[str, str]] = []
124
+
125
+
126
+ class ScenarioCreate(BaseModel):
127
+ name: str = Field(..., max_length=200)
128
+ description: str = Field(default="", max_length=1000)
129
+ agent_type: str = "generic"
130
+ prompt: str = Field(..., max_length=16000)
131
+ test_input: str = Field(..., max_length=8000)
132
+ expected_actions: list[dict[str, Any]] = []
133
+ tags: list[str] = []
134
+ scenarios_dir: str = ""
135
+
136
+
137
+ def _validate_path(scenarios_dir: str) -> Path:
138
+ """Resolve and validate that the path stays within SCENARIOS_BASE_DIR."""
139
+ _base = SCENARIOS_BASE_DIR.resolve()
140
+ # Belt-and-suspenders: reject obvious traversal attempts
141
+ if ".." in scenarios_dir:
142
+ raise HTTPException(status_code=400, detail="Path outside allowed scenarios directory")
143
+ resolved = (_base / scenarios_dir).resolve()
144
+ if not resolved.is_relative_to(_base):
145
+ raise HTTPException(status_code=400, detail="Path outside allowed scenarios directory")
146
+ return resolved
147
+
148
+
149
+ # ── Routes ────────────────────────────────────────────────────────────────────
150
+
151
+ @app.get("/health")
152
+ async def health() -> dict[str, str]:
153
+ return {"status": "ok"}
154
+
155
+
156
+ @app.get("/api/config-status")
157
+ async def config_status() -> dict[str, Any]:
158
+ anthropic_set = bool(os.environ.get("ANTHROPIC_API_KEY"))
159
+ openai_set = bool(os.environ.get("OPENAI_API_KEY"))
160
+ return {
161
+ "anthropic_key_set": anthropic_set,
162
+ "openai_key_set": openai_set,
163
+ "any_key_set": anthropic_set or openai_set,
164
+ "mock_available": True,
165
+ }
166
+
167
+
168
+ @app.post("/api/runs", response_model=RunSummary)
169
+ async def create_run(
170
+ req: RunRequest,
171
+ background: BackgroundTasks,
172
+ db: AsyncSession = Depends(get_db),
173
+ ) -> RunSummary:
174
+ _validate_path(req.scenarios_dir)
175
+
176
+ if not req.mock and not any(os.environ.get(k) for k in ("ANTHROPIC_API_KEY", "OPENAI_API_KEY")):
177
+ raise HTTPException(status_code=400, detail="No provider API key found in environment (set ANTHROPIC_API_KEY or OPENAI_API_KEY)")
178
+
179
+ run_id = str(uuid.uuid4())
180
+ record = RunRecord(
181
+ run_id=run_id,
182
+ started_at=datetime.now(timezone.utc),
183
+ results=[],
184
+ )
185
+ db.add(record)
186
+ await db.commit()
187
+
188
+ background.add_task(_execute_run, run_id, req)
189
+
190
+ return RunSummary(
191
+ run_id=run_id,
192
+ started_at=_iso(record.started_at),
193
+ pass_rate=0.0,
194
+ passed=0,
195
+ total=0,
196
+ status="running",
197
+ )
198
+
199
+
200
+ @app.get("/api/runs", response_model=list[RunSummary])
201
+ async def list_runs(db: AsyncSession = Depends(get_db)) -> list[RunSummary]:
202
+ rows = await db.execute(select(RunRecord).order_by(RunRecord.started_at.desc()).limit(100))
203
+ records = rows.scalars().all()
204
+
205
+ run_ids = [r.run_id for r in records]
206
+ name_rows = await db.execute(
207
+ select(ScenarioResultRecord.run_id, ScenarioResultRecord.scenario_name)
208
+ .where(ScenarioResultRecord.run_id.in_(run_ids))
209
+ )
210
+ names_by_run: dict[str, list[str]] = {}
211
+ for run_id, scenario_name in name_rows:
212
+ names_by_run.setdefault(run_id, []).append(scenario_name)
213
+
214
+ return [
215
+ RunSummary(
216
+ run_id=r.run_id,
217
+ started_at=_iso(r.started_at),
218
+ finished_at=_iso(r.finished_at),
219
+ pass_rate=r.pass_rate or 0.0,
220
+ passed=r.passed or 0,
221
+ total=r.total or 0,
222
+ status="completed" if r.finished_at else "running",
223
+ scenario_names=names_by_run.get(r.run_id, []),
224
+ skipped_count=len(r.skipped_scenarios or []),
225
+ skipped_scenarios=r.skipped_scenarios or [],
226
+ )
227
+ for r in records
228
+ ]
229
+
230
+
231
+ @app.get("/api/runs/{run_id}")
232
+ async def get_run(run_id: str, db: AsyncSession = Depends(get_db)) -> dict[str, Any]:
233
+ row = await db.execute(select(RunRecord).where(RunRecord.run_id == run_id))
234
+ record = row.scalar_one_or_none()
235
+ if not record:
236
+ raise HTTPException(status_code=404, detail="Run not found")
237
+
238
+ result_rows = await db.execute(
239
+ select(ScenarioResultRecord).where(ScenarioResultRecord.run_id == run_id)
240
+ )
241
+ scenario_results = result_rows.scalars().all()
242
+
243
+ return {
244
+ "run_id": record.run_id,
245
+ "started_at": _iso(record.started_at),
246
+ "finished_at": _iso(record.finished_at),
247
+ "pass_rate": record.pass_rate,
248
+ "passed": record.passed,
249
+ "total": record.total,
250
+ "status": "completed" if record.finished_at else "running",
251
+ "scenario_results": [
252
+ {
253
+ "scenario_id": r.scenario_id,
254
+ "scenario_name": r.scenario_name,
255
+ "agent_type": r.agent_type,
256
+ "status": r.status,
257
+ "overall_score": r.overall_score,
258
+ "pass_rate": r.pass_rate,
259
+ "reasoning": r.reasoning,
260
+ "error": r.error,
261
+ "steps": r.steps,
262
+ }
263
+ for r in scenario_results
264
+ ],
265
+ }
266
+
267
+
268
+ @app.post("/api/scenarios")
269
+ async def create_scenario(
270
+ req: ScenarioCreate,
271
+ ) -> dict[str, Any]:
272
+ scenario_id = re.sub(r"[^a-z0-9]+", "-", req.name.lower()).strip("-")
273
+
274
+ actions = req.expected_actions or [
275
+ {"action": "complete_task", "required": True, "description": "Agent completes the task correctly"}
276
+ ]
277
+
278
+ scenario_data = {
279
+ "id": scenario_id,
280
+ "name": req.name,
281
+ "description": req.description,
282
+ "agent_type": req.agent_type,
283
+ "prompt": req.prompt,
284
+ "test_input": req.test_input,
285
+ "expected_actions": actions,
286
+ "tags": req.tags,
287
+ }
288
+
289
+ scen_dir = _validate_path(req.scenarios_dir)
290
+ scen_dir.mkdir(parents=True, exist_ok=True)
291
+
292
+ path = scen_dir / f"{scenario_id}.yml"
293
+ if path.exists():
294
+ raise HTTPException(status_code=409, detail=f"Scenario with id '{scenario_id}' already exists")
295
+ with open(path, "w") as f:
296
+ yaml.dump(scenario_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
297
+
298
+ return {
299
+ "id": scenario_id,
300
+ "name": req.name,
301
+ "description": req.description,
302
+ "agent_type": req.agent_type,
303
+ "tags": req.tags,
304
+ "step_count": len(actions),
305
+ }
306
+
307
+
308
+ @app.get("/api/scenarios")
309
+ async def list_scenarios(scenarios_dir: str = "") -> list[dict[str, Any]]:
310
+ scen_dir = _validate_path(scenarios_dir)
311
+ if not scen_dir.exists():
312
+ return []
313
+ loader = ScenarioLoader()
314
+ result = []
315
+ for path in sorted(scen_dir.rglob("*.yml")) + sorted(scen_dir.rglob("*.yaml")):
316
+ if path.name.startswith("_"):
317
+ continue
318
+ try:
319
+ s = loader.load_file(path)
320
+ rel = path.relative_to(scen_dir)
321
+ suite = rel.parts[0] if len(rel.parts) > 1 else "default"
322
+ result.append({
323
+ "id": s.id,
324
+ "name": s.name,
325
+ "description": s.description,
326
+ "agent_type": s.agent_type.value,
327
+ "tags": s.tags,
328
+ "step_count": len(s.expected_actions),
329
+ "suite": suite,
330
+ })
331
+ except Exception as e:
332
+ logger.warning("Failed to load scenario %s: %s", path, e)
333
+ return result
334
+
335
+
336
+ # ── Background task ────────────────────────────────────────────────────────────
337
+
338
+ async def _mark_run_failed(run_id: str, error_message: str) -> None:
339
+ from .database import SessionLocal
340
+ async with SessionLocal() as db:
341
+ row = await db.execute(select(RunRecord).where(RunRecord.run_id == run_id))
342
+ record = row.scalar_one_or_none()
343
+ if record:
344
+ record.finished_at = datetime.now(timezone.utc)
345
+ record.total = 0
346
+ record.passed = 0
347
+ record.pass_rate = 0.0
348
+ record.results = [{"error": error_message}]
349
+ await db.commit()
350
+
351
+
352
+ async def _execute_run(run_id: str, req: RunRequest) -> None:
353
+ from .database import SessionLocal
354
+
355
+ config = EvalConfig()
356
+ if req.judge_model:
357
+ config.judge.model = req.judge_model
358
+ loader = ScenarioLoader()
359
+ scen_dir = (SCENARIOS_BASE_DIR.resolve() / req.scenarios_dir).resolve()
360
+
361
+ if not scen_dir.exists():
362
+ await _mark_run_failed(run_id, f"Scenarios directory not found: {scen_dir}")
363
+ return
364
+
365
+ async def _run_with_timeout() -> None:
366
+ scenarios = loader.load_dir(scen_dir)
367
+
368
+ if loader.skipped:
369
+ async with SessionLocal() as db:
370
+ row = await db.execute(select(RunRecord).where(RunRecord.run_id == run_id))
371
+ record = row.scalar_one_or_none()
372
+ if record:
373
+ record.skipped_scenarios = loader.skipped
374
+ await db.commit()
375
+
376
+ if req.tags:
377
+ scenarios = [s for s in scenarios if any(t in s.tags for t in req.tags)]
378
+ if req.scenario_ids:
379
+ scenarios = [s for s in scenarios if s.id in req.scenario_ids]
380
+
381
+ if req.mock:
382
+ from evalgrid.core.scorer import MockScorer
383
+ runner = ScenarioRunner(
384
+ config,
385
+ scorer=MockScorer(),
386
+ agent_fn=lambda s: ("[mock agent output]", 0),
387
+ )
388
+ else:
389
+ runner = ScenarioRunner(config)
390
+
391
+ loop = asyncio.get_running_loop()
392
+ results = []
393
+ for scenario in scenarios:
394
+ result = await loop.run_in_executor(None, runner.run_scenario, scenario)
395
+ results.append(result)
396
+
397
+ async with SessionLocal() as db:
398
+ sr = ScenarioResultRecord(
399
+ run_id=run_id,
400
+ scenario_id=result.scenario_id,
401
+ scenario_name=result.scenario_name,
402
+ agent_type=result.agent_type.value,
403
+ status=result.status.value,
404
+ overall_score=result.overall_score,
405
+ pass_rate=result.pass_rate,
406
+ reasoning=result.reasoning,
407
+ error=result.error,
408
+ steps=[s.model_dump(mode="json") for s in result.steps],
409
+ started_at=result.started_at,
410
+ finished_at=result.finished_at,
411
+ )
412
+ db.add(sr)
413
+ await db.commit()
414
+
415
+ passed = sum(1 for r in results if r.status == PassFail.PASS)
416
+ total = len(results)
417
+
418
+ async with SessionLocal() as db:
419
+ row = await db.execute(select(RunRecord).where(RunRecord.run_id == run_id))
420
+ record = row.scalar_one_or_none()
421
+ if record:
422
+ record.finished_at = datetime.now(timezone.utc)
423
+ record.passed = passed
424
+ record.total = total
425
+ record.pass_rate = passed / total if total else 0.0
426
+ await db.commit()
427
+
428
+ try:
429
+ await asyncio.wait_for(_run_with_timeout(), timeout=1800)
430
+ except TimeoutError:
431
+ logger.error("Run %s timed out after 1800s", run_id)
432
+ await _mark_run_failed(run_id, "Run timed out (30 minute limit exceeded)")
433
+ except Exception as exc:
434
+ await _mark_run_failed(run_id, _sanitize_error(str(exc)))
435
+
436
+
437
+ # Serve React dashboard — evalgrid/static/ (wheel) or dashboard/dist/ (dev)
438
+ _dashboard = Path(__file__).parent.parent / "static"
439
+ if not _dashboard.exists():
440
+ _dashboard = Path(__file__).parent.parent.parent / "dashboard" / "dist"
441
+ if _dashboard.exists():
442
+ app.mount("/", StaticFiles(directory=_dashboard, html=True), name="static")
@@ -0,0 +1,78 @@
1
+ """Database setup — SQLite (dev) / Postgres (prod)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from pathlib import Path
7
+
8
+ logger = logging.getLogger("evalgrid")
9
+
10
+ from sqlalchemy import JSON, Column, DateTime, Float, ForeignKey, Integer, String, Text, func, text
11
+ from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
12
+ from sqlalchemy.orm import DeclarativeBase
13
+
14
+ from evalgrid.core.paths import default_database_url
15
+
16
+ DATABASE_URL = default_database_url()
17
+
18
+ engine = create_async_engine(DATABASE_URL, echo=False)
19
+ SessionLocal = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
20
+
21
+
22
+ class Base(DeclarativeBase):
23
+ pass
24
+
25
+
26
+ class RunRecord(Base):
27
+ __tablename__ = "runs"
28
+
29
+ id = Column(Integer, primary_key=True, autoincrement=True)
30
+ run_id = Column(String, unique=True, index=True)
31
+ started_at = Column(DateTime, default=func.now())
32
+ finished_at = Column(DateTime, nullable=True)
33
+ pass_rate = Column(Float, default=0.0)
34
+ passed = Column(Integer, default=0)
35
+ total = Column(Integer, default=0)
36
+ results = Column(JSON, default=list)
37
+ skipped_scenarios = Column(JSON, nullable=True)
38
+
39
+
40
+ class ScenarioResultRecord(Base):
41
+ __tablename__ = "scenario_results"
42
+
43
+ id = Column(Integer, primary_key=True, autoincrement=True)
44
+ run_id = Column(String, ForeignKey("runs.run_id"), index=True)
45
+ scenario_id = Column(String, index=True)
46
+ scenario_name = Column(String)
47
+ agent_type = Column(String)
48
+ status = Column(String)
49
+ overall_score = Column(Float)
50
+ pass_rate = Column(Float)
51
+ reasoning = Column(Text, default="")
52
+ error = Column(Text, nullable=True)
53
+ steps = Column(JSON, default=list)
54
+ started_at = Column(DateTime)
55
+ finished_at = Column(DateTime, nullable=True)
56
+
57
+
58
+ async def init_db() -> None:
59
+ """Initialize the database, creating parent directories as needed."""
60
+ url_str = str(engine.url)
61
+ if url_str.startswith("sqlite"):
62
+ db_path_str = url_str.split(":///", 1)[-1]
63
+ if db_path_str and db_path_str != ":memory:":
64
+ db_path = Path(db_path_str).expanduser()
65
+ db_path.parent.mkdir(parents=True, exist_ok=True)
66
+
67
+ async with engine.begin() as conn:
68
+ await conn.run_sync(Base.metadata.create_all)
69
+ try:
70
+ await conn.execute(text("ALTER TABLE runs ADD COLUMN skipped_scenarios JSON DEFAULT '[]'"))
71
+ except Exception as e:
72
+ if "duplicate column" not in str(e).lower():
73
+ logger.warning("Migration skipped_scenarios failed: %s", e)
74
+
75
+
76
+ async def get_db():
77
+ async with SessionLocal() as session:
78
+ yield session
@@ -0,0 +1,3 @@
1
+ from evalgrid.cli.main import cli as main
2
+
3
+ __all__ = ["main"]
File without changes
@@ -0,0 +1,139 @@
1
+ """evalgrid init — scaffold .evalgrid/ in the current project."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import shutil
6
+ from pathlib import Path
7
+
8
+ import click
9
+ from rich.console import Console
10
+ from rich.panel import Panel
11
+
12
+ console = Console(legacy_windows=False)
13
+
14
+ CONFIG_TEMPLATE = """\
15
+ # evalgrid configuration
16
+ model: claude-sonnet-4-6
17
+ max_concurrent: 5
18
+ timeout_seconds: 120
19
+ retry_on_error: true
20
+ max_retries: 2
21
+ output_dir: .evalgrid/results
22
+ scenarios_dir: .evalgrid/scenarios
23
+ # Database path: ~/.evalgrid/results.db (override via DATABASE_URL env var)
24
+
25
+ # Choose your judge model (used to score agent responses)
26
+ judge:
27
+ # Anthropic (default)
28
+ provider: anthropic
29
+ model: claude-sonnet-4-6
30
+ api_key_env: ANTHROPIC_API_KEY
31
+
32
+ # OpenAI alternative:
33
+ # provider: openai
34
+ # model: gpt-4o
35
+ # api_key_env: OPENAI_API_KEY
36
+
37
+ # Google alternative:
38
+ # provider: google
39
+ # model: gemini/gemini-1.5-pro
40
+ # api_key_env: GEMINI_API_KEY
41
+
42
+ # Local Ollama (no API key needed):
43
+ # provider: ollama
44
+ # model: ollama/llama3.1:70b
45
+ # api_key_env: ""
46
+
47
+ # OpenRouter (200+ models, one key):
48
+ # provider: openrouter
49
+ # model: openrouter/anthropic/claude-sonnet-4
50
+ # api_key_env: OPENROUTER_API_KEY
51
+ """
52
+
53
+ EXAMPLE_SCENARIO = """\
54
+ id: example-hello-world
55
+ name: Hello World
56
+ description: Verify the agent responds politely to a greeting
57
+ agent_type: generic
58
+ prompt: |
59
+ You are a helpful AI assistant. Respond to user messages in a friendly,
60
+ concise way. Always greet the user back and offer to help.
61
+ test_input: "Hello! Can you help me today?"
62
+ expected_actions:
63
+ - action: greet_user
64
+ required: true
65
+ description: Agent greets the user back
66
+ - action: offer_assistance
67
+ required: true
68
+ description: Agent offers to help
69
+ - action: ask_followup
70
+ required: false
71
+ description: Agent asks what the user needs help with
72
+ success_criteria: >
73
+ The agent should acknowledge the greeting warmly and clearly offer assistance.
74
+ The tone should be friendly and professional.
75
+ tags:
76
+ - greeting
77
+ - basic
78
+ timeout_seconds: 30
79
+ """
80
+
81
+ GITIGNORE_ADDITION = """
82
+ # evalgrid
83
+ .evalgrid/results/
84
+ .evalgrid/results.db
85
+ """
86
+
87
+
88
+ @click.command()
89
+ @click.option("--dir", "target_dir", default=".", help="Project directory to initialize")
90
+ @click.option("--with-examples", is_flag=True, default=True, help="Copy example scenarios")
91
+ def init(target_dir: str, with_examples: bool) -> None:
92
+ """Scaffold a .evalgrid/ folder in your project."""
93
+ root = Path(target_dir).resolve()
94
+ evalgrid_dir = root / ".evalgrid"
95
+
96
+ if evalgrid_dir.exists():
97
+ console.print("[yellow]! .evalgrid/ already exists.[/yellow]")
98
+ if not click.confirm("Reinitialize?", default=False):
99
+ return
100
+
101
+ # Create directory structure
102
+ (evalgrid_dir / "scenarios").mkdir(parents=True, exist_ok=True)
103
+ (evalgrid_dir / "results").mkdir(parents=True, exist_ok=True)
104
+
105
+ # Write config
106
+ config_path = evalgrid_dir / "config.yml"
107
+ config_path.write_text(CONFIG_TEMPLATE)
108
+
109
+ # Write example scenario
110
+ if with_examples:
111
+ example_path = evalgrid_dir / "scenarios" / "example-hello-world.yml"
112
+ example_path.write_text(EXAMPLE_SCENARIO)
113
+
114
+ # Append to .gitignore if it exists
115
+ gitignore = root / ".gitignore"
116
+ if gitignore.exists():
117
+ content = gitignore.read_text()
118
+ if ".evalgrid/results" not in content:
119
+ gitignore.write_text(content + GITIGNORE_ADDITION)
120
+
121
+ console.print(
122
+ Panel.fit(
123
+ "[bold green]evalgrid initialized![/bold green]\n\n"
124
+ f"[dim]Config:[/dim] [cyan]{config_path.relative_to(root)}[/cyan]\n"
125
+ f"[dim]Scenarios:[/dim] [cyan]{(evalgrid_dir / 'scenarios').relative_to(root)}[/cyan]\n\n"
126
+ "[bold]Next steps:[/bold]\n\n"
127
+ " 1. Try it now (no API key required):\n"
128
+ " [cyan]evalgrid run --mock[/cyan]\n\n"
129
+ " 2. To run with real AI scoring, set your API key:\n"
130
+ " [cyan]Linux/macOS:[/cyan] export ANTHROPIC_API_KEY=sk-ant-...\n"
131
+ " [cyan]Windows:[/cyan] $env:ANTHROPIC_API_KEY = \"sk-ant-...\"\n\n"
132
+ " 3. Review the example scenario:\n"
133
+ " [cyan].evalgrid/scenarios/example-hello-world.yml[/cyan]\n\n"
134
+ " 4. Open the dashboard:\n"
135
+ " [cyan]evalgrid server[/cyan] -> http://localhost:8000",
136
+ title="evalgrid",
137
+ border_style="green",
138
+ )
139
+ )