pytest-llm-sushit 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. pytest_llm_sushit-0.1.0/.env.example +12 -0
  2. pytest_llm_sushit-0.1.0/.gitignore +57 -0
  3. pytest_llm_sushit-0.1.0/Dockerfile +14 -0
  4. pytest_llm_sushit-0.1.0/PKG-INFO +30 -0
  5. pytest_llm_sushit-0.1.0/README.md +122 -0
  6. pytest_llm_sushit-0.1.0/app/__init__.py +0 -0
  7. pytest_llm_sushit-0.1.0/app/api/__init__.py +0 -0
  8. pytest_llm_sushit-0.1.0/app/api/dashboard.py +70 -0
  9. pytest_llm_sushit-0.1.0/app/api/eval.py +142 -0
  10. pytest_llm_sushit-0.1.0/app/api/trace.py +71 -0
  11. pytest_llm_sushit-0.1.0/app/config.py +20 -0
  12. pytest_llm_sushit-0.1.0/app/database.py +26 -0
  13. pytest_llm_sushit-0.1.0/app/main.py +116 -0
  14. pytest_llm_sushit-0.1.0/app/models/__init__.py +0 -0
  15. pytest_llm_sushit-0.1.0/app/models/eval.py +78 -0
  16. pytest_llm_sushit-0.1.0/app/models/trace.py +56 -0
  17. pytest_llm_sushit-0.1.0/app/services/__init__.py +0 -0
  18. pytest_llm_sushit-0.1.0/app/services/eval_runner.py +165 -0
  19. pytest_llm_sushit-0.1.0/app/services/providers/__init__.py +0 -0
  20. pytest_llm_sushit-0.1.0/app/services/providers/base.py +50 -0
  21. pytest_llm_sushit-0.1.0/app/services/providers/gemini.py +45 -0
  22. pytest_llm_sushit-0.1.0/app/services/providers/groq.py +46 -0
  23. pytest_llm_sushit-0.1.0/app/services/providers/mistral.py +41 -0
  24. pytest_llm_sushit-0.1.0/app/services/providers/openrouter.py +55 -0
  25. pytest_llm_sushit-0.1.0/app/services/providers/registry.py +82 -0
  26. pytest_llm_sushit-0.1.0/app/services/scoring.py +56 -0
  27. pytest_llm_sushit-0.1.0/app/services/trace_collector.py +109 -0
  28. pytest_llm_sushit-0.1.0/app/templates/404.html +9 -0
  29. pytest_llm_sushit-0.1.0/app/templates/base.html +71 -0
  30. pytest_llm_sushit-0.1.0/app/templates/dashboard.html +72 -0
  31. pytest_llm_sushit-0.1.0/app/templates/eval_datasets.html +34 -0
  32. pytest_llm_sushit-0.1.0/app/templates/eval_detail.html +29 -0
  33. pytest_llm_sushit-0.1.0/app/templates/eval_run_detail.html +48 -0
  34. pytest_llm_sushit-0.1.0/app/templates/eval_runs.html +36 -0
  35. pytest_llm_sushit-0.1.0/app/templates/trace_detail.html +54 -0
  36. pytest_llm_sushit-0.1.0/app/templates/traces.html +30 -0
  37. pytest_llm_sushit-0.1.0/docker-compose.yml +10 -0
  38. pytest_llm_sushit-0.1.0/examples/conftest.py +25 -0
  39. pytest_llm_sushit-0.1.0/examples/test_example.py +101 -0
  40. pytest_llm_sushit-0.1.0/pyproject.toml +48 -0
  41. pytest_llm_sushit-0.1.0/requirements.txt +12 -0
  42. pytest_llm_sushit-0.1.0/scripts/seed_data.py +131 -0
  43. pytest_llm_sushit-0.1.0/src/pytest_llm/__init__.py +30 -0
  44. pytest_llm_sushit-0.1.0/src/pytest_llm/assertions.py +241 -0
  45. pytest_llm_sushit-0.1.0/src/pytest_llm/config.py +38 -0
  46. pytest_llm_sushit-0.1.0/src/pytest_llm/judge.py +197 -0
  47. pytest_llm_sushit-0.1.0/src/pytest_llm/plugin.py +79 -0
  48. pytest_llm_sushit-0.1.0/src/pytest_llm/reporter.py +82 -0
  49. pytest_llm_sushit-0.1.0/tests/__init__.py +0 -0
  50. pytest_llm_sushit-0.1.0/tests/conftest.py +47 -0
  51. pytest_llm_sushit-0.1.0/tests/test_assertions.py +136 -0
  52. pytest_llm_sushit-0.1.0/tests/test_judge.py +142 -0
@@ -0,0 +1,12 @@
1
+ # LLM Providers
2
+ GROQ_API_KEY=gsk_your_key_here
3
+ MISTRAL_API_KEY=your_key_here
4
+ GEMINI_API_KEY=your_key_here
5
+ OPENROUTER_API_KEY=sk-or-your_key_here
6
+
7
+ # Database
8
+ DATABASE_URL=sqlite+aiosqlite:///./data/reliability.db
9
+
10
+ # App
11
+ APP_ENV=development
12
+ SECRET_KEY=change-me-in-production
@@ -0,0 +1,57 @@
1
+ <<<<<<< HEAD
2
+ =======
3
+ # Environment
4
+ .env
5
+ data/*.db
6
+
7
+ # Python
8
+ >>>>>>> 9520d27 (Initial AI Reliability Platform MVP)
9
+ __pycache__/
10
+ *.py[cod]
11
+ *$py.class
12
+ *.egg-info/
13
+ dist/
14
+ build/
15
+ .eggs/
16
+ <<<<<<< HEAD
17
+ venv/
18
+ .venv/
19
+ =======
20
+
21
+ # Virtual environments
22
+ venv/
23
+ .venv/
24
+
25
+ # IDE
26
+ .vscode/
27
+ .idea/
28
+ *.swp
29
+ *.swo
30
+
31
+ # OS
32
+ .DS_Store
33
+ Thumbs.db
34
+ # Database
35
+ *.db
36
+ *.sqlite
37
+ *.sqlite3
38
+
39
+ # Logs
40
+ *.log
41
+
42
+ # Test/cache
43
+ >>>>>>> 9520d27 (Initial AI Reliability Platform MVP)
44
+ .pytest_cache/
45
+ .mypy_cache/
46
+ .coverage
47
+ htmlcov/
48
+ <<<<<<< HEAD
49
+ =======
50
+
51
+ # Node
52
+ node_modules/
53
+
54
+ # Environment variants
55
+ .env.*
56
+ !.env.example
57
+ >>>>>>> 9520d27 (Initial AI Reliability Platform MVP)
@@ -0,0 +1,14 @@
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ COPY requirements.txt .
6
+ RUN pip install --no-cache-dir -r requirements.txt
7
+
8
+ COPY . .
9
+
10
+ RUN mkdir -p data
11
+
12
+ EXPOSE 8000
13
+
14
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
@@ -0,0 +1,30 @@
1
+ Metadata-Version: 2.4
2
+ Name: pytest-llm-sushit
3
+ Version: 0.1.0
4
+ Summary: LLM-powered semantic assertions for pytest
5
+ Project-URL: Homepage, https://github.com/Sushit-prog/pytest-llm
6
+ Project-URL: Repository, https://github.com/Sushit-prog/pytest-llm
7
+ Project-URL: Issues, https://github.com/Sushit-prog/pytest-llm/issues
8
+ Author: Sushit
9
+ License: MIT
10
+ Keywords: ai,assertions,langchain,langgraph,llm,ml,pytest,testing
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Framework :: Pytest
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Topic :: Software Development :: Testing
17
+ Requires-Python: >=3.9
18
+ Requires-Dist: anthropic>=0.20
19
+ Requires-Dist: openai>=1.0
20
+ Requires-Dist: pydantic>=2.0
21
+ Requires-Dist: pytest>=7.0
22
+ Requires-Dist: rich>=13.0
23
+ Requires-Dist: sentence-transformers>=2.2
24
+ Provides-Extra: dev
25
+ Requires-Dist: pytest-cov; extra == 'dev'
26
+ Requires-Dist: pytest>=7.0; extra == 'dev'
27
+ Provides-Extra: groq
28
+ Requires-Dist: groq>=0.4; extra == 'groq'
29
+ Provides-Extra: ollama
30
+ Requires-Dist: ollama>=0.1; extra == 'ollama'
@@ -0,0 +1,122 @@
1
+ # pytest-llm
2
+
3
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9%2B-blue)](https://www.python.org/downloads/)
4
+ [![PyPI version](https://img.shields.io/pypi/v/pytest-llm)](https://pypi.org/project/pytest-llm/)
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT)
6
+ [![Tests](https://img.shields.io/badge/tests-passing-brightgreen)](#)
7
+
8
+ LLM-powered semantic assertions for pytest.
9
+
10
+ ## Why?
11
+
12
+ Testing LLM outputs with string matching is brittle. `pytest-llm` adds semantic
13
+ assertions powered by LLM judges and local embeddings — check faithfulness, tone,
14
+ safety, hallucinations, and more with a single function call.
15
+
16
+ ## Quick start
17
+
18
+ ```bash
19
+ pip install pytest-llm-sushit
20
+ ```
21
+
22
+ ```python
23
+ from pytest_llm import assert_faithful, assert_tone, assert_safe
24
+
25
+ def test_llm_output():
26
+ output = "Python was created by Guido van Rossum in 1991."
27
+ source = "Guido van Rossum created Python, released in 1991."
28
+
29
+ assert_faithful(output, source) # factual accuracy
30
+ assert_tone(output, "professional") # tone check
31
+ assert_safe(output) # safety check
32
+ ```
33
+
34
+ ## How it works
35
+
36
+ ```
37
+ Your pytest test
38
+
39
+
40
+ pytest-llm assertion (assert_faithful, assert_regression...)
41
+
42
+ ├── Local path: sentence-transformers (no API call)
43
+ │ cosine similarity → pass/fail
44
+
45
+ └── LLM Judge path: your chosen provider
46
+ OpenAI / Anthropic / Groq / Ollama
47
+ JSON response → score + reason → pass/fail
48
+ ```
49
+
50
+ ## Assertions
51
+
52
+ | Assertion | What it checks | Uses API? |
53
+ |-----------|---------------|-----------|
54
+ | `assert_faithful` | Every factual claim in output is supported by source | Yes |
55
+ | `assert_no_hallucination` | Output contains no invented facts not in source | Yes |
56
+ | `assert_tone` | Output matches an expected tone (freeform string) | Yes |
57
+ | `assert_semantic_similarity` | Cosine similarity between output and expected text | No |
58
+ | `assert_contains_claim` | Output semantically contains a given claim | Yes |
59
+ | `assert_safe` | Output contains no harmful or offensive content | Yes |
60
+ | `assert_language` | Output is written in the expected language | Yes |
61
+ | `assert_regression` | Output is not worse than a baseline (similarity + quality) | Yes |
62
+
63
+ ## Configuration
64
+
65
+ ### Environment variables
66
+
67
+ ```bash
68
+ export LLM_JUDGE_PROVIDER=openai # or anthropic, groq, ollama
69
+ export LLM_JUDGE_MODEL=gpt-4o-mini # optional, defaults to provider best
70
+ export OPENAI_API_KEY=sk-... # set for your chosen provider
71
+ ```
72
+
73
+ ### conftest.py
74
+
75
+ ```python
76
+ from pytest_llm import pytest_configure_judge
77
+
78
+ pytest_configure_judge(provider="anthropic", model="claude-haiku-4-5-20251001")
79
+ ```
80
+
81
+ ### CLI options
82
+
83
+ ```bash
84
+ pytest --llm-judge-provider=anthropic --llm-judge-model=claude-haiku-4-5-20251001
85
+ pytest --llm-report # print Rich summary table after tests
86
+ ```
87
+
88
+ ## CI/CD with GitHub Actions
89
+
90
+ ```yaml
91
+ name: Tests
92
+ on: [push, pull_request]
93
+ jobs:
94
+ test:
95
+ runs-on: ubuntu-latest
96
+ steps:
97
+ - uses: actions/checkout@v4
98
+ - uses: actions/setup-python@v5
99
+ with:
100
+ python-version: "3.12"
101
+ - name: Install dependencies
102
+ run: pip install -e ".[dev]"
103
+ - name: Run tests
104
+ run: pytest tests/ -v --tb=short
105
+ ```
106
+
107
+ ## Provider support
108
+
109
+ | Provider | Default Model | Env var for API key |
110
+ |----------|---------------|---------------------|
111
+ | OpenAI | `gpt-4o-mini` | `OPENAI_API_KEY` |
112
+ | Anthropic | `claude-haiku-4-5-20251001` | `ANTHROPIC_API_KEY` |
113
+ | Groq | `llama-3.3-70b-versatile` | `GROQ_API_KEY` |
114
+ | Ollama | `llama3` | (local, no key needed) |
115
+
116
+ ## Works with langgraph-replay
117
+
118
+ `pytest-llm` integrates with [langgraph-replay](https://github.com/Sushit-prog/langgraph-replay) for tracing and replaying LangGraph agent sessions during evaluation.
119
+
120
+ ```bash
121
+ langgraph-replay blame session_abc --eval
122
+ ```
File without changes
File without changes
@@ -0,0 +1,70 @@
1
+ from fastapi import APIRouter
2
+ from sqlmodel import Session, select, func
3
+ from app.database import get_engine
4
+ from app.models.eval import EvalRun, EvalResult
5
+ from app.models.trace import Trace, ProviderUsage
6
+
7
+ router = APIRouter(prefix="/api/v1/dashboard", tags=["dashboard"])
8
+
9
+
10
+ @router.get("/summary")
11
+ def get_summary():
12
+ with Session(get_engine()) as session:
13
+ total_runs = session.exec(select(func.count(EvalRun.id))).one()
14
+ total_traces = session.exec(select(func.count(Trace.id))).one()
15
+ total_results = session.exec(select(func.count(EvalResult.id))).one()
16
+ passed = session.exec(select(func.count(EvalResult.id)).where(EvalResult.status == "pass")).one()
17
+ failed = session.exec(select(func.count(EvalResult.id)).where(EvalResult.status == "fail")).one()
18
+
19
+ return {
20
+ "total_eval_runs": total_runs,
21
+ "total_traces": total_traces,
22
+ "total_eval_results": total_results,
23
+ "passed": passed,
24
+ "failed": failed,
25
+ "pass_rate": f"{(passed / total_results * 100):.1f}%" if total_results > 0 else "N/A",
26
+ }
27
+
28
+
29
+ @router.get("/providers")
30
+ def get_provider_stats():
31
+ with Session(get_engine()) as session:
32
+ runs = list(session.exec(select(EvalRun)).all())
33
+
34
+ stats = {}
35
+ for r in runs:
36
+ key = f"{r.provider}/{r.model}"
37
+ if key not in stats:
38
+ stats[key] = {"calls": 0, "total_tokens": 0, "total_cost": 0.0, "total_latency": 0.0}
39
+ stats[key]["calls"] += 1
40
+ stats[key]["total_tokens"] += r.total_tokens
41
+ stats[key]["total_cost"] += r.estimated_cost
42
+ stats[key]["total_latency"] += r.avg_latency_ms * r.total_cases
43
+
44
+ return [
45
+ {
46
+ "provider_model": k,
47
+ "calls": v["calls"],
48
+ "total_tokens": v["total_tokens"],
49
+ "total_cost": round(v["total_cost"], 4),
50
+ "avg_latency_ms": round(v["total_latency"] / v["calls"], 1) if v["calls"] > 0 else 0,
51
+ }
52
+ for k, v in stats.items()
53
+ ]
54
+
55
+
56
+ @router.get("/failures")
57
+ def get_failure_summary():
58
+ with Session(get_engine()) as session:
59
+ results = list(
60
+ session.exec(select(EvalResult).where(EvalResult.status.in_(["fail", "error"]))).all()
61
+ )
62
+
63
+ failure_types = {}
64
+ for r in results:
65
+ key = r.error_message if r.error_message else "wrong_output"
66
+ if key not in failure_types:
67
+ failure_types[key] = 0
68
+ failure_types[key] += 1
69
+
70
+ return [{"type": k, "count": v} for k, v in sorted(failure_types.items(), key=lambda x: -x[1])[:10]]
@@ -0,0 +1,142 @@
1
+ from fastapi import APIRouter, HTTPException
2
+ from pydantic import BaseModel
3
+ from typing import Optional
4
+ from app.services.eval_runner import EvalRunner
5
+
6
+ router = APIRouter(prefix="/api/v1/eval", tags=["eval"])
7
+ runner = EvalRunner()
8
+
9
+
10
+ class DatasetCreate(BaseModel):
11
+ name: str
12
+ description: Optional[str] = None
13
+
14
+
15
+ class TestCaseCreate(BaseModel):
16
+ input: str
17
+ expected: str
18
+ category: Optional[str] = None
19
+ difficulty: str = "medium"
20
+
21
+
22
+ class DatasetImport(BaseModel):
23
+ cases: list[TestCaseCreate]
24
+
25
+
26
+ class RunCreate(BaseModel):
27
+ dataset_id: int
28
+ provider: str
29
+ model: str
30
+ prompt_template: Optional[str] = None
31
+
32
+
33
+ @router.post("/datasets")
34
+ def create_dataset(body: DatasetCreate):
35
+ dataset = runner.create_dataset(name=body.name, description=body.description)
36
+ return {"id": dataset.id, "name": dataset.name}
37
+
38
+
39
+ @router.get("/datasets")
40
+ def list_datasets():
41
+ datasets = runner.list_datasets()
42
+ return [{"id": d.id, "name": d.name, "description": d.description, "created_at": d.created_at.isoformat()} for d in datasets]
43
+
44
+
45
+ @router.get("/datasets/{dataset_id}")
46
+ def get_dataset(dataset_id: int):
47
+ dataset = runner.get_dataset(dataset_id)
48
+ if not dataset:
49
+ raise HTTPException(status_code=404, detail="Dataset not found")
50
+ cases = runner.get_test_cases(dataset_id)
51
+ return {
52
+ "id": dataset.id,
53
+ "name": dataset.name,
54
+ "description": dataset.description,
55
+ "cases": [{"id": c.id, "input": c.input_text, "expected": c.expected_output, "category": c.category, "difficulty": c.difficulty} for c in cases],
56
+ }
57
+
58
+
59
+ @router.post("/datasets/{dataset_id}/cases")
60
+ def add_test_cases(dataset_id: int, body: DatasetImport):
61
+ dataset = runner.get_dataset(dataset_id)
62
+ if not dataset:
63
+ raise HTTPException(status_code=404, detail="Dataset not found")
64
+ cases = [c.model_dump() for c in body.cases]
65
+ count = runner.add_test_cases(dataset_id, cases)
66
+ return {"added": count}
67
+
68
+
69
+ @router.post("/runs")
70
+ async def create_and_run_eval(body: RunCreate):
71
+ run = runner.create_run(
72
+ dataset_id=body.dataset_id,
73
+ provider_name=body.provider,
74
+ model=body.model,
75
+ prompt_template=body.prompt_template,
76
+ )
77
+ run = await runner.execute_run(run.id)
78
+ return {
79
+ "id": run.id,
80
+ "status": run.status,
81
+ "total_cases": run.total_cases,
82
+ "passed": run.passed_cases,
83
+ "failed": run.failed_cases,
84
+ "pass_rate": f"{(run.passed_cases / run.total_cases * 100):.1f}%" if run.total_cases > 0 else "N/A",
85
+ "avg_latency_ms": round(run.avg_latency_ms, 1),
86
+ "total_tokens": run.total_tokens,
87
+ "estimated_cost": round(run.estimated_cost, 4),
88
+ }
89
+
90
+
91
+ @router.get("/runs")
92
+ def list_runs(dataset_id: Optional[int] = None):
93
+ runs = runner.list_runs(dataset_id=dataset_id)
94
+ return [
95
+ {
96
+ "id": r.id,
97
+ "dataset_id": r.dataset_id,
98
+ "provider": r.provider,
99
+ "model": r.model,
100
+ "status": r.status,
101
+ "total_cases": r.total_cases,
102
+ "passed": r.passed_cases,
103
+ "failed": r.failed_cases,
104
+ "avg_latency_ms": round(r.avg_latency_ms, 1),
105
+ "estimated_cost": round(r.estimated_cost, 4),
106
+ "created_at": r.created_at.isoformat(),
107
+ "completed_at": r.completed_at.isoformat() if r.completed_at else None,
108
+ }
109
+ for r in runs
110
+ ]
111
+
112
+
113
+ @router.get("/runs/{run_id}")
114
+ def get_run(run_id: int):
115
+ run = runner.get_run(run_id)
116
+ if not run:
117
+ raise HTTPException(status_code=404, detail="Run not found")
118
+ results = runner.get_run_results(run_id)
119
+ return {
120
+ "id": run.id,
121
+ "provider": run.provider,
122
+ "model": run.model,
123
+ "status": run.status,
124
+ "total_cases": run.total_cases,
125
+ "passed": run.passed_cases,
126
+ "failed": run.failed_cases,
127
+ "avg_latency_ms": round(run.avg_latency_ms, 1),
128
+ "total_tokens": run.total_tokens,
129
+ "estimated_cost": round(run.estimated_cost, 4),
130
+ "results": [
131
+ {
132
+ "id": r.id,
133
+ "test_case_id": r.test_case_id,
134
+ "status": r.status,
135
+ "actual_output": r.actual_output,
136
+ "score": r.score,
137
+ "latency_ms": r.latency_ms,
138
+ "error_message": r.error_message,
139
+ }
140
+ for r in results
141
+ ],
142
+ }
@@ -0,0 +1,71 @@
1
+ from fastapi import APIRouter, HTTPException
2
+ from sqlmodel import Session, select
3
+ from app.database import get_engine
4
+ from app.models.trace import Trace, Span
5
+
6
+ router = APIRouter(prefix="/api/v1/traces", tags=["traces"])
7
+
8
+
9
+ @router.get("")
10
+ def list_traces(limit: int = 50):
11
+ with Session(get_engine()) as session:
12
+ traces = list(session.exec(select(Trace).order_by(Trace.created_at.desc()).limit(limit)).all())
13
+ return [
14
+ {
15
+ "trace_id": t.trace_id,
16
+ "name": t.name,
17
+ "status": t.status,
18
+ "total_latency_ms": round(t.total_latency_ms, 1),
19
+ "total_tokens": t.total_tokens,
20
+ "created_at": t.created_at.isoformat(),
21
+ }
22
+ for t in traces
23
+ ]
24
+
25
+
26
+ @router.get("/{trace_id}")
27
+ def get_trace(trace_id: str):
28
+ with Session(get_engine()) as session:
29
+ trace = session.exec(select(Trace).where(Trace.trace_id == trace_id)).first()
30
+ if not trace:
31
+ raise HTTPException(status_code=404, detail="Trace not found")
32
+ spans = list(session.exec(select(Span).where(Span.trace_id == trace_id).order_by(Span.created_at)).all())
33
+
34
+ return {
35
+ "trace_id": trace.trace_id,
36
+ "name": trace.name,
37
+ "status": trace.status,
38
+ "total_latency_ms": round(trace.total_latency_ms, 1),
39
+ "total_tokens": trace.total_tokens,
40
+ "created_at": trace.created_at.isoformat(),
41
+ "spans": [
42
+ {
43
+ "span_id": s.span_id,
44
+ "parent_span_id": s.parent_span_id,
45
+ "name": s.name,
46
+ "provider": s.provider,
47
+ "model": s.model,
48
+ "status": s.status,
49
+ "latency_ms": round(s.latency_ms, 1) if s.latency_ms else None,
50
+ "tokens_in": s.tokens_in,
51
+ "tokens_out": s.tokens_out,
52
+ "input_text": s.input_text[:500] if s.input_text else None,
53
+ "output_text": s.output_text[:500] if s.output_text else None,
54
+ "error_message": s.error_message,
55
+ }
56
+ for s in spans
57
+ ],
58
+ }
59
+
60
+
61
+ @router.delete("/{trace_id}")
62
+ def delete_trace(trace_id: str):
63
+ with Session(get_engine()) as session:
64
+ spans = session.exec(select(Span).where(Span.trace_id == trace_id)).all()
65
+ for s in spans:
66
+ session.delete(s)
67
+ trace = session.exec(select(Trace).where(Trace.trace_id == trace_id)).first()
68
+ if trace:
69
+ session.delete(trace)
70
+ session.commit()
71
+ return {"deleted": True}
@@ -0,0 +1,20 @@
1
+ from pydantic_settings import BaseSettings
2
+ from functools import lru_cache
3
+
4
+
5
+ class Settings(BaseSettings):
6
+ database_url: str = "sqlite+aiosqlite:///./data/reliability.db"
7
+ app_env: str = "development"
8
+ secret_key: str = "change-me-in-production"
9
+
10
+ groq_api_key: str = ""
11
+ mistral_api_key: str = ""
12
+ gemini_api_key: str = ""
13
+ openrouter_api_key: str = ""
14
+
15
+ model_config = {"env_file": ".env", "env_file_encoding": "utf-8"}
16
+
17
+
18
+ @lru_cache
19
+ def get_settings() -> Settings:
20
+ return Settings()
@@ -0,0 +1,26 @@
1
+ from sqlmodel import SQLModel, create_engine, Session
2
+ from app.config import get_settings
3
+ import os
4
+
5
+ engine = None
6
+
7
+
8
+ def get_engine():
9
+ global engine
10
+ if engine is None:
11
+ settings = get_settings()
12
+ db_path = settings.database_url.replace("sqlite+aiosqlite:///", "")
13
+ os.makedirs(os.path.dirname(db_path) if os.path.dirname(db_path) else ".", exist_ok=True)
14
+ engine = create_engine(settings.database_url.replace("sqlite+aiosqlite", "sqlite"))
15
+ return engine
16
+
17
+
18
+ def init_db():
19
+ eng = get_engine()
20
+ SQLModel.metadata.create_all(eng)
21
+
22
+
23
+ def get_session():
24
+ eng = get_engine()
25
+ with Session(eng) as session:
26
+ yield session
@@ -0,0 +1,116 @@
1
+ from fastapi import FastAPI, Request
2
+ from fastapi.templating import Jinja2Templates
3
+ from sqlmodel import Session, select
4
+ from app.database import init_db, get_engine
5
+ from app.models.eval import EvalDataset, EvalRun, EvalResult, TestCase
6
+ from app.models.trace import Trace, Span
7
+ from app.api import eval as eval_api
8
+ from app.api import trace as trace_api
9
+ from app.api import dashboard as dashboard_api
10
+ from app.services.providers.registry import list_providers
11
+
12
+ app = FastAPI(title="AI Reliability Platform", version="0.1.0")
13
+
14
+ templates = Jinja2Templates(directory="app/templates")
15
+
16
+ app.include_router(eval_api.router)
17
+ app.include_router(trace_api.router)
18
+ app.include_router(dashboard_api.router)
19
+
20
+
21
+ @app.on_event("startup")
22
+ def on_startup():
23
+ init_db()
24
+
25
+
26
+ @app.get("/health")
27
+ def health():
28
+ return {"status": "ok"}
29
+
30
+
31
+ @app.get("/")
32
+ def dashboard(request: Request):
33
+ with Session(get_engine()) as session:
34
+ total_runs = session.exec(select(EvalRun)).all()
35
+ total_traces = session.exec(select(Trace)).all()
36
+ recent_runs = sorted(total_runs, key=lambda r: r.created_at, reverse=True)[:5]
37
+ recent_traces = sorted(total_traces, key=lambda t: t.created_at, reverse=True)[:5]
38
+ total_results = len(session.exec(select(EvalResult)).all())
39
+ passed = len(session.exec(select(EvalResult).where(EvalResult.status == "pass")).all())
40
+
41
+ pass_rate = f"{(passed / total_results * 100):.1f}%" if total_results > 0 else "N/A"
42
+
43
+ return templates.TemplateResponse("dashboard.html", context={
44
+ "request": request,
45
+ "total_runs": len(total_runs),
46
+ "total_traces": len(total_traces),
47
+ "total_results": total_results,
48
+ "passed": passed,
49
+ "pass_rate": pass_rate,
50
+ "recent_runs": recent_runs,
51
+ "recent_traces": recent_traces,
52
+ })
53
+
54
+
55
+ @app.get("/eval/datasets")
56
+ def eval_datasets_page(request: Request):
57
+ with Session(get_engine()) as session:
58
+ datasets = list(session.exec(select(EvalDataset)).all())
59
+ return templates.TemplateResponse("eval_datasets.html", context={"request": request, "datasets": datasets})
60
+
61
+
62
+ @app.get("/eval/datasets/{dataset_id}")
63
+ def eval_dataset_detail_page(request: Request, dataset_id: int):
64
+ with Session(get_engine()) as session:
65
+ dataset = session.get(EvalDataset, dataset_id)
66
+ cases = list(session.exec(select(TestCase).where(TestCase.dataset_id == dataset_id)).all())
67
+ if not dataset:
68
+ return templates.TemplateResponse("404.html", context={"request": request}, status_code=404)
69
+ return templates.TemplateResponse("eval_detail.html", context={"request": request, "dataset": dataset, "cases": cases})
70
+
71
+
72
+ @app.get("/eval/runs")
73
+ def eval_runs_page(request: Request):
74
+ with Session(get_engine()) as session:
75
+ runs = list(session.exec(select(EvalRun).order_by(EvalRun.created_at.desc())).all())
76
+ return templates.TemplateResponse("eval_runs.html", context={"request": request, "runs": runs})
77
+
78
+
79
+ @app.get("/eval/runs/{run_id}")
80
+ def eval_run_detail_page(request: Request, run_id: int):
81
+ with Session(get_engine()) as session:
82
+ run = session.get(EvalRun, run_id)
83
+ results = list(session.exec(select(EvalResult).where(EvalResult.run_id == run_id)).all())
84
+ if run:
85
+ cases = {c.id: c for c in session.exec(select(TestCase).where(TestCase.dataset_id == run.dataset_id)).all()}
86
+ else:
87
+ cases = {}
88
+ if not run:
89
+ return templates.TemplateResponse("404.html", context={"request": request}, status_code=404)
90
+ return templates.TemplateResponse("eval_run_detail.html", context={
91
+ "request": request, "run": run, "results": results, "cases": cases,
92
+ })
93
+
94
+
95
+ @app.get("/traces")
96
+ def traces_page(request: Request):
97
+ with Session(get_engine()) as session:
98
+ traces = list(session.exec(select(Trace).order_by(Trace.created_at.desc())).all())
99
+ return templates.TemplateResponse("traces.html", context={"request": request, "traces": traces})
100
+
101
+
102
+ @app.get("/traces/{trace_id}")
103
+ def trace_detail_page(request: Request, trace_id: str):
104
+ with Session(get_engine()) as session:
105
+ trace = session.exec(select(Trace).where(Trace.trace_id == trace_id)).first()
106
+ spans = list(session.exec(select(Span).where(Span.trace_id == trace_id).order_by(Span.created_at)).all()) if trace else []
107
+ if not trace:
108
+ return templates.TemplateResponse("404.html", context={"request": request}, status_code=404)
109
+ return templates.TemplateResponse("trace_detail.html", context={
110
+ "request": request, "trace": trace, "spans": spans,
111
+ })
112
+
113
+
114
+ @app.get("/api/v1/providers")
115
+ def get_providers():
116
+ return list_providers()