llm-eval-harness 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_eval_harness-0.1.0/.env.example +1 -0
- llm_eval_harness-0.1.0/.github/ISSUE_TEMPLATE/bug_report.md +27 -0
- llm_eval_harness-0.1.0/.github/ISSUE_TEMPLATE/feature_request.md +20 -0
- llm_eval_harness-0.1.0/.github/workflows/ci.yml +31 -0
- llm_eval_harness-0.1.0/.gitignore +21 -0
- llm_eval_harness-0.1.0/AGENTS.md +109 -0
- llm_eval_harness-0.1.0/CHANGELOG.md +17 -0
- llm_eval_harness-0.1.0/GENERATION_PROMPT.md +34 -0
- llm_eval_harness-0.1.0/LICENSE +21 -0
- llm_eval_harness-0.1.0/PKG-INFO +79 -0
- llm_eval_harness-0.1.0/README.md +48 -0
- llm_eval_harness-0.1.0/pyproject.toml +50 -0
- llm_eval_harness-0.1.0/pyproject.toml.bak +66 -0
- llm_eval_harness-0.1.0/run_droid.sh +17 -0
- llm_eval_harness-0.1.0/src/__init__.py +3 -0
- llm_eval_harness-0.1.0/src/cli.py +303 -0
- llm_eval_harness-0.1.0/src/db.py +506 -0
- llm_eval_harness-0.1.0/src/evaluator.py +357 -0
- llm_eval_harness-0.1.0/src/ingest.py +220 -0
- llm_eval_harness-0.1.0/src/judges.py +100 -0
- llm_eval_harness-0.1.0/src/models.py +165 -0
- llm_eval_harness-0.1.0/src/reporter.py +162 -0
- llm_eval_harness-0.1.0/tests/__init__.py +1 -0
- llm_eval_harness-0.1.0/tests/conftest.py +82 -0
- llm_eval_harness-0.1.0/tests/test_cli.py +168 -0
- llm_eval_harness-0.1.0/tests/test_db.py +193 -0
- llm_eval_harness-0.1.0/tests/test_evaluator.py +226 -0
- llm_eval_harness-0.1.0/tests/test_ingest.py +162 -0
- llm_eval_harness-0.1.0/tests/test_judges.py +103 -0
- llm_eval_harness-0.1.0/tests/test_models.py +161 -0
- llm_eval_harness-0.1.0/tests/test_reporter.py +114 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
OPENRIXER_API_KEY=sk-or-replace-me
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: Bug report
|
|
3
|
+
about: Report a bug in eval-harness
|
|
4
|
+
title: '[BUG] '
|
|
5
|
+
labels: bug
|
|
6
|
+
assignees: ''
|
|
7
|
+
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
**Describe the bug**
|
|
11
|
+
A clear description of what the bug is.
|
|
12
|
+
|
|
13
|
+
**To Reproduce**
|
|
14
|
+
Steps to reproduce:
|
|
15
|
+
1. Run `eval-harness run ...`
|
|
16
|
+
2. See error
|
|
17
|
+
|
|
18
|
+
**Eval Run ID** (if applicable)
|
|
19
|
+
<!-- Run ID from the database or output -->
|
|
20
|
+
|
|
21
|
+
**Environment**
|
|
22
|
+
- eval-harness version:
|
|
23
|
+
- Python version:
|
|
24
|
+
- OS:
|
|
25
|
+
|
|
26
|
+
**Additional context**
|
|
27
|
+
Add any other context here.
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: Feature request
|
|
3
|
+
about: Suggest a new feature
|
|
4
|
+
title: '[FEAT] '
|
|
5
|
+
labels: enhancement
|
|
6
|
+
assignees: ''
|
|
7
|
+
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
**Problem**
|
|
11
|
+
What problem are you trying to solve?
|
|
12
|
+
|
|
13
|
+
**Solution**
|
|
14
|
+
What do you want to happen?
|
|
15
|
+
|
|
16
|
+
**Alternatives**
|
|
17
|
+
Any alternative solutions you've considered?
|
|
18
|
+
|
|
19
|
+
**Additional context**
|
|
20
|
+
Add any other context here.
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.11", "3.12"]
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
18
|
+
uses: actions/setup-python@v5
|
|
19
|
+
with:
|
|
20
|
+
python-version: ${{ matrix.python-version }}
|
|
21
|
+
- name: Install
|
|
22
|
+
run: |
|
|
23
|
+
python -m pip install --upgrade pip
|
|
24
|
+
python -m pip install -e ".[dev]"
|
|
25
|
+
- name: Lint
|
|
26
|
+
run: |
|
|
27
|
+
ruff check src tests
|
|
28
|
+
ruff format --check src tests
|
|
29
|
+
- name: Test
|
|
30
|
+
run: |
|
|
31
|
+
pytest tests/ -v --cov=src --cov-report=term-missing
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
__pycache__/
|
|
2
|
+
*.pyc
|
|
3
|
+
*.pyo
|
|
4
|
+
*.egg-info/
|
|
5
|
+
build/
|
|
6
|
+
dist/
|
|
7
|
+
.venv/
|
|
8
|
+
venv/
|
|
9
|
+
.env
|
|
10
|
+
.coverage
|
|
11
|
+
coverage.xml
|
|
12
|
+
htmlcov/
|
|
13
|
+
.pytest_cache/
|
|
14
|
+
.ruff_cache/
|
|
15
|
+
.mypy_cache/
|
|
16
|
+
*.db
|
|
17
|
+
*.db-journal
|
|
18
|
+
*.db-wal
|
|
19
|
+
*.db-shm
|
|
20
|
+
.eval-harness/
|
|
21
|
+
/push.sh
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
# AGENTS.md — Eval Harness Project
|
|
2
|
+
|
|
3
|
+
## Project
|
|
4
|
+
Python CLI tool that evaluates LLM outputs from production logs against a dual-dimension rubric (faithfulness + task completion).
|
|
5
|
+
|
|
6
|
+
## Tech Stack
|
|
7
|
+
- Python 3.11+
|
|
8
|
+
- hatchling build backend (pyproject.toml)
|
|
9
|
+
- Typer + Rich (CLI + terminal output)
|
|
10
|
+
- Pydantic v2 (data validation)
|
|
11
|
+
- sqlite3 stdlib (sync, WAL mode)
|
|
12
|
+
- httpx (async HTTP for judge API calls)
|
|
13
|
+
- tiktoken (token estimation)
|
|
14
|
+
- pytest + pytest-httpx + ruff
|
|
15
|
+
|
|
16
|
+
## Project Structure
|
|
17
|
+
```
|
|
18
|
+
eval-harness/
|
|
19
|
+
├── src/
|
|
20
|
+
│ ├── __init__.py
|
|
21
|
+
│ ├── cli.py # Typer app: run, judges, report, export, cache
|
|
22
|
+
│ ├── models.py # Pydantic: EvalRecord, EvalResult, EvalRun, JudgeCacheEntry, RubricTemplate
|
|
23
|
+
│ ├── db.py # SQLite: schema versioning, CRUD, migrations, export
|
|
24
|
+
│ ├── ingest.py # JSONL + CSV + stdin parser: --sample, --since, lenient parsing
|
|
25
|
+
│ ├── evaluator.py # LLM-as-judge: async batch, round-robin fallback, response caching
|
|
26
|
+
│ ├── reporter.py # Rich terminal tables, ASCII histogram, JSON/CSV export
|
|
27
|
+
│ └── judges.py # OpenRouter free model fetcher/cache
|
|
28
|
+
├── tests/
|
|
29
|
+
│ ├── conftest.py # Fixtures, mock judge responses, temp DB setup
|
|
30
|
+
│ ├── test_models.py
|
|
31
|
+
│ ├── test_db.py
|
|
32
|
+
│ ├── test_ingest.py # VCR cassettes for mock LLM responses
|
|
33
|
+
│ ├── test_evaluator.py # pytest-httpx for mock HTTP
|
|
34
|
+
│ ├── test_reporter.py
|
|
35
|
+
│ ├── test_cli.py # Typer CliRunner, end-to-end
|
|
36
|
+
│ └── test_judges.py
|
|
37
|
+
├── .github/workflows/ci.yml
|
|
38
|
+
├── pyproject.toml
|
|
39
|
+
├── README.md
|
|
40
|
+
├── .gitignore
|
|
41
|
+
├── .env.example
|
|
42
|
+
├── LICENSE (MIT)
|
|
43
|
+
└── CHANGELOG.md
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Database Schema (v1)
|
|
47
|
+
```sql
|
|
48
|
+
CREATE TABLE schema_version (version INTEGER PRIMARY KEY, applied_at TEXT NOT NULL);
|
|
49
|
+
|
|
50
|
+
CREATE TABLE eval_runs (
|
|
51
|
+
run_id TEXT PRIMARY KEY, created_at TEXT NOT NULL, config_json TEXT NOT NULL,
|
|
52
|
+
record_count INTEGER DEFAULT 0, rubric_id TEXT DEFAULT 'faithfulness-v1',
|
|
53
|
+
judge_model TEXT, status TEXT DEFAULT 'running', completed_at TEXT,
|
|
54
|
+
mean_score REAL, pass_rate REAL, eval_time_seconds REAL
|
|
55
|
+
);
|
|
56
|
+
|
|
57
|
+
CREATE TABLE eval_records (
|
|
58
|
+
record_id TEXT PRIMARY KEY, run_id TEXT NOT NULL REFERENCES eval_runs(run_id),
|
|
59
|
+
input_text TEXT NOT NULL, output_text TEXT NOT NULL, reference_text TEXT,
|
|
60
|
+
source_file TEXT, metadata_json TEXT, created_at TEXT NOT NULL
|
|
61
|
+
);
|
|
62
|
+
CREATE INDEX idx_records_run ON eval_records(run_id);
|
|
63
|
+
|
|
64
|
+
CREATE TABLE eval_results (
|
|
65
|
+
result_id TEXT PRIMARY KEY, record_id TEXT NOT NULL REFERENCES eval_records(record_id),
|
|
66
|
+
run_id TEXT NOT NULL REFERENCES eval_runs(run_id), rubric_id TEXT DEFAULT 'faithfulness-v1',
|
|
67
|
+
rubric_version TEXT DEFAULT '1.0', faithfulness REAL NOT NULL, task_completion REAL NOT NULL,
|
|
68
|
+
combined_score REAL NOT NULL, pass_fail TEXT NOT NULL, reasoning TEXT DEFAULT '',
|
|
69
|
+
faithfulness_reasoning TEXT DEFAULT '', task_completion_reasoning TEXT DEFAULT '',
|
|
70
|
+
judge_model TEXT NOT NULL, judge_fallbacks INTEGER DEFAULT 0, judge_tried TEXT DEFAULT '[]',
|
|
71
|
+
tokens_estimated INTEGER, evaluated_at TEXT NOT NULL, error TEXT
|
|
72
|
+
);
|
|
73
|
+
CREATE INDEX idx_results_run ON eval_results(run_id);
|
|
74
|
+
|
|
75
|
+
CREATE TABLE judge_cache (
|
|
76
|
+
cache_key TEXT PRIMARY KEY, model_id TEXT NOT NULL, rubric_version TEXT NOT NULL,
|
|
77
|
+
response_json TEXT NOT NULL, created_at TEXT NOT NULL, hits INTEGER DEFAULT 1
|
|
78
|
+
);
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Input Schema (JSONL)
|
|
82
|
+
```json
|
|
83
|
+
{"input": "user prompt", "output": "model response", "reference": "optional ground truth"}
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## Judge Output Schema
|
|
87
|
+
```json
|
|
88
|
+
{"faithfulness": 0.0-1.0, "task_completion": 0.0-1.0, "reasoning": "str", "faithfulness_reasoning": "str", "task_completion_reasoning": "str"}
|
|
89
|
+
```
|
|
90
|
+
Combined: 0.5 * faithfulness + 0.5 * task_completion. Pass/fail threshold: 0.7.
|
|
91
|
+
|
|
92
|
+
## CLI Commands
|
|
93
|
+
- `eval-harness run <file>` — primary: ingest + evaluate + report. Flags: --format jsonl|csv, --input-col, --output-col, --reference-col, --sample N, --since DATE, --limit N, --judge MODEL, --no-fallback, --max-fallbacks N, --pass-threshold FLOAT, --output json|table, --output-file PATH, --dry-run, --resume, --timeout SECONDS, --rpm-limit INT, --yes, --verbose, --quiet, --config PATH
|
|
94
|
+
- `eval-harness judges` — list free judge models. Flags: --refresh, --json
|
|
95
|
+
- `eval-harness report --run-id UUID` — show results. Flags: --output json|table|csv, --output-file PATH
|
|
96
|
+
- `eval-harness export --run-id UUID --format json|csv --output-file PATH`
|
|
97
|
+
- `eval-harness cache [--clear] [--stats]`
|
|
98
|
+
|
|
99
|
+
Exit codes: 0=all pass, 1=any failures, 2=evaluator error
|
|
100
|
+
|
|
101
|
+
## API Key
|
|
102
|
+
Read from OPENRIXER_API_KEY env var (not hardcoded).
|
|
103
|
+
|
|
104
|
+
## Rules
|
|
105
|
+
- TDD: write failing test BEFORE implementation
|
|
106
|
+
- Type hints everywhere
|
|
107
|
+
- Google-style docstrings
|
|
108
|
+
- ruff check + ruff format before every commit
|
|
109
|
+
- Commit after every task
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [0.1.0] - 2026-05-29
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
- Typer CLI: `run`, `judges`, `report`, `export`, `cache`
|
|
12
|
+
- Pydantic v2 models: `EvalRecord`, `EvalResult`, `EvalRun`, `JudgeCacheEntry`, `RubricTemplate`, `EvalSummary`
|
|
13
|
+
- SQLite persistence layer with WAL mode and schema migrations
|
|
14
|
+
- JSONL/CSV/stdin ingestion with sampling, since-date filtering, and lenient parsing
|
|
15
|
+
- Async LLM-as-judge evaluator with round-robin fallback and response caching
|
|
16
|
+
- Rich-based reporter with summary table, ASCII histogram, and JSON/CSV export
|
|
17
|
+
- OpenRouter free-model fetcher and on-disk cache
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
Generate the complete eval-harness Python CLI project. Follow the AGENTS.md spec in the current directory exactly.
|
|
2
|
+
|
|
3
|
+
Create ALL files listed in the project structure. Use TDD: write each test file before its corresponding source file.
|
|
4
|
+
|
|
5
|
+
Files to create:
|
|
6
|
+
1. pyproject.toml (hatchling build, all deps, ruff config, test config)
|
|
7
|
+
2. .github/workflows/ci.yml (ruff + pytest)
|
|
8
|
+
3. README.md (install, quickstart, CI/CD example)
|
|
9
|
+
4. .gitignore
|
|
10
|
+
5. .env.example
|
|
11
|
+
6. LICENSE (MIT)
|
|
12
|
+
7. CHANGELOG.md
|
|
13
|
+
8. src/__init__.py
|
|
14
|
+
9. src/models.py (Pydantic v2: EvalRecord, EvalResult, EvalRun, JudgeCacheEntry, RubricTemplate, BUILTIN_RUBRIC_V1, PassFail/RunStatus enums, EvalSummary)
|
|
15
|
+
10. src/db.py (SQLite WAL mode, schema versioning, full CRUD for all 5 tables, export JSON/CSV)
|
|
16
|
+
11. src/ingest.py (JSONL + CSV + stdin parser: --sample N, --since DATE, lenient parsing, no auto-detect)
|
|
17
|
+
12. src/evaluator.py (async batch eval, round-robin fallback, sqlite response cache, token tracking, markdown-wrapped JSON handling)
|
|
18
|
+
13. src/reporter.py (Rich terminal tables, ASCII histogram, JSON/CSV export, judge usage stats)
|
|
19
|
+
14. src/cli.py (Typer app: run/judges/report/export/cache commands, all flags from AGENTS.md, exit codes 0/1/2, --dry-run, --resume, progress bar for batches > 10)
|
|
20
|
+
15. src/judges.py (OpenRouter model fetch/cache, rank by context length, --refresh, --json)
|
|
21
|
+
16. tests/conftest.py (temp DB per test, sample records, mock judge responses)
|
|
22
|
+
17. tests/test_models.py, test_db.py, test_ingest.py, test_evaluator.py, test_reporter.py, test_cli.py, test_judges.py
|
|
23
|
+
|
|
24
|
+
Requirements:
|
|
25
|
+
- Python 3.11+, hatchling build backend
|
|
26
|
+
- Dependencies: typer>=0.12, pydantic>=2.0, rich>=13.0, httpx>=0.27, tiktoken>=0.7
|
|
27
|
+
- Dev deps: pytest>=8, pytest-httpx>=0.30, pytest-cov>=5, ruff>=0.5
|
|
28
|
+
- OPENRIXER_API_KEY env var (never hardcoded)
|
|
29
|
+
- TDD for every module: test file before source file, each public function tested
|
|
30
|
+
- Type hints everywhere (mypy-compatible)
|
|
31
|
+
- Google-style docstrings for all public functions
|
|
32
|
+
- The project must work: pip install -e . && eval-harness --help
|
|
33
|
+
- Database: sqlite3 stdlib (NOT aiosqlite), WAL mode, foreign keys, schema migration system
|
|
34
|
+
- Judge API: OpenRouter (openrouter.ai/api/v1/chat/completions), round-robin fallback through cached free models
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Eval Harness Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: llm-eval-harness
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: CLI tool that evaluates LLM outputs from production logs against a dual-dimension rubric.
|
|
5
|
+
Project-URL: Homepage, https://github.com/onicarps/eval-harness
|
|
6
|
+
Project-URL: Repository, https://github.com/onicarps/eval-harness
|
|
7
|
+
Project-URL: Issues, https://github.com/onicarps/eval-harness/issues
|
|
8
|
+
Author: llm-eval-harness
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: cli,evaluation,faithfulness,llm,rubric
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
18
|
+
Requires-Python: >=3.11
|
|
19
|
+
Requires-Dist: httpx>=0.27
|
|
20
|
+
Requires-Dist: pydantic>=2.0
|
|
21
|
+
Requires-Dist: rich>=13.0
|
|
22
|
+
Requires-Dist: tiktoken>=0.7
|
|
23
|
+
Requires-Dist: typer>=0.12
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
|
|
26
|
+
Requires-Dist: pytest-cov>=5; extra == 'dev'
|
|
27
|
+
Requires-Dist: pytest-httpx>=0.30; extra == 'dev'
|
|
28
|
+
Requires-Dist: pytest>=8; extra == 'dev'
|
|
29
|
+
Requires-Dist: ruff>=0.5; extra == 'dev'
|
|
30
|
+
Description-Content-Type: text/markdown
|
|
31
|
+
|
|
32
|
+
# eval-harness
|
|
33
|
+
|
|
34
|
+
A Python CLI that evaluates LLM outputs from production logs against a
|
|
35
|
+
dual-dimension rubric (faithfulness + task completion).
|
|
36
|
+
|
|
37
|
+
## Install
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install -e ".[dev]"
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Quickstart
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
export OPENRIXER_API_KEY=sk-or-...
|
|
47
|
+
eval-harness run path/to/logs.jsonl --judge meta-llama/llama-3.1-8b-instruct:free
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Input JSONL schema:
|
|
51
|
+
|
|
52
|
+
```json
|
|
53
|
+
{"input": "user prompt", "output": "model response", "reference": "optional ground truth"}
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Commands
|
|
57
|
+
|
|
58
|
+
- `eval-harness run <file>` — ingest, evaluate, and report
|
|
59
|
+
- `eval-harness judges` — list free judge models (cached in `~/.eval-harness/judges.json`)
|
|
60
|
+
- `eval-harness report --run-id UUID` — show a stored run
|
|
61
|
+
- `eval-harness export --run-id UUID --format json|csv --output-file PATH`
|
|
62
|
+
- `eval-harness cache [--stats] [--clear]`
|
|
63
|
+
|
|
64
|
+
Exit codes: `0` all pass, `1` any failures, `2` evaluator error.
|
|
65
|
+
|
|
66
|
+
## CI/CD example
|
|
67
|
+
|
|
68
|
+
```yaml
|
|
69
|
+
- run: pip install eval-harness
|
|
70
|
+
- run: OPENRIXER_API_KEY=${{ secrets.OPENRIXER_API_KEY }} eval-harness run eval/cases.jsonl --pass-threshold 0.7
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Development
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
pip install -e ".[dev]"
|
|
77
|
+
ruff check src tests && ruff format --check src tests
|
|
78
|
+
pytest tests/ -v --cov=src
|
|
79
|
+
```
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# eval-harness
|
|
2
|
+
|
|
3
|
+
A Python CLI that evaluates LLM outputs from production logs against a
|
|
4
|
+
dual-dimension rubric (faithfulness + task completion).
|
|
5
|
+
|
|
6
|
+
## Install
|
|
7
|
+
|
|
8
|
+
```bash
|
|
9
|
+
pip install -e ".[dev]"
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
## Quickstart
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
export OPENRIXER_API_KEY=sk-or-...
|
|
16
|
+
eval-harness run path/to/logs.jsonl --judge meta-llama/llama-3.1-8b-instruct:free
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
Input JSONL schema:
|
|
20
|
+
|
|
21
|
+
```json
|
|
22
|
+
{"input": "user prompt", "output": "model response", "reference": "optional ground truth"}
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Commands
|
|
26
|
+
|
|
27
|
+
- `eval-harness run <file>` — ingest, evaluate, and report
|
|
28
|
+
- `eval-harness judges` — list free judge models (cached in `~/.eval-harness/judges.json`)
|
|
29
|
+
- `eval-harness report --run-id UUID` — show a stored run
|
|
30
|
+
- `eval-harness export --run-id UUID --format json|csv --output-file PATH`
|
|
31
|
+
- `eval-harness cache [--stats] [--clear]`
|
|
32
|
+
|
|
33
|
+
Exit codes: `0` all pass, `1` any failures, `2` evaluator error.
|
|
34
|
+
|
|
35
|
+
## CI/CD example
|
|
36
|
+
|
|
37
|
+
```yaml
|
|
38
|
+
- run: pip install eval-harness
|
|
39
|
+
- run: OPENRIXER_API_KEY=${{ secrets.OPENRIXER_API_KEY }} eval-harness run eval/cases.jsonl --pass-threshold 0.7
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Development
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install -e ".[dev]"
|
|
46
|
+
ruff check src tests && ruff format --check src tests
|
|
47
|
+
pytest tests/ -v --cov=src
|
|
48
|
+
```
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = [ "hatchling",]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "llm-eval-harness"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "CLI tool that evaluates LLM outputs from production logs against a dual-dimension rubric."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
keywords = [ "llm", "evaluation", "rubric", "cli", "faithfulness",]
|
|
12
|
+
classifiers = [ "Development Status :: 4 - Beta", "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Topic :: Software Development :: Quality Assurance",]
|
|
13
|
+
dependencies = [ "typer>=0.12", "pydantic>=2.0", "rich>=13.0", "httpx>=0.27", "tiktoken>=0.7",]
|
|
14
|
+
[[project.authors]]
|
|
15
|
+
name = "llm-eval-harness"
|
|
16
|
+
|
|
17
|
+
[project.license]
|
|
18
|
+
text = "MIT"
|
|
19
|
+
|
|
20
|
+
[project.optional-dependencies]
|
|
21
|
+
dev = [ "pytest>=8", "pytest-httpx>=0.30", "pytest-cov>=5", "pytest-asyncio>=0.23", "ruff>=0.5",]
|
|
22
|
+
|
|
23
|
+
[project.scripts]
|
|
24
|
+
eval-harness = "src.cli:app"
|
|
25
|
+
|
|
26
|
+
[project.urls]
|
|
27
|
+
Homepage = "https://github.com/onicarps/eval-harness"
|
|
28
|
+
Repository = "https://github.com/onicarps/eval-harness"
|
|
29
|
+
Issues = "https://github.com/onicarps/eval-harness/issues"
|
|
30
|
+
|
|
31
|
+
[tool.ruff]
|
|
32
|
+
line-length = 100
|
|
33
|
+
target-version = "py311"
|
|
34
|
+
|
|
35
|
+
[tool.ruff.lint]
|
|
36
|
+
select = [ "E", "F", "I", "B", "UP", "W",]
|
|
37
|
+
ignore = [ "E501", "B008", "B017", "UP045", "UP007",]
|
|
38
|
+
|
|
39
|
+
[tool.ruff.format]
|
|
40
|
+
quote-style = "double"
|
|
41
|
+
indent-style = "space"
|
|
42
|
+
|
|
43
|
+
[tool.pytest.ini_options]
|
|
44
|
+
testpaths = [ "tests",]
|
|
45
|
+
addopts = "-ra"
|
|
46
|
+
asyncio_mode = "auto"
|
|
47
|
+
filterwarnings = [ "ignore::DeprecationWarning",]
|
|
48
|
+
|
|
49
|
+
[tool.hatch.build.targets.wheel]
|
|
50
|
+
packages = [ "src",]
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "eval-harness"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "CLI tool that evaluates LLM outputs from production logs against a dual-dimension rubric."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "Eval Harness Contributors" }]
|
|
13
|
+
keywords = ["llm", "evaluation", "rubric", "cli", "faithfulness"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 4 - Beta",
|
|
16
|
+
"License :: OSI Approved :: MIT License",
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"Programming Language :: Python :: 3.11",
|
|
19
|
+
"Programming Language :: Python :: 3.12",
|
|
20
|
+
"Topic :: Software Development :: Quality Assurance",
|
|
21
|
+
]
|
|
22
|
+
dependencies = [
|
|
23
|
+
"typer>=0.12",
|
|
24
|
+
"pydantic>=2.0",
|
|
25
|
+
"rich>=13.0",
|
|
26
|
+
"httpx>=0.27",
|
|
27
|
+
"tiktoken>=0.7",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[project.optional-dependencies]
|
|
31
|
+
dev = [
|
|
32
|
+
"pytest>=8",
|
|
33
|
+
"pytest-httpx>=0.30",
|
|
34
|
+
"pytest-cov>=5",
|
|
35
|
+
"pytest-asyncio>=0.23",
|
|
36
|
+
"ruff>=0.5",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
[project.scripts]
|
|
40
|
+
eval-harness = "src.cli:app"
|
|
41
|
+
|
|
42
|
+
[project.urls]
|
|
43
|
+
Homepage = "https://github.com/onicarps/eval-harness"
|
|
44
|
+
Repository = "https://github.com/onicarps/eval-harness"
|
|
45
|
+
Issues = "https://github.com/onicarps/eval-harness/issues"
|
|
46
|
+
|
|
47
|
+
[tool.hatch.build.targets.wheel]
|
|
48
|
+
packages = ["src"]
|
|
49
|
+
|
|
50
|
+
[tool.ruff]
|
|
51
|
+
line-length = 100
|
|
52
|
+
target-version = "py311"
|
|
53
|
+
|
|
54
|
+
[tool.ruff.lint]
|
|
55
|
+
select = ["E", "F", "I", "B", "UP", "W"]
|
|
56
|
+
ignore = ["E501", "B008", "B017", "UP045", "UP007"]
|
|
57
|
+
|
|
58
|
+
[tool.ruff.format]
|
|
59
|
+
quote-style = "double"
|
|
60
|
+
indent-style = "space"
|
|
61
|
+
|
|
62
|
+
[tool.pytest.ini_options]
|
|
63
|
+
testpaths = ["tests"]
|
|
64
|
+
addopts = "-ra"
|
|
65
|
+
asyncio_mode = "auto"
|
|
66
|
+
filterwarnings = ["ignore::DeprecationWarning"]
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
set -e
|
|
3
|
+
|
|
4
|
+
cd /home/oni/.hermes/profiles/eval-harness/workspace/eval-harness
|
|
5
|
+
|
|
6
|
+
# Load env
|
|
7
|
+
eval $(grep -v '^#' /home/oni/.hermes/profiles/eval-harness/.env | grep -v '^$' | sed 's/^/export /')
|
|
8
|
+
|
|
9
|
+
echo "FACTORY_API_KEY set: ${FACTORY_API_KEY:0:8}..."
|
|
10
|
+
|
|
11
|
+
# Write prompt to temp file
|
|
12
|
+
cat > /tmp/droid_prompt.txt << 'PROMPT'
|
|
13
|
+
Read the AGENTS.md and GENERATION_PROMPT.md files in the current directory. Then generate the complete eval-harness Python CLI project as specified. Create ALL files listed in the project structure. Use TDD: write each test file before its corresponding source file. Start with pyproject.toml, then src/__init__.py, src/models.py, tests/test_models.py, src/db.py, tests/test_db.py, src/ingest.py, tests/test_ingest.py, src/evaluator.py, tests/test_evaluator.py, src/reporter.py, tests/test_reporter.py, src/cli.py, tests/test_cli.py, src/judges.py, tests/test_judges.py, .github/workflows/ci.yml, README.md, .gitignore, .env.example, LICENSE, CHANGELOG.md. Make sure pip install -e . works and eval-harness --help shows all commands.
|
|
14
|
+
PROMPT
|
|
15
|
+
|
|
16
|
+
echo "Starting droid..."
|
|
17
|
+
droid exec --auto high -f /tmp/droid_prompt.txt
|