llm-eval-harness 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. llm_eval_harness-0.1.0/.env.example +1 -0
  2. llm_eval_harness-0.1.0/.github/ISSUE_TEMPLATE/bug_report.md +27 -0
  3. llm_eval_harness-0.1.0/.github/ISSUE_TEMPLATE/feature_request.md +20 -0
  4. llm_eval_harness-0.1.0/.github/workflows/ci.yml +31 -0
  5. llm_eval_harness-0.1.0/.gitignore +21 -0
  6. llm_eval_harness-0.1.0/AGENTS.md +109 -0
  7. llm_eval_harness-0.1.0/CHANGELOG.md +17 -0
  8. llm_eval_harness-0.1.0/GENERATION_PROMPT.md +34 -0
  9. llm_eval_harness-0.1.0/LICENSE +21 -0
  10. llm_eval_harness-0.1.0/PKG-INFO +79 -0
  11. llm_eval_harness-0.1.0/README.md +48 -0
  12. llm_eval_harness-0.1.0/pyproject.toml +50 -0
  13. llm_eval_harness-0.1.0/pyproject.toml.bak +66 -0
  14. llm_eval_harness-0.1.0/run_droid.sh +17 -0
  15. llm_eval_harness-0.1.0/src/__init__.py +3 -0
  16. llm_eval_harness-0.1.0/src/cli.py +303 -0
  17. llm_eval_harness-0.1.0/src/db.py +506 -0
  18. llm_eval_harness-0.1.0/src/evaluator.py +357 -0
  19. llm_eval_harness-0.1.0/src/ingest.py +220 -0
  20. llm_eval_harness-0.1.0/src/judges.py +100 -0
  21. llm_eval_harness-0.1.0/src/models.py +165 -0
  22. llm_eval_harness-0.1.0/src/reporter.py +162 -0
  23. llm_eval_harness-0.1.0/tests/__init__.py +1 -0
  24. llm_eval_harness-0.1.0/tests/conftest.py +82 -0
  25. llm_eval_harness-0.1.0/tests/test_cli.py +168 -0
  26. llm_eval_harness-0.1.0/tests/test_db.py +193 -0
  27. llm_eval_harness-0.1.0/tests/test_evaluator.py +226 -0
  28. llm_eval_harness-0.1.0/tests/test_ingest.py +162 -0
  29. llm_eval_harness-0.1.0/tests/test_judges.py +103 -0
  30. llm_eval_harness-0.1.0/tests/test_models.py +161 -0
  31. llm_eval_harness-0.1.0/tests/test_reporter.py +114 -0
@@ -0,0 +1 @@
1
+ OPENRIXER_API_KEY=sk-or-replace-me
@@ -0,0 +1,27 @@
1
+ ---
2
+ name: Bug report
3
+ about: Report a bug in eval-harness
4
+ title: '[BUG] '
5
+ labels: bug
6
+ assignees: ''
7
+
8
+ ---
9
+
10
+ **Describe the bug**
11
+ A clear description of what the bug is.
12
+
13
+ **To Reproduce**
14
+ Steps to reproduce:
15
+ 1. Run `eval-harness run ...`
16
+ 2. See error
17
+
18
+ **Eval Run ID** (if applicable)
19
+ <!-- Run ID from the database or output -->
20
+
21
+ **Environment**
22
+ - eval-harness version:
23
+ - Python version:
24
+ - OS:
25
+
26
+ **Additional context**
27
+ Add any other context here.
@@ -0,0 +1,20 @@
1
+ ---
2
+ name: Feature request
3
+ about: Suggest a new feature
4
+ title: '[FEAT] '
5
+ labels: enhancement
6
+ assignees: ''
7
+
8
+ ---
9
+
10
+ **Problem**
11
+ What problem are you trying to solve?
12
+
13
+ **Solution**
14
+ What do you want to happen?
15
+
16
+ **Alternatives**
17
+ Any alternative solutions you've considered?
18
+
19
+ **Additional context**
20
+ Add any other context here.
@@ -0,0 +1,31 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.11", "3.12"]
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+ - name: Set up Python ${{ matrix.python-version }}
18
+ uses: actions/setup-python@v5
19
+ with:
20
+ python-version: ${{ matrix.python-version }}
21
+ - name: Install
22
+ run: |
23
+ python -m pip install --upgrade pip
24
+ python -m pip install -e ".[dev]"
25
+ - name: Lint
26
+ run: |
27
+ ruff check src tests
28
+ ruff format --check src tests
29
+ - name: Test
30
+ run: |
31
+ pytest tests/ -v --cov=src --cov-report=term-missing
@@ -0,0 +1,21 @@
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ *.egg-info/
5
+ build/
6
+ dist/
7
+ .venv/
8
+ venv/
9
+ .env
10
+ .coverage
11
+ coverage.xml
12
+ htmlcov/
13
+ .pytest_cache/
14
+ .ruff_cache/
15
+ .mypy_cache/
16
+ *.db
17
+ *.db-journal
18
+ *.db-wal
19
+ *.db-shm
20
+ .eval-harness/
21
+ /push.sh
@@ -0,0 +1,109 @@
1
+ # AGENTS.md — Eval Harness Project
2
+
3
+ ## Project
4
+ Python CLI tool that evaluates LLM outputs from production logs against a dual-dimension rubric (faithfulness + task completion).
5
+
6
+ ## Tech Stack
7
+ - Python 3.11+
8
+ - hatchling build backend (pyproject.toml)
9
+ - Typer + Rich (CLI + terminal output)
10
+ - Pydantic v2 (data validation)
11
+ - sqlite3 stdlib (sync, WAL mode)
12
+ - httpx (async HTTP for judge API calls)
13
+ - tiktoken (token estimation)
14
+ - pytest + pytest-httpx + ruff
15
+
16
+ ## Project Structure
17
+ ```
18
+ eval-harness/
19
+ ├── src/
20
+ │ ├── __init__.py
21
+ │ ├── cli.py # Typer app: run, judges, report, export, cache
22
+ │ ├── models.py # Pydantic: EvalRecord, EvalResult, EvalRun, JudgeCacheEntry, RubricTemplate
23
+ │ ├── db.py # SQLite: schema versioning, CRUD, migrations, export
24
+ │ ├── ingest.py # JSONL + CSV + stdin parser: --sample, --since, lenient parsing
25
+ │ ├── evaluator.py # LLM-as-judge: async batch, round-robin fallback, response caching
26
+ │ ├── reporter.py # Rich terminal tables, ASCII histogram, JSON/CSV export
27
+ │ └── judges.py # OpenRouter free model fetcher/cache
28
+ ├── tests/
29
+ │ ├── conftest.py # Fixtures, mock judge responses, temp DB setup
30
+ │ ├── test_models.py
31
+ │ ├── test_db.py
32
+ │ ├── test_ingest.py # VCR cassettes for mock LLM responses
33
+ │ ├── test_evaluator.py # pytest-httpx for mock HTTP
34
+ │ ├── test_reporter.py
35
+ │ ├── test_cli.py # Typer CliRunner, end-to-end
36
+ │ └── test_judges.py
37
+ ├── .github/workflows/ci.yml
38
+ ├── pyproject.toml
39
+ ├── README.md
40
+ ├── .gitignore
41
+ ├── .env.example
42
+ ├── LICENSE (MIT)
43
+ └── CHANGELOG.md
44
+ ```
45
+
46
+ ## Database Schema (v1)
47
+ ```sql
48
+ CREATE TABLE schema_version (version INTEGER PRIMARY KEY, applied_at TEXT NOT NULL);
49
+
50
+ CREATE TABLE eval_runs (
51
+ run_id TEXT PRIMARY KEY, created_at TEXT NOT NULL, config_json TEXT NOT NULL,
52
+ record_count INTEGER DEFAULT 0, rubric_id TEXT DEFAULT 'faithfulness-v1',
53
+ judge_model TEXT, status TEXT DEFAULT 'running', completed_at TEXT,
54
+ mean_score REAL, pass_rate REAL, eval_time_seconds REAL
55
+ );
56
+
57
+ CREATE TABLE eval_records (
58
+ record_id TEXT PRIMARY KEY, run_id TEXT NOT NULL REFERENCES eval_runs(run_id),
59
+ input_text TEXT NOT NULL, output_text TEXT NOT NULL, reference_text TEXT,
60
+ source_file TEXT, metadata_json TEXT, created_at TEXT NOT NULL
61
+ );
62
+ CREATE INDEX idx_records_run ON eval_records(run_id);
63
+
64
+ CREATE TABLE eval_results (
65
+ result_id TEXT PRIMARY KEY, record_id TEXT NOT NULL REFERENCES eval_records(record_id),
66
+ run_id TEXT NOT NULL REFERENCES eval_runs(run_id), rubric_id TEXT DEFAULT 'faithfulness-v1',
67
+ rubric_version TEXT DEFAULT '1.0', faithfulness REAL NOT NULL, task_completion REAL NOT NULL,
68
+ combined_score REAL NOT NULL, pass_fail TEXT NOT NULL, reasoning TEXT DEFAULT '',
69
+ faithfulness_reasoning TEXT DEFAULT '', task_completion_reasoning TEXT DEFAULT '',
70
+ judge_model TEXT NOT NULL, judge_fallbacks INTEGER DEFAULT 0, judge_tried TEXT DEFAULT '[]',
71
+ tokens_estimated INTEGER, evaluated_at TEXT NOT NULL, error TEXT
72
+ );
73
+ CREATE INDEX idx_results_run ON eval_results(run_id);
74
+
75
+ CREATE TABLE judge_cache (
76
+ cache_key TEXT PRIMARY KEY, model_id TEXT NOT NULL, rubric_version TEXT NOT NULL,
77
+ response_json TEXT NOT NULL, created_at TEXT NOT NULL, hits INTEGER DEFAULT 1
78
+ );
79
+ ```
80
+
81
+ ## Input Schema (JSONL)
82
+ ```json
83
+ {"input": "user prompt", "output": "model response", "reference": "optional ground truth"}
84
+ ```
85
+
86
+ ## Judge Output Schema
87
+ ```json
88
+ {"faithfulness": 0.0-1.0, "task_completion": 0.0-1.0, "reasoning": "str", "faithfulness_reasoning": "str", "task_completion_reasoning": "str"}
89
+ ```
90
+ Combined: 0.5 * faithfulness + 0.5 * task_completion. Pass/fail threshold: 0.7.
91
+
92
+ ## CLI Commands
93
+ - `eval-harness run <file>` — primary: ingest + evaluate + report. Flags: --format jsonl|csv, --input-col, --output-col, --reference-col, --sample N, --since DATE, --limit N, --judge MODEL, --no-fallback, --max-fallbacks N, --pass-threshold FLOAT, --output json|table, --output-file PATH, --dry-run, --resume, --timeout SECONDS, --rpm-limit INT, --yes, --verbose, --quiet, --config PATH
94
+ - `eval-harness judges` — list free judge models. Flags: --refresh, --json
95
+ - `eval-harness report --run-id UUID` — show results. Flags: --output json|table|csv, --output-file PATH
96
+ - `eval-harness export --run-id UUID --format json|csv --output-file PATH`
97
+ - `eval-harness cache [--clear] [--stats]`
98
+
99
+ Exit codes: 0=all pass, 1=any failures, 2=evaluator error
100
+
101
+ ## API Key
102
+ Read from OPENRIXER_API_KEY env var (not hardcoded).
103
+
104
+ ## Rules
105
+ - TDD: write failing test BEFORE implementation
106
+ - Type hints everywhere
107
+ - Google-style docstrings
108
+ - ruff check + ruff format before every commit
109
+ - Commit after every task
@@ -0,0 +1,17 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [0.1.0] - 2026-05-29
9
+
10
+ ### Added
11
+ - Typer CLI: `run`, `judges`, `report`, `export`, `cache`
12
+ - Pydantic v2 models: `EvalRecord`, `EvalResult`, `EvalRun`, `JudgeCacheEntry`, `RubricTemplate`, `EvalSummary`
13
+ - SQLite persistence layer with WAL mode and schema migrations
14
+ - JSONL/CSV/stdin ingestion with sampling, since-date filtering, and lenient parsing
15
+ - Async LLM-as-judge evaluator with round-robin fallback and response caching
16
+ - Rich-based reporter with summary table, ASCII histogram, and JSON/CSV export
17
+ - OpenRouter free-model fetcher and on-disk cache
@@ -0,0 +1,34 @@
1
+ Generate the complete eval-harness Python CLI project. Follow the AGENTS.md spec in the current directory exactly.
2
+
3
+ Create ALL files listed in the project structure. Use TDD: write each test file before its corresponding source file.
4
+
5
+ Files to create:
6
+ 1. pyproject.toml (hatchling build, all deps, ruff config, test config)
7
+ 2. .github/workflows/ci.yml (ruff + pytest)
8
+ 3. README.md (install, quickstart, CI/CD example)
9
+ 4. .gitignore
10
+ 5. .env.example
11
+ 6. LICENSE (MIT)
12
+ 7. CHANGELOG.md
13
+ 8. src/__init__.py
14
+ 9. src/models.py (Pydantic v2: EvalRecord, EvalResult, EvalRun, JudgeCacheEntry, RubricTemplate, BUILTIN_RUBRIC_V1, PassFail/RunStatus enums, EvalSummary)
15
+ 10. src/db.py (SQLite WAL mode, schema versioning, full CRUD for all 5 tables, export JSON/CSV)
16
+ 11. src/ingest.py (JSONL + CSV + stdin parser: --sample N, --since DATE, lenient parsing, no auto-detect)
17
+ 12. src/evaluator.py (async batch eval, round-robin fallback, sqlite response cache, token tracking, markdown-wrapped JSON handling)
18
+ 13. src/reporter.py (Rich terminal tables, ASCII histogram, JSON/CSV export, judge usage stats)
19
+ 14. src/cli.py (Typer app: run/judges/report/export/cache commands, all flags from AGENTS.md, exit codes 0/1/2, --dry-run, --resume, progress bar for batches > 10)
20
+ 15. src/judges.py (OpenRouter model fetch/cache, rank by context length, --refresh, --json)
21
+ 16. tests/conftest.py (temp DB per test, sample records, mock judge responses)
22
+ 17. tests/test_models.py, test_db.py, test_ingest.py, test_evaluator.py, test_reporter.py, test_cli.py, test_judges.py
23
+
24
+ Requirements:
25
+ - Python 3.11+, hatchling build backend
26
+ - Dependencies: typer>=0.12, pydantic>=2.0, rich>=13.0, httpx>=0.27, tiktoken>=0.7
27
+ - Dev deps: pytest>=8, pytest-httpx>=0.30, pytest-cov>=5, ruff>=0.5
28
+ - OPENRIXER_API_KEY env var (never hardcoded)
29
+ - TDD for every module: test file before source file, each public function tested
30
+ - Type hints everywhere (mypy-compatible)
31
+ - Google-style docstrings for all public functions
32
+ - The project must work: pip install -e . && eval-harness --help
33
+ - Database: sqlite3 stdlib (NOT aiosqlite), WAL mode, foreign keys, schema migration system
34
+ - Judge API: OpenRouter (openrouter.ai/api/v1/chat/completions), round-robin fallback through cached free models
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Eval Harness Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,79 @@
1
+ Metadata-Version: 2.4
2
+ Name: llm-eval-harness
3
+ Version: 0.1.0
4
+ Summary: CLI tool that evaluates LLM outputs from production logs against a dual-dimension rubric.
5
+ Project-URL: Homepage, https://github.com/onicarps/eval-harness
6
+ Project-URL: Repository, https://github.com/onicarps/eval-harness
7
+ Project-URL: Issues, https://github.com/onicarps/eval-harness/issues
8
+ Author: llm-eval-harness
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: cli,evaluation,faithfulness,llm,rubric
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Software Development :: Quality Assurance
18
+ Requires-Python: >=3.11
19
+ Requires-Dist: httpx>=0.27
20
+ Requires-Dist: pydantic>=2.0
21
+ Requires-Dist: rich>=13.0
22
+ Requires-Dist: tiktoken>=0.7
23
+ Requires-Dist: typer>=0.12
24
+ Provides-Extra: dev
25
+ Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
26
+ Requires-Dist: pytest-cov>=5; extra == 'dev'
27
+ Requires-Dist: pytest-httpx>=0.30; extra == 'dev'
28
+ Requires-Dist: pytest>=8; extra == 'dev'
29
+ Requires-Dist: ruff>=0.5; extra == 'dev'
30
+ Description-Content-Type: text/markdown
31
+
32
+ # eval-harness
33
+
34
+ A Python CLI that evaluates LLM outputs from production logs against a
35
+ dual-dimension rubric (faithfulness + task completion).
36
+
37
+ ## Install
38
+
39
+ ```bash
40
+ pip install -e ".[dev]"
41
+ ```
42
+
43
+ ## Quickstart
44
+
45
+ ```bash
46
+ export OPENRIXER_API_KEY=sk-or-...
47
+ eval-harness run path/to/logs.jsonl --judge meta-llama/llama-3.1-8b-instruct:free
48
+ ```
49
+
50
+ Input JSONL schema:
51
+
52
+ ```json
53
+ {"input": "user prompt", "output": "model response", "reference": "optional ground truth"}
54
+ ```
55
+
56
+ ## Commands
57
+
58
+ - `eval-harness run <file>` — ingest, evaluate, and report
59
+ - `eval-harness judges` — list free judge models (cached in `~/.eval-harness/judges.json`)
60
+ - `eval-harness report --run-id UUID` — show a stored run
61
+ - `eval-harness export --run-id UUID --format json|csv --output-file PATH`
62
+ - `eval-harness cache [--stats] [--clear]`
63
+
64
+ Exit codes: `0` all pass, `1` any failures, `2` evaluator error.
65
+
66
+ ## CI/CD example
67
+
68
+ ```yaml
69
+ - run: pip install eval-harness
70
+ - run: OPENRIXER_API_KEY=${{ secrets.OPENRIXER_API_KEY }} eval-harness run eval/cases.jsonl --pass-threshold 0.7
71
+ ```
72
+
73
+ ## Development
74
+
75
+ ```bash
76
+ pip install -e ".[dev]"
77
+ ruff check src tests && ruff format --check src tests
78
+ pytest tests/ -v --cov=src
79
+ ```
@@ -0,0 +1,48 @@
1
+ # eval-harness
2
+
3
+ A Python CLI that evaluates LLM outputs from production logs against a
4
+ dual-dimension rubric (faithfulness + task completion).
5
+
6
+ ## Install
7
+
8
+ ```bash
9
+ pip install -e ".[dev]"
10
+ ```
11
+
12
+ ## Quickstart
13
+
14
+ ```bash
15
+ export OPENRIXER_API_KEY=sk-or-...
16
+ eval-harness run path/to/logs.jsonl --judge meta-llama/llama-3.1-8b-instruct:free
17
+ ```
18
+
19
+ Input JSONL schema:
20
+
21
+ ```json
22
+ {"input": "user prompt", "output": "model response", "reference": "optional ground truth"}
23
+ ```
24
+
25
+ ## Commands
26
+
27
+ - `eval-harness run <file>` — ingest, evaluate, and report
28
+ - `eval-harness judges` — list free judge models (cached in `~/.eval-harness/judges.json`)
29
+ - `eval-harness report --run-id UUID` — show a stored run
30
+ - `eval-harness export --run-id UUID --format json|csv --output-file PATH`
31
+ - `eval-harness cache [--stats] [--clear]`
32
+
33
+ Exit codes: `0` all pass, `1` any failures, `2` evaluator error.
34
+
35
+ ## CI/CD example
36
+
37
+ ```yaml
38
+ - run: pip install eval-harness
39
+ - run: OPENRIXER_API_KEY=${{ secrets.OPENRIXER_API_KEY }} eval-harness run eval/cases.jsonl --pass-threshold 0.7
40
+ ```
41
+
42
+ ## Development
43
+
44
+ ```bash
45
+ pip install -e ".[dev]"
46
+ ruff check src tests && ruff format --check src tests
47
+ pytest tests/ -v --cov=src
48
+ ```
@@ -0,0 +1,50 @@
1
+ [build-system]
2
+ requires = [ "hatchling",]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "llm-eval-harness"
7
+ version = "0.1.0"
8
+ description = "CLI tool that evaluates LLM outputs from production logs against a dual-dimension rubric."
9
+ readme = "README.md"
10
+ requires-python = ">=3.11"
11
+ keywords = [ "llm", "evaluation", "rubric", "cli", "faithfulness",]
12
+ classifiers = [ "Development Status :: 4 - Beta", "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Topic :: Software Development :: Quality Assurance",]
13
+ dependencies = [ "typer>=0.12", "pydantic>=2.0", "rich>=13.0", "httpx>=0.27", "tiktoken>=0.7",]
14
+ [[project.authors]]
15
+ name = "llm-eval-harness"
16
+
17
+ [project.license]
18
+ text = "MIT"
19
+
20
+ [project.optional-dependencies]
21
+ dev = [ "pytest>=8", "pytest-httpx>=0.30", "pytest-cov>=5", "pytest-asyncio>=0.23", "ruff>=0.5",]
22
+
23
+ [project.scripts]
24
+ eval-harness = "src.cli:app"
25
+
26
+ [project.urls]
27
+ Homepage = "https://github.com/onicarps/eval-harness"
28
+ Repository = "https://github.com/onicarps/eval-harness"
29
+ Issues = "https://github.com/onicarps/eval-harness/issues"
30
+
31
+ [tool.ruff]
32
+ line-length = 100
33
+ target-version = "py311"
34
+
35
+ [tool.ruff.lint]
36
+ select = [ "E", "F", "I", "B", "UP", "W",]
37
+ ignore = [ "E501", "B008", "B017", "UP045", "UP007",]
38
+
39
+ [tool.ruff.format]
40
+ quote-style = "double"
41
+ indent-style = "space"
42
+
43
+ [tool.pytest.ini_options]
44
+ testpaths = [ "tests",]
45
+ addopts = "-ra"
46
+ asyncio_mode = "auto"
47
+ filterwarnings = [ "ignore::DeprecationWarning",]
48
+
49
+ [tool.hatch.build.targets.wheel]
50
+ packages = [ "src",]
@@ -0,0 +1,66 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "eval-harness"
7
+ version = "0.1.0"
8
+ description = "CLI tool that evaluates LLM outputs from production logs against a dual-dimension rubric."
9
+ readme = "README.md"
10
+ requires-python = ">=3.11"
11
+ license = { text = "MIT" }
12
+ authors = [{ name = "Eval Harness Contributors" }]
13
+ keywords = ["llm", "evaluation", "rubric", "cli", "faithfulness"]
14
+ classifiers = [
15
+ "Development Status :: 4 - Beta",
16
+ "License :: OSI Approved :: MIT License",
17
+ "Programming Language :: Python :: 3",
18
+ "Programming Language :: Python :: 3.11",
19
+ "Programming Language :: Python :: 3.12",
20
+ "Topic :: Software Development :: Quality Assurance",
21
+ ]
22
+ dependencies = [
23
+ "typer>=0.12",
24
+ "pydantic>=2.0",
25
+ "rich>=13.0",
26
+ "httpx>=0.27",
27
+ "tiktoken>=0.7",
28
+ ]
29
+
30
+ [project.optional-dependencies]
31
+ dev = [
32
+ "pytest>=8",
33
+ "pytest-httpx>=0.30",
34
+ "pytest-cov>=5",
35
+ "pytest-asyncio>=0.23",
36
+ "ruff>=0.5",
37
+ ]
38
+
39
+ [project.scripts]
40
+ eval-harness = "src.cli:app"
41
+
42
+ [project.urls]
43
+ Homepage = "https://github.com/onicarps/eval-harness"
44
+ Repository = "https://github.com/onicarps/eval-harness"
45
+ Issues = "https://github.com/onicarps/eval-harness/issues"
46
+
47
+ [tool.hatch.build.targets.wheel]
48
+ packages = ["src"]
49
+
50
+ [tool.ruff]
51
+ line-length = 100
52
+ target-version = "py311"
53
+
54
+ [tool.ruff.lint]
55
+ select = ["E", "F", "I", "B", "UP", "W"]
56
+ ignore = ["E501", "B008", "B017", "UP045", "UP007"]
57
+
58
+ [tool.ruff.format]
59
+ quote-style = "double"
60
+ indent-style = "space"
61
+
62
+ [tool.pytest.ini_options]
63
+ testpaths = ["tests"]
64
+ addopts = "-ra"
65
+ asyncio_mode = "auto"
66
+ filterwarnings = ["ignore::DeprecationWarning"]
@@ -0,0 +1,17 @@
1
+ #!/bin/bash
2
+ set -e
3
+
4
+ cd /home/oni/.hermes/profiles/eval-harness/workspace/eval-harness
5
+
6
+ # Load env
7
+ eval $(grep -v '^#' /home/oni/.hermes/profiles/eval-harness/.env | grep -v '^$' | sed 's/^/export /')
8
+
9
+ echo "FACTORY_API_KEY set: ${FACTORY_API_KEY:0:8}..."
10
+
11
+ # Write prompt to temp file
12
+ cat > /tmp/droid_prompt.txt << 'PROMPT'
13
+ Read the AGENTS.md and GENERATION_PROMPT.md files in the current directory. Then generate the complete eval-harness Python CLI project as specified. Create ALL files listed in the project structure. Use TDD: write each test file before its corresponding source file. Start with pyproject.toml, then src/__init__.py, src/models.py, tests/test_models.py, src/db.py, tests/test_db.py, src/ingest.py, tests/test_ingest.py, src/evaluator.py, tests/test_evaluator.py, src/reporter.py, tests/test_reporter.py, src/cli.py, tests/test_cli.py, src/judges.py, tests/test_judges.py, .github/workflows/ci.yml, README.md, .gitignore, .env.example, LICENSE, CHANGELOG.md. Make sure pip install -e . works and eval-harness --help shows all commands.
14
+ PROMPT
15
+
16
+ echo "Starting droid..."
17
+ droid exec --auto high -f /tmp/droid_prompt.txt
@@ -0,0 +1,3 @@
1
+ """Eval Harness: CLI tool to evaluate LLM outputs against a dual-dimension rubric."""
2
+
3
+ __version__ = "0.1.0"