autojudge-evaluate 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autojudge_evaluate-0.2.1/PKG-INFO +230 -0
- autojudge_evaluate-0.2.1/README.md +194 -0
- autojudge_evaluate-0.2.1/pyproject.toml +61 -0
- autojudge_evaluate-0.2.1/setup.cfg +4 -0
- autojudge_evaluate-0.2.1/src/autojudge_evaluate/__init__.py +32 -0
- autojudge_evaluate-0.2.1/src/autojudge_evaluate/__main__.py +3 -0
- autojudge_evaluate-0.2.1/src/autojudge_evaluate/_commands/__init__.py +0 -0
- autojudge_evaluate-0.2.1/src/autojudge_evaluate/_commands/_eval_result.py +480 -0
- autojudge_evaluate-0.2.1/src/autojudge_evaluate/_commands/_leaderboard.py +156 -0
- autojudge_evaluate-0.2.1/src/autojudge_evaluate/_commands/_meta_evaluate.py +381 -0
- autojudge_evaluate-0.2.1/src/autojudge_evaluate/_commands/_qrel_evaluate.py +174 -0
- autojudge_evaluate-0.2.1/src/autojudge_evaluate/analysis/__init__.py +0 -0
- autojudge_evaluate-0.2.1/src/autojudge_evaluate/analysis/correlation_table.py +971 -0
- autojudge_evaluate-0.2.1/src/autojudge_evaluate/eval_results/__init__.py +25 -0
- autojudge_evaluate-0.2.1/src/autojudge_evaluate/eval_results/builder.py +333 -0
- autojudge_evaluate-0.2.1/src/autojudge_evaluate/eval_results/eval_result.py +403 -0
- autojudge_evaluate-0.2.1/src/autojudge_evaluate/eval_results/io.py +285 -0
- autojudge_evaluate-0.2.1/src/autojudge_evaluate/eval_results/verification.py +382 -0
- autojudge_evaluate-0.2.1/src/autojudge_evaluate/evaluation.py +489 -0
- autojudge_evaluate-0.2.1/src/autojudge_evaluate/nugget_doc_eval.py +402 -0
- autojudge_evaluate-0.2.1/src/autojudge_evaluate/pyircore.py +128 -0
- autojudge_evaluate-0.2.1/src/autojudge_evaluate.egg-info/PKG-INFO +230 -0
- autojudge_evaluate-0.2.1/src/autojudge_evaluate.egg-info/SOURCES.txt +34 -0
- autojudge_evaluate-0.2.1/src/autojudge_evaluate.egg-info/dependency_links.txt +1 -0
- autojudge_evaluate-0.2.1/src/autojudge_evaluate.egg-info/entry_points.txt +2 -0
- autojudge_evaluate-0.2.1/src/autojudge_evaluate.egg-info/requires.txt +15 -0
- autojudge_evaluate-0.2.1/src/autojudge_evaluate.egg-info/top_level.txt +1 -0
- autojudge_evaluate-0.2.1/tests/test_correlations.py +78 -0
- autojudge_evaluate-0.2.1/tests/test_eval_results.py +539 -0
- autojudge_evaluate-0.2.1/tests/test_eval_results_io.py +472 -0
- autojudge_evaluate-0.2.1/tests/test_evaluation.py +126 -0
- autojudge_evaluate-0.2.1/tests/test_evaluation_interface.py +92 -0
- autojudge_evaluate-0.2.1/tests/test_leaderboard_interface.py +89 -0
- autojudge_evaluate-0.2.1/tests/test_nugget_doc_eval.py +94 -0
- autojudge_evaluate-0.2.1/tests/test_pyircore.py +62 -0
- autojudge_evaluate-0.2.1/tests/test_tau_ap_correlations.py +77 -0
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: autojudge-evaluate
|
|
3
|
+
Version: 0.2.1
|
|
4
|
+
Summary: Evaluation tools for TREC AutoJudge: meta-evaluate, qrel-evaluate, leaderboard statistics
|
|
5
|
+
Author: TREC AutoJudge Team
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/trec-autojudge/auto-judge-evaluate
|
|
8
|
+
Project-URL: Repository, https://github.com/trec-autojudge/auto-judge-evaluate
|
|
9
|
+
Keywords: trec,autojudge,evaluation,ir,correlation
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
Requires-Dist: autojudge-base
|
|
23
|
+
Requires-Dist: click>=8.0
|
|
24
|
+
Requires-Dist: pandas>=2.0
|
|
25
|
+
Requires-Dist: numpy
|
|
26
|
+
Requires-Dist: scipy
|
|
27
|
+
Requires-Dist: scikit-learn
|
|
28
|
+
Requires-Dist: krippendorff
|
|
29
|
+
Requires-Dist: matplotlib
|
|
30
|
+
Requires-Dist: pyyaml>=6.0
|
|
31
|
+
Requires-Dist: tira>=0.0.192
|
|
32
|
+
Provides-Extra: test
|
|
33
|
+
Requires-Dist: pytest>=7.0; extra == "test"
|
|
34
|
+
Requires-Dist: pytest-cov>=4.0; extra == "test"
|
|
35
|
+
Requires-Dist: approvaltests>=16.0.0; extra == "test"
|
|
36
|
+
|
|
37
|
+
# autojudge-evaluate
|
|
38
|
+
|
|
39
|
+
Evaluation tools for the TREC AutoJudge framework. Computes leaderboard correlations, inter-annotator agreement on qrels, leaderboard statistics, and format conversion for evaluation result files.
|
|
40
|
+
|
|
41
|
+
## Installation
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
uv pip install autojudge-evaluate
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## CLI Commands
|
|
48
|
+
|
|
49
|
+
All commands are available via `auto-judge-evaluate <command>`.
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
### `meta-evaluate` — Leaderboard correlation
|
|
54
|
+
|
|
55
|
+
Correlate predicted leaderboards against a ground-truth leaderboard.
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
auto-judge-evaluate meta-evaluate \
|
|
59
|
+
--truth-leaderboard truth.eval.jsonl --truth-format jsonl \
|
|
60
|
+
--eval-format tot -i results/*eval.txt \
|
|
61
|
+
--correlation kendall --correlation spearman --correlation tauap_b \
|
|
62
|
+
--truth-measure nugget_coverage --truth-measure f1 \
|
|
63
|
+
--on-missing default \
|
|
64
|
+
--output correlations.jsonl
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
**Key options:**
|
|
68
|
+
|
|
69
|
+
| Option | Description |
|
|
70
|
+
|--------|-------------|
|
|
71
|
+
| `--truth-leaderboard FILE` | Ground-truth leaderboard file (required) |
|
|
72
|
+
| `--truth-format FMT` | Format: `trec_eval`, `tot`, `ir_measures`, `ranking`, `jsonl` |
|
|
73
|
+
| `--eval-format FMT` | Format of input leaderboard files |
|
|
74
|
+
| `-i FILE` / positional | Input leaderboard file(s), supports globs. Repeatable |
|
|
75
|
+
| `--correlation METHOD` | Correlation method. Repeatable. Supports `kendall`, `pearson`, `spearman`, `tauap_b`, and top-k variants like `kendall@15` |
|
|
76
|
+
| `--truth-measure NAME` | Truth measure(s) to correlate against. Repeatable. Omit for all |
|
|
77
|
+
| `--eval-measure NAME` | Eval measure(s) to include. Repeatable. Omit for all |
|
|
78
|
+
| `--on-missing MODE` | Handle run mismatches: `error`, `warn`, `skip`, `default` (fill 0.0) |
|
|
79
|
+
| `--only-shared-topics` | Intersect topics across truth and eval (default: `--all-topics`) |
|
|
80
|
+
| `--only-shared-runs` | Intersect runs across truth and eval (default: `--all-runs`) |
|
|
81
|
+
| `--truth-drop-aggregate` | Recompute aggregates from per-topic data |
|
|
82
|
+
| `--output FILE` | Output `.jsonl` or `.txt` |
|
|
83
|
+
| `--out-format FMT` | `jsonl` (default) or `table` |
|
|
84
|
+
| `--aggregate` | Report only mean across all judges |
|
|
85
|
+
|
|
86
|
+
**Output:** One row per (Judge, TruthMeasure, EvalMeasure) with correlation values as columns.
|
|
87
|
+
|
|
88
|
+
---
|
|
89
|
+
|
|
90
|
+
### `qrel-evaluate` — Inter-annotator agreement on qrels
|
|
91
|
+
|
|
92
|
+
Compare predicted relevance judgments (qrels) against truth qrels. Computes set overlap (precision, recall, F1) and agreement metrics (Cohen's Kappa, Krippendorff's Alpha, Jaccard, ARI).
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
auto-judge-evaluate qrel-evaluate \
|
|
96
|
+
--truth-qrels official.qrels \
|
|
97
|
+
--predict-qrels predicted.qrels
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
**Key options:**
|
|
103
|
+
|
|
104
|
+
| Option | Description |
|
|
105
|
+
|--------|-------------|
|
|
106
|
+
| `--truth-qrels FILE` | Truth qrels in TREC format |
|
|
107
|
+
| `--truth-nugget-docs DIR` | Alternative: truth as nugget-docs directory |
|
|
108
|
+
| `--predict-qrels FILE` | Predicted qrels in TREC format |
|
|
109
|
+
| `--predict-nugget-docs DIR` | Alternative: predicted as nugget-docs directory |
|
|
110
|
+
| `--truth-max-grade N` | Grade scale upper bound for truth (default: 1 = binary) |
|
|
111
|
+
| `--predict-max-grade N` | Grade scale upper bound for predicted (default: 1) |
|
|
112
|
+
| `--truth-relevance-threshold N` | Binary threshold for truth side (default: 1) |
|
|
113
|
+
| `--predict-relevance-threshold N` | Binary threshold for predicted side (default: 1) |
|
|
114
|
+
| `--on-missing MODE` | Handle topics in only one side: `error`, `warn`, `default`, `skip` |
|
|
115
|
+
| `--output FILE` | Output `.jsonl` or `.txt` |
|
|
116
|
+
|
|
117
|
+
**Output:** Per-topic table with Precision, Recall, F1, Jaccard, Kappa, Krippendorff's Alpha, ARI, plus a MEAN row.
|
|
118
|
+
|
|
119
|
+
---
|
|
120
|
+
|
|
121
|
+
### `leaderboard` — Leaderboard statistics
|
|
122
|
+
|
|
123
|
+
Compute per-run statistics (mean, stderr, stdev, min, max) from leaderboard files.
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
auto-judge-evaluate leaderboard \
|
|
127
|
+
--eval-format tot -i results/*eval.txt --sort
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
**Key options:**
|
|
131
|
+
|
|
132
|
+
| Option | Description |
|
|
133
|
+
|--------|-------------|
|
|
134
|
+
| `--eval-format FMT` | Input format (required) |
|
|
135
|
+
| `-i FILE` / positional | Input file(s), supports globs. Repeatable |
|
|
136
|
+
| `--eval-measure NAME` | Filter to specific measures. Repeatable |
|
|
137
|
+
| `--sort` | Sort runs by mean score (descending) |
|
|
138
|
+
| `--output FILE` | Output `.jsonl` or `.csv` |
|
|
139
|
+
|
|
140
|
+
**Output:** One row per (Judge, RunID, Measure) with Topics, Mean, Stderr, Stdev, Min, Max.
|
|
141
|
+
|
|
142
|
+
---
|
|
143
|
+
|
|
144
|
+
## Analysis Module
|
|
145
|
+
|
|
146
|
+
Post-hoc analysis, tables, plots of `meta-evaluate` output. Produces correlation tables and bar plots with judge categorization.
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
python -m autojudge_evaluate.analysis.correlation_table \
|
|
150
|
+
-d ragtime:ragtime-correlations.jsonl \
|
|
151
|
+
-d rag:rag-correlations.jsonl \
|
|
152
|
+
-d dragun:dragun-correlations.jsonl \
|
|
153
|
+
--judges judges.yml \
|
|
154
|
+
--correlation kendall \
|
|
155
|
+
--truth-measure nugget_coverage \
|
|
156
|
+
--format latex \
|
|
157
|
+
--plot-dir plots/
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
**Judge configuration** (`judges.yml`) maps cryptic filenames to display names and categories, with optional plot styling:
|
|
161
|
+
|
|
162
|
+
```yaml
|
|
163
|
+
styles:
|
|
164
|
+
colors:
|
|
165
|
+
pointwise: "#4A90D9"
|
|
166
|
+
pairwise: "#D94A4A"
|
|
167
|
+
hatches:
|
|
168
|
+
gpt-4o: ""
|
|
169
|
+
llama-3: "//"
|
|
170
|
+
|
|
171
|
+
judges:
|
|
172
|
+
my-judge-A.eval:
|
|
173
|
+
name: System A
|
|
174
|
+
method: pointwise # category column
|
|
175
|
+
model: gpt-4o # category column
|
|
176
|
+
my-judge-B.eval:
|
|
177
|
+
name: System B
|
|
178
|
+
method: pairwise
|
|
179
|
+
model: llama-3
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
- **`styles.colors`**: maps category values to fill colors (any matplotlib color string)
|
|
183
|
+
- **`styles.hatches`**: maps category values to hatch patterns (`//`, `..`, `xx`, `\\`, etc). Values combine across categories.
|
|
184
|
+
- Color is picked from the first matching category value; hatches are combined from all matches.
|
|
185
|
+
- Without a `styles:` section, bars use a sequential grayscale fallback.
|
|
186
|
+
- Judges not in the YAML are excluded unless `--all-judges` is passed.
|
|
187
|
+
|
|
188
|
+
**Key options:** `--format` (github, latex, tsv, plain, html, pipe), `--columns` (correlations or measures), `--summary` (add mean/max rows), `--aggregate` (aggregate across datasets), `--same THRESHOLD` (highlight near-equal values).
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
---
|
|
192
|
+
|
|
193
|
+
### `eval-result` — Format conversion and verification
|
|
194
|
+
|
|
195
|
+
Clean and convert evaluation result files.
|
|
196
|
+
|
|
197
|
+
```bash
|
|
198
|
+
# Convert tot to jsonl
|
|
199
|
+
auto-judge-evaluate eval-result data.txt -if tot -of jsonl -o data.jsonl
|
|
200
|
+
|
|
201
|
+
# Filter to specific runs and topics
|
|
202
|
+
auto-judge-evaluate eval-result data.txt -if tot -of jsonl -o filtered.jsonl \
|
|
203
|
+
--filter-runs system_A --filter-runs system_B \
|
|
204
|
+
--filter-topics topic_1
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
**Key options:**
|
|
208
|
+
|
|
209
|
+
| Option | Description |
|
|
210
|
+
|--------|-------------|
|
|
211
|
+
| `-if FMT` | Input format: `trec_eval`, `tot`, `ir_measures`, `ranking`, `jsonl` |
|
|
212
|
+
| `-of FMT` | Output format (defaults to input format) |
|
|
213
|
+
| `-o FILE` | Output file. Omit for roundtrip test to temp file |
|
|
214
|
+
| `--filter-runs ID` | Keep only these runs. Repeatable |
|
|
215
|
+
| `--filter-topics ID` | Keep only these topics. Repeatable |
|
|
216
|
+
| `--filter-measures NAME` | Keep only these measures. Repeatable |
|
|
217
|
+
| `--compare-aggregates` | Compare file aggregates vs recomputed from per-topic data |
|
|
218
|
+
| `--drop-aggregates` | Drop existing aggregate rows |
|
|
219
|
+
| `--recompute-aggregates` | Recompute from per-topic data (implies `--drop-aggregates`) |
|
|
220
|
+
| `--roundtrip` / `--no-roundtrip` | Enable/disable roundtrip verification (default: on) |
|
|
221
|
+
|
|
222
|
+
**Supported formats:**
|
|
223
|
+
|
|
224
|
+
| Format | Columns |
|
|
225
|
+
|--------|---------|
|
|
226
|
+
| `trec_eval` | measure topic value (3 cols, run_id from filename) |
|
|
227
|
+
| `tot` | run measure topic value (4 cols) |
|
|
228
|
+
| `ir_measures` | run topic measure value (4 cols) |
|
|
229
|
+
| `ranking` | topic Q0 doc_id rank score run (6 cols) |
|
|
230
|
+
| `jsonl` | JSON lines with run_id, topic_id, measure, value |
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
# autojudge-evaluate
|
|
2
|
+
|
|
3
|
+
Evaluation tools for the TREC AutoJudge framework. Computes leaderboard correlations, inter-annotator agreement on qrels, leaderboard statistics, and format conversion for evaluation result files.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
uv pip install autojudge-evaluate
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## CLI Commands
|
|
12
|
+
|
|
13
|
+
All commands are available via `auto-judge-evaluate <command>`.
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
### `meta-evaluate` — Leaderboard correlation
|
|
18
|
+
|
|
19
|
+
Correlate predicted leaderboards against a ground-truth leaderboard.
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
auto-judge-evaluate meta-evaluate \
|
|
23
|
+
--truth-leaderboard truth.eval.jsonl --truth-format jsonl \
|
|
24
|
+
--eval-format tot -i results/*eval.txt \
|
|
25
|
+
--correlation kendall --correlation spearman --correlation tauap_b \
|
|
26
|
+
--truth-measure nugget_coverage --truth-measure f1 \
|
|
27
|
+
--on-missing default \
|
|
28
|
+
--output correlations.jsonl
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
**Key options:**
|
|
32
|
+
|
|
33
|
+
| Option | Description |
|
|
34
|
+
|--------|-------------|
|
|
35
|
+
| `--truth-leaderboard FILE` | Ground-truth leaderboard file (required) |
|
|
36
|
+
| `--truth-format FMT` | Format: `trec_eval`, `tot`, `ir_measures`, `ranking`, `jsonl` |
|
|
37
|
+
| `--eval-format FMT` | Format of input leaderboard files |
|
|
38
|
+
| `-i FILE` / positional | Input leaderboard file(s), supports globs. Repeatable |
|
|
39
|
+
| `--correlation METHOD` | Correlation method. Repeatable. Supports `kendall`, `pearson`, `spearman`, `tauap_b`, and top-k variants like `kendall@15` |
|
|
40
|
+
| `--truth-measure NAME` | Truth measure(s) to correlate against. Repeatable. Omit for all |
|
|
41
|
+
| `--eval-measure NAME` | Eval measure(s) to include. Repeatable. Omit for all |
|
|
42
|
+
| `--on-missing MODE` | Handle run mismatches: `error`, `warn`, `skip`, `default` (fill 0.0) |
|
|
43
|
+
| `--only-shared-topics` | Intersect topics across truth and eval (default: `--all-topics`) |
|
|
44
|
+
| `--only-shared-runs` | Intersect runs across truth and eval (default: `--all-runs`) |
|
|
45
|
+
| `--truth-drop-aggregate` | Recompute aggregates from per-topic data |
|
|
46
|
+
| `--output FILE` | Output `.jsonl` or `.txt` |
|
|
47
|
+
| `--out-format FMT` | `jsonl` (default) or `table` |
|
|
48
|
+
| `--aggregate` | Report only mean across all judges |
|
|
49
|
+
|
|
50
|
+
**Output:** One row per (Judge, TruthMeasure, EvalMeasure) with correlation values as columns.
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
### `qrel-evaluate` — Inter-annotator agreement on qrels
|
|
55
|
+
|
|
56
|
+
Compare predicted relevance judgments (qrels) against truth qrels. Computes set overlap (precision, recall, F1) and agreement metrics (Cohen's Kappa, Krippendorff's Alpha, Jaccard, ARI).
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
auto-judge-evaluate qrel-evaluate \
|
|
60
|
+
--truth-qrels official.qrels \
|
|
61
|
+
--predict-qrels predicted.qrels
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
**Key options:**
|
|
67
|
+
|
|
68
|
+
| Option | Description |
|
|
69
|
+
|--------|-------------|
|
|
70
|
+
| `--truth-qrels FILE` | Truth qrels in TREC format |
|
|
71
|
+
| `--truth-nugget-docs DIR` | Alternative: truth as nugget-docs directory |
|
|
72
|
+
| `--predict-qrels FILE` | Predicted qrels in TREC format |
|
|
73
|
+
| `--predict-nugget-docs DIR` | Alternative: predicted as nugget-docs directory |
|
|
74
|
+
| `--truth-max-grade N` | Grade scale upper bound for truth (default: 1 = binary) |
|
|
75
|
+
| `--predict-max-grade N` | Grade scale upper bound for predicted (default: 1) |
|
|
76
|
+
| `--truth-relevance-threshold N` | Binary threshold for truth side (default: 1) |
|
|
77
|
+
| `--predict-relevance-threshold N` | Binary threshold for predicted side (default: 1) |
|
|
78
|
+
| `--on-missing MODE` | Handle topics in only one side: `error`, `warn`, `default`, `skip` |
|
|
79
|
+
| `--output FILE` | Output `.jsonl` or `.txt` |
|
|
80
|
+
|
|
81
|
+
**Output:** Per-topic table with Precision, Recall, F1, Jaccard, Kappa, Krippendorff's Alpha, ARI, plus a MEAN row.
|
|
82
|
+
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
### `leaderboard` — Leaderboard statistics
|
|
86
|
+
|
|
87
|
+
Compute per-run statistics (mean, stderr, stdev, min, max) from leaderboard files.
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
auto-judge-evaluate leaderboard \
|
|
91
|
+
--eval-format tot -i results/*eval.txt --sort
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
**Key options:**
|
|
95
|
+
|
|
96
|
+
| Option | Description |
|
|
97
|
+
|--------|-------------|
|
|
98
|
+
| `--eval-format FMT` | Input format (required) |
|
|
99
|
+
| `-i FILE` / positional | Input file(s), supports globs. Repeatable |
|
|
100
|
+
| `--eval-measure NAME` | Filter to specific measures. Repeatable |
|
|
101
|
+
| `--sort` | Sort runs by mean score (descending) |
|
|
102
|
+
| `--output FILE` | Output `.jsonl` or `.csv` |
|
|
103
|
+
|
|
104
|
+
**Output:** One row per (Judge, RunID, Measure) with Topics, Mean, Stderr, Stdev, Min, Max.
|
|
105
|
+
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
## Analysis Module
|
|
109
|
+
|
|
110
|
+
Post-hoc analysis, tables, plots of `meta-evaluate` output. Produces correlation tables and bar plots with judge categorization.
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
python -m autojudge_evaluate.analysis.correlation_table \
|
|
114
|
+
-d ragtime:ragtime-correlations.jsonl \
|
|
115
|
+
-d rag:rag-correlations.jsonl \
|
|
116
|
+
-d dragun:dragun-correlations.jsonl \
|
|
117
|
+
--judges judges.yml \
|
|
118
|
+
--correlation kendall \
|
|
119
|
+
--truth-measure nugget_coverage \
|
|
120
|
+
--format latex \
|
|
121
|
+
--plot-dir plots/
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
**Judge configuration** (`judges.yml`) maps cryptic filenames to display names and categories, with optional plot styling:
|
|
125
|
+
|
|
126
|
+
```yaml
|
|
127
|
+
styles:
|
|
128
|
+
colors:
|
|
129
|
+
pointwise: "#4A90D9"
|
|
130
|
+
pairwise: "#D94A4A"
|
|
131
|
+
hatches:
|
|
132
|
+
gpt-4o: ""
|
|
133
|
+
llama-3: "//"
|
|
134
|
+
|
|
135
|
+
judges:
|
|
136
|
+
my-judge-A.eval:
|
|
137
|
+
name: System A
|
|
138
|
+
method: pointwise # category column
|
|
139
|
+
model: gpt-4o # category column
|
|
140
|
+
my-judge-B.eval:
|
|
141
|
+
name: System B
|
|
142
|
+
method: pairwise
|
|
143
|
+
model: llama-3
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
- **`styles.colors`**: maps category values to fill colors (any matplotlib color string)
|
|
147
|
+
- **`styles.hatches`**: maps category values to hatch patterns (`//`, `..`, `xx`, `\\`, etc). Values combine across categories.
|
|
148
|
+
- Color is picked from the first matching category value; hatches are combined from all matches.
|
|
149
|
+
- Without a `styles:` section, bars use a sequential grayscale fallback.
|
|
150
|
+
- Judges not in the YAML are excluded unless `--all-judges` is passed.
|
|
151
|
+
|
|
152
|
+
**Key options:** `--format` (github, latex, tsv, plain, html, pipe), `--columns` (correlations or measures), `--summary` (add mean/max rows), `--aggregate` (aggregate across datasets), `--same THRESHOLD` (highlight near-equal values).
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
---
|
|
156
|
+
|
|
157
|
+
### `eval-result` — Format conversion and verification
|
|
158
|
+
|
|
159
|
+
Clean and convert evaluation result files.
|
|
160
|
+
|
|
161
|
+
```bash
|
|
162
|
+
# Convert tot to jsonl
|
|
163
|
+
auto-judge-evaluate eval-result data.txt -if tot -of jsonl -o data.jsonl
|
|
164
|
+
|
|
165
|
+
# Filter to specific runs and topics
|
|
166
|
+
auto-judge-evaluate eval-result data.txt -if tot -of jsonl -o filtered.jsonl \
|
|
167
|
+
--filter-runs system_A --filter-runs system_B \
|
|
168
|
+
--filter-topics topic_1
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
**Key options:**
|
|
172
|
+
|
|
173
|
+
| Option | Description |
|
|
174
|
+
|--------|-------------|
|
|
175
|
+
| `-if FMT` | Input format: `trec_eval`, `tot`, `ir_measures`, `ranking`, `jsonl` |
|
|
176
|
+
| `-of FMT` | Output format (defaults to input format) |
|
|
177
|
+
| `-o FILE` | Output file. Omit for roundtrip test to temp file |
|
|
178
|
+
| `--filter-runs ID` | Keep only these runs. Repeatable |
|
|
179
|
+
| `--filter-topics ID` | Keep only these topics. Repeatable |
|
|
180
|
+
| `--filter-measures NAME` | Keep only these measures. Repeatable |
|
|
181
|
+
| `--compare-aggregates` | Compare file aggregates vs recomputed from per-topic data |
|
|
182
|
+
| `--drop-aggregates` | Drop existing aggregate rows |
|
|
183
|
+
| `--recompute-aggregates` | Recompute from per-topic data (implies `--drop-aggregates`) |
|
|
184
|
+
| `--roundtrip` / `--no-roundtrip` | Enable/disable roundtrip verification (default: on) |
|
|
185
|
+
|
|
186
|
+
**Supported formats:**
|
|
187
|
+
|
|
188
|
+
| Format | Columns |
|
|
189
|
+
|--------|---------|
|
|
190
|
+
| `trec_eval` | measure topic value (3 cols, run_id from filename) |
|
|
191
|
+
| `tot` | run measure topic value (4 cols) |
|
|
192
|
+
| `ir_measures` | run topic measure value (4 cols) |
|
|
193
|
+
| `ranking` | topic Q0 doc_id rank score run (6 cols) |
|
|
194
|
+
| `jsonl` | JSON lines with run_id, topic_id, measure, value |
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "autojudge-evaluate"
|
|
7
|
+
version = "v0.2.1"
|
|
8
|
+
description = "Evaluation tools for TREC AutoJudge: meta-evaluate, qrel-evaluate, leaderboard statistics"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "TREC AutoJudge Team"}
|
|
14
|
+
]
|
|
15
|
+
keywords = ["trec", "autojudge", "evaluation", "ir", "correlation"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 3 - Alpha",
|
|
18
|
+
"Intended Audience :: Science/Research",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.10",
|
|
22
|
+
"Programming Language :: Python :: 3.11",
|
|
23
|
+
"Programming Language :: Python :: 3.12",
|
|
24
|
+
"Programming Language :: Python :: 3.13",
|
|
25
|
+
"Programming Language :: Python :: 3.14",
|
|
26
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
dependencies = [
|
|
30
|
+
"autojudge-base",
|
|
31
|
+
"click>=8.0",
|
|
32
|
+
"pandas>=2.0",
|
|
33
|
+
"numpy",
|
|
34
|
+
"scipy",
|
|
35
|
+
"scikit-learn",
|
|
36
|
+
"krippendorff",
|
|
37
|
+
"matplotlib",
|
|
38
|
+
"pyyaml>=6.0",
|
|
39
|
+
"tira>=0.0.192",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
[project.optional-dependencies]
|
|
43
|
+
test = [
|
|
44
|
+
"pytest>=7.0",
|
|
45
|
+
"pytest-cov>=4.0",
|
|
46
|
+
"approvaltests>=16.0.0",
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
[project.scripts]
|
|
50
|
+
auto-judge-evaluate = "autojudge_evaluate:main"
|
|
51
|
+
|
|
52
|
+
[project.urls]
|
|
53
|
+
Homepage = "https://github.com/trec-autojudge/auto-judge-evaluate"
|
|
54
|
+
Repository = "https://github.com/trec-autojudge/auto-judge-evaluate"
|
|
55
|
+
|
|
56
|
+
[tool.setuptools.packages.find]
|
|
57
|
+
where = ["src"]
|
|
58
|
+
|
|
59
|
+
[tool.pytest.ini_options]
|
|
60
|
+
testpaths = ["tests"]
|
|
61
|
+
python_files = ["test_*.py"]
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""
|
|
2
|
+
autojudge_evaluate - Evaluation tools for TREC AutoJudge systems.
|
|
3
|
+
|
|
4
|
+
Provides:
|
|
5
|
+
- evaluation.py: LeaderboardEvaluator for correlation analysis
|
|
6
|
+
- eval_results/: EvalResult containers and I/O
|
|
7
|
+
- nugget_doc_eval.py: Nugget-document evaluation
|
|
8
|
+
- analysis/: Correlation tables and plots
|
|
9
|
+
- _commands/: CLI commands (meta-evaluate, qrel-evaluate, leaderboard, eval-result)
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
__version__ = '0.1.0'
|
|
13
|
+
|
|
14
|
+
from click import group
|
|
15
|
+
|
|
16
|
+
from ._commands._meta_evaluate import meta_evaluate
|
|
17
|
+
from ._commands._leaderboard import leaderboard
|
|
18
|
+
from ._commands._eval_result import eval_result
|
|
19
|
+
from ._commands._qrel_evaluate import qrel_evaluate
|
|
20
|
+
from .analysis.correlation_table import main as analysis
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@group()
|
|
24
|
+
def main():
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
main.command("meta-evaluate")(meta_evaluate)
|
|
29
|
+
main.command("leaderboard")(leaderboard)
|
|
30
|
+
main.add_command(eval_result)
|
|
31
|
+
main.add_command(qrel_evaluate, "qrel-evaluate")
|
|
32
|
+
main.add_command(analysis, "analysis")
|
|
File without changes
|