autojudge-evaluate 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. autojudge_evaluate-0.2.1/PKG-INFO +230 -0
  2. autojudge_evaluate-0.2.1/README.md +194 -0
  3. autojudge_evaluate-0.2.1/pyproject.toml +61 -0
  4. autojudge_evaluate-0.2.1/setup.cfg +4 -0
  5. autojudge_evaluate-0.2.1/src/autojudge_evaluate/__init__.py +32 -0
  6. autojudge_evaluate-0.2.1/src/autojudge_evaluate/__main__.py +3 -0
  7. autojudge_evaluate-0.2.1/src/autojudge_evaluate/_commands/__init__.py +0 -0
  8. autojudge_evaluate-0.2.1/src/autojudge_evaluate/_commands/_eval_result.py +480 -0
  9. autojudge_evaluate-0.2.1/src/autojudge_evaluate/_commands/_leaderboard.py +156 -0
  10. autojudge_evaluate-0.2.1/src/autojudge_evaluate/_commands/_meta_evaluate.py +381 -0
  11. autojudge_evaluate-0.2.1/src/autojudge_evaluate/_commands/_qrel_evaluate.py +174 -0
  12. autojudge_evaluate-0.2.1/src/autojudge_evaluate/analysis/__init__.py +0 -0
  13. autojudge_evaluate-0.2.1/src/autojudge_evaluate/analysis/correlation_table.py +971 -0
  14. autojudge_evaluate-0.2.1/src/autojudge_evaluate/eval_results/__init__.py +25 -0
  15. autojudge_evaluate-0.2.1/src/autojudge_evaluate/eval_results/builder.py +333 -0
  16. autojudge_evaluate-0.2.1/src/autojudge_evaluate/eval_results/eval_result.py +403 -0
  17. autojudge_evaluate-0.2.1/src/autojudge_evaluate/eval_results/io.py +285 -0
  18. autojudge_evaluate-0.2.1/src/autojudge_evaluate/eval_results/verification.py +382 -0
  19. autojudge_evaluate-0.2.1/src/autojudge_evaluate/evaluation.py +489 -0
  20. autojudge_evaluate-0.2.1/src/autojudge_evaluate/nugget_doc_eval.py +402 -0
  21. autojudge_evaluate-0.2.1/src/autojudge_evaluate/pyircore.py +128 -0
  22. autojudge_evaluate-0.2.1/src/autojudge_evaluate.egg-info/PKG-INFO +230 -0
  23. autojudge_evaluate-0.2.1/src/autojudge_evaluate.egg-info/SOURCES.txt +34 -0
  24. autojudge_evaluate-0.2.1/src/autojudge_evaluate.egg-info/dependency_links.txt +1 -0
  25. autojudge_evaluate-0.2.1/src/autojudge_evaluate.egg-info/entry_points.txt +2 -0
  26. autojudge_evaluate-0.2.1/src/autojudge_evaluate.egg-info/requires.txt +15 -0
  27. autojudge_evaluate-0.2.1/src/autojudge_evaluate.egg-info/top_level.txt +1 -0
  28. autojudge_evaluate-0.2.1/tests/test_correlations.py +78 -0
  29. autojudge_evaluate-0.2.1/tests/test_eval_results.py +539 -0
  30. autojudge_evaluate-0.2.1/tests/test_eval_results_io.py +472 -0
  31. autojudge_evaluate-0.2.1/tests/test_evaluation.py +126 -0
  32. autojudge_evaluate-0.2.1/tests/test_evaluation_interface.py +92 -0
  33. autojudge_evaluate-0.2.1/tests/test_leaderboard_interface.py +89 -0
  34. autojudge_evaluate-0.2.1/tests/test_nugget_doc_eval.py +94 -0
  35. autojudge_evaluate-0.2.1/tests/test_pyircore.py +62 -0
  36. autojudge_evaluate-0.2.1/tests/test_tau_ap_correlations.py +77 -0
@@ -0,0 +1,230 @@
1
+ Metadata-Version: 2.4
2
+ Name: autojudge-evaluate
3
+ Version: 0.2.1
4
+ Summary: Evaluation tools for TREC AutoJudge: meta-evaluate, qrel-evaluate, leaderboard statistics
5
+ Author: TREC AutoJudge Team
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/trec-autojudge/auto-judge-evaluate
8
+ Project-URL: Repository, https://github.com/trec-autojudge/auto-judge-evaluate
9
+ Keywords: trec,autojudge,evaluation,ir,correlation
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Programming Language :: Python :: 3.14
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Requires-Python: >=3.10
21
+ Description-Content-Type: text/markdown
22
+ Requires-Dist: autojudge-base
23
+ Requires-Dist: click>=8.0
24
+ Requires-Dist: pandas>=2.0
25
+ Requires-Dist: numpy
26
+ Requires-Dist: scipy
27
+ Requires-Dist: scikit-learn
28
+ Requires-Dist: krippendorff
29
+ Requires-Dist: matplotlib
30
+ Requires-Dist: pyyaml>=6.0
31
+ Requires-Dist: tira>=0.0.192
32
+ Provides-Extra: test
33
+ Requires-Dist: pytest>=7.0; extra == "test"
34
+ Requires-Dist: pytest-cov>=4.0; extra == "test"
35
+ Requires-Dist: approvaltests>=16.0.0; extra == "test"
36
+
37
+ # autojudge-evaluate
38
+
39
+ Evaluation tools for the TREC AutoJudge framework. Computes leaderboard correlations, inter-annotator agreement on qrels, leaderboard statistics, and format conversion for evaluation result files.
40
+
41
+ ## Installation
42
+
43
+ ```bash
44
+ uv pip install autojudge-evaluate
45
+ ```
46
+
47
+ ## CLI Commands
48
+
49
+ All commands are available via `auto-judge-evaluate <command>`.
50
+
51
+ ---
52
+
53
+ ### `meta-evaluate` — Leaderboard correlation
54
+
55
+ Correlate predicted leaderboards against a ground-truth leaderboard.
56
+
57
+ ```bash
58
+ auto-judge-evaluate meta-evaluate \
59
+ --truth-leaderboard truth.eval.jsonl --truth-format jsonl \
60
+ --eval-format tot -i results/*eval.txt \
61
+ --correlation kendall --correlation spearman --correlation tauap_b \
62
+ --truth-measure nugget_coverage --truth-measure f1 \
63
+ --on-missing default \
64
+ --output correlations.jsonl
65
+ ```
66
+
67
+ **Key options:**
68
+
69
+ | Option | Description |
70
+ |--------|-------------|
71
+ | `--truth-leaderboard FILE` | Ground-truth leaderboard file (required) |
72
+ | `--truth-format FMT` | Format: `trec_eval`, `tot`, `ir_measures`, `ranking`, `jsonl` |
73
+ | `--eval-format FMT` | Format of input leaderboard files |
74
+ | `-i FILE` / positional | Input leaderboard file(s), supports globs. Repeatable |
75
+ | `--correlation METHOD` | Correlation method. Repeatable. Supports `kendall`, `pearson`, `spearman`, `tauap_b`, and top-k variants like `kendall@15` |
76
+ | `--truth-measure NAME` | Truth measure(s) to correlate against. Repeatable. Omit for all |
77
+ | `--eval-measure NAME` | Eval measure(s) to include. Repeatable. Omit for all |
78
+ | `--on-missing MODE` | Handle run mismatches: `error`, `warn`, `skip`, `default` (fill 0.0) |
79
+ | `--only-shared-topics` | Intersect topics across truth and eval (default: `--all-topics`) |
80
+ | `--only-shared-runs` | Intersect runs across truth and eval (default: `--all-runs`) |
81
+ | `--truth-drop-aggregate` | Recompute aggregates from per-topic data |
82
+ | `--output FILE` | Output `.jsonl` or `.txt` |
83
+ | `--out-format FMT` | `jsonl` (default) or `table` |
84
+ | `--aggregate` | Report only mean across all judges |
85
+
86
+ **Output:** One row per (Judge, TruthMeasure, EvalMeasure) with correlation values as columns.
87
+
88
+ ---
89
+
90
+ ### `qrel-evaluate` — Inter-annotator agreement on qrels
91
+
92
+ Compare predicted relevance judgments (qrels) against truth qrels. Computes set overlap (precision, recall, F1) and agreement metrics (Cohen's Kappa, Krippendorff's Alpha, Jaccard, ARI).
93
+
94
+ ```bash
95
+ auto-judge-evaluate qrel-evaluate \
96
+ --truth-qrels official.qrels \
97
+ --predict-qrels predicted.qrels
98
+ ```
99
+
100
+
101
+
102
+ **Key options:**
103
+
104
+ | Option | Description |
105
+ |--------|-------------|
106
+ | `--truth-qrels FILE` | Truth qrels in TREC format |
107
+ | `--truth-nugget-docs DIR` | Alternative: truth as nugget-docs directory |
108
+ | `--predict-qrels FILE` | Predicted qrels in TREC format |
109
+ | `--predict-nugget-docs DIR` | Alternative: predicted as nugget-docs directory |
110
+ | `--truth-max-grade N` | Grade scale upper bound for truth (default: 1 = binary) |
111
+ | `--predict-max-grade N` | Grade scale upper bound for predicted (default: 1) |
112
+ | `--truth-relevance-threshold N` | Binary threshold for truth side (default: 1) |
113
+ | `--predict-relevance-threshold N` | Binary threshold for predicted side (default: 1) |
114
+ | `--on-missing MODE` | Handle topics in only one side: `error`, `warn`, `default`, `skip` |
115
+ | `--output FILE` | Output `.jsonl` or `.txt` |
116
+
117
+ **Output:** Per-topic table with Precision, Recall, F1, Jaccard, Kappa, Krippendorff's Alpha, ARI, plus a MEAN row.
118
+
119
+ ---
120
+
121
+ ### `leaderboard` — Leaderboard statistics
122
+
123
+ Compute per-run statistics (mean, stderr, stdev, min, max) from leaderboard files.
124
+
125
+ ```bash
126
+ auto-judge-evaluate leaderboard \
127
+ --eval-format tot -i results/*eval.txt --sort
128
+ ```
129
+
130
+ **Key options:**
131
+
132
+ | Option | Description |
133
+ |--------|-------------|
134
+ | `--eval-format FMT` | Input format (required) |
135
+ | `-i FILE` / positional | Input file(s), supports globs. Repeatable |
136
+ | `--eval-measure NAME` | Filter to specific measures. Repeatable |
137
+ | `--sort` | Sort runs by mean score (descending) |
138
+ | `--output FILE` | Output `.jsonl` or `.csv` |
139
+
140
+ **Output:** One row per (Judge, RunID, Measure) with Topics, Mean, Stderr, Stdev, Min, Max.
141
+
142
+ ---
143
+
144
+ ## Analysis Module
145
+
146
+ Post-hoc analysis, tables, plots of `meta-evaluate` output. Produces correlation tables and bar plots with judge categorization.
147
+
148
+ ```bash
149
+ python -m autojudge_evaluate.analysis.correlation_table \
150
+ -d ragtime:ragtime-correlations.jsonl \
151
+ -d rag:rag-correlations.jsonl \
152
+ -d dragun:dragun-correlations.jsonl \
153
+ --judges judges.yml \
154
+ --correlation kendall \
155
+ --truth-measure nugget_coverage \
156
+ --format latex \
157
+ --plot-dir plots/
158
+ ```
159
+
160
+ **Judge configuration** (`judges.yml`) maps cryptic filenames to display names and categories, with optional plot styling:
161
+
162
+ ```yaml
163
+ styles:
164
+ colors:
165
+ pointwise: "#4A90D9"
166
+ pairwise: "#D94A4A"
167
+ hatches:
168
+ gpt-4o: ""
169
+ llama-3: "//"
170
+
171
+ judges:
172
+ my-judge-A.eval:
173
+ name: System A
174
+ method: pointwise # category column
175
+ model: gpt-4o # category column
176
+ my-judge-B.eval:
177
+ name: System B
178
+ method: pairwise
179
+ model: llama-3
180
+ ```
181
+
182
+ - **`styles.colors`**: maps category values to fill colors (any matplotlib color string)
183
+ - **`styles.hatches`**: maps category values to hatch patterns (`//`, `..`, `xx`, `\\`, etc). Values combine across categories.
184
+ - Color is picked from the first matching category value; hatches are combined from all matches.
185
+ - Without a `styles:` section, bars use a sequential grayscale fallback.
186
+ - Judges not in the YAML are excluded unless `--all-judges` is passed.
187
+
188
+ **Key options:** `--format` (github, latex, tsv, plain, html, pipe), `--columns` (correlations or measures), `--summary` (add mean/max rows), `--aggregate` (aggregate across datasets), `--same THRESHOLD` (highlight near-equal values).
189
+
190
+
191
+ ---
192
+
193
+ ### `eval-result` — Format conversion and verification
194
+
195
+ Clean and convert evaluation result files.
196
+
197
+ ```bash
198
+ # Convert tot to jsonl
199
+ auto-judge-evaluate eval-result data.txt -if tot -of jsonl -o data.jsonl
200
+
201
+ # Filter to specific runs and topics
202
+ auto-judge-evaluate eval-result data.txt -if tot -of jsonl -o filtered.jsonl \
203
+ --filter-runs system_A --filter-runs system_B \
204
+ --filter-topics topic_1
205
+ ```
206
+
207
+ **Key options:**
208
+
209
+ | Option | Description |
210
+ |--------|-------------|
211
+ | `-if FMT` | Input format: `trec_eval`, `tot`, `ir_measures`, `ranking`, `jsonl` |
212
+ | `-of FMT` | Output format (defaults to input format) |
213
+ | `-o FILE` | Output file. Omit for roundtrip test to temp file |
214
+ | `--filter-runs ID` | Keep only these runs. Repeatable |
215
+ | `--filter-topics ID` | Keep only these topics. Repeatable |
216
+ | `--filter-measures NAME` | Keep only these measures. Repeatable |
217
+ | `--compare-aggregates` | Compare file aggregates vs recomputed from per-topic data |
218
+ | `--drop-aggregates` | Drop existing aggregate rows |
219
+ | `--recompute-aggregates` | Recompute from per-topic data (implies `--drop-aggregates`) |
220
+ | `--roundtrip` / `--no-roundtrip` | Enable/disable roundtrip verification (default: on) |
221
+
222
+ **Supported formats:**
223
+
224
+ | Format | Columns |
225
+ |--------|---------|
226
+ | `trec_eval` | measure topic value (3 cols, run_id from filename) |
227
+ | `tot` | run measure topic value (4 cols) |
228
+ | `ir_measures` | run topic measure value (4 cols) |
229
+ | `ranking` | topic Q0 doc_id rank score run (6 cols) |
230
+ | `jsonl` | JSON lines with run_id, topic_id, measure, value |
@@ -0,0 +1,194 @@
1
+ # autojudge-evaluate
2
+
3
+ Evaluation tools for the TREC AutoJudge framework. Computes leaderboard correlations, inter-annotator agreement on qrels, leaderboard statistics, and format conversion for evaluation result files.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ uv pip install autojudge-evaluate
9
+ ```
10
+
11
+ ## CLI Commands
12
+
13
+ All commands are available via `auto-judge-evaluate <command>`.
14
+
15
+ ---
16
+
17
+ ### `meta-evaluate` — Leaderboard correlation
18
+
19
+ Correlate predicted leaderboards against a ground-truth leaderboard.
20
+
21
+ ```bash
22
+ auto-judge-evaluate meta-evaluate \
23
+ --truth-leaderboard truth.eval.jsonl --truth-format jsonl \
24
+ --eval-format tot -i results/*eval.txt \
25
+ --correlation kendall --correlation spearman --correlation tauap_b \
26
+ --truth-measure nugget_coverage --truth-measure f1 \
27
+ --on-missing default \
28
+ --output correlations.jsonl
29
+ ```
30
+
31
+ **Key options:**
32
+
33
+ | Option | Description |
34
+ |--------|-------------|
35
+ | `--truth-leaderboard FILE` | Ground-truth leaderboard file (required) |
36
+ | `--truth-format FMT` | Format: `trec_eval`, `tot`, `ir_measures`, `ranking`, `jsonl` |
37
+ | `--eval-format FMT` | Format of input leaderboard files |
38
+ | `-i FILE` / positional | Input leaderboard file(s), supports globs. Repeatable |
39
+ | `--correlation METHOD` | Correlation method. Repeatable. Supports `kendall`, `pearson`, `spearman`, `tauap_b`, and top-k variants like `kendall@15` |
40
+ | `--truth-measure NAME` | Truth measure(s) to correlate against. Repeatable. Omit for all |
41
+ | `--eval-measure NAME` | Eval measure(s) to include. Repeatable. Omit for all |
42
+ | `--on-missing MODE` | Handle run mismatches: `error`, `warn`, `skip`, `default` (fill 0.0) |
43
+ | `--only-shared-topics` | Intersect topics across truth and eval (default: `--all-topics`) |
44
+ | `--only-shared-runs` | Intersect runs across truth and eval (default: `--all-runs`) |
45
+ | `--truth-drop-aggregate` | Recompute aggregates from per-topic data |
46
+ | `--output FILE` | Output `.jsonl` or `.txt` |
47
+ | `--out-format FMT` | `jsonl` (default) or `table` |
48
+ | `--aggregate` | Report only mean across all judges |
49
+
50
+ **Output:** One row per (Judge, TruthMeasure, EvalMeasure) with correlation values as columns.
51
+
52
+ ---
53
+
54
+ ### `qrel-evaluate` — Inter-annotator agreement on qrels
55
+
56
+ Compare predicted relevance judgments (qrels) against truth qrels. Computes set overlap (precision, recall, F1) and agreement metrics (Cohen's Kappa, Krippendorff's Alpha, Jaccard, ARI).
57
+
58
+ ```bash
59
+ auto-judge-evaluate qrel-evaluate \
60
+ --truth-qrels official.qrels \
61
+ --predict-qrels predicted.qrels
62
+ ```
63
+
64
+
65
+
66
+ **Key options:**
67
+
68
+ | Option | Description |
69
+ |--------|-------------|
70
+ | `--truth-qrels FILE` | Truth qrels in TREC format |
71
+ | `--truth-nugget-docs DIR` | Alternative: truth as nugget-docs directory |
72
+ | `--predict-qrels FILE` | Predicted qrels in TREC format |
73
+ | `--predict-nugget-docs DIR` | Alternative: predicted as nugget-docs directory |
74
+ | `--truth-max-grade N` | Grade scale upper bound for truth (default: 1 = binary) |
75
+ | `--predict-max-grade N` | Grade scale upper bound for predicted (default: 1) |
76
+ | `--truth-relevance-threshold N` | Binary threshold for truth side (default: 1) |
77
+ | `--predict-relevance-threshold N` | Binary threshold for predicted side (default: 1) |
78
+ | `--on-missing MODE` | Handle topics in only one side: `error`, `warn`, `default`, `skip` |
79
+ | `--output FILE` | Output `.jsonl` or `.txt` |
80
+
81
+ **Output:** Per-topic table with Precision, Recall, F1, Jaccard, Kappa, Krippendorff's Alpha, ARI, plus a MEAN row.
82
+
83
+ ---
84
+
85
+ ### `leaderboard` — Leaderboard statistics
86
+
87
+ Compute per-run statistics (mean, stderr, stdev, min, max) from leaderboard files.
88
+
89
+ ```bash
90
+ auto-judge-evaluate leaderboard \
91
+ --eval-format tot -i results/*eval.txt --sort
92
+ ```
93
+
94
+ **Key options:**
95
+
96
+ | Option | Description |
97
+ |--------|-------------|
98
+ | `--eval-format FMT` | Input format (required) |
99
+ | `-i FILE` / positional | Input file(s), supports globs. Repeatable |
100
+ | `--eval-measure NAME` | Filter to specific measures. Repeatable |
101
+ | `--sort` | Sort runs by mean score (descending) |
102
+ | `--output FILE` | Output `.jsonl` or `.csv` |
103
+
104
+ **Output:** One row per (Judge, RunID, Measure) with Topics, Mean, Stderr, Stdev, Min, Max.
105
+
106
+ ---
107
+
108
+ ## Analysis Module
109
+
110
+ Post-hoc analysis, tables, plots of `meta-evaluate` output. Produces correlation tables and bar plots with judge categorization.
111
+
112
+ ```bash
113
+ python -m autojudge_evaluate.analysis.correlation_table \
114
+ -d ragtime:ragtime-correlations.jsonl \
115
+ -d rag:rag-correlations.jsonl \
116
+ -d dragun:dragun-correlations.jsonl \
117
+ --judges judges.yml \
118
+ --correlation kendall \
119
+ --truth-measure nugget_coverage \
120
+ --format latex \
121
+ --plot-dir plots/
122
+ ```
123
+
124
+ **Judge configuration** (`judges.yml`) maps cryptic filenames to display names and categories, with optional plot styling:
125
+
126
+ ```yaml
127
+ styles:
128
+ colors:
129
+ pointwise: "#4A90D9"
130
+ pairwise: "#D94A4A"
131
+ hatches:
132
+ gpt-4o: ""
133
+ llama-3: "//"
134
+
135
+ judges:
136
+ my-judge-A.eval:
137
+ name: System A
138
+ method: pointwise # category column
139
+ model: gpt-4o # category column
140
+ my-judge-B.eval:
141
+ name: System B
142
+ method: pairwise
143
+ model: llama-3
144
+ ```
145
+
146
+ - **`styles.colors`**: maps category values to fill colors (any matplotlib color string)
147
+ - **`styles.hatches`**: maps category values to hatch patterns (`//`, `..`, `xx`, `\\`, etc). Values combine across categories.
148
+ - Color is picked from the first matching category value; hatches are combined from all matches.
149
+ - Without a `styles:` section, bars use a sequential grayscale fallback.
150
+ - Judges not in the YAML are excluded unless `--all-judges` is passed.
151
+
152
+ **Key options:** `--format` (github, latex, tsv, plain, html, pipe), `--columns` (correlations or measures), `--summary` (add mean/max rows), `--aggregate` (aggregate across datasets), `--same THRESHOLD` (highlight near-equal values).
153
+
154
+
155
+ ---
156
+
157
+ ### `eval-result` — Format conversion and verification
158
+
159
+ Clean and convert evaluation result files.
160
+
161
+ ```bash
162
+ # Convert tot to jsonl
163
+ auto-judge-evaluate eval-result data.txt -if tot -of jsonl -o data.jsonl
164
+
165
+ # Filter to specific runs and topics
166
+ auto-judge-evaluate eval-result data.txt -if tot -of jsonl -o filtered.jsonl \
167
+ --filter-runs system_A --filter-runs system_B \
168
+ --filter-topics topic_1
169
+ ```
170
+
171
+ **Key options:**
172
+
173
+ | Option | Description |
174
+ |--------|-------------|
175
+ | `-if FMT` | Input format: `trec_eval`, `tot`, `ir_measures`, `ranking`, `jsonl` |
176
+ | `-of FMT` | Output format (defaults to input format) |
177
+ | `-o FILE` | Output file. Omit for roundtrip test to temp file |
178
+ | `--filter-runs ID` | Keep only these runs. Repeatable |
179
+ | `--filter-topics ID` | Keep only these topics. Repeatable |
180
+ | `--filter-measures NAME` | Keep only these measures. Repeatable |
181
+ | `--compare-aggregates` | Compare file aggregates vs recomputed from per-topic data |
182
+ | `--drop-aggregates` | Drop existing aggregate rows |
183
+ | `--recompute-aggregates` | Recompute from per-topic data (implies `--drop-aggregates`) |
184
+ | `--roundtrip` / `--no-roundtrip` | Enable/disable roundtrip verification (default: on) |
185
+
186
+ **Supported formats:**
187
+
188
+ | Format | Columns |
189
+ |--------|---------|
190
+ | `trec_eval` | measure topic value (3 cols, run_id from filename) |
191
+ | `tot` | run measure topic value (4 cols) |
192
+ | `ir_measures` | run topic measure value (4 cols) |
193
+ | `ranking` | topic Q0 doc_id rank score run (6 cols) |
194
+ | `jsonl` | JSON lines with run_id, topic_id, measure, value |
@@ -0,0 +1,61 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "autojudge-evaluate"
7
+ version = "v0.2.1"
8
+ description = "Evaluation tools for TREC AutoJudge: meta-evaluate, qrel-evaluate, leaderboard statistics"
9
+ readme = "README.md"
10
+ license = {text = "MIT"}
11
+ requires-python = ">=3.10"
12
+ authors = [
13
+ {name = "TREC AutoJudge Team"}
14
+ ]
15
+ keywords = ["trec", "autojudge", "evaluation", "ir", "correlation"]
16
+ classifiers = [
17
+ "Development Status :: 3 - Alpha",
18
+ "Intended Audience :: Science/Research",
19
+ "License :: OSI Approved :: MIT License",
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.10",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Programming Language :: Python :: 3.12",
24
+ "Programming Language :: Python :: 3.13",
25
+ "Programming Language :: Python :: 3.14",
26
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
27
+ ]
28
+
29
+ dependencies = [
30
+ "autojudge-base",
31
+ "click>=8.0",
32
+ "pandas>=2.0",
33
+ "numpy",
34
+ "scipy",
35
+ "scikit-learn",
36
+ "krippendorff",
37
+ "matplotlib",
38
+ "pyyaml>=6.0",
39
+ "tira>=0.0.192",
40
+ ]
41
+
42
+ [project.optional-dependencies]
43
+ test = [
44
+ "pytest>=7.0",
45
+ "pytest-cov>=4.0",
46
+ "approvaltests>=16.0.0",
47
+ ]
48
+
49
+ [project.scripts]
50
+ auto-judge-evaluate = "autojudge_evaluate:main"
51
+
52
+ [project.urls]
53
+ Homepage = "https://github.com/trec-autojudge/auto-judge-evaluate"
54
+ Repository = "https://github.com/trec-autojudge/auto-judge-evaluate"
55
+
56
+ [tool.setuptools.packages.find]
57
+ where = ["src"]
58
+
59
+ [tool.pytest.ini_options]
60
+ testpaths = ["tests"]
61
+ python_files = ["test_*.py"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,32 @@
1
+ """
2
+ autojudge_evaluate - Evaluation tools for TREC AutoJudge systems.
3
+
4
+ Provides:
5
+ - evaluation.py: LeaderboardEvaluator for correlation analysis
6
+ - eval_results/: EvalResult containers and I/O
7
+ - nugget_doc_eval.py: Nugget-document evaluation
8
+ - analysis/: Correlation tables and plots
9
+ - _commands/: CLI commands (meta-evaluate, qrel-evaluate, leaderboard, eval-result)
10
+ """
11
+
12
+ __version__ = '0.1.0'
13
+
14
+ from click import group
15
+
16
+ from ._commands._meta_evaluate import meta_evaluate
17
+ from ._commands._leaderboard import leaderboard
18
+ from ._commands._eval_result import eval_result
19
+ from ._commands._qrel_evaluate import qrel_evaluate
20
+ from .analysis.correlation_table import main as analysis
21
+
22
+
23
+ @group()
24
+ def main():
25
+ pass
26
+
27
+
28
+ main.command("meta-evaluate")(meta_evaluate)
29
+ main.command("leaderboard")(leaderboard)
30
+ main.add_command(eval_result)
31
+ main.add_command(qrel_evaluate, "qrel-evaluate")
32
+ main.add_command(analysis, "analysis")
@@ -0,0 +1,3 @@
1
+ from autojudge_evaluate import main
2
+
3
+ main()