agent-eval 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_eval-0.1.1/MANIFEST.in +1 -0
- agent_eval-0.1.1/PKG-INFO +74 -0
- agent_eval-0.1.1/README.md +50 -0
- agent_eval-0.1.1/pyproject.toml +51 -0
- agent_eval-0.1.1/setup.cfg +4 -0
- agent_eval-0.1.1/src/agent_eval.egg-info/PKG-INFO +74 -0
- agent_eval-0.1.1/src/agent_eval.egg-info/SOURCES.txt +21 -0
- agent_eval-0.1.1/src/agent_eval.egg-info/dependency_links.txt +1 -0
- agent_eval-0.1.1/src/agent_eval.egg-info/entry_points.txt +2 -0
- agent_eval-0.1.1/src/agent_eval.egg-info/requires.txt +17 -0
- agent_eval-0.1.1/src/agent_eval.egg-info/top_level.txt +1 -0
- agent_eval-0.1.1/src/agenteval/__init__.py +14 -0
- agent_eval-0.1.1/src/agenteval/cli.py +412 -0
- agent_eval-0.1.1/src/agenteval/config.py +87 -0
- agent_eval-0.1.1/src/agenteval/dataset_features.yml +71 -0
- agent_eval-0.1.1/src/agenteval/io.py +38 -0
- agent_eval-0.1.1/src/agenteval/log.py +58 -0
- agent_eval-0.1.1/src/agenteval/models.py +89 -0
- agent_eval-0.1.1/src/agenteval/schema_generator.py +95 -0
- agent_eval-0.1.1/src/agenteval/score.py +152 -0
- agent_eval-0.1.1/src/agenteval/summary.py +110 -0
- agent_eval-0.1.1/src/agenteval/upload.py +175 -0
- agent_eval-0.1.1/tests/test_cli.py +10 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
include src/agenteval/dataset_features.yml
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: agent-eval
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Agent evaluation toolkit
|
|
5
|
+
Project-URL: Homepage, https://github.com/allenai/agent-eval
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: click
|
|
9
|
+
Requires-Dist: inspect-ai
|
|
10
|
+
Requires-Dist: litellm
|
|
11
|
+
Requires-Dist: pydantic>=2.0.0
|
|
12
|
+
Requires-Dist: huggingface_hub
|
|
13
|
+
Requires-Dist: pyarrow
|
|
14
|
+
Requires-Dist: datasets
|
|
15
|
+
Provides-Extra: dev
|
|
16
|
+
Requires-Dist: black==24.2.0; extra == "dev"
|
|
17
|
+
Requires-Dist: isort; extra == "dev"
|
|
18
|
+
Requires-Dist: autoflake; extra == "dev"
|
|
19
|
+
Requires-Dist: pytest; extra == "dev"
|
|
20
|
+
Requires-Dist: pytest-asyncio; extra == "dev"
|
|
21
|
+
Requires-Dist: mypy==1.15; extra == "dev"
|
|
22
|
+
Requires-Dist: types-PyYAML; extra == "dev"
|
|
23
|
+
Requires-Dist: types-setuptools; extra == "dev"
|
|
24
|
+
|
|
25
|
+
# agent-eval
|
|
26
|
+
|
|
27
|
+
A utility for evaluating agents on a suite of [Inspect](https://github.com/UKGovernmentBEIS/inspect_ai)-formatted evals, with the following primary benefits:
|
|
28
|
+
1. Task suite specifications as config.
|
|
29
|
+
2. Extracts the token usage of the agent from log files, and computes cost using `litellm`.
|
|
30
|
+
3. Submits task suite results to a leaderboard, with submission metadata and easy upload to a HuggingFace repo for distribution of scores and logs.
|
|
31
|
+
|
|
32
|
+
# Installation
|
|
33
|
+
|
|
34
|
+
To install from pypi, use `pip install agent-eval`.
|
|
35
|
+
|
|
36
|
+
# Usage
|
|
37
|
+
|
|
38
|
+
## Run evaluation suite
|
|
39
|
+
```shell
|
|
40
|
+
agenteval eval --config-path CONFIG_PATH --split SPLIT LOG_DIR
|
|
41
|
+
```
|
|
42
|
+
Evaluate an agent on the supplied eval suite configuration. Results are written to `agenteval.json` in the log directory.
|
|
43
|
+
|
|
44
|
+
See [sample-config.yml](sample-config.yml) for a sample configuration file.
|
|
45
|
+
|
|
46
|
+
For aggregation in a leaderboard, each task specifies a `primary_metric` as `{scorer_name}/{metric_name}`.
|
|
47
|
+
The scoring utils will look for a corresponding stderr metric,
|
|
48
|
+
by looking for another metric with the same `scorer_name` and with a `metric_name` containing the string "stderr".
|
|
49
|
+
|
|
50
|
+
## Score results
|
|
51
|
+
```shell
|
|
52
|
+
agenteval score [OPTIONS] LOG_DIR
|
|
53
|
+
```
|
|
54
|
+
Compute scores for the results in `agenteval.json` and update the file with the computed scores.
|
|
55
|
+
|
|
56
|
+
## Publish scores
|
|
57
|
+
```shell
|
|
58
|
+
agenteval publish [OPTIONS] LOG_DIR
|
|
59
|
+
```
|
|
60
|
+
Upload the scored results to HuggingFace datasets.
|
|
61
|
+
|
|
62
|
+
# Administer the HuggingFace datasets
|
|
63
|
+
Prior to publishing scores, two HuggingFace datasets should be set up, one for full submissions and one for results files.
|
|
64
|
+
|
|
65
|
+
If you want to call `load_dataset()` on the results dataset (e.g., for populating a leaderboard), you probably want to explicitly tell HuggingFace about the schema and dataset structure (otherwise, HuggingFace may fail to propertly auto-convert to Parquet).
|
|
66
|
+
This is done by updating the `configs` attribute in the YAML metadata block at the top of the `README.md` file at the root of the results dataset (the metadata block is identified by lines with just `---` above and below it).
|
|
67
|
+
This attribute should contain a list of configs, each of which specifies the schema (under the `features` key) and dataset structure (under the `data_files` key).
|
|
68
|
+
See [sample-config-hf-readme-metadata.yml](sample-config-hf-readme-metadata.yml) for a sample metadata block corresponding to [sample-comfig.yml](sample-config.yml) (note that the metadata references the [raw schema data](src/agenteval/dataset_features.yml), which must be copied).
|
|
69
|
+
|
|
70
|
+
To facilitate initializing new configs, `agenteval publish` will automatically add this metadata if it is missing.
|
|
71
|
+
|
|
72
|
+
# Development
|
|
73
|
+
|
|
74
|
+
See [Development.md](Development.md) for development instructions.
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# agent-eval
|
|
2
|
+
|
|
3
|
+
A utility for evaluating agents on a suite of [Inspect](https://github.com/UKGovernmentBEIS/inspect_ai)-formatted evals, with the following primary benefits:
|
|
4
|
+
1. Task suite specifications as config.
|
|
5
|
+
2. Extracts the token usage of the agent from log files, and computes cost using `litellm`.
|
|
6
|
+
3. Submits task suite results to a leaderboard, with submission metadata and easy upload to a HuggingFace repo for distribution of scores and logs.
|
|
7
|
+
|
|
8
|
+
# Installation
|
|
9
|
+
|
|
10
|
+
To install from pypi, use `pip install agent-eval`.
|
|
11
|
+
|
|
12
|
+
# Usage
|
|
13
|
+
|
|
14
|
+
## Run evaluation suite
|
|
15
|
+
```shell
|
|
16
|
+
agenteval eval --config-path CONFIG_PATH --split SPLIT LOG_DIR
|
|
17
|
+
```
|
|
18
|
+
Evaluate an agent on the supplied eval suite configuration. Results are written to `agenteval.json` in the log directory.
|
|
19
|
+
|
|
20
|
+
See [sample-config.yml](sample-config.yml) for a sample configuration file.
|
|
21
|
+
|
|
22
|
+
For aggregation in a leaderboard, each task specifies a `primary_metric` as `{scorer_name}/{metric_name}`.
|
|
23
|
+
The scoring utils will look for a corresponding stderr metric,
|
|
24
|
+
by looking for another metric with the same `scorer_name` and with a `metric_name` containing the string "stderr".
|
|
25
|
+
|
|
26
|
+
## Score results
|
|
27
|
+
```shell
|
|
28
|
+
agenteval score [OPTIONS] LOG_DIR
|
|
29
|
+
```
|
|
30
|
+
Compute scores for the results in `agenteval.json` and update the file with the computed scores.
|
|
31
|
+
|
|
32
|
+
## Publish scores
|
|
33
|
+
```shell
|
|
34
|
+
agenteval publish [OPTIONS] LOG_DIR
|
|
35
|
+
```
|
|
36
|
+
Upload the scored results to HuggingFace datasets.
|
|
37
|
+
|
|
38
|
+
# Administer the HuggingFace datasets
|
|
39
|
+
Prior to publishing scores, two HuggingFace datasets should be set up, one for full submissions and one for results files.
|
|
40
|
+
|
|
41
|
+
If you want to call `load_dataset()` on the results dataset (e.g., for populating a leaderboard), you probably want to explicitly tell HuggingFace about the schema and dataset structure (otherwise, HuggingFace may fail to propertly auto-convert to Parquet).
|
|
42
|
+
This is done by updating the `configs` attribute in the YAML metadata block at the top of the `README.md` file at the root of the results dataset (the metadata block is identified by lines with just `---` above and below it).
|
|
43
|
+
This attribute should contain a list of configs, each of which specifies the schema (under the `features` key) and dataset structure (under the `data_files` key).
|
|
44
|
+
See [sample-config-hf-readme-metadata.yml](sample-config-hf-readme-metadata.yml) for a sample metadata block corresponding to [sample-comfig.yml](sample-config.yml) (note that the metadata references the [raw schema data](src/agenteval/dataset_features.yml), which must be copied).
|
|
45
|
+
|
|
46
|
+
To facilitate initializing new configs, `agenteval publish` will automatically add this metadata if it is missing.
|
|
47
|
+
|
|
48
|
+
# Development
|
|
49
|
+
|
|
50
|
+
See [Development.md](Development.md) for development instructions.
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "agent-eval"
|
|
7
|
+
version = "0.1.1"
|
|
8
|
+
description = "Agent evaluation toolkit"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"click",
|
|
13
|
+
"inspect-ai",
|
|
14
|
+
"litellm",
|
|
15
|
+
"pydantic>=2.0.0",
|
|
16
|
+
# For leaderboard
|
|
17
|
+
"huggingface_hub",
|
|
18
|
+
"pyarrow",
|
|
19
|
+
"datasets",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
[project.urls]
|
|
23
|
+
Homepage = "https://github.com/allenai/agent-eval"
|
|
24
|
+
|
|
25
|
+
[project.optional-dependencies]
|
|
26
|
+
dev = [
|
|
27
|
+
"black==24.2.0",
|
|
28
|
+
"isort",
|
|
29
|
+
"autoflake",
|
|
30
|
+
"pytest",
|
|
31
|
+
"pytest-asyncio",
|
|
32
|
+
"mypy==1.15",
|
|
33
|
+
"types-PyYAML",
|
|
34
|
+
"types-setuptools"
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[project.scripts]
|
|
38
|
+
agenteval = "agenteval.cli:cli"
|
|
39
|
+
|
|
40
|
+
[tool.setuptools.packages.find]
|
|
41
|
+
where = ["src"]
|
|
42
|
+
|
|
43
|
+
[tool.pytest.ini_options]
|
|
44
|
+
testpaths = ["tests"]
|
|
45
|
+
python_files = ["test_*.py"]
|
|
46
|
+
|
|
47
|
+
[tool.setuptools]
|
|
48
|
+
include-package-data = true
|
|
49
|
+
|
|
50
|
+
[tool.setuptools.package-data]
|
|
51
|
+
"agenteval" = ["dataset_features.yml"]
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: agent-eval
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Agent evaluation toolkit
|
|
5
|
+
Project-URL: Homepage, https://github.com/allenai/agent-eval
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: click
|
|
9
|
+
Requires-Dist: inspect-ai
|
|
10
|
+
Requires-Dist: litellm
|
|
11
|
+
Requires-Dist: pydantic>=2.0.0
|
|
12
|
+
Requires-Dist: huggingface_hub
|
|
13
|
+
Requires-Dist: pyarrow
|
|
14
|
+
Requires-Dist: datasets
|
|
15
|
+
Provides-Extra: dev
|
|
16
|
+
Requires-Dist: black==24.2.0; extra == "dev"
|
|
17
|
+
Requires-Dist: isort; extra == "dev"
|
|
18
|
+
Requires-Dist: autoflake; extra == "dev"
|
|
19
|
+
Requires-Dist: pytest; extra == "dev"
|
|
20
|
+
Requires-Dist: pytest-asyncio; extra == "dev"
|
|
21
|
+
Requires-Dist: mypy==1.15; extra == "dev"
|
|
22
|
+
Requires-Dist: types-PyYAML; extra == "dev"
|
|
23
|
+
Requires-Dist: types-setuptools; extra == "dev"
|
|
24
|
+
|
|
25
|
+
# agent-eval
|
|
26
|
+
|
|
27
|
+
A utility for evaluating agents on a suite of [Inspect](https://github.com/UKGovernmentBEIS/inspect_ai)-formatted evals, with the following primary benefits:
|
|
28
|
+
1. Task suite specifications as config.
|
|
29
|
+
2. Extracts the token usage of the agent from log files, and computes cost using `litellm`.
|
|
30
|
+
3. Submits task suite results to a leaderboard, with submission metadata and easy upload to a HuggingFace repo for distribution of scores and logs.
|
|
31
|
+
|
|
32
|
+
# Installation
|
|
33
|
+
|
|
34
|
+
To install from pypi, use `pip install agent-eval`.
|
|
35
|
+
|
|
36
|
+
# Usage
|
|
37
|
+
|
|
38
|
+
## Run evaluation suite
|
|
39
|
+
```shell
|
|
40
|
+
agenteval eval --config-path CONFIG_PATH --split SPLIT LOG_DIR
|
|
41
|
+
```
|
|
42
|
+
Evaluate an agent on the supplied eval suite configuration. Results are written to `agenteval.json` in the log directory.
|
|
43
|
+
|
|
44
|
+
See [sample-config.yml](sample-config.yml) for a sample configuration file.
|
|
45
|
+
|
|
46
|
+
For aggregation in a leaderboard, each task specifies a `primary_metric` as `{scorer_name}/{metric_name}`.
|
|
47
|
+
The scoring utils will look for a corresponding stderr metric,
|
|
48
|
+
by looking for another metric with the same `scorer_name` and with a `metric_name` containing the string "stderr".
|
|
49
|
+
|
|
50
|
+
## Score results
|
|
51
|
+
```shell
|
|
52
|
+
agenteval score [OPTIONS] LOG_DIR
|
|
53
|
+
```
|
|
54
|
+
Compute scores for the results in `agenteval.json` and update the file with the computed scores.
|
|
55
|
+
|
|
56
|
+
## Publish scores
|
|
57
|
+
```shell
|
|
58
|
+
agenteval publish [OPTIONS] LOG_DIR
|
|
59
|
+
```
|
|
60
|
+
Upload the scored results to HuggingFace datasets.
|
|
61
|
+
|
|
62
|
+
# Administer the HuggingFace datasets
|
|
63
|
+
Prior to publishing scores, two HuggingFace datasets should be set up, one for full submissions and one for results files.
|
|
64
|
+
|
|
65
|
+
If you want to call `load_dataset()` on the results dataset (e.g., for populating a leaderboard), you probably want to explicitly tell HuggingFace about the schema and dataset structure (otherwise, HuggingFace may fail to propertly auto-convert to Parquet).
|
|
66
|
+
This is done by updating the `configs` attribute in the YAML metadata block at the top of the `README.md` file at the root of the results dataset (the metadata block is identified by lines with just `---` above and below it).
|
|
67
|
+
This attribute should contain a list of configs, each of which specifies the schema (under the `features` key) and dataset structure (under the `data_files` key).
|
|
68
|
+
See [sample-config-hf-readme-metadata.yml](sample-config-hf-readme-metadata.yml) for a sample metadata block corresponding to [sample-comfig.yml](sample-config.yml) (note that the metadata references the [raw schema data](src/agenteval/dataset_features.yml), which must be copied).
|
|
69
|
+
|
|
70
|
+
To facilitate initializing new configs, `agenteval publish` will automatically add this metadata if it is missing.
|
|
71
|
+
|
|
72
|
+
# Development
|
|
73
|
+
|
|
74
|
+
See [Development.md](Development.md) for development instructions.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MANIFEST.in
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
src/agent_eval.egg-info/PKG-INFO
|
|
5
|
+
src/agent_eval.egg-info/SOURCES.txt
|
|
6
|
+
src/agent_eval.egg-info/dependency_links.txt
|
|
7
|
+
src/agent_eval.egg-info/entry_points.txt
|
|
8
|
+
src/agent_eval.egg-info/requires.txt
|
|
9
|
+
src/agent_eval.egg-info/top_level.txt
|
|
10
|
+
src/agenteval/__init__.py
|
|
11
|
+
src/agenteval/cli.py
|
|
12
|
+
src/agenteval/config.py
|
|
13
|
+
src/agenteval/dataset_features.yml
|
|
14
|
+
src/agenteval/io.py
|
|
15
|
+
src/agenteval/log.py
|
|
16
|
+
src/agenteval/models.py
|
|
17
|
+
src/agenteval/schema_generator.py
|
|
18
|
+
src/agenteval/score.py
|
|
19
|
+
src/agenteval/summary.py
|
|
20
|
+
src/agenteval/upload.py
|
|
21
|
+
tests/test_cli.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
agenteval
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from importlib.metadata import version as get_version
|
|
2
|
+
|
|
3
|
+
__version__ = get_version("agent-eval")
|
|
4
|
+
|
|
5
|
+
from .score import process_eval_logs
|
|
6
|
+
from .summary import compute_summary_statistics
|
|
7
|
+
from .upload import upload_folder_to_hf, upload_summary_to_hf
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"process_eval_logs",
|
|
11
|
+
"compute_summary_statistics",
|
|
12
|
+
"upload_folder_to_hf",
|
|
13
|
+
"upload_summary_to_hf",
|
|
14
|
+
]
|