mcpbr 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcpbr-0.1.0/.github/ISSUE_TEMPLATE/bug_report.yml +84 -0
- mcpbr-0.1.0/.github/ISSUE_TEMPLATE/config.yml +8 -0
- mcpbr-0.1.0/.github/ISSUE_TEMPLATE/feature_request.yml +68 -0
- mcpbr-0.1.0/.github/PULL_REQUEST_TEMPLATE.md +42 -0
- mcpbr-0.1.0/.github/release-drafter.yml +50 -0
- mcpbr-0.1.0/.github/workflows/ci.yml +74 -0
- mcpbr-0.1.0/.github/workflows/publish.yml +51 -0
- mcpbr-0.1.0/.github/workflows/release-drafter.yml +22 -0
- mcpbr-0.1.0/.gitignore +87 -0
- mcpbr-0.1.0/CHANGELOG.md +40 -0
- mcpbr-0.1.0/CONTRIBUTING.md +135 -0
- mcpbr-0.1.0/Dockerfile +21 -0
- mcpbr-0.1.0/LICENSE +21 -0
- mcpbr-0.1.0/PKG-INFO +714 -0
- mcpbr-0.1.0/README.md +679 -0
- mcpbr-0.1.0/SECURITY.md +56 -0
- mcpbr-0.1.0/assets/mcpbr-demo.gif +0 -0
- mcpbr-0.1.0/assets/mcpbr-eval-results.png +0 -0
- mcpbr-0.1.0/assets/mcpbr-logo.jpg +0 -0
- mcpbr-0.1.0/config/example.yaml +58 -0
- mcpbr-0.1.0/config/supermodel.yaml +29 -0
- mcpbr-0.1.0/pyproject.toml +69 -0
- mcpbr-0.1.0/requirements.txt +18 -0
- mcpbr-0.1.0/results_comparison_5tasks_v2_summary.md +35 -0
- mcpbr-0.1.0/src/mcpbr/__init__.py +6 -0
- mcpbr-0.1.0/src/mcpbr/agent.py +187 -0
- mcpbr-0.1.0/src/mcpbr/cli.py +511 -0
- mcpbr-0.1.0/src/mcpbr/config.py +187 -0
- mcpbr-0.1.0/src/mcpbr/docker_env.py +561 -0
- mcpbr-0.1.0/src/mcpbr/evaluation.py +361 -0
- mcpbr-0.1.0/src/mcpbr/harness.py +468 -0
- mcpbr-0.1.0/src/mcpbr/harnesses.py +840 -0
- mcpbr-0.1.0/src/mcpbr/log_formatter.py +459 -0
- mcpbr-0.1.0/src/mcpbr/models.py +143 -0
- mcpbr-0.1.0/src/mcpbr/providers.py +236 -0
- mcpbr-0.1.0/src/mcpbr/reporting.py +213 -0
- mcpbr-0.1.0/tests/__init__.py +1 -0
- mcpbr-0.1.0/tests/test_agent.py +62 -0
- mcpbr-0.1.0/tests/test_config.py +142 -0
- mcpbr-0.1.0/tests/test_evaluation.py +32 -0
- mcpbr-0.1.0/tests/test_integration.py +252 -0
- mcpbr-0.1.0/tests/test_models.py +239 -0
- mcpbr-0.1.0/uv.lock +2180 -0
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
name: Bug Report
|
|
2
|
+
description: Report a bug or unexpected behavior
|
|
3
|
+
title: "[Bug]: "
|
|
4
|
+
labels: ["bug", "triage"]
|
|
5
|
+
body:
|
|
6
|
+
- type: markdown
|
|
7
|
+
attributes:
|
|
8
|
+
value: |
|
|
9
|
+
Thanks for taking the time to report a bug! Please fill out the form below.
|
|
10
|
+
|
|
11
|
+
- type: textarea
|
|
12
|
+
id: description
|
|
13
|
+
attributes:
|
|
14
|
+
label: Bug Description
|
|
15
|
+
description: A clear and concise description of what the bug is.
|
|
16
|
+
placeholder: Describe the bug...
|
|
17
|
+
validations:
|
|
18
|
+
required: true
|
|
19
|
+
|
|
20
|
+
- type: textarea
|
|
21
|
+
id: reproduction
|
|
22
|
+
attributes:
|
|
23
|
+
label: Steps to Reproduce
|
|
24
|
+
description: Steps to reproduce the behavior.
|
|
25
|
+
placeholder: |
|
|
26
|
+
1. Run `mcpbr ...`
|
|
27
|
+
2. See error...
|
|
28
|
+
validations:
|
|
29
|
+
required: true
|
|
30
|
+
|
|
31
|
+
- type: textarea
|
|
32
|
+
id: expected
|
|
33
|
+
attributes:
|
|
34
|
+
label: Expected Behavior
|
|
35
|
+
description: What you expected to happen.
|
|
36
|
+
placeholder: What should have happened?
|
|
37
|
+
validations:
|
|
38
|
+
required: true
|
|
39
|
+
|
|
40
|
+
- type: textarea
|
|
41
|
+
id: actual
|
|
42
|
+
attributes:
|
|
43
|
+
label: Actual Behavior
|
|
44
|
+
description: What actually happened.
|
|
45
|
+
placeholder: What actually happened?
|
|
46
|
+
validations:
|
|
47
|
+
required: true
|
|
48
|
+
|
|
49
|
+
- type: textarea
|
|
50
|
+
id: environment
|
|
51
|
+
attributes:
|
|
52
|
+
label: Environment
|
|
53
|
+
description: Please provide your environment details.
|
|
54
|
+
value: |
|
|
55
|
+
- OS: [e.g., macOS 14.0, Ubuntu 22.04]
|
|
56
|
+
- Python version: [e.g., 3.11.5]
|
|
57
|
+
- mcpbr version: [e.g., 0.1.0]
|
|
58
|
+
- Docker version: [e.g., 24.0.0]
|
|
59
|
+
validations:
|
|
60
|
+
required: true
|
|
61
|
+
|
|
62
|
+
- type: textarea
|
|
63
|
+
id: logs
|
|
64
|
+
attributes:
|
|
65
|
+
label: Relevant Logs
|
|
66
|
+
description: Please copy and paste any relevant log output.
|
|
67
|
+
render: shell
|
|
68
|
+
|
|
69
|
+
- type: textarea
|
|
70
|
+
id: config
|
|
71
|
+
attributes:
|
|
72
|
+
label: Configuration
|
|
73
|
+
description: If relevant, share your config.yaml (redact API keys!).
|
|
74
|
+
render: yaml
|
|
75
|
+
|
|
76
|
+
- type: checkboxes
|
|
77
|
+
id: checklist
|
|
78
|
+
attributes:
|
|
79
|
+
label: Checklist
|
|
80
|
+
options:
|
|
81
|
+
- label: I have searched existing issues to ensure this is not a duplicate
|
|
82
|
+
required: true
|
|
83
|
+
- label: I have redacted any sensitive information (API keys, etc.)
|
|
84
|
+
required: true
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
blank_issues_enabled: false
|
|
2
|
+
contact_links:
|
|
3
|
+
- name: Documentation
|
|
4
|
+
url: https://github.com/greynewell/mcpbr#readme
|
|
5
|
+
about: Check out the documentation before opening an issue
|
|
6
|
+
- name: Discussions
|
|
7
|
+
url: https://github.com/greynewell/mcpbr/discussions
|
|
8
|
+
about: Ask questions and discuss ideas in GitHub Discussions
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
name: Feature Request
|
|
2
|
+
description: Suggest a new feature or enhancement
|
|
3
|
+
title: "[Feature]: "
|
|
4
|
+
labels: ["enhancement", "triage"]
|
|
5
|
+
body:
|
|
6
|
+
- type: markdown
|
|
7
|
+
attributes:
|
|
8
|
+
value: |
|
|
9
|
+
Thanks for suggesting a feature! Please fill out the form below.
|
|
10
|
+
|
|
11
|
+
- type: textarea
|
|
12
|
+
id: problem
|
|
13
|
+
attributes:
|
|
14
|
+
label: Problem Statement
|
|
15
|
+
description: What problem would this feature solve? Is this related to a frustration?
|
|
16
|
+
placeholder: I'm always frustrated when...
|
|
17
|
+
validations:
|
|
18
|
+
required: true
|
|
19
|
+
|
|
20
|
+
- type: textarea
|
|
21
|
+
id: solution
|
|
22
|
+
attributes:
|
|
23
|
+
label: Proposed Solution
|
|
24
|
+
description: Describe the solution you'd like.
|
|
25
|
+
placeholder: I would like to be able to...
|
|
26
|
+
validations:
|
|
27
|
+
required: true
|
|
28
|
+
|
|
29
|
+
- type: textarea
|
|
30
|
+
id: alternatives
|
|
31
|
+
attributes:
|
|
32
|
+
label: Alternatives Considered
|
|
33
|
+
description: Describe any alternative solutions or features you've considered.
|
|
34
|
+
placeholder: I've also considered...
|
|
35
|
+
|
|
36
|
+
- type: dropdown
|
|
37
|
+
id: area
|
|
38
|
+
attributes:
|
|
39
|
+
label: Feature Area
|
|
40
|
+
description: What area of the project does this feature relate to?
|
|
41
|
+
options:
|
|
42
|
+
- CLI
|
|
43
|
+
- Configuration
|
|
44
|
+
- Providers (LLM APIs)
|
|
45
|
+
- Agent Harnesses
|
|
46
|
+
- Docker/Environment
|
|
47
|
+
- Evaluation/Testing
|
|
48
|
+
- Reporting
|
|
49
|
+
- Documentation
|
|
50
|
+
- Other
|
|
51
|
+
validations:
|
|
52
|
+
required: true
|
|
53
|
+
|
|
54
|
+
- type: textarea
|
|
55
|
+
id: additional
|
|
56
|
+
attributes:
|
|
57
|
+
label: Additional Context
|
|
58
|
+
description: Add any other context, mockups, or screenshots about the feature request.
|
|
59
|
+
|
|
60
|
+
- type: checkboxes
|
|
61
|
+
id: checklist
|
|
62
|
+
attributes:
|
|
63
|
+
label: Checklist
|
|
64
|
+
options:
|
|
65
|
+
- label: I have searched existing issues to ensure this is not a duplicate
|
|
66
|
+
required: true
|
|
67
|
+
- label: I am willing to help implement this feature
|
|
68
|
+
required: false
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
## Description
|
|
2
|
+
|
|
3
|
+
<!-- Describe your changes in detail -->
|
|
4
|
+
|
|
5
|
+
## Related Issue
|
|
6
|
+
|
|
7
|
+
<!-- If this PR addresses an issue, link it here -->
|
|
8
|
+
Fixes #
|
|
9
|
+
|
|
10
|
+
## Type of Change
|
|
11
|
+
|
|
12
|
+
<!-- Mark the relevant option with an "x" -->
|
|
13
|
+
|
|
14
|
+
- [ ] Bug fix (non-breaking change that fixes an issue)
|
|
15
|
+
- [ ] New feature (non-breaking change that adds functionality)
|
|
16
|
+
- [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
|
|
17
|
+
- [ ] Documentation update
|
|
18
|
+
- [ ] Refactoring (no functional changes)
|
|
19
|
+
- [ ] CI/CD or tooling changes
|
|
20
|
+
|
|
21
|
+
## Checklist
|
|
22
|
+
|
|
23
|
+
<!-- Mark completed items with an "x" -->
|
|
24
|
+
|
|
25
|
+
- [ ] I have read the [CONTRIBUTING](../CONTRIBUTING.md) guidelines
|
|
26
|
+
- [ ] My code follows the project's code style
|
|
27
|
+
- [ ] I have added tests that prove my fix/feature works
|
|
28
|
+
- [ ] New and existing unit tests pass locally
|
|
29
|
+
- [ ] I have updated the documentation (if applicable)
|
|
30
|
+
- [ ] I have added an entry to CHANGELOG.md (if applicable)
|
|
31
|
+
|
|
32
|
+
## Testing
|
|
33
|
+
|
|
34
|
+
<!-- Describe how you tested your changes -->
|
|
35
|
+
|
|
36
|
+
## Screenshots (if applicable)
|
|
37
|
+
|
|
38
|
+
<!-- Add screenshots to help explain your changes -->
|
|
39
|
+
|
|
40
|
+
## Additional Notes
|
|
41
|
+
|
|
42
|
+
<!-- Any additional information that reviewers should know -->
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
name-template: 'v$RESOLVED_VERSION'
|
|
2
|
+
tag-template: 'v$RESOLVED_VERSION'
|
|
3
|
+
categories:
|
|
4
|
+
- title: 'New Features'
|
|
5
|
+
labels:
|
|
6
|
+
- 'feature'
|
|
7
|
+
- 'enhancement'
|
|
8
|
+
- title: 'Bug Fixes'
|
|
9
|
+
labels:
|
|
10
|
+
- 'fix'
|
|
11
|
+
- 'bugfix'
|
|
12
|
+
- 'bug'
|
|
13
|
+
- title: 'Documentation'
|
|
14
|
+
labels:
|
|
15
|
+
- 'documentation'
|
|
16
|
+
- 'docs'
|
|
17
|
+
- title: 'Maintenance'
|
|
18
|
+
labels:
|
|
19
|
+
- 'chore'
|
|
20
|
+
- 'maintenance'
|
|
21
|
+
- 'dependencies'
|
|
22
|
+
- title: 'Other Changes'
|
|
23
|
+
labels:
|
|
24
|
+
- '*'
|
|
25
|
+
change-template: '- $TITLE @$AUTHOR (#$NUMBER)'
|
|
26
|
+
change-title-escapes: '\<*_&'
|
|
27
|
+
version-resolver:
|
|
28
|
+
major:
|
|
29
|
+
labels:
|
|
30
|
+
- 'major'
|
|
31
|
+
- 'breaking'
|
|
32
|
+
minor:
|
|
33
|
+
labels:
|
|
34
|
+
- 'minor'
|
|
35
|
+
- 'feature'
|
|
36
|
+
- 'enhancement'
|
|
37
|
+
patch:
|
|
38
|
+
labels:
|
|
39
|
+
- 'patch'
|
|
40
|
+
- 'fix'
|
|
41
|
+
- 'bugfix'
|
|
42
|
+
default: patch
|
|
43
|
+
template: |
|
|
44
|
+
## Changes
|
|
45
|
+
|
|
46
|
+
$CHANGES
|
|
47
|
+
|
|
48
|
+
## Contributors
|
|
49
|
+
|
|
50
|
+
$CONTRIBUTORS
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
lint:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v4
|
|
14
|
+
|
|
15
|
+
- name: Set up Python
|
|
16
|
+
uses: actions/setup-python@v5
|
|
17
|
+
with:
|
|
18
|
+
python-version: "3.11"
|
|
19
|
+
|
|
20
|
+
- name: Install dependencies
|
|
21
|
+
run: |
|
|
22
|
+
python -m pip install --upgrade pip
|
|
23
|
+
pip install ruff
|
|
24
|
+
|
|
25
|
+
- name: Run ruff check
|
|
26
|
+
run: ruff check src/ tests/
|
|
27
|
+
|
|
28
|
+
- name: Run ruff format check
|
|
29
|
+
run: ruff format --check src/ tests/
|
|
30
|
+
|
|
31
|
+
test:
|
|
32
|
+
runs-on: ubuntu-latest
|
|
33
|
+
strategy:
|
|
34
|
+
matrix:
|
|
35
|
+
python-version: ["3.11", "3.12"]
|
|
36
|
+
|
|
37
|
+
steps:
|
|
38
|
+
- uses: actions/checkout@v4
|
|
39
|
+
|
|
40
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
41
|
+
uses: actions/setup-python@v5
|
|
42
|
+
with:
|
|
43
|
+
python-version: ${{ matrix.python-version }}
|
|
44
|
+
|
|
45
|
+
- name: Install dependencies
|
|
46
|
+
run: |
|
|
47
|
+
python -m pip install --upgrade pip
|
|
48
|
+
pip install -e ".[dev]"
|
|
49
|
+
|
|
50
|
+
- name: Run unit tests
|
|
51
|
+
run: pytest -m "not integration" -v
|
|
52
|
+
|
|
53
|
+
build:
|
|
54
|
+
runs-on: ubuntu-latest
|
|
55
|
+
steps:
|
|
56
|
+
- uses: actions/checkout@v4
|
|
57
|
+
|
|
58
|
+
- name: Set up Python
|
|
59
|
+
uses: actions/setup-python@v5
|
|
60
|
+
with:
|
|
61
|
+
python-version: "3.11"
|
|
62
|
+
|
|
63
|
+
- name: Install build dependencies
|
|
64
|
+
run: |
|
|
65
|
+
python -m pip install --upgrade pip
|
|
66
|
+
pip install build
|
|
67
|
+
|
|
68
|
+
- name: Build package
|
|
69
|
+
run: python -m build
|
|
70
|
+
|
|
71
|
+
- name: Check distribution
|
|
72
|
+
run: |
|
|
73
|
+
pip install twine
|
|
74
|
+
twine check dist/*
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
permissions:
|
|
8
|
+
contents: read
|
|
9
|
+
id-token: write
|
|
10
|
+
|
|
11
|
+
jobs:
|
|
12
|
+
build:
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v4
|
|
16
|
+
|
|
17
|
+
- name: Set up Python
|
|
18
|
+
uses: actions/setup-python@v5
|
|
19
|
+
with:
|
|
20
|
+
python-version: "3.11"
|
|
21
|
+
|
|
22
|
+
- name: Install build dependencies
|
|
23
|
+
run: |
|
|
24
|
+
python -m pip install --upgrade pip
|
|
25
|
+
pip install build
|
|
26
|
+
|
|
27
|
+
- name: Build package
|
|
28
|
+
run: python -m build
|
|
29
|
+
|
|
30
|
+
- name: Store distribution packages
|
|
31
|
+
uses: actions/upload-artifact@v4
|
|
32
|
+
with:
|
|
33
|
+
name: python-package-distributions
|
|
34
|
+
path: dist/
|
|
35
|
+
|
|
36
|
+
publish-pypi:
|
|
37
|
+
needs: build
|
|
38
|
+
runs-on: ubuntu-latest
|
|
39
|
+
environment:
|
|
40
|
+
name: pypi
|
|
41
|
+
url: https://pypi.org/p/mcpbr
|
|
42
|
+
|
|
43
|
+
steps:
|
|
44
|
+
- name: Download distribution packages
|
|
45
|
+
uses: actions/download-artifact@v4
|
|
46
|
+
with:
|
|
47
|
+
name: python-package-distributions
|
|
48
|
+
path: dist/
|
|
49
|
+
|
|
50
|
+
- name: Publish to PyPI
|
|
51
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
name: Release Drafter
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
types: [opened, reopened, synchronize]
|
|
8
|
+
|
|
9
|
+
permissions:
|
|
10
|
+
contents: read
|
|
11
|
+
pull-requests: write
|
|
12
|
+
|
|
13
|
+
jobs:
|
|
14
|
+
update_release_draft:
|
|
15
|
+
permissions:
|
|
16
|
+
contents: write
|
|
17
|
+
pull-requests: write
|
|
18
|
+
runs-on: ubuntu-latest
|
|
19
|
+
steps:
|
|
20
|
+
- uses: release-drafter/release-drafter@v6
|
|
21
|
+
env:
|
|
22
|
+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
mcpbr-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
*.egg-info/
|
|
24
|
+
.installed.cfg
|
|
25
|
+
*.egg
|
|
26
|
+
|
|
27
|
+
# PyInstaller
|
|
28
|
+
*.manifest
|
|
29
|
+
*.spec
|
|
30
|
+
|
|
31
|
+
# Installer logs
|
|
32
|
+
pip-log.txt
|
|
33
|
+
pip-delete-this-directory.txt
|
|
34
|
+
|
|
35
|
+
# Unit test / coverage reports
|
|
36
|
+
htmlcov/
|
|
37
|
+
.tox/
|
|
38
|
+
.nox/
|
|
39
|
+
.coverage
|
|
40
|
+
.coverage.*
|
|
41
|
+
.cache
|
|
42
|
+
nosetests.xml
|
|
43
|
+
coverage.xml
|
|
44
|
+
*.cover
|
|
45
|
+
*.py,cover
|
|
46
|
+
.hypothesis/
|
|
47
|
+
.pytest_cache/
|
|
48
|
+
|
|
49
|
+
# Environments
|
|
50
|
+
.env
|
|
51
|
+
.env.*
|
|
52
|
+
.venv/
|
|
53
|
+
env/
|
|
54
|
+
venv/
|
|
55
|
+
ENV/
|
|
56
|
+
env.bak/
|
|
57
|
+
venv.bak/
|
|
58
|
+
|
|
59
|
+
# OS
|
|
60
|
+
.DS_Store
|
|
61
|
+
|
|
62
|
+
# IDE
|
|
63
|
+
.idea/
|
|
64
|
+
.vscode/
|
|
65
|
+
.cursor/
|
|
66
|
+
*.swp
|
|
67
|
+
*.swo
|
|
68
|
+
*~
|
|
69
|
+
|
|
70
|
+
# Project specific
|
|
71
|
+
results/
|
|
72
|
+
*.json
|
|
73
|
+
!config/*.json
|
|
74
|
+
config.yaml
|
|
75
|
+
reports/
|
|
76
|
+
.swebench-mcp/
|
|
77
|
+
.logs/
|
|
78
|
+
logs/
|
|
79
|
+
|
|
80
|
+
# Local secrets for act (GitHub Actions local runner)
|
|
81
|
+
.secrets
|
|
82
|
+
|
|
83
|
+
# Docker
|
|
84
|
+
docker-build/
|
|
85
|
+
|
|
86
|
+
# Datasets cache
|
|
87
|
+
.cache/
|
mcpbr-0.1.0/CHANGELOG.md
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [0.1.0] - 2026-01-17
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
|
|
12
|
+
- Initial release of mcpbr (Model Context Protocol Benchmark Runner)
|
|
13
|
+
- Support for multiple LLM providers:
|
|
14
|
+
- OpenRouter (multi-provider gateway)
|
|
15
|
+
- OpenAI (direct API)
|
|
16
|
+
- Anthropic (direct API)
|
|
17
|
+
- Google AI / Gemini (direct API)
|
|
18
|
+
- Agent harness implementations:
|
|
19
|
+
- Claude Code CLI with MCP server support
|
|
20
|
+
- OpenAI Codex CLI
|
|
21
|
+
- OpenCode CLI
|
|
22
|
+
- Gemini CLI
|
|
23
|
+
- Docker-based task isolation for SWE-bench evaluation
|
|
24
|
+
- Baseline agent for comparison (no tools)
|
|
25
|
+
- JSON and Markdown report generation
|
|
26
|
+
- Configurable evaluation parameters via YAML
|
|
27
|
+
- CLI commands: `run`, `init`, `models`, `harnesses`, `providers`
|
|
28
|
+
- Real-time streaming output when using `--verbose` flag with Claude Code harness
|
|
29
|
+
- Tool usage tracking: counts tool calls by name and includes breakdown in results JSON
|
|
30
|
+
- Each streamed output line is prefixed with task instance ID for parallel worker disambiguation
|
|
31
|
+
- **In-container agent execution**: The Claude Code CLI now runs inside the Docker container where all dependencies are installed. This ensures Python imports work correctly (e.g., `from astropy import ...`) and the agent can verify fixes by running tests.
|
|
32
|
+
- Pre-built SWE-bench Docker images from Epoch AI's registry (`ghcr.io/epoch-research/swe-bench.eval`) are now used when available, providing:
|
|
33
|
+
- Repository at the correct commit with all dependencies pre-installed
|
|
34
|
+
- Consistent, reproducible evaluation environments
|
|
35
|
+
- x86_64 images with automatic emulation on ARM64 (Apple Silicon) Macs
|
|
36
|
+
- Timestamped log filenames to prevent overwrites: `{instance_id}_{run_type}_{YYYYMMDD_HHMMSS}.json`
|
|
37
|
+
- `--no-prebuilt` CLI flag to disable pre-built images and build from scratch
|
|
38
|
+
- Network access for containers to enable API calls from within Docker
|
|
39
|
+
|
|
40
|
+
[0.1.0]: https://github.com/greynewell/mcpbr/releases/tag/v0.1.0
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# Contributing to mcpbr
|
|
2
|
+
|
|
3
|
+
Thank you for your interest in contributing to mcpbr! This document provides guidelines and information for contributors.
|
|
4
|
+
|
|
5
|
+
## How to Contribute
|
|
6
|
+
|
|
7
|
+
### Reporting Bugs
|
|
8
|
+
|
|
9
|
+
Before creating a bug report, please check the existing issues to avoid duplicates. When creating a bug report, include:
|
|
10
|
+
|
|
11
|
+
- A clear, descriptive title
|
|
12
|
+
- Steps to reproduce the behavior
|
|
13
|
+
- Expected behavior
|
|
14
|
+
- Actual behavior
|
|
15
|
+
- Your environment (OS, Python version, Docker version)
|
|
16
|
+
- Any relevant logs or error messages
|
|
17
|
+
|
|
18
|
+
### Suggesting Features
|
|
19
|
+
|
|
20
|
+
Feature requests are welcome! Please:
|
|
21
|
+
|
|
22
|
+
- Check existing issues and discussions first
|
|
23
|
+
- Clearly describe the feature and its use case
|
|
24
|
+
- Explain why this feature would be useful to most users
|
|
25
|
+
|
|
26
|
+
### Pull Requests
|
|
27
|
+
|
|
28
|
+
1. Fork the repository
|
|
29
|
+
2. Create a feature branch from `main`
|
|
30
|
+
3. Make your changes
|
|
31
|
+
4. Add or update tests as needed
|
|
32
|
+
5. Ensure all tests pass
|
|
33
|
+
6. Update documentation if needed
|
|
34
|
+
7. Submit a pull request
|
|
35
|
+
|
|
36
|
+
## Development Setup
|
|
37
|
+
|
|
38
|
+
### Prerequisites
|
|
39
|
+
|
|
40
|
+
- Python 3.11+
|
|
41
|
+
- Docker
|
|
42
|
+
- An API key for at least one supported provider (OpenRouter, OpenAI, Anthropic, or Google)
|
|
43
|
+
|
|
44
|
+
### Installation
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
# Clone your fork
|
|
48
|
+
git clone https://github.com/greynewell/mcpbr.git
|
|
49
|
+
cd mcpbr
|
|
50
|
+
|
|
51
|
+
# Create a virtual environment
|
|
52
|
+
python -m venv venv
|
|
53
|
+
source venv/bin/activate # or `venv\Scripts\activate` on Windows
|
|
54
|
+
|
|
55
|
+
# Install in development mode with dev dependencies
|
|
56
|
+
pip install -e ".[dev]"
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### Running Tests
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
# Run unit tests only (no Docker or API keys required)
|
|
63
|
+
pytest -m "not integration"
|
|
64
|
+
|
|
65
|
+
# Run all tests (requires Docker and API keys)
|
|
66
|
+
pytest
|
|
67
|
+
|
|
68
|
+
# Run with coverage
|
|
69
|
+
pytest --cov=mcpbr
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Code Style
|
|
73
|
+
|
|
74
|
+
This project uses:
|
|
75
|
+
|
|
76
|
+
- [ruff](https://github.com/astral-sh/ruff) for linting and formatting
|
|
77
|
+
- Type hints throughout the codebase
|
|
78
|
+
|
|
79
|
+
Before submitting a PR:
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
# Run linter
|
|
83
|
+
ruff check src/ tests/
|
|
84
|
+
|
|
85
|
+
# Auto-fix issues
|
|
86
|
+
ruff check --fix src/ tests/
|
|
87
|
+
|
|
88
|
+
# Format code
|
|
89
|
+
ruff format src/ tests/
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### Commit Messages
|
|
93
|
+
|
|
94
|
+
- Use clear, descriptive commit messages
|
|
95
|
+
- Start with a verb in the imperative mood (e.g., "Add", "Fix", "Update")
|
|
96
|
+
- Keep the first line under 72 characters
|
|
97
|
+
- Reference issues when applicable (e.g., "Fix #123")
|
|
98
|
+
|
|
99
|
+
## Project Structure
|
|
100
|
+
|
|
101
|
+
```
|
|
102
|
+
mcpbr/
|
|
103
|
+
├── src/mcpbr/ # Main package
|
|
104
|
+
│ ├── cli.py # CLI commands
|
|
105
|
+
│ ├── config.py # Configuration models
|
|
106
|
+
│ ├── harness.py # Main orchestrator
|
|
107
|
+
│ ├── harnesses.py # Agent implementations
|
|
108
|
+
│ ├── providers.py # LLM provider abstractions
|
|
109
|
+
│ └── ...
|
|
110
|
+
├── tests/ # Test suite
|
|
111
|
+
├── config/ # Example configurations
|
|
112
|
+
└── docs/ # Documentation
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
## Adding New Features
|
|
116
|
+
|
|
117
|
+
### Adding a New Provider
|
|
118
|
+
|
|
119
|
+
1. Create a new class in `src/mcpbr/providers.py` implementing the `ModelProvider` protocol
|
|
120
|
+
2. Add it to `PROVIDER_REGISTRY`
|
|
121
|
+
3. Update `VALID_PROVIDERS` in `config.py`
|
|
122
|
+
4. Add tests
|
|
123
|
+
5. Update documentation
|
|
124
|
+
|
|
125
|
+
### Adding a New Agent Harness
|
|
126
|
+
|
|
127
|
+
1. Create a new class in `src/mcpbr/harnesses.py` implementing the `AgentHarness` protocol
|
|
128
|
+
2. Add it to `HARNESS_REGISTRY`
|
|
129
|
+
3. Update `VALID_HARNESSES` in `config.py`
|
|
130
|
+
4. Add tests
|
|
131
|
+
5. Update documentation
|
|
132
|
+
|
|
133
|
+
## Questions?
|
|
134
|
+
|
|
135
|
+
Feel free to open an issue for any questions about contributing.
|
mcpbr-0.1.0/Dockerfile
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# Base image for SWE-bench task environments
|
|
2
|
+
FROM python:3.11-slim
|
|
3
|
+
|
|
4
|
+
# Install system dependencies
|
|
5
|
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
6
|
+
git \
|
|
7
|
+
curl \
|
|
8
|
+
build-essential \
|
|
9
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
10
|
+
|
|
11
|
+
# Set up workspace
|
|
12
|
+
WORKDIR /workspace
|
|
13
|
+
|
|
14
|
+
# Install common Python testing tools
|
|
15
|
+
RUN pip install --no-cache-dir \
|
|
16
|
+
pytest \
|
|
17
|
+
pytest-xdist \
|
|
18
|
+
coverage
|
|
19
|
+
|
|
20
|
+
# Default command
|
|
21
|
+
CMD ["/bin/bash"]
|