axor-benchmarks 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,95 @@
1
+ name: CI/CD
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ tags: ["v*.*.*"]
7
+ pull_request:
8
+ branches: [main]
9
+
10
+ jobs:
11
+ test:
12
+ name: Test (Python ${{ matrix.python-version }})
13
+ runs-on: ubuntu-latest
14
+ strategy:
15
+ matrix:
16
+ python-version: ["3.11", "3.12"]
17
+
18
+ steps:
19
+ - uses: actions/checkout@v4
20
+
21
+ - name: Checkout axor-core
22
+ uses: actions/checkout@v4
23
+ with:
24
+ repository: ${{ github.repository_owner }}/axor-core
25
+ path: axor-core
26
+
27
+ - name: Checkout axor-claude
28
+ uses: actions/checkout@v4
29
+ with:
30
+ repository: ${{ github.repository_owner }}/axor-claude
31
+ path: axor-claude
32
+
33
+ - uses: actions/setup-python@v5
34
+ with:
35
+ python-version: ${{ matrix.python-version }}
36
+ cache: pip
37
+
38
+ - name: Install
39
+ run: |
40
+ pip install -e axor-core/
41
+ pip install -e axor-claude/
42
+ pip install -e ".[dev]"
43
+
44
+ - name: Run tests
45
+ run: pytest -q
46
+
47
+ publish:
48
+ name: Publish to PyPI
49
+ needs: test
50
+ runs-on: ubuntu-latest
51
+ if: startsWith(github.ref, 'refs/tags/v')
52
+ environment: pypi
53
+
54
+ permissions:
55
+ id-token: write
56
+
57
+ steps:
58
+ - uses: actions/checkout@v4
59
+
60
+ - uses: actions/setup-python@v5
61
+ with:
62
+ python-version: "3.12"
63
+
64
+ - name: Verify tag matches package version
65
+ run: |
66
+ python - << 'EOF'
67
+ import pathlib
68
+ import re
69
+ import sys
70
+ import tomllib
71
+
72
+ ref = "${{ github.ref_name }}"
73
+ m = re.fullmatch(r"v(\d+\.\d+\.\d+)", ref)
74
+ if not m:
75
+ print(f"Tag {ref!r} must match vX.Y.Z")
76
+ sys.exit(1)
77
+
78
+ tag_version = m.group(1)
79
+ data = tomllib.loads(pathlib.Path("pyproject.toml").read_text(encoding="utf-8"))
80
+ pkg_version = data["project"]["version"]
81
+
82
+ if tag_version != pkg_version:
83
+ print(f"Version mismatch: tag={tag_version}, pyproject={pkg_version}")
84
+ sys.exit(1)
85
+
86
+ print(f"Version check passed: {pkg_version}")
87
+ EOF
88
+
89
+ - name: Build
90
+ run: |
91
+ pip install hatchling build
92
+ python -m build
93
+
94
+ - name: Publish to PyPI
95
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,8 @@
1
+ __pycache__/
2
+ *.pyc
3
+ *.egg-info/
4
+ dist/
5
+ build/
6
+ .venv/
7
+ results/
8
+ *.json
@@ -0,0 +1,17 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Axor Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
@@ -0,0 +1,164 @@
1
+ Metadata-Version: 2.4
2
+ Name: axor-benchmarks
3
+ Version: 0.1.1
4
+ Summary: Benchmark governed vs raw Claude on your codebase
5
+ Project-URL: Repository, https://github.com/Bucha11/axor-benchmarks
6
+ Project-URL: Bug Tracker, https://github.com/Bucha11/axor-benchmarks/issues
7
+ Project-URL: Changelog, https://github.com/Bucha11/axor-benchmarks/releases
8
+ License: MIT
9
+ License-File: LICENSE
10
+ Keywords: agents,axor,benchmark,claude,llm
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Environment :: Console
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Requires-Python: >=3.11
18
+ Requires-Dist: anthropic>=0.40.0
19
+ Requires-Dist: axor-claude>=0.1.0
20
+ Requires-Dist: axor-core>=0.1.0
21
+ Provides-Extra: dev
22
+ Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
23
+ Requires-Dist: pytest>=8.0; extra == 'dev'
24
+ Description-Content-Type: text/markdown
25
+
26
+ # axor-benchmarks
27
+
28
+ [![CI](https://github.com/Bucha11/axor-benchmarks/actions/workflows/ci.yml/badge.svg)](https://github.com/Bucha11/axor-benchmarks/actions/workflows/ci.yml)
29
+ [![PyPI](https://img.shields.io/pypi/v/axor-benchmarks)](https://pypi.org/project/axor-benchmarks/)
30
+ [![Python](https://img.shields.io/pypi/pyversions/axor-benchmarks)](https://pypi.org/project/axor-benchmarks/)
31
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
32
+
33
+ **Benchmark governed (axor) vs raw Claude on your codebase.**
34
+
35
+ Measures real token savings, latency, and federation across 4 benchmark suites on any Python project.
36
+
37
+ ---
38
+
39
+ ## Installation
40
+
41
+ ```bash
42
+ pip install axor-benchmarks
43
+ ```
44
+
45
+ ---
46
+
47
+ ## Quick Start
48
+
49
+ ```bash
50
+ cd ~/my-project
51
+ axor-bench
52
+ ```
53
+
54
+ Output:
55
+ ```
56
+ axor benchmark results
57
+ repo: ~/my-project
58
+ file: src/auth.py
59
+
60
+ task raw tokens governed savings bar policy
61
+ ─────────────────────────────────────────────────────────────────────────────────
62
+ write_test 1,842 1,203 -34.7% ████████░░░░░░░░ focused_generative
63
+ explain_function 1,105 891 -19.4% ███░░░░░░░░░░░░░ focused_readonly
64
+ find_bugs 1,290 978 -24.2% ████░░░░░░░░░░░░ focused_readonly
65
+ ─────────────────────────────────────────────────────────────────────────────────
66
+ TOTAL 4,237 3,072 -27.5% ████░░░░░░░░░░░░
67
+
68
+ insights
69
+ → Token reduction: 27.5% (4,237 → 3,072 tokens)
70
+ → Most used policy: focused_readonly (2 tasks)
71
+ ```
72
+
73
+ ---
74
+
75
+ ## Authentication
76
+
77
+ Priority order (highest to lowest):
78
+
79
+ 1. `--api-key sk-ant-...` flag
80
+ 2. `ANTHROPIC_API_KEY` env var
81
+ 3. `~/.axor/config.toml` (set via `axor claude → /auth`)
82
+
83
+ ```bash
84
+ # Use env var
85
+ ANTHROPIC_API_KEY=sk-ant-... axor-bench
86
+
87
+ # Use flag (not saved)
88
+ axor-bench --api-key sk-ant-...
89
+
90
+ # Use saved key from axor-cli
91
+ axor claude # → /auth → saves to ~/.axor/config.toml
92
+ axor-bench # reads automatically
93
+ ```
94
+
95
+ ---
96
+
97
+ ## Suites
98
+
99
+ | Suite | Tasks | What it measures |
100
+ |-------|-------|-----------------|
101
+ | `quick` | 1 task | Fast sanity check (~30s) |
102
+ | `small` | 3 tasks | Single-turn focused tasks |
103
+ | `large` | 2 tasks | Multi-tool, multi-step tasks |
104
+ | `conversation` | 1 × 10 turns | Context growth over long sessions |
105
+ | `federation` | 1 task | Child agent spawning + isolation |
106
+ | `full` | all | Complete benchmark (~5-10 min) |
107
+
108
+ ```bash
109
+ axor-bench --suite small # fast
110
+ axor-bench --suite full # complete
111
+ axor-bench --suite conversation # test context compression
112
+ axor-bench --suite federation # test child agents
113
+ ```
114
+
115
+ ---
116
+
117
+ ## Options
118
+
119
+ ```
120
+ axor-bench [options]
121
+
122
+ --api-key KEY Anthropic API key
123
+ --repo PATH Repo to benchmark (default: current dir)
124
+ --file PATH Specific file to use as context
125
+ --suite SUITE quick | small | large | conversation | federation | full
126
+ --no-raw Skip raw Claude baseline (governed only)
127
+ --output FORMAT table (default) | json
128
+ ```
129
+
130
+ ---
131
+
132
+ ## What is measured
133
+
134
+ **Raw Claude** — direct Anthropic API call with no governance:
135
+ - Full conversation history passed every turn
136
+ - No context compression
137
+ - No policy selection
138
+ - No tool governance
139
+
140
+ **Governed (axor)** — same task via GovernedSession:
141
+ - Dynamic policy based on task (focused_readonly, moderate_mutative, etc.)
142
+ - Context shaped and compressed per turn
143
+ - Waste elimination (dedup, error collapse, prose summarization)
144
+ - Session-scoped cache (no re-reading same file twice)
145
+
146
+ **Token savings** = `(raw - governed) / raw × 100%`
147
+
148
+ Positive = governed uses fewer tokens (expected for most tasks).
149
+ Negative = governed uses more (possible for very simple tasks where overhead > savings).
150
+
151
+ ---
152
+
153
+ ## Requirements
154
+
155
+ - Python 3.11+
156
+ - `axor-core >= 0.1.0`
157
+ - `axor-claude >= 0.1.0`
158
+ - `anthropic >= 0.40.0`
159
+
160
+ ---
161
+
162
+ ## License
163
+
164
+ MIT
@@ -0,0 +1,139 @@
1
+ # axor-benchmarks
2
+
3
+ [![CI](https://github.com/Bucha11/axor-benchmarks/actions/workflows/ci.yml/badge.svg)](https://github.com/Bucha11/axor-benchmarks/actions/workflows/ci.yml)
4
+ [![PyPI](https://img.shields.io/pypi/v/axor-benchmarks)](https://pypi.org/project/axor-benchmarks/)
5
+ [![Python](https://img.shields.io/pypi/pyversions/axor-benchmarks)](https://pypi.org/project/axor-benchmarks/)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
7
+
8
+ **Benchmark governed (axor) vs raw Claude on your codebase.**
9
+
10
+ Measures real token savings, latency, and federation across 4 benchmark suites on any Python project.
11
+
12
+ ---
13
+
14
+ ## Installation
15
+
16
+ ```bash
17
+ pip install axor-benchmarks
18
+ ```
19
+
20
+ ---
21
+
22
+ ## Quick Start
23
+
24
+ ```bash
25
+ cd ~/my-project
26
+ axor-bench
27
+ ```
28
+
29
+ Output:
30
+ ```
31
+ axor benchmark results
32
+ repo: ~/my-project
33
+ file: src/auth.py
34
+
35
+ task raw tokens governed savings bar policy
36
+ ─────────────────────────────────────────────────────────────────────────────────
37
+ write_test 1,842 1,203 -34.7% ████████░░░░░░░░ focused_generative
38
+ explain_function 1,105 891 -19.4% ███░░░░░░░░░░░░░ focused_readonly
39
+ find_bugs 1,290 978 -24.2% ████░░░░░░░░░░░░ focused_readonly
40
+ ─────────────────────────────────────────────────────────────────────────────────
41
+ TOTAL 4,237 3,072 -27.5% ████░░░░░░░░░░░░
42
+
43
+ insights
44
+ → Token reduction: 27.5% (4,237 → 3,072 tokens)
45
+ → Most used policy: focused_readonly (2 tasks)
46
+ ```
47
+
48
+ ---
49
+
50
+ ## Authentication
51
+
52
+ Priority order (highest to lowest):
53
+
54
+ 1. `--api-key sk-ant-...` flag
55
+ 2. `ANTHROPIC_API_KEY` env var
56
+ 3. `~/.axor/config.toml` (set via `axor claude → /auth`)
57
+
58
+ ```bash
59
+ # Use env var
60
+ ANTHROPIC_API_KEY=sk-ant-... axor-bench
61
+
62
+ # Use flag (not saved)
63
+ axor-bench --api-key sk-ant-...
64
+
65
+ # Use saved key from axor-cli
66
+ axor claude # → /auth → saves to ~/.axor/config.toml
67
+ axor-bench # reads automatically
68
+ ```
69
+
70
+ ---
71
+
72
+ ## Suites
73
+
74
+ | Suite | Tasks | What it measures |
75
+ |-------|-------|-----------------|
76
+ | `quick` | 1 task | Fast sanity check (~30s) |
77
+ | `small` | 3 tasks | Single-turn focused tasks |
78
+ | `large` | 2 tasks | Multi-tool, multi-step tasks |
79
+ | `conversation` | 1 × 10 turns | Context growth over long sessions |
80
+ | `federation` | 1 task | Child agent spawning + isolation |
81
+ | `full` | all | Complete benchmark (~5-10 min) |
82
+
83
+ ```bash
84
+ axor-bench --suite small # fast
85
+ axor-bench --suite full # complete
86
+ axor-bench --suite conversation # test context compression
87
+ axor-bench --suite federation # test child agents
88
+ ```
89
+
90
+ ---
91
+
92
+ ## Options
93
+
94
+ ```
95
+ axor-bench [options]
96
+
97
+ --api-key KEY Anthropic API key
98
+ --repo PATH Repo to benchmark (default: current dir)
99
+ --file PATH Specific file to use as context
100
+ --suite SUITE quick | small | large | conversation | federation | full
101
+ --no-raw Skip raw Claude baseline (governed only)
102
+ --output FORMAT table (default) | json
103
+ ```
104
+
105
+ ---
106
+
107
+ ## What is measured
108
+
109
+ **Raw Claude** — direct Anthropic API call with no governance:
110
+ - Full conversation history passed every turn
111
+ - No context compression
112
+ - No policy selection
113
+ - No tool governance
114
+
115
+ **Governed (axor)** — same task via GovernedSession:
116
+ - Dynamic policy based on task (focused_readonly, moderate_mutative, etc.)
117
+ - Context shaped and compressed per turn
118
+ - Waste elimination (dedup, error collapse, prose summarization)
119
+ - Session-scoped cache (no re-reading same file twice)
120
+
121
+ **Token savings** = `(raw - governed) / raw × 100%`
122
+
123
+ Positive = governed uses fewer tokens (expected for most tasks).
124
+ Negative = governed uses more (possible for very simple tasks where overhead > savings).
125
+
126
+ ---
127
+
128
+ ## Requirements
129
+
130
+ - Python 3.11+
131
+ - `axor-core >= 0.1.0`
132
+ - `axor-claude >= 0.1.0`
133
+ - `anthropic >= 0.40.0`
134
+
135
+ ---
136
+
137
+ ## License
138
+
139
+ MIT
@@ -0,0 +1,161 @@
1
+ # API Key Module Refactoring Summary
2
+
3
+ ## Overview
4
+ Refactored `api_key.py` to improve readability, maintainability, and type safety while preserving the existing public interface.
5
+
6
+ ## Key Improvements
7
+
8
+ ### 1. **Enhanced Type Hints**
9
+ - Added `Final` type hints for constants to prevent accidental modification
10
+ - Added explicit return types to all functions
11
+ - Added parameter type hints where missing
12
+ - Used `dict[str, Any]` instead of generic dict
13
+
14
+ **Before:**
15
+ ```python
16
+ CONFIG_DIR = Path.home() / ".axor"
17
+ CONFIG_FILE = CONFIG_DIR / "config.toml"
18
+
19
+ _ENV_VARS = {
20
+ "claude": "ANTHROPIC_API_KEY",
21
+ "openai": "OPENAI_API_KEY",
22
+ }
23
+ ```
24
+
25
+ **After:**
26
+ ```python
27
+ CONFIG_DIR: Final[Path] = Path.home() / ".axor"
28
+ CONFIG_FILE: Final[Path] = CONFIG_DIR / "config.toml"
29
+
30
+ _ENV_VARS: Final[dict[str, str]] = {
31
+ "claude": "ANTHROPIC_API_KEY",
32
+ "openai": "OPENAI_API_KEY",
33
+ }
34
+ ```
35
+
36
+ ### 2. **Improved Code Organization**
37
+ - Moved all public functions to the top
38
+ - Grouped private helper functions at the bottom with clear section comment
39
+ - Consistent ordering: constants → public API → private helpers
40
+
41
+ ### 3. **Better Separation of Concerns**
42
+ - Extracted TOML serialization logic into `_serialize_to_toml()`
43
+ - Split complex `prompt_and_save()` into smaller, focused functions:
44
+ - `_print_prompt_header()` - Display prompt information
45
+ - `_prompt_for_key()` - Get API key from user
46
+ - `_prompt_to_save()` - Ask about saving to config
47
+ - `_save_key_to_config()` - Save and display result
48
+ - Created `_load_existing_config()` to deduplicate config loading logic
49
+
50
+ ### 4. **Enhanced Documentation**
51
+ - Added comprehensive docstrings to all functions (public and private)
52
+ - Included Args, Returns, and Raises sections where appropriate
53
+ - Added inline comments explaining non-obvious behavior
54
+ - Clarified the priority chain in `resolve_api_key()` docstring
55
+
56
+ ### 5. **Improved Error Handling**
57
+ - Better tracking of file descriptors in `_write_config()`
58
+ - Explicit cleanup of temp files on error
59
+ - Clear separation between expected failures (return None) and exceptional failures (raise)
60
+
61
+ **Before:**
62
+ ```python
63
+ def _write_config(data: dict[str, Any]) -> None:
64
+ fd, tmp = tempfile.mkstemp(dir=CONFIG_DIR, prefix=".axor_cfg_")
65
+ try:
66
+ with os.fdopen(fd, "w") as f:
67
+ f.write("\n".join(lines))
68
+ os.replace(tmp, CONFIG_FILE)
69
+ CONFIG_FILE.chmod(stat.S_IRUSR | stat.S_IWUSR)
70
+ except Exception:
71
+ if os.path.exists(tmp):
72
+ os.unlink(tmp)
73
+ raise
74
+ ```
75
+
76
+ **After:**
77
+ ```python
78
+ def _write_config(data: dict[str, Any]) -> None:
79
+ toml_content = _serialize_to_toml(data)
80
+
81
+ fd = -1
82
+ tmp_path = ""
83
+ try:
84
+ fd, tmp_path = tempfile.mkstemp(dir=CONFIG_DIR, prefix=".axor_cfg_")
85
+ with os.fdopen(fd, "w") as f:
86
+ f.write(toml_content)
87
+ fd = -1 # Mark as closed
88
+
89
+ os.replace(tmp_path, CONFIG_FILE)
90
+ CONFIG_FILE.chmod(stat.S_IRUSR | stat.S_IWUSR)
91
+ except Exception:
92
+ if fd != -1:
93
+ try:
94
+ os.close(fd)
95
+ except OSError:
96
+ pass
97
+ if tmp_path and os.path.exists(tmp_path):
98
+ os.unlink(tmp_path)
99
+ raise
100
+ ```
101
+
102
+ ### 6. **Named Constants**
103
+ - Introduced `_TOML_API_KEY_FIELD` constant for "api_key" field name
104
+ - Prevents typos and makes future changes easier
105
+
106
+ ### 7. **Pathlib Consistency**
107
+ - Used `Path.open()` instead of mixing `open()` with Path objects
108
+ - More idiomatic pathlib usage throughout
109
+
110
+ **Before:**
111
+ ```python
112
+ with open(CONFIG_FILE, "rb") as f:
113
+ config = tomllib.load(f)
114
+ ```
115
+
116
+ **After:**
117
+ ```python
118
+ with CONFIG_FILE.open("rb") as f:
119
+ config = tomllib.load(f)
120
+ ```
121
+
122
+ ### 8. **Quote Escaping in TOML**
123
+ - Added proper escaping for quotes in values to prevent TOML syntax errors
124
+ - More robust serialization
125
+
126
+ **Before:**
127
+ ```python
128
+ lines.append(f'{key} = "{val}"')
129
+ ```
130
+
131
+ **After:**
132
+ ```python
133
+ escaped_val = str(val).replace('"', r'\"')
134
+ lines.append(f'{key} = "{escaped_val}"')
135
+ ```
136
+
137
+ ## Public Interface Preserved
138
+
139
+ All public functions maintain their exact signatures:
140
+ - `resolve_api_key(adapter: str, flag_key: str | None = None) -> str | None`
141
+ - `load_from_config(adapter: str) -> str | None`
142
+ - `save_to_config(adapter: str, api_key: str) -> None`
143
+ - `clear_from_config(adapter: str) -> bool`
144
+ - `prompt_and_save(adapter: str) -> str | None`
145
+
146
+ ## Testing
147
+
148
+ All existing functionality verified:
149
+ - ✓ Module imports successfully
150
+ - ✓ Flag-based key resolution
151
+ - ✓ Environment variable resolution
152
+ - ✓ Config file save/load
153
+ - ✓ Key clearing
154
+
155
+ ## Benefits
156
+
157
+ 1. **Maintainability**: Smaller, focused functions are easier to understand and modify
158
+ 2. **Testability**: Each helper function can be tested independently
159
+ 3. **Type Safety**: Better IDE support and early error detection
160
+ 4. **Readability**: Clear structure and comprehensive documentation
161
+ 5. **Robustness**: Improved error handling and edge case coverage