arrowspace_tuner 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arrowspace_tuner-0.2.0/.github/workflows/ci.yml +57 -0
- arrowspace_tuner-0.2.0/.gitignore +26 -0
- arrowspace_tuner-0.2.0/.python-version +1 -0
- arrowspace_tuner-0.2.0/CHANGELOG.md +26 -0
- arrowspace_tuner-0.2.0/CONTRIBUTING.md +56 -0
- arrowspace_tuner-0.2.0/LICENSE +13 -0
- arrowspace_tuner-0.2.0/PKG-INFO +154 -0
- arrowspace_tuner-0.2.0/README.md +119 -0
- arrowspace_tuner-0.2.0/notebooks/quickstart.ipynb +0 -0
- arrowspace_tuner-0.2.0/pyproject.toml +110 -0
- arrowspace_tuner-0.2.0/results/.gitkeep +0 -0
- arrowspace_tuner-0.2.0/scripts/test_eval.py +83 -0
- arrowspace_tuner-0.2.0/src/arrowspace_tuner/__init__.py +40 -0
- arrowspace_tuner-0.2.0/src/arrowspace_tuner/api.py +132 -0
- arrowspace_tuner-0.2.0/src/arrowspace_tuner/core/__init__.py +26 -0
- arrowspace_tuner-0.2.0/src/arrowspace_tuner/core/config.py +139 -0
- arrowspace_tuner-0.2.0/src/arrowspace_tuner/core/graph.py +159 -0
- arrowspace_tuner-0.2.0/src/arrowspace_tuner/core/objective.py +370 -0
- arrowspace_tuner-0.2.0/src/arrowspace_tuner/py.typed +0 -0
- arrowspace_tuner-0.2.0/src/arrowspace_tuner/reporting/__init__.py +13 -0
- arrowspace_tuner-0.2.0/src/arrowspace_tuner/reporting/reporter.py +188 -0
- arrowspace_tuner-0.2.0/src/arrowspace_tuner/tuner.py +382 -0
- arrowspace_tuner-0.2.0/tests/conftest.py +115 -0
- arrowspace_tuner-0.2.0/tests/test_objective.py +232 -0
- arrowspace_tuner-0.2.0/tests/test_tuner.py +292 -0
- arrowspace_tuner-0.2.0/uv.lock +3167 -0
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
lint:
|
|
11
|
+
name: ruff + mypy
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
strategy:
|
|
14
|
+
matrix:
|
|
15
|
+
python-version: ["3.12", "3.13"]
|
|
16
|
+
|
|
17
|
+
steps:
|
|
18
|
+
- uses: actions/checkout@v4
|
|
19
|
+
|
|
20
|
+
- name: Install uv
|
|
21
|
+
uses: astral-sh/setup-uv@v3
|
|
22
|
+
with:
|
|
23
|
+
version: "latest"
|
|
24
|
+
|
|
25
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
26
|
+
run: uv python install ${{ matrix.python-version }}
|
|
27
|
+
|
|
28
|
+
- name: Install dependencies (no arrowspace — lint only)
|
|
29
|
+
run: uv sync --extra dev --no-install-project
|
|
30
|
+
|
|
31
|
+
- name: Lint (ruff)
|
|
32
|
+
run: uv run ruff check src
|
|
33
|
+
|
|
34
|
+
- name: Type check (mypy)
|
|
35
|
+
run: uv run mypy src
|
|
36
|
+
|
|
37
|
+
test:
|
|
38
|
+
name: pytest
|
|
39
|
+
runs-on: ubuntu-latest
|
|
40
|
+
needs: lint
|
|
41
|
+
|
|
42
|
+
steps:
|
|
43
|
+
- uses: actions/checkout@v4
|
|
44
|
+
|
|
45
|
+
- name: Install uv
|
|
46
|
+
uses: astral-sh/setup-uv@v3
|
|
47
|
+
with:
|
|
48
|
+
version: "latest"
|
|
49
|
+
|
|
50
|
+
- name: Set up Python 3.12
|
|
51
|
+
run: uv python install 3.12
|
|
52
|
+
|
|
53
|
+
- name: Install project + dev dependencies
|
|
54
|
+
run: uv sync --extra dev
|
|
55
|
+
|
|
56
|
+
- name: Run tests
|
|
57
|
+
run: uv run pytest -v --cov=arrowspace_tuner --cov-report=term-missing
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# Python-generated files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[oc]
|
|
4
|
+
build/
|
|
5
|
+
dist/
|
|
6
|
+
wheels/
|
|
7
|
+
*.egg-info
|
|
8
|
+
|
|
9
|
+
# Virtual environments
|
|
10
|
+
.venv
|
|
11
|
+
|
|
12
|
+
# Test artefacts
|
|
13
|
+
.coverage
|
|
14
|
+
htmlcov/
|
|
15
|
+
.pytest_cache/
|
|
16
|
+
|
|
17
|
+
# Local experiment outputs — keep folder, ignore contents
|
|
18
|
+
results/*
|
|
19
|
+
!results/.gitkeep
|
|
20
|
+
|
|
21
|
+
# Local data
|
|
22
|
+
data/
|
|
23
|
+
|
|
24
|
+
.ruff_cache/
|
|
25
|
+
.mypy_cache/
|
|
26
|
+
ruff_errors.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.13
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to `arrowspace_tuner` are documented here.
|
|
4
|
+
Format follows [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
|
5
|
+
Versioning follows [Semantic Versioning](https://semver.org/).
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## [0.1.0] — 2026-04-29
|
|
10
|
+
|
|
11
|
+
Initial release.
|
|
12
|
+
|
|
13
|
+
### Added
|
|
14
|
+
|
|
15
|
+
- `EpsTuner` — main public class for hyperparameter discovery over `eps`, `k`, `tau`
|
|
16
|
+
- `arrowspace_tuner.optuna()` — one-liner convenience API: `aspace, gl = arrowspace.optuna(embeddings)`
|
|
17
|
+
- `StudyConfig` / `BuildParams` — typed dataclasses for power-user configuration
|
|
18
|
+
- Query-free spectral objective: weighted composite of MRR-Top0 proxy, Fiedler value, and lambda variance
|
|
19
|
+
- Optuna TPE sampler with pruning on degenerate graphs (NNZ ≤ N, disconnected, flat spectrum)
|
|
20
|
+
- `sample_n` subsampling: 33x speedup on 50k corpus with identical best params (validated)
|
|
21
|
+
- `storage` parameter for SQLite-backed persistence and parallel/resumed runs
|
|
22
|
+
- `tuner.save_report()` — saves `trials.csv`, `best_params.json`, and Plotly HTML plots
|
|
23
|
+
- `[report]` optional extra (pandas + plotly) — kept out of hard dependencies
|
|
24
|
+
- `py.typed` marker — PEP 561 compliant, full mypy strict mode
|
|
25
|
+
- Comprehensive test suite: `test_objective.py`, `test_tuner.py`, `conftest.py`
|
|
26
|
+
- CI workflow: pytest + ruff + mypy on every push and pull request
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# Contributing
|
|
2
|
+
|
|
3
|
+
Thank you for contributing to **arrowspace-tuner**!
|
|
4
|
+
|
|
5
|
+
## Commit Convention
|
|
6
|
+
|
|
7
|
+
We use [Conventional Commits](https://www.conventionalcommits.org/). Every commit message must follow this format:
|
|
8
|
+
|
|
9
|
+
```
|
|
10
|
+
<type>: <short summary>
|
|
11
|
+
|
|
12
|
+
[optional body]
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
### Types
|
|
16
|
+
|
|
17
|
+
| Type | When to use |
|
|
18
|
+
|---|---|
|
|
19
|
+
| `feat` | New feature or behaviour |
|
|
20
|
+
| `fix` | Bug fix |
|
|
21
|
+
| `test` | Adding or fixing tests |
|
|
22
|
+
| `refactor` | Code change with no behaviour change |
|
|
23
|
+
| `chore` | Tooling, CI, dependencies, repo hygiene |
|
|
24
|
+
| `docs` | Documentation only |
|
|
25
|
+
| `perf` | Performance improvement |
|
|
26
|
+
|
|
27
|
+
### Examples
|
|
28
|
+
|
|
29
|
+
```
|
|
30
|
+
feat: add early-stopping to EpsTuner.fit()
|
|
31
|
+
fix: catch BaseException around ArrowSpace .build() for Rust panics
|
|
32
|
+
test: add degenerate-corpus fixture for pruning paths
|
|
33
|
+
chore: update .gitignore, remove .coverage artefact
|
|
34
|
+
docs: add quickstart section to README
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
### Rules
|
|
38
|
+
|
|
39
|
+
- Summary line ≤ 72 characters
|
|
40
|
+
- Use the imperative mood: "add", not "added" or "adds"
|
|
41
|
+
- Body explains **why**, not what (the diff shows the what)
|
|
42
|
+
- Reference issues/PRs in the body: `Fixes #12`
|
|
43
|
+
|
|
44
|
+
## Branch Names
|
|
45
|
+
|
|
46
|
+
```
|
|
47
|
+
feat/<short-description>
|
|
48
|
+
fix/<short-description>
|
|
49
|
+
chore/<short-description>
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Pull Requests
|
|
53
|
+
|
|
54
|
+
- All PRs must pass CI (pytest + ruff + mypy) before merging
|
|
55
|
+
- Squash-merge into `main`
|
|
56
|
+
- PR title must follow the same conventional commit format
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
Copyright [2026] Tommaso Moriondo
|
|
2
|
+
|
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
you may not use this file except in compliance with the License.
|
|
5
|
+
You may obtain a copy of the License at
|
|
6
|
+
|
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
See the License for the specific language governing permissions and
|
|
13
|
+
limitations under the License.
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: arrowspace_tuner
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Hyperparameter discovery (eps auto-tuning) for ArrowSpace via Optuna.
|
|
5
|
+
Project-URL: Homepage, https://github.com/Genefold/arrowspace_tuner
|
|
6
|
+
Project-URL: Repository, https://github.com/Genefold/arrowspace_tuner.git
|
|
7
|
+
Author-email: Tommaso Moriondo <moriondotommaso@gmail.com>
|
|
8
|
+
License: Apache-2.0
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Keywords: arrowspace,graph-laplacian,hyperparameter-tuning,optuna,spectral-analysis,vector-search
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Classifier: Typing :: Typed
|
|
19
|
+
Requires-Python: >=3.12
|
|
20
|
+
Requires-Dist: arrowspace>=0.26.0
|
|
21
|
+
Requires-Dist: numpy>=2.4.4
|
|
22
|
+
Requires-Dist: optuna>=4.8.0
|
|
23
|
+
Requires-Dist: scipy>=1.17.1
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: mypy>=1.15; extra == 'dev'
|
|
26
|
+
Requires-Dist: pandas>=3.0.0; extra == 'dev'
|
|
27
|
+
Requires-Dist: plotly>=6.7.0; extra == 'dev'
|
|
28
|
+
Requires-Dist: pytest-cov>=5.0; extra == 'dev'
|
|
29
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
30
|
+
Requires-Dist: ruff>=0.9; extra == 'dev'
|
|
31
|
+
Provides-Extra: report
|
|
32
|
+
Requires-Dist: pandas>=3.0.0; extra == 'report'
|
|
33
|
+
Requires-Dist: plotly>=6.7.0; extra == 'report'
|
|
34
|
+
Description-Content-Type: text/markdown
|
|
35
|
+
|
|
36
|
+
# arrowspace_tuner
|
|
37
|
+
|
|
38
|
+
[](https://github.com/Genefold/arrowspace_tuner/actions/workflows/ci.yml)
|
|
39
|
+
[](https://pypi.org/project/arrowspace-tuner/)
|
|
40
|
+
[](https://pypi.org/project/arrowspace-tuner/)
|
|
41
|
+
[](LICENSE)
|
|
42
|
+
|
|
43
|
+
Hyperparameter discovery for [ArrowSpace](https://github.com/tuned-org-uk/arrowspace-rs) — automatically finds the best `eps`, `k`, and `tau` for your corpus using a query-free spectral objective.
|
|
44
|
+
|
|
45
|
+
## Why
|
|
46
|
+
|
|
47
|
+
ArrowSpace's retrieval quality depends on three graph-construction parameters:
|
|
48
|
+
|
|
49
|
+
| Parameter | What it controls |
|
|
50
|
+
|---|---|
|
|
51
|
+
| `eps` | Neighbourhood radius for graph edges |
|
|
52
|
+
| `k` | Number of nearest neighbours per node |
|
|
53
|
+
| `tau` | Search temperature (exploration vs. exploitation) |
|
|
54
|
+
|
|
55
|
+
Setting these by hand is tedious and corpus-dependent. `arrowspace_tuner` uses [Optuna](https://optuna.org/) and a label-free spectral MRR proxy to find them automatically in minutes.
|
|
56
|
+
|
|
57
|
+
## Install
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
# Core (no pandas/plotly)
|
|
61
|
+
pip install arrowspace-tuner
|
|
62
|
+
|
|
63
|
+
# With HTML/CSV reporting
|
|
64
|
+
pip install arrowspace-tuner[report]
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Quickstart
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
import numpy as np
|
|
71
|
+
import arrowspace_tuner as arrowspace
|
|
72
|
+
|
|
73
|
+
embeddings = np.load("corpus.npy") # shape (N, D) float64
|
|
74
|
+
|
|
75
|
+
# One-liner: auto-discover eps, k, tau — runs in ~15 min on 50k corpus
|
|
76
|
+
aspace, gl = arrowspace.optuna(embeddings)
|
|
77
|
+
|
|
78
|
+
# Search as normal
|
|
79
|
+
results = aspace.search(query_embedding, gl, tau=0.8)
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Power-user API
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
from arrowspace_tuner import EpsTuner
|
|
86
|
+
|
|
87
|
+
tuner = EpsTuner(
|
|
88
|
+
n_trials = 15,
|
|
89
|
+
sample_n = 5_000, # 33x faster: explore on 5k, final build on full corpus
|
|
90
|
+
eps_low = 0.8, # narrow bounds if you know your corpus geometry
|
|
91
|
+
eps_high = 2.5,
|
|
92
|
+
k_low = 15,
|
|
93
|
+
k_high = 40,
|
|
94
|
+
tau_low = 0.05,
|
|
95
|
+
tau_high = 0.5,
|
|
96
|
+
n_probe = 50,
|
|
97
|
+
storage = "sqlite:///tune.db", # resume interrupted runs
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
aspace, gl = tuner.fit(embeddings)
|
|
101
|
+
|
|
102
|
+
print(tuner.best_params) # {"eps": 1.615, "k": 38, "tau": 0.114}
|
|
103
|
+
print(tuner.best_score) # 2.138
|
|
104
|
+
print(tuner.best_fiedler) # 0.718 — graph connectivity health
|
|
105
|
+
print(tuner.best_mrr_proxy) # 2.896 — retrieval coherence proxy
|
|
106
|
+
|
|
107
|
+
# Save CSV + HTML plots (requires [report] extra)
|
|
108
|
+
tuner.save_report(out_dir="results")
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## Speed
|
|
112
|
+
|
|
113
|
+
The dominant cost is building the ArrowSpace graph on N vectors. With `sample_n`:
|
|
114
|
+
|
|
115
|
+
| Setting | Per trial | 15 trials | Notes |
|
|
116
|
+
|---|---|---|---|
|
|
117
|
+
| sample_n = 50k | ~23 min | ~5.8h | baseline |
|
|
118
|
+
| `sample_n=5_000` | ~1.5 min | **~27 min** | **33x faster, same best params** |
|
|
119
|
+
|
|
120
|
+
The final build after the study always uses the full corpus.
|
|
121
|
+
|
|
122
|
+
## Objective
|
|
123
|
+
|
|
124
|
+
The objective is a weighted composite of three spectral signals — no ground-truth labels required:
|
|
125
|
+
|
|
126
|
+
```
|
|
127
|
+
score = 0.70 * mrr_top0_spectral # retrieval coherence
|
|
128
|
+
+ 0.20 * log1p(fiedler) # graph connectivity health
|
|
129
|
+
+ 0.10 * log1p(var_lambda) # spectral richness
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## Parallel runs
|
|
133
|
+
|
|
134
|
+
Optuna + SQLite lets you run multiple workers simultaneously:
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
# Terminal 1
|
|
138
|
+
python -m arrowspace_tuner --storage sqlite:///tune.db --trials 15
|
|
139
|
+
|
|
140
|
+
# Terminal 2 (simultaneously)
|
|
141
|
+
python -m arrowspace_tuner --storage sqlite:///tune.db --trials 15
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
## Requirements
|
|
145
|
+
|
|
146
|
+
- Python ≥ 3.12
|
|
147
|
+
- `arrowspace >= 0.26.0`
|
|
148
|
+
- `optuna >= 4.8.0`
|
|
149
|
+
- `scipy >= 1.17.1`
|
|
150
|
+
- `numpy >= 2.4.4`
|
|
151
|
+
|
|
152
|
+
## License
|
|
153
|
+
|
|
154
|
+
Apache-2.0 — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
# arrowspace_tuner
|
|
2
|
+
|
|
3
|
+
[](https://github.com/Genefold/arrowspace_tuner/actions/workflows/ci.yml)
|
|
4
|
+
[](https://pypi.org/project/arrowspace-tuner/)
|
|
5
|
+
[](https://pypi.org/project/arrowspace-tuner/)
|
|
6
|
+
[](LICENSE)
|
|
7
|
+
|
|
8
|
+
Hyperparameter discovery for [ArrowSpace](https://github.com/tuned-org-uk/arrowspace-rs) — automatically finds the best `eps`, `k`, and `tau` for your corpus using a query-free spectral objective.
|
|
9
|
+
|
|
10
|
+
## Why
|
|
11
|
+
|
|
12
|
+
ArrowSpace's retrieval quality depends on three graph-construction parameters:
|
|
13
|
+
|
|
14
|
+
| Parameter | What it controls |
|
|
15
|
+
|---|---|
|
|
16
|
+
| `eps` | Neighbourhood radius for graph edges |
|
|
17
|
+
| `k` | Number of nearest neighbours per node |
|
|
18
|
+
| `tau` | Search temperature (exploration vs. exploitation) |
|
|
19
|
+
|
|
20
|
+
Setting these by hand is tedious and corpus-dependent. `arrowspace_tuner` uses [Optuna](https://optuna.org/) and a label-free spectral MRR proxy to find them automatically in minutes.
|
|
21
|
+
|
|
22
|
+
## Install
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
# Core (no pandas/plotly)
|
|
26
|
+
pip install arrowspace-tuner
|
|
27
|
+
|
|
28
|
+
# With HTML/CSV reporting
|
|
29
|
+
pip install arrowspace-tuner[report]
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Quickstart
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
import numpy as np
|
|
36
|
+
import arrowspace_tuner as arrowspace
|
|
37
|
+
|
|
38
|
+
embeddings = np.load("corpus.npy") # shape (N, D) float64
|
|
39
|
+
|
|
40
|
+
# One-liner: auto-discover eps, k, tau — runs in ~15 min on 50k corpus
|
|
41
|
+
aspace, gl = arrowspace.optuna(embeddings)
|
|
42
|
+
|
|
43
|
+
# Search as normal
|
|
44
|
+
results = aspace.search(query_embedding, gl, tau=0.8)
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Power-user API
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
from arrowspace_tuner import EpsTuner
|
|
51
|
+
|
|
52
|
+
tuner = EpsTuner(
|
|
53
|
+
n_trials = 15,
|
|
54
|
+
sample_n = 5_000, # 33x faster: explore on 5k, final build on full corpus
|
|
55
|
+
eps_low = 0.8, # narrow bounds if you know your corpus geometry
|
|
56
|
+
eps_high = 2.5,
|
|
57
|
+
k_low = 15,
|
|
58
|
+
k_high = 40,
|
|
59
|
+
tau_low = 0.05,
|
|
60
|
+
tau_high = 0.5,
|
|
61
|
+
n_probe = 50,
|
|
62
|
+
storage = "sqlite:///tune.db", # resume interrupted runs
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
aspace, gl = tuner.fit(embeddings)
|
|
66
|
+
|
|
67
|
+
print(tuner.best_params) # {"eps": 1.615, "k": 38, "tau": 0.114}
|
|
68
|
+
print(tuner.best_score) # 2.138
|
|
69
|
+
print(tuner.best_fiedler) # 0.718 — graph connectivity health
|
|
70
|
+
print(tuner.best_mrr_proxy) # 2.896 — retrieval coherence proxy
|
|
71
|
+
|
|
72
|
+
# Save CSV + HTML plots (requires [report] extra)
|
|
73
|
+
tuner.save_report(out_dir="results")
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Speed
|
|
77
|
+
|
|
78
|
+
The dominant cost is building the ArrowSpace graph on N vectors. With `sample_n`:
|
|
79
|
+
|
|
80
|
+
| Setting | Per trial | 15 trials | Notes |
|
|
81
|
+
|---|---|---|---|
|
|
82
|
+
| sample_n = 50k | ~23 min | ~5.8h | baseline |
|
|
83
|
+
| `sample_n=5_000` | ~1.5 min | **~27 min** | **33x faster, same best params** |
|
|
84
|
+
|
|
85
|
+
The final build after the study always uses the full corpus.
|
|
86
|
+
|
|
87
|
+
## Objective
|
|
88
|
+
|
|
89
|
+
The objective is a weighted composite of three spectral signals — no ground-truth labels required:
|
|
90
|
+
|
|
91
|
+
```
|
|
92
|
+
score = 0.70 * mrr_top0_spectral # retrieval coherence
|
|
93
|
+
+ 0.20 * log1p(fiedler) # graph connectivity health
|
|
94
|
+
+ 0.10 * log1p(var_lambda) # spectral richness
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## Parallel runs
|
|
98
|
+
|
|
99
|
+
Optuna + SQLite lets you run multiple workers simultaneously:
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
# Terminal 1
|
|
103
|
+
python -m arrowspace_tuner --storage sqlite:///tune.db --trials 15
|
|
104
|
+
|
|
105
|
+
# Terminal 2 (simultaneously)
|
|
106
|
+
python -m arrowspace_tuner --storage sqlite:///tune.db --trials 15
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Requirements
|
|
110
|
+
|
|
111
|
+
- Python ≥ 3.12
|
|
112
|
+
- `arrowspace >= 0.26.0`
|
|
113
|
+
- `optuna >= 4.8.0`
|
|
114
|
+
- `scipy >= 1.17.1`
|
|
115
|
+
- `numpy >= 2.4.4`
|
|
116
|
+
|
|
117
|
+
## License
|
|
118
|
+
|
|
119
|
+
Apache-2.0 — see [LICENSE](LICENSE).
|
|
File without changes
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "arrowspace_tuner"
|
|
7
|
+
version = "0.2.0"
|
|
8
|
+
description = "Hyperparameter discovery (eps auto-tuning) for ArrowSpace via Optuna."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { text = "Apache-2.0" }
|
|
11
|
+
authors = [
|
|
12
|
+
{ name = "Tommaso Moriondo", email = "moriondotommaso@gmail.com" },
|
|
13
|
+
]
|
|
14
|
+
requires-python = ">=3.12"
|
|
15
|
+
keywords = [
|
|
16
|
+
"vector-search",
|
|
17
|
+
"spectral-analysis",
|
|
18
|
+
"hyperparameter-tuning",
|
|
19
|
+
"optuna",
|
|
20
|
+
"arrowspace",
|
|
21
|
+
"graph-laplacian",
|
|
22
|
+
]
|
|
23
|
+
classifiers = [
|
|
24
|
+
"Development Status :: 3 - Alpha",
|
|
25
|
+
"Intended Audience :: Developers",
|
|
26
|
+
"Intended Audience :: Science/Research",
|
|
27
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
28
|
+
"Programming Language :: Python :: 3",
|
|
29
|
+
"Programming Language :: Python :: 3.12",
|
|
30
|
+
"Programming Language :: Python :: 3.13",
|
|
31
|
+
"Typing :: Typed",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
# ── Hard dependencies ──────────────────────────────────────────────────────
|
|
35
|
+
# These are always required: the Rust wheel, Optuna, SciPy for Fiedler, NumPy.
|
|
36
|
+
# plotly and pandas are NOT here — they are opt-in via [report].
|
|
37
|
+
dependencies = [
|
|
38
|
+
"arrowspace>=0.26.0",
|
|
39
|
+
"numpy>=2.4.4",
|
|
40
|
+
"optuna>=4.8.0",
|
|
41
|
+
"scipy>=1.17.1",
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
# ── Optional extras ─────────────────────────────────────────────────────────
|
|
45
|
+
[project.optional-dependencies]
|
|
46
|
+
|
|
47
|
+
# pip install arrowspace_tuner[report]
|
|
48
|
+
# Needed for tuner.save_report() and all HTML/CSV output from reporter.py
|
|
49
|
+
report = [
|
|
50
|
+
"plotly>=6.7.0",
|
|
51
|
+
"pandas>=3.0.0",
|
|
52
|
+
]
|
|
53
|
+
|
|
54
|
+
# pip install arrowspace_tuner[dev]
|
|
55
|
+
# Full dev environment: testing + linting + type checking
|
|
56
|
+
dev = [
|
|
57
|
+
"pytest>=8.0",
|
|
58
|
+
"pytest-cov>=5.0",
|
|
59
|
+
"ruff>=0.9",
|
|
60
|
+
"mypy>=1.15",
|
|
61
|
+
"plotly>=6.7.0", # needed to test reporter.py
|
|
62
|
+
"pandas>=3.0.0",
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
[project.urls]
|
|
66
|
+
Homepage = "https://github.com/Genefold/arrowspace_tuner"
|
|
67
|
+
Repository = "https://github.com/Genefold/arrowspace_tuner.git"
|
|
68
|
+
|
|
69
|
+
# ── Hatchling config ───────────────────────────────────────────────────────────
|
|
70
|
+
[tool.hatch.build.targets.wheel]
|
|
71
|
+
packages = ["src/arrowspace_tuner"]
|
|
72
|
+
exclude = [
|
|
73
|
+
"tests/",
|
|
74
|
+
"notebooks/",
|
|
75
|
+
"docs/",
|
|
76
|
+
".github/",
|
|
77
|
+
"*.db",
|
|
78
|
+
"*.sqlite",
|
|
79
|
+
"*.ipynb",
|
|
80
|
+
".ruff_cache/",
|
|
81
|
+
".mypy_cache/",
|
|
82
|
+
".pytest_cache/",
|
|
83
|
+
"dist/",
|
|
84
|
+
"*.egg-info/",
|
|
85
|
+
]
|
|
86
|
+
|
|
87
|
+
# ── Ruff ──────────────────────────────────────────────────────────────────
|
|
88
|
+
[tool.ruff]
|
|
89
|
+
line-length = 100
|
|
90
|
+
target-version = "py312"
|
|
91
|
+
|
|
92
|
+
[tool.ruff.lint]
|
|
93
|
+
select = ["E", "F", "I", "UP", "ANN"]
|
|
94
|
+
ignore = ["ANN101"]
|
|
95
|
+
|
|
96
|
+
# ── Mypy ────────────────────────────────────────────────────────────────────
|
|
97
|
+
[tool.mypy]
|
|
98
|
+
python_version = "3.12"
|
|
99
|
+
strict = true
|
|
100
|
+
ignore_missing_imports = true # arrowspace has no stubs
|
|
101
|
+
|
|
102
|
+
# ── Pytest ─────────────────────────────────────────────────────────────────
|
|
103
|
+
[tool.pytest.ini_options]
|
|
104
|
+
testpaths = ["tests"]
|
|
105
|
+
addopts = "--cov=arrowspace_tuner --cov-report=term-missing"
|
|
106
|
+
|
|
107
|
+
[dependency-groups]
|
|
108
|
+
dev = [
|
|
109
|
+
"twine>=6.2.0",
|
|
110
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
scripts/test_eval.py
|
|
4
|
+
====================
|
|
5
|
+
Run the arrowspace_tuner optimisation pipeline on the CVE .npy corpus.
|
|
6
|
+
|
|
7
|
+
uv run python scripts/test_eval.py \
|
|
8
|
+
--data data/cve_embs/cve1999-2025.npy \
|
|
9
|
+
--n 50000 \
|
|
10
|
+
--trials 20 \
|
|
11
|
+
--seed 42
|
|
12
|
+
"""
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import logging
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
|
|
20
|
+
from arrowspace_tuner.tuner import EpsTuner
|
|
21
|
+
|
|
22
|
+
logging.basicConfig(
|
|
23
|
+
level=logging.INFO,
|
|
24
|
+
format="[%(asctime)s] %(levelname)s %(name)s | %(message)s",
|
|
25
|
+
)
|
|
26
|
+
log = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def load_npy(path: str, n: int, seed: int) -> np.ndarray:
|
|
30
|
+
log.info("Loading %s …", path)
|
|
31
|
+
X = np.load(path)
|
|
32
|
+
log.info(" full shape : %s dtype=%s", X.shape, X.dtype)
|
|
33
|
+
n = min(n, len(X))
|
|
34
|
+
rng = np.random.default_rng(seed)
|
|
35
|
+
idx = rng.choice(len(X), size=n, replace=False)
|
|
36
|
+
idx.sort()
|
|
37
|
+
X = X[idx].astype(np.float64)
|
|
38
|
+
norms = np.linalg.norm(X, axis=1, keepdims=True)
|
|
39
|
+
X = X / np.clip(norms, 1e-12, None)
|
|
40
|
+
log.info(" subsample : %s (L2-normalised)", X.shape)
|
|
41
|
+
return X
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def main() -> None:
|
|
45
|
+
parser = argparse.ArgumentParser()
|
|
46
|
+
parser.add_argument("--data", default="data/cve_embs/cve1999-2025.npy")
|
|
47
|
+
parser.add_argument("--n", type=int, default=5000)
|
|
48
|
+
parser.add_argument("--trials", type=int, default=20)
|
|
49
|
+
parser.add_argument("--seed", type=int, default=54)
|
|
50
|
+
|
|
51
|
+
args = parser.parse_args()
|
|
52
|
+
|
|
53
|
+
embeddings = load_npy(args.data, args.n, args.seed)
|
|
54
|
+
|
|
55
|
+
tuner = EpsTuner(
|
|
56
|
+
n_trials = args.trials,
|
|
57
|
+
sample_n = None, # already subsampled above
|
|
58
|
+
seed = args.seed,
|
|
59
|
+
study_name = "cve_arrowspace_fstar",
|
|
60
|
+
storage = None,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
log.info("Starting | n=%d trials=%d seed=%d", len(embeddings), args.trials, args.seed)
|
|
64
|
+
|
|
65
|
+
aspace, gl = tuner.fit(embeddings)
|
|
66
|
+
|
|
67
|
+
print("\n=== Best result ===")
|
|
68
|
+
print(f" F** : {tuner.best_score:.8f}")
|
|
69
|
+
print(f" eps : {tuner.best_params['eps']:.5f}")
|
|
70
|
+
print(f" k : {tuner.best_params['k']}")
|
|
71
|
+
print(f" tau : {tuner.best_params['tau']:.4f}")
|
|
72
|
+
print(f" fiedler : {tuner.best_fiedler}")
|
|
73
|
+
print(f" var_lambda : {tuner.best_var_lambda}")
|
|
74
|
+
print(f" mrr_proxy : {tuner.best_mrr_proxy}")
|
|
75
|
+
print(f"\n ArrowSpace : {aspace}")
|
|
76
|
+
print(f" Graph : {gl}")
|
|
77
|
+
|
|
78
|
+
tuner.save_report(out_dir="results")
|
|
79
|
+
log.info("Report saved to results/")
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
if __name__ == "__main__":
|
|
83
|
+
main()
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""
|
|
2
|
+
arrowspace_tuner — hyperparameter discovery for ArrowSpace.
|
|
3
|
+
|
|
4
|
+
Quickstart
|
|
5
|
+
----------
|
|
6
|
+
import numpy as np
|
|
7
|
+
import arrowspace_tuner as arrowspace
|
|
8
|
+
|
|
9
|
+
embeddings = np.load("corpus.npy")
|
|
10
|
+
|
|
11
|
+
# one-liner: auto-discover eps, k, tau
|
|
12
|
+
aspace, gl = arrowspace.optuna(embeddings)
|
|
13
|
+
|
|
14
|
+
# power-user: full control + post-run inspection
|
|
15
|
+
from arrowspace_tuner import EpsTuner
|
|
16
|
+
|
|
17
|
+
tuner = EpsTuner(n_trials=100, sample_n=10_000, eps_low=0.5, eps_high=3.0)
|
|
18
|
+
aspace, gl = tuner.fit(embeddings)
|
|
19
|
+
print(tuner.best_params) # {"eps": 1.2, "k": 14, "tau": 0.8}
|
|
20
|
+
print(tuner.best_score)
|
|
21
|
+
tuner.save_report() # requires pip install arrowspace-tuner[report]
|
|
22
|
+
"""
|
|
23
|
+
from .api import optuna
|
|
24
|
+
|
|
25
|
+
# Power-user exports: config dataclasses for advanced customisation
|
|
26
|
+
from .core import BuildParams, StudyConfig
|
|
27
|
+
from .tuner import EpsTuner
|
|
28
|
+
|
|
29
|
+
__version__ = "0.1.0"
|
|
30
|
+
|
|
31
|
+
__all__ = [
|
|
32
|
+
# primary public API
|
|
33
|
+
"optuna",
|
|
34
|
+
"EpsTuner",
|
|
35
|
+
# config — for power users
|
|
36
|
+
"BuildParams",
|
|
37
|
+
"StudyConfig",
|
|
38
|
+
# version
|
|
39
|
+
"__version__",
|
|
40
|
+
]
|