arrowspace_tuner 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,40 @@
1
+ """
2
+ arrowspace_tuner — hyperparameter discovery for ArrowSpace.
3
+
4
+ Quickstart
5
+ ----------
6
+ import numpy as np
7
+ import arrowspace_tuner as arrowspace
8
+
9
+ embeddings = np.load("corpus.npy")
10
+
11
+ # one-liner: auto-discover eps, k, tau
12
+ aspace, gl = arrowspace.optuna(embeddings)
13
+
14
+ # power-user: full control + post-run inspection
15
+ from arrowspace_tuner import EpsTuner
16
+
17
+ tuner = EpsTuner(n_trials=100, sample_n=10_000, eps_low=0.5, eps_high=3.0)
18
+ aspace, gl = tuner.fit(embeddings)
19
+ print(tuner.best_params) # {"eps": 1.2, "k": 14, "tau": 0.8}
20
+ print(tuner.best_score)
21
+ tuner.save_report() # requires pip install arrowspace-tuner[report]
22
+ """
23
+ from .api import optuna
24
+
25
+ # Power-user exports: config dataclasses for advanced customisation
26
+ from .core import BuildParams, StudyConfig
27
+ from .tuner import EpsTuner
28
+
29
+ __version__ = "0.1.0"
30
+
31
+ __all__ = [
32
+ # primary public API
33
+ "optuna",
34
+ "EpsTuner",
35
+ # config — for power users
36
+ "BuildParams",
37
+ "StudyConfig",
38
+ # version
39
+ "__version__",
40
+ ]
@@ -0,0 +1,132 @@
1
+ """
2
+ api.py — one-liner convenience function for hyperparameter discovery.
3
+
4
+ This module exists solely to satisfy the acceptance criteria:
5
+
6
+ aspace, gl = arrowspace.optuna(embeddings)
7
+
8
+ It is a thin shim over EpsTuner with sensible defaults.
9
+ For any non-trivial use case, instantiate EpsTuner directly.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import numpy as np
14
+
15
+ from .core.config import _DEFAULT_N_TRIALS
16
+ from .tuner import EpsTuner
17
+
18
+
19
+ def optuna(
20
+ embeddings: np.ndarray,
21
+ *,
22
+ n_trials: int = _DEFAULT_N_TRIALS,
23
+ sample_n: int | None = 5_000,
24
+ seed: int = 54,
25
+ study_name: str = "arrowspace_fstar",
26
+ storage: str | None = None,
27
+ eps_low: float = 0.3,
28
+ eps_high: float = 4.0,
29
+ k_low: int = 3,
30
+ k_high: int = 40,
31
+ tau_low: float = 0.1,
32
+ tau_high: float = 1.0,
33
+ n_probe: int = 50,
34
+ ) -> tuple[object, object]:
35
+ """
36
+ Auto-discover eps, k, and tau and return a ready-to-use (aspace, gl) pair.
37
+
38
+ This is the simplest entry point to arrowspace_tuner. It runs an Optuna
39
+ study with default settings and returns the ArrowSpace index built with
40
+ the best hyperparameters found.
41
+
42
+ Defaults are tuned for speed on large corpora (> 50k items):
43
+ - sample_n=5_000 gives a 33x speedup over full-corpus trials with
44
+ identical best params found (validated on a 50k CVE corpus).
45
+ - n_probe=50 is sufficient to rank parameter regions reliably.
46
+ - The final build after the study always uses the full corpus.
47
+
48
+ Parameters
49
+ ----------
50
+ embeddings : np.ndarray
51
+ Shape (N, D) float64 corpus embeddings.
52
+ n_trials : int
53
+ Number of Optuna trials. Default 15.
54
+ sample_n : int | None
55
+ Subsample size per trial. Default 5_000. None = full corpus.
56
+ Recommended for large corpora (> 50k items).
57
+ seed : int
58
+ Random seed for reproducibility.
59
+ study_name : str
60
+ Optuna study identifier.
61
+ storage : str | None
62
+ Optuna storage URI for persistence. None = in-memory.
63
+ Use "sqlite:///tune.db" to resume interrupted runs.
64
+ eps_low, eps_high : float
65
+ Log-scale search bounds for eps.
66
+ k_low, k_high : int
67
+ Search bounds for k.
68
+ tau_low, tau_high : float
69
+ Search bounds for tau.
70
+ n_probe : int
71
+ Number of anchor queries per trial for the MRR proxy. Default 50.
72
+
73
+ Returns
74
+ -------
75
+ aspace : ArrowSpace
76
+ ArrowSpace index built with the best hyperparameters found.
77
+ gl : GraphLaplacian
78
+ Corresponding graph Laplacian.
79
+
80
+ Examples
81
+ --------
82
+ Minimal usage — matches the acceptance criteria exactly:
83
+
84
+ import numpy as np
85
+ import arrowspace_tuner as arrowspace
86
+
87
+ embeddings = np.load("corpus.npy")
88
+ aspace, gl = arrowspace.optuna(embeddings)
89
+
90
+ results = aspace.search(query_embedding, gl, tau=0.8)
91
+
92
+ With a custom search range:
93
+
94
+ aspace, gl = arrowspace.optuna(
95
+ embeddings,
96
+ n_trials=30,
97
+ sample_n=10_000,
98
+ eps_low=0.5,
99
+ eps_high=3.0,
100
+ )
101
+
102
+ Inspecting the study after the fact:
103
+
104
+ from arrowspace_tuner import EpsTuner
105
+
106
+ tuner = EpsTuner(n_trials=15, sample_n=5_000)
107
+ aspace, gl = tuner.fit(embeddings)
108
+ print(tuner.best_params)
109
+ tuner.save_report()
110
+
111
+ Resuming an interrupted run:
112
+
113
+ aspace, gl = arrowspace.optuna(
114
+ embeddings,
115
+ storage="sqlite:///tune.db",
116
+ )
117
+ """
118
+ tuner = EpsTuner(
119
+ n_trials = n_trials,
120
+ sample_n = sample_n,
121
+ seed = seed,
122
+ study_name = study_name,
123
+ storage = storage,
124
+ eps_low = eps_low,
125
+ eps_high = eps_high,
126
+ k_low = k_low,
127
+ k_high = k_high,
128
+ tau_low = tau_low,
129
+ tau_high = tau_high,
130
+ n_probe = n_probe,
131
+ )
132
+ return tuner.fit(embeddings)
@@ -0,0 +1,26 @@
1
+ """
2
+ arrowspace_tuner.core — internal building blocks.
3
+
4
+ This subpackage is not part of the public API.
5
+ Import from arrowspace_tuner directly:
6
+
7
+ from arrowspace_tuner import EpsTuner, optuna
8
+ from arrowspace_tuner import StudyConfig, BuildParams # for power users
9
+ """
10
+ from .config import BuildParams, StudyConfig
11
+ from .graph import fiedler_normalized, gl_to_scipy
12
+ from .objective import build_and_score, make_objective
13
+
14
+ __version__ = "0.1.0"
15
+
16
+ __all__ = [
17
+ # config
18
+ "BuildParams",
19
+ "StudyConfig",
20
+ # graph
21
+ "fiedler_normalized",
22
+ "gl_to_scipy",
23
+ # objective
24
+ "build_and_score",
25
+ "make_objective",
26
+ ]
@@ -0,0 +1,139 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Any
5
+
6
+ # Single source of truth for the default number of trials.
7
+ # Referenced by StudyConfig, EpsTuner.__init__, and api.optuna().
8
+ _DEFAULT_N_TRIALS: int = 15
9
+
10
+
11
+ @dataclass
12
+ class BuildParams:
13
+ """
14
+ Parameters passed to ArrowSpaceBuilder for a single trial build.
15
+
16
+ Attributes
17
+ ----------
18
+ eps : float
19
+ Neighbourhood radius for graph construction.
20
+ Primary hyperparameter being optimised.
21
+ k : int
22
+ Number of nearest neighbours used when building the graph.
23
+ topk : int
24
+ Number of results returned by search. Automatically set to k // 2
25
+ during optimisation; can be overridden for the final build.
26
+ p : float
27
+ Minkowski distance exponent (2.0 = Euclidean).
28
+ sigma : float | None
29
+ Optional Gaussian kernel bandwidth. None = auto.
30
+ max_clusters : int
31
+ Upper bound on the number of clusters fed to the builder.
32
+ cluster_radius : float
33
+ Squared L2 threshold for cluster creation.
34
+ sampling_rate : float
35
+ Fraction of embeddings used per trial build (1.0 = all).
36
+ """
37
+
38
+ eps: float = 0.8
39
+ k: int = 10
40
+ topk: int = 5
41
+ p: float = 2.0
42
+ sigma: float | None = None
43
+ max_clusters: int = 5000
44
+ cluster_radius: float = 0.42
45
+ sampling_rate: float = 1.0
46
+
47
+ def to_dict(self) -> dict[str, Any]:
48
+ """Return graph_params dict expected by ArrowSpaceBuilder.build()."""
49
+ return {
50
+ "eps": self.eps,
51
+ "k": self.k,
52
+ "topk": self.topk,
53
+ "p": self.p,
54
+ "sigma": self.sigma,
55
+ }
56
+
57
+
58
+ @dataclass
59
+ class StudyConfig:
60
+ """
61
+ Configuration for the Optuna study loop.
62
+
63
+ Attributes
64
+ ----------
65
+ n_trials : int
66
+ Number of Optuna trials to run. Default: 15.
67
+ sample_n : int | None
68
+ Subsample this many embeddings per trial for speed.
69
+ None = use all embeddings every trial.
70
+ Recommended: 5_000 for corpora > 50k items (33x speedup,
71
+ identical best params found vs full-corpus run).
72
+ seed : int
73
+ Random seed for reproducibility.
74
+ study_name : str
75
+ Optuna study identifier. Used as folder name in reporter output.
76
+ storage : str | None
77
+ Optuna storage URL (e.g. "sqlite:///optuna.db"). None = in-memory.
78
+ n_jobs : int
79
+ Number of parallel workers for study.optimize(). Default: 1 (serial).
80
+ Set to -1 to use all available CPU cores, or any positive integer.
81
+
82
+ Threading safety note: Optuna n_jobs > 1 runs each trial in a
83
+ separate thread sharing the same Python process. The objective
84
+ closure itself is stateless (captures read-only numpy arrays), so
85
+ it is thread-safe. However, parallelism is only safe if the
86
+ underlying ArrowSpace Rust extension is thread-safe under concurrent
87
+ .build() calls. Verify this before setting n_jobs > 1 in production.
88
+
89
+ Reproducibility note: with n_jobs > 1 and TPESampler the trial
90
+ execution order is non-deterministic, so best_params may differ
91
+ across runs even with the same seed. Use n_jobs=1 for reproducible
92
+ comparisons.
93
+
94
+ Search space — graph structure
95
+ ------------------------------
96
+ eps_low, eps_high : float
97
+ Log-scale bounds for eps search.
98
+ k_low, k_high : int
99
+ Bounds for k (nearest neighbours) search.
100
+
101
+ Search space — retrieval
102
+ ------------------------
103
+ tau_low, tau_high : float
104
+ Bounds for tau search. tau controls the ArrowSpace search
105
+ temperature passed to search_batch(). Optimising tau alongside
106
+ eps and k ensures the graph is evaluated at its best retrieval
107
+ operating point, not an arbitrary fixed tau.
108
+
109
+ MRR proxy
110
+ ---------
111
+ n_probe : int
112
+ Number of corpus items used as query anchors per trial when
113
+ computing the spectral MRR-Top0 proxy. Scales search_batch cost
114
+ linearly — 50 probes gives ~14% MRR standard error, which is
115
+ more than adequate for ranking trials. Use 200 only for a final
116
+ high-accuracy evaluation where trial speed is not a concern.
117
+ """
118
+
119
+ n_trials: int = _DEFAULT_N_TRIALS
120
+ sample_n: int | None = None
121
+ seed: int = 54
122
+ study_name: str = "arrowspace_tuner"
123
+ storage: str | None = None
124
+ n_jobs: int = 1
125
+
126
+ # Search space — graph
127
+ eps_low: float = 0.3
128
+ eps_high: float = 4.0
129
+ k_low: int = 3
130
+ k_high: int = 40
131
+
132
+ # Search space — retrieval
133
+ tau_low: float = 0.1
134
+ tau_high: float = 1.0
135
+
136
+ # MRR proxy — 50 gives ~14% s.e., adequate for trial ranking (was 200)
137
+ n_probe: int = 50
138
+ max_clusters: int = 50
139
+ cluster_radius: float = 0.5
@@ -0,0 +1,159 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from collections.abc import Sequence
5
+ from typing import Protocol
6
+
7
+ import numpy as np
8
+ import scipy.sparse as sp
9
+ import scipy.sparse.linalg as spla
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class PyGraphLaplacian(Protocol):
15
+ """
16
+ Structural protocol matching the PyGraphLaplacian type exposed by the
17
+ ArrowSpace Rust extension (arrowspace._arrowspace.PyGraphLaplacian).
18
+
19
+ Declaring it here as a Protocol lets mypy check call-sites without
20
+ importing the extension at type-check time, which is correct because
21
+ the Rust wheel may not be present in the type-checking environment.
22
+ """
23
+
24
+ def to_csr(self) -> tuple[Sequence[float], Sequence[int], Sequence[int]]:
25
+ """Return (data, indices, indptr) arrays for CSR construction."""
26
+ ...
27
+
28
+ def shape(self) -> tuple[int, int]:
29
+ """Return (nrows, ncols) of the Laplacian matrix."""
30
+ ...
31
+
32
+
33
+ def gl_to_scipy(gl: PyGraphLaplacian) -> sp.csr_matrix:
34
+ """
35
+ Convert a PyGraphLaplacian (from the ArrowSpace Rust extension) to a
36
+ SciPy CSR sparse matrix.
37
+
38
+ Parameters
39
+ ----------
40
+ gl : PyGraphLaplacian
41
+ The graph Laplacian returned by ArrowSpaceBuilder.build().
42
+
43
+ Returns
44
+ -------
45
+ sp.csr_matrix
46
+ The Laplacian as a SciPy sparse matrix, ready for eigendecomposition.
47
+ """
48
+ raw = gl.to_csr() # returns (data, indices, indptr, shape)
49
+ shape = gl.shape()
50
+ data = np.asarray(raw[0], dtype=np.float64)
51
+ indices = np.asarray(raw[1], dtype=np.int32)
52
+ indptr = np.asarray(raw[2], dtype=np.int32)
53
+ return sp.csr_matrix((data, indices, indptr), shape=shape)
54
+
55
+
56
+ def fiedler_normalized_from_csr(L: sp.csr_matrix, nnz: int) -> float:
57
+ """
58
+ Compute the normalised Fiedler value (λ₂) from a pre-built SciPy CSR
59
+ Laplacian matrix.
60
+
61
+ This is the hot path called from build_and_score. The caller is
62
+ responsible for building L and computing nnz from a single gl.to_csr()
63
+ call, avoiding redundant FFI roundtrips (#10).
64
+
65
+ Eigenvalue strategy
66
+ -------------------
67
+ N ≤ 5_000 : dense path via np.linalg.eigvalsh.
68
+ Always converges, zero ARPACK overhead, fastest at this scale.
69
+ Covers the sample_n=5_000 default path entirely.
70
+ N > 5_000 : shift-invert ARPACK (sigma=0.0, which="LM").
71
+ Finds the largest eigenvalues of L^{-1}, equivalent to the
72
+ smallest eigenvalues of L. 5–20× faster than which="SM" and
73
+ far more numerically stable.
74
+ tol=1e-4 is sufficient because the Fiedler value feeds into
75
+ log1p() — 4 significant digits is more than adequate.
76
+
77
+ Parameters
78
+ ----------
79
+ L : sp.csr_matrix
80
+ Pre-built normalised Laplacian (caller's responsibility).
81
+ nnz : int
82
+ Number of non-zero entries (already computed by caller).
83
+
84
+ Returns
85
+ -------
86
+ float
87
+ λ₂ ∈ [0, 1]. Returns 0.0 on degenerate/disconnected graphs
88
+ and on any numerical failure.
89
+ """
90
+ try:
91
+ n = L.shape[0]
92
+
93
+ # Degenerate guard: fewer edges than nodes → nearly empty graph
94
+ if nnz <= n:
95
+ logger.warning(
96
+ "Degenerate graph NNZ=%d <= N=%d — returning 0.0", nnz, n
97
+ )
98
+ return 0.0
99
+
100
+ # Normalise: L_norm = D^{-1/2} L D^{-1/2}
101
+ diag = np.array(L.diagonal(), dtype=np.float64)
102
+ safe_diag = np.where(diag > 1e-12, diag, 1e-12)
103
+ d_inv_sqrt = sp.diags(1.0 / np.sqrt(safe_diag))
104
+ L_norm = d_inv_sqrt @ L @ d_inv_sqrt
105
+
106
+ # ── eigenvalue computation ──────────────────────────────────────────
107
+ if n <= 5_000:
108
+ all_vals = np.linalg.eigvalsh(L_norm.toarray())
109
+ vals = all_vals[:2]
110
+ else:
111
+ vals = spla.eigsh(
112
+ L_norm,
113
+ k=2,
114
+ sigma=0.0,
115
+ which="LM",
116
+ return_eigenvectors=False,
117
+ tol=1e-4,
118
+ maxiter=500,
119
+ )
120
+
121
+ fiedler = max(0.0, float(sorted(np.real(vals))[1]))
122
+
123
+ logger.debug(
124
+ "fiedler_normalized: λ₂=%.6f NNZ=%d N=%d path=%s",
125
+ fiedler, nnz, n, "dense" if n <= 5_000 else "shift-invert",
126
+ )
127
+ return fiedler
128
+
129
+ except Exception as exc:
130
+ logger.warning("fiedler_normalized failed: %s", exc, exc_info=True)
131
+ return 0.0
132
+
133
+
134
+ def fiedler_normalized(gl: PyGraphLaplacian) -> float:
135
+ """
136
+ Public wrapper: compute the normalised Fiedler value from a raw
137
+ PyGraphLaplacian. Calls gl.to_csr() once internally.
138
+
139
+ Prefer fiedler_normalized_from_csr() in hot paths where the CSR
140
+ matrix has already been materialised to avoid a redundant FFI call.
141
+
142
+ Parameters
143
+ ----------
144
+ gl : PyGraphLaplacian
145
+ The graph Laplacian returned by ArrowSpaceBuilder.build().
146
+
147
+ Returns
148
+ -------
149
+ float
150
+ λ₂ ∈ [0, 1].
151
+ """
152
+ raw = gl.to_csr()
153
+ shape = gl.shape()
154
+ data = np.asarray(raw[0], dtype=np.float64)
155
+ indices = np.asarray(raw[1], dtype=np.int32)
156
+ indptr = np.asarray(raw[2], dtype=np.int32)
157
+ L = sp.csr_matrix((data, indices, indptr), shape=shape)
158
+ nnz = len(data)
159
+ return fiedler_normalized_from_csr(L, nnz)