forest-clustering 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Vladislav Kozlov
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,146 @@
1
+ Metadata-Version: 2.4
2
+ Name: forest-clustering
3
+ Version: 0.1.0
4
+ Summary: Random-partition similarity clustering for mixed-type tabular data
5
+ Author-email: Vladislav Kozlov <vlad.kneu@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/4eshireCat/forest_clustering
8
+ Project-URL: Repository, https://github.com/4eshireCat/forest_clustering
9
+ Project-URL: Documentation, https://github.com/4eshireCat/forest_clustering/blob/main/ALGORITHM.md
10
+ Project-URL: Issues, https://github.com/4eshireCat/forest_clustering/issues
11
+ Project-URL: Changelog, https://github.com/4eshireCat/forest_clustering/blob/main/CHANGELOG.md
12
+ Keywords: clustering,unsupervised learning,random forests,tabular data,mixed data,similarity embedding
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
22
+ Classifier: Operating System :: OS Independent
23
+ Requires-Python: >=3.10
24
+ Description-Content-Type: text/markdown
25
+ License-File: LICENSE
26
+ Requires-Dist: numpy>=1.24
27
+ Requires-Dist: scipy>=1.10
28
+ Requires-Dist: scikit-learn>=1.3
29
+ Requires-Dist: pandas>=2.0
30
+ Requires-Dist: joblib>=1.3
31
+ Provides-Extra: dev
32
+ Requires-Dist: pytest; extra == "dev"
33
+ Requires-Dist: pytest-cov; extra == "dev"
34
+ Requires-Dist: build; extra == "dev"
35
+ Requires-Dist: twine; extra == "dev"
36
+ Dynamic: license-file
37
+
38
+ # forest-clustering
39
+
40
+ **Random-partition similarity clustering for mixed-type tabular data.**
41
+
42
+ `forest-clustering` builds a compact binary embedding from random feature partitions and then applies any sklearn-compatible clustering algorithm to that embedding. It handles numerical, categorical, and mixed data natively, with built-in correlation-based feature weighting and outlier-robust cut-point generation.
43
+
44
+ ## How it works
45
+
46
+ Each of the *L* iterations:
47
+ 1. Samples *m* features (weighted by inverse correlation-group size).
48
+ 2. Draws *K−1* random cut-points per numerical feature (uniform or quantile-based).
49
+ 3. Assigns each sample a mixed-radix *cell ID* based on which bin it falls in for each selected feature.
50
+
51
+ The result is an *n × L* integer embedding. Two samples that consistently land in the same cell are similar; Hamming distance on this embedding approximates the true similarity.
52
+
53
+ ```
54
+ Hamming(i, j) = (1/L) · Σ_l [ E[i,l] ≠ E[j,l] ] ∈ [0, 1]
55
+ ```
56
+
57
+ See [ALGORITHM.md](ALGORITHM.md) for a detailed description with diagrams.
58
+
59
+ ## Installation
60
+
61
+ ```bash
62
+ pip install forest-clustering
63
+ ```
64
+
65
+ Python ≥ 3.10 required.
66
+
67
+ ## Quick start
68
+
69
+ ```python
70
+ from forest_clustering import ForestClusterer
71
+ from sklearn.cluster import KMeans
72
+
73
+ fc = ForestClusterer(
74
+ n_iterations=200,
75
+ n_bins=3,
76
+ quantile_cuts=True, # robust to outliers
77
+ corr_threshold=0.9, # down-weight correlated features
78
+ clusterer=KMeans(n_clusters=5, n_init="auto", random_state=0),
79
+ random_state=42,
80
+ )
81
+
82
+ labels = fc.fit_predict(df) # works with DataFrame or ndarray
83
+ ```
84
+
85
+ ## Parameters
86
+
87
+ | Parameter | Default | Description |
88
+ |---|---|---|
89
+ | `n_iterations` | 200 | Number of random partitioning iterations *L*. More → more stable. |
90
+ | `n_bins` | 3 | Number of bins per feature per iteration *K*. |
91
+ | `n_features` | `"sqrt"` | Features selected per iteration: int, float fraction, `"sqrt"`, `"log2"`. |
92
+ | `quantile_cuts` | `False` | Sample cut-points from empirical quantiles (robust to outliers). |
93
+ | `corr_threshold` | 0.7 | Spearman \|r\| threshold for grouping correlated features. `None` disables. |
94
+ | `corr_sample_size` | 10 000 | Rows used to estimate feature correlations. |
95
+ | `clusterer` | `DBSCAN(metric="hamming")` | Any sklearn-compatible `fit_predict` estimator. |
96
+ | `feature_types` | `None` | Override type detection: `{col: "numerical"\|"categorical"}`. |
97
+ | `cat_threshold` | 10 | Numerical columns with ≤ this many unique values → treated as categorical. |
98
+ | `n_jobs` | −1 | Parallelism for embedding computation (joblib). |
99
+ | `random_state` | `None` | Seed for reproducibility. |
100
+
101
+ ## Downstream clustering algorithms
102
+
103
+ | Algorithm | When to use |
104
+ |---|---|
105
+ | `KMeans(n_clusters=K)` on embedding | Known K, any n — fast and stable |
106
+ | `AgglomerativeClustering(metric="precomputed")` | n ≤ 50 K, non-spherical clusters |
107
+ | `DBSCAN(metric="hamming")` on embedding | Unknown K, need outlier detection |
108
+ | `HDBSCAN(metric="precomputed")` | Variable-density clusters (pass `.astype(float64)`) |
109
+ | `MiniBatchKMeans` on embedding | n > 100 K |
110
+
111
+ ```python
112
+ from sklearn.cluster import AgglomerativeClustering
113
+
114
+ fc = ForestClusterer(
115
+ n_iterations=300,
116
+ clusterer=AgglomerativeClustering(n_clusters=3, metric="precomputed", linkage="average"),
117
+ )
118
+ labels = fc.fit_predict(X)
119
+ ```
120
+
121
+ ## Utilities
122
+
123
+ ```python
124
+ # Get the raw n×L embedding
125
+ E = fc.get_embedding()
126
+
127
+ # Pairwise Hamming distance matrix (chunked for large n)
128
+ D = fc.pairwise_distance()
129
+
130
+ # Transform new data using fitted partition specs
131
+ E_new = fc.transform(X_new)
132
+ ```
133
+
134
+ ## Hyperparameter guidelines
135
+
136
+ | Goal | Recommendation |
137
+ |---|---|
138
+ | Fast prototype | `n_iterations=50`, `n_bins=3` |
139
+ | Balanced quality/speed | `n_iterations=200`, `n_bins=3` (default) |
140
+ | High stability | `n_iterations=500`, `n_bins=4` |
141
+ | Outlier-heavy data | `quantile_cuts=True` |
142
+ | Many correlated features | `corr_threshold=0.8–0.9` |
143
+
144
+ ## License
145
+
146
+ MIT
@@ -0,0 +1,109 @@
1
+ # forest-clustering
2
+
3
+ **Random-partition similarity clustering for mixed-type tabular data.**
4
+
5
+ `forest-clustering` builds a compact binary embedding from random feature partitions and then applies any sklearn-compatible clustering algorithm to that embedding. It handles numerical, categorical, and mixed data natively, with built-in correlation-based feature weighting and outlier-robust cut-point generation.
6
+
7
+ ## How it works
8
+
9
+ Each of the *L* iterations:
10
+ 1. Samples *m* features (weighted by inverse correlation-group size).
11
+ 2. Draws *K−1* random cut-points per numerical feature (uniform or quantile-based).
12
+ 3. Assigns each sample a mixed-radix *cell ID* based on which bin it falls in for each selected feature.
13
+
14
+ The result is an *n × L* integer embedding. Two samples that consistently land in the same cell are similar; Hamming distance on this embedding approximates the true similarity.
15
+
16
+ ```
17
+ Hamming(i, j) = (1/L) · Σ_l [ E[i,l] ≠ E[j,l] ] ∈ [0, 1]
18
+ ```
19
+
20
+ See [ALGORITHM.md](ALGORITHM.md) for a detailed description with diagrams.
21
+
22
+ ## Installation
23
+
24
+ ```bash
25
+ pip install forest-clustering
26
+ ```
27
+
28
+ Python ≥ 3.10 required.
29
+
30
+ ## Quick start
31
+
32
+ ```python
33
+ from forest_clustering import ForestClusterer
34
+ from sklearn.cluster import KMeans
35
+
36
+ fc = ForestClusterer(
37
+ n_iterations=200,
38
+ n_bins=3,
39
+ quantile_cuts=True, # robust to outliers
40
+ corr_threshold=0.9, # down-weight correlated features
41
+ clusterer=KMeans(n_clusters=5, n_init="auto", random_state=0),
42
+ random_state=42,
43
+ )
44
+
45
+ labels = fc.fit_predict(df) # works with DataFrame or ndarray
46
+ ```
47
+
48
+ ## Parameters
49
+
50
+ | Parameter | Default | Description |
51
+ |---|---|---|
52
+ | `n_iterations` | 200 | Number of random partitioning iterations *L*. More → more stable. |
53
+ | `n_bins` | 3 | Number of bins per feature per iteration *K*. |
54
+ | `n_features` | `"sqrt"` | Features selected per iteration: int, float fraction, `"sqrt"`, `"log2"`. |
55
+ | `quantile_cuts` | `False` | Sample cut-points from empirical quantiles (robust to outliers). |
56
+ | `corr_threshold` | 0.7 | Spearman \|r\| threshold for grouping correlated features. `None` disables. |
57
+ | `corr_sample_size` | 10 000 | Rows used to estimate feature correlations. |
58
+ | `clusterer` | `DBSCAN(metric="hamming")` | Any sklearn-compatible `fit_predict` estimator. |
59
+ | `feature_types` | `None` | Override type detection: `{col: "numerical"\|"categorical"}`. |
60
+ | `cat_threshold` | 10 | Numerical columns with ≤ this many unique values → treated as categorical. |
61
+ | `n_jobs` | −1 | Parallelism for embedding computation (joblib). |
62
+ | `random_state` | `None` | Seed for reproducibility. |
63
+
64
+ ## Downstream clustering algorithms
65
+
66
+ | Algorithm | When to use |
67
+ |---|---|
68
+ | `KMeans(n_clusters=K)` on embedding | Known K, any n — fast and stable |
69
+ | `AgglomerativeClustering(metric="precomputed")` | n ≤ 50 K, non-spherical clusters |
70
+ | `DBSCAN(metric="hamming")` on embedding | Unknown K, need outlier detection |
71
+ | `HDBSCAN(metric="precomputed")` | Variable-density clusters (pass `.astype(float64)`) |
72
+ | `MiniBatchKMeans` on embedding | n > 100 K |
73
+
74
+ ```python
75
+ from sklearn.cluster import AgglomerativeClustering
76
+
77
+ fc = ForestClusterer(
78
+ n_iterations=300,
79
+ clusterer=AgglomerativeClustering(n_clusters=3, metric="precomputed", linkage="average"),
80
+ )
81
+ labels = fc.fit_predict(X)
82
+ ```
83
+
84
+ ## Utilities
85
+
86
+ ```python
87
+ # Get the raw n×L embedding
88
+ E = fc.get_embedding()
89
+
90
+ # Pairwise Hamming distance matrix (chunked for large n)
91
+ D = fc.pairwise_distance()
92
+
93
+ # Transform new data using fitted partition specs
94
+ E_new = fc.transform(X_new)
95
+ ```
96
+
97
+ ## Hyperparameter guidelines
98
+
99
+ | Goal | Recommendation |
100
+ |---|---|
101
+ | Fast prototype | `n_iterations=50`, `n_bins=3` |
102
+ | Balanced quality/speed | `n_iterations=200`, `n_bins=3` (default) |
103
+ | High stability | `n_iterations=500`, `n_bins=4` |
104
+ | Outlier-heavy data | `quantile_cuts=True` |
105
+ | Many correlated features | `corr_threshold=0.8–0.9` |
106
+
107
+ ## License
108
+
109
+ MIT
@@ -0,0 +1,12 @@
1
+ from .clusterer import ForestClusterer
2
+ from .distance import pairwise_hamming, pairwise_hamming_chunked, cross_hamming
3
+
4
+ __version__ = "0.1.0"
5
+
6
+ __all__ = [
7
+ "ForestClusterer",
8
+ "pairwise_hamming",
9
+ "pairwise_hamming_chunked",
10
+ "cross_hamming",
11
+ "__version__",
12
+ ]
@@ -0,0 +1,191 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.base import BaseEstimator, ClusterMixin
4
+ from sklearn.cluster import DBSCAN
5
+ from sklearn.utils.validation import check_is_fitted
6
+
7
+ from .feature_encoder import DataEncoder
8
+ from .correlation import compute_feature_weights
9
+ from .partitioner import build_col_stats, build_iteration_specs, compute_embedding
10
+ from .distance import pairwise_hamming, pairwise_hamming_chunked, cross_hamming
11
+
12
+
13
+ def _resolve_n_features(n_features, d: int) -> int:
14
+ if n_features == "sqrt":
15
+ return max(1, int(np.ceil(np.sqrt(d))))
16
+ if n_features == "log2":
17
+ return max(1, int(np.ceil(np.log2(max(d, 2)))))
18
+ if isinstance(n_features, float) and 0 < n_features <= 1.0:
19
+ return max(1, int(np.ceil(n_features * d)))
20
+ return max(1, int(n_features))
21
+
22
+
23
+ class ForestClusterer(BaseEstimator, ClusterMixin):
24
+ """Clustering via random-partition similarity embeddings.
25
+
26
+ Parameters
27
+ ----------
28
+ n_iterations : int
29
+ Number of random partitioning iterations (L). More → more stable embeddings.
30
+ n_features : int | float | "sqrt" | "log2"
31
+ Features selected per iteration. Float = fraction, "sqrt" = ceil(sqrt(d)).
32
+ n_bins : int
33
+ Number of bins per feature per iteration (K).
34
+ clusterer : sklearn-compatible estimator or None
35
+ Downstream clustering algorithm. Must support fit_predict().
36
+ If metric="precomputed", receives the pairwise distance matrix.
37
+ If metric="hamming" or not set, receives the (n, L) embedding directly.
38
+ Default: DBSCAN(metric="hamming").
39
+ corr_threshold : float or None
40
+ Spearman |corr| threshold for grouping correlated features (1/G weighting).
41
+ None disables correlation-based weighting.
42
+ corr_sample_size : int
43
+ Number of rows to sample when computing feature correlations.
44
+ feature_types : dict or None
45
+ Override detected feature types: {col_name_or_idx: "numerical"|"categorical"}.
46
+ cat_threshold : int
47
+ Numerical columns with ≤ this many unique values are treated as categorical.
48
+ quantile_cuts : bool
49
+ If True, cut-points for numerical features are sampled from empirical quantiles
50
+ instead of uniform [min, max].
51
+ n_jobs : int
52
+ Parallelism for embedding computation (passed to joblib).
53
+ random_state : int or None
54
+ Seed for reproducibility.
55
+ """
56
+
57
+ def __init__(
58
+ self,
59
+ n_iterations: int = 200,
60
+ n_features="sqrt",
61
+ n_bins: int = 3,
62
+ clusterer=None,
63
+ corr_threshold: float | None = 0.7,
64
+ corr_sample_size: int = 10_000,
65
+ feature_types: dict | None = None,
66
+ cat_threshold: int = 10,
67
+ quantile_cuts: bool = False,
68
+ n_jobs: int = -1,
69
+ random_state: int | None = None,
70
+ ):
71
+ self.n_iterations = n_iterations
72
+ self.n_features = n_features
73
+ self.n_bins = n_bins
74
+ self.clusterer = clusterer
75
+ self.corr_threshold = corr_threshold
76
+ self.corr_sample_size = corr_sample_size
77
+ self.feature_types = feature_types
78
+ self.cat_threshold = cat_threshold
79
+ self.quantile_cuts = quantile_cuts
80
+ self.n_jobs = n_jobs
81
+ self.random_state = random_state
82
+
83
+ # ------------------------------------------------------------------
84
+ # Core sklearn interface
85
+ # ------------------------------------------------------------------
86
+
87
+ def fit(self, X, y=None):
88
+ rng = np.random.default_rng(self.random_state)
89
+
90
+ self.encoder_ = DataEncoder(
91
+ feature_types_override=self.feature_types,
92
+ cat_threshold=self.cat_threshold,
93
+ )
94
+ X_enc = self.encoder_.fit_transform(X)
95
+ n, d = X_enc.shape
96
+
97
+ n_feat = _resolve_n_features(self.n_features, d)
98
+
99
+ # Feature weights from correlation
100
+ if self.corr_threshold is not None and d > 1:
101
+ self.feature_weights_ = compute_feature_weights(
102
+ X_enc,
103
+ threshold=self.corr_threshold,
104
+ sample_size=self.corr_sample_size,
105
+ rng=rng,
106
+ )
107
+ else:
108
+ self.feature_weights_ = np.ones(d)
109
+
110
+ # Column statistics for cut-point generation
111
+ self.col_stats_ = build_col_stats(
112
+ X_enc,
113
+ self.encoder_.feature_types_,
114
+ quantile_cuts=self.quantile_cuts,
115
+ rng=rng,
116
+ )
117
+
118
+ # Build all iteration specs
119
+ self.specs_ = build_iteration_specs(
120
+ n_iterations=self.n_iterations,
121
+ col_stats=self.col_stats_,
122
+ n_features_per_iter=n_feat,
123
+ n_bins=self.n_bins,
124
+ feature_weights=self.feature_weights_,
125
+ rng=rng,
126
+ )
127
+
128
+ # Compute training embedding
129
+ self.embedding_ = compute_embedding(X_enc, self.specs_, n_jobs=self.n_jobs)
130
+ return self
131
+
132
+ def fit_predict(self, X, y=None) -> np.ndarray:
133
+ self.fit(X)
134
+ return self._run_clusterer(self.embedding_)
135
+
136
+ # ------------------------------------------------------------------
137
+ # Transform / distance
138
+ # ------------------------------------------------------------------
139
+
140
+ def transform(self, X) -> np.ndarray:
141
+ """Apply fitted partition specs to new data. Returns (n, L) embedding."""
142
+ check_is_fitted(self, "specs_")
143
+ X_enc = self.encoder_.transform(X)
144
+ return compute_embedding(X_enc, self.specs_, n_jobs=self.n_jobs)
145
+
146
+ def get_embedding(self) -> np.ndarray:
147
+ check_is_fitted(self, "embedding_")
148
+ return self.embedding_
149
+
150
+ def pairwise_distance(
151
+ self,
152
+ X=None,
153
+ Y=None,
154
+ chunk_size: int = 2_000,
155
+ ) -> np.ndarray:
156
+ """Hamming distance matrix from embeddings.
157
+
158
+ X=None → use training embedding.
159
+ Y=None → square matrix D[i,j] = d(X[i], X[j]).
160
+ X,Y provided → rectangular matrix D[i,j] = d(X[i], Y[j]).
161
+ """
162
+ check_is_fitted(self, "embedding_")
163
+
164
+ E_X = self.embedding_ if X is None else self.transform(X)
165
+
166
+ if Y is not None:
167
+ E_Y = self.transform(Y)
168
+ return cross_hamming(E_X, E_Y)
169
+
170
+ n = E_X.shape[0]
171
+ if n <= chunk_size:
172
+ return pairwise_hamming(E_X)
173
+ return pairwise_hamming_chunked(E_X, chunk_size=chunk_size)
174
+
175
+ # ------------------------------------------------------------------
176
+ # Internal
177
+ # ------------------------------------------------------------------
178
+
179
+ def _run_clusterer(self, E: np.ndarray) -> np.ndarray:
180
+ clf = self.clusterer
181
+ if clf is None:
182
+ clf = DBSCAN(metric="hamming", n_jobs=self.n_jobs)
183
+
184
+ metric = getattr(clf, "metric", None)
185
+ if metric == "precomputed":
186
+ D = self.pairwise_distance().astype(np.float64)
187
+ return clf.fit_predict(D)
188
+
189
+ # Pass embedding directly; works for DBSCAN(metric='hamming'),
190
+ # HDBSCAN, KMeans, Agglomerative, etc.
191
+ return clf.fit_predict(E)
@@ -0,0 +1,63 @@
1
+ import numpy as np
2
+ from scipy import stats
3
+ from scipy.sparse import csr_matrix
4
+ from scipy.sparse.csgraph import connected_components
5
+
6
+
7
+ def compute_feature_weights(
8
+ X: np.ndarray,
9
+ threshold: float = 0.7,
10
+ sample_size: int = 10_000,
11
+ rng: np.random.Generator | None = None,
12
+ ) -> np.ndarray:
13
+ """Return per-feature weight array of shape (d,).
14
+
15
+ Features belonging to a strongly-correlated group of size G get weight 1/G.
16
+ Correlation is estimated on a random sample via Spearman rank correlation.
17
+ """
18
+ if rng is None:
19
+ rng = np.random.default_rng()
20
+
21
+ n, d = X.shape
22
+ if d < 2:
23
+ return np.ones(d)
24
+
25
+ # Sample rows
26
+ idx = rng.choice(n, size=min(sample_size, n), replace=False)
27
+ X_s = X[idx].astype(np.float64)
28
+
29
+ # Drop columns that are constant in the sample (Spearman undefined)
30
+ stds = X_s.std(axis=0)
31
+ variable_mask = stds > 0
32
+ if variable_mask.sum() < 2:
33
+ return np.ones(d)
34
+
35
+ X_var = X_s[:, variable_mask]
36
+ var_indices = np.where(variable_mask)[0]
37
+
38
+ # Spearman correlation matrix on variable columns
39
+ result = stats.spearmanr(X_var, nan_policy="omit")
40
+ if X_var.shape[1] == 2:
41
+ corr_val = float(result.statistic) if np.isscalar(result.statistic) else float(result.statistic[0, 1])
42
+ corr = np.array([[1.0, corr_val], [corr_val, 1.0]])
43
+ else:
44
+ corr = np.array(result.statistic)
45
+
46
+ corr = np.abs(np.nan_to_num(corr))
47
+
48
+ # Adjacency: connected if |corr| > threshold (no self-loops)
49
+ adj = (corr > threshold).astype(np.uint8)
50
+ np.fill_diagonal(adj, 0)
51
+
52
+ # Connected components on variable features
53
+ _, labels = connected_components(csr_matrix(adj), directed=False)
54
+
55
+ weights = np.ones(d)
56
+ for comp_id in np.unique(labels):
57
+ members_local = np.where(labels == comp_id)[0]
58
+ if len(members_local) < 2:
59
+ continue
60
+ members_global = var_indices[members_local]
61
+ weights[members_global] = 1.0 / len(members_local)
62
+
63
+ return weights
@@ -0,0 +1,26 @@
1
+ import numpy as np
2
+ from scipy.spatial.distance import cdist
3
+
4
+
5
+ def pairwise_hamming(E: np.ndarray) -> np.ndarray:
6
+ """Full pairwise Hamming distance matrix from embedding.
7
+
8
+ E: (n, L) int64 — embedding returned by compute_embedding
9
+ Returns: (n, n) float32
10
+ """
11
+ return cdist(E, E, metric="hamming").astype(np.float32)
12
+
13
+
14
+ def pairwise_hamming_chunked(E: np.ndarray, chunk_size: int = 2_000) -> np.ndarray:
15
+ """Memory-efficient version for larger n (builds full matrix row-by-row)."""
16
+ n = E.shape[0]
17
+ D = np.empty((n, n), dtype=np.float32)
18
+ for start in range(0, n, chunk_size):
19
+ end = min(start + chunk_size, n)
20
+ D[start:end] = cdist(E[start:end], E, metric="hamming").astype(np.float32)
21
+ return D
22
+
23
+
24
+ def cross_hamming(E_X: np.ndarray, E_Y: np.ndarray) -> np.ndarray:
25
+ """Pairwise Hamming between two embedding matrices. Returns (n_X, n_Y) float32."""
26
+ return cdist(E_X, E_Y, metric="hamming").astype(np.float32)