tdaphantom 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tdaphantom/__init__.py +2 -0
- tdaphantom/hypothesis_tests/__init__.py +0 -0
- tdaphantom/hypothesis_tests/bottleneck_distance_tests/__init__.py +0 -0
- tdaphantom/hypothesis_tests/bottleneck_distance_tests/bottleneck_distance_test.py +190 -0
- tdaphantom/hypothesis_tests/universal_null_tests/__init__.py +0 -0
- tdaphantom/hypothesis_tests/universal_null_tests/universal_null_hypothesis_test.py +236 -0
- tdaphantom/metrics/__init__.py +0 -0
- tdaphantom/metrics/metrics.py +79 -0
- tdaphantom/tdaphantom.py +606 -0
- tdaphantom-1.0.0.dist-info/METADATA +140 -0
- tdaphantom-1.0.0.dist-info/RECORD +14 -0
- tdaphantom-1.0.0.dist-info/WHEEL +5 -0
- tdaphantom-1.0.0.dist-info/licenses/LICENSE +21 -0
- tdaphantom-1.0.0.dist-info/top_level.txt +1 -0
tdaphantom/__init__.py
ADDED
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from typing import List, Optional
|
|
3
|
+
import math
|
|
4
|
+
import random
|
|
5
|
+
import gudhi
|
|
6
|
+
from scipy.spatial.distance import cdist
|
|
7
|
+
from tdaphantom.metrics.metrics import w_infinity, hausdorff_dist_matrix, hausdorff
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BNTest:
|
|
11
|
+
def __init__(
|
|
12
|
+
self,
|
|
13
|
+
point_cloud: np.ndarray,
|
|
14
|
+
is_distance_matrix: bool = False,
|
|
15
|
+
dgm: np.ndarray = None,
|
|
16
|
+
k: int = 1,
|
|
17
|
+
alpha: float = 0.05,
|
|
18
|
+
complex: str = "VR",
|
|
19
|
+
max_depth: int = 50, # low and slow
|
|
20
|
+
method: str = "bottleneck:subsample"
|
|
21
|
+
):
|
|
22
|
+
"""
|
|
23
|
+
The bottleneck hypothesis testing from
|
|
24
|
+
'confidence sets for persistence diagrams'
|
|
25
|
+
by Fasy et al
|
|
26
|
+
"""
|
|
27
|
+
self.method = method
|
|
28
|
+
self.method_calls = {
|
|
29
|
+
"bottleneck:subsample": self.subsample,
|
|
30
|
+
"bottleneck:concentration": self.concentration,
|
|
31
|
+
"bottleneck:shells": self.shells,
|
|
32
|
+
"bottleneck:density": self.density
|
|
33
|
+
}
|
|
34
|
+
self.pc = point_cloud
|
|
35
|
+
self.dgm = dgm
|
|
36
|
+
self.is_distance_matrix = is_distance_matrix
|
|
37
|
+
self.k = k
|
|
38
|
+
self.complex = complex # currently only VR is supported
|
|
39
|
+
self.max_depth = max_depth
|
|
40
|
+
self.alpha = alpha
|
|
41
|
+
|
|
42
|
+
def _subsampling_method_via_persistence(self, subsample_percentage: float = 0.3) -> float:
|
|
43
|
+
"""
|
|
44
|
+
DEPRECIATED - DO NOT USE
|
|
45
|
+
E[W_infnity(hat(P),P)] != E[W_infnity(hat(P),subsample_hat(P)]
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
n = len(self.dgm)
|
|
49
|
+
b = int(0.4*n)
|
|
50
|
+
try:
|
|
51
|
+
N = min(int(subsample_percentage * math.comb(n, b)), self.max_depth)
|
|
52
|
+
except OverflowError:
|
|
53
|
+
N = self.max_depth
|
|
54
|
+
|
|
55
|
+
T_j_array = np.zeros(N)
|
|
56
|
+
for i in range(N):
|
|
57
|
+
idx = np.random.choice(n, size=b, replace=False)
|
|
58
|
+
subsample = self.dgm[idx]
|
|
59
|
+
T_j_array[i] = self.w_infinity(subsample, self.dgm)
|
|
60
|
+
|
|
61
|
+
def _subsampling_method(self, subsample_percentage: float = 0.8) -> np.ndarray:
|
|
62
|
+
"""
|
|
63
|
+
Fasy et al. 4.1 subsampling
|
|
64
|
+
b = subsample size = O(n / log(n))
|
|
65
|
+
N = number of subsamples (theory uses n choose b, but we will use a subset)
|
|
66
|
+
A bar with persistence > C_b is significant at level alpha.
|
|
67
|
+
By the bottleneck stability theorem, W_inf(PH(S_n), PH(P)) <= C_b
|
|
68
|
+
with probability >= 1 - alpha.
|
|
69
|
+
|
|
70
|
+
From the paper:
|
|
71
|
+
P(H(S_n, M) > C_n) <= alpha + O((b/n)^(1/4))
|
|
72
|
+
|
|
73
|
+
The bias term O((b/n)^(1/4)) -> 0 as b/n -> 0, so theory requires b << n.
|
|
74
|
+
The paper uses b = O(n / log(n)) for the theoretical guarantee.
|
|
75
|
+
In practice, larger b gives smaller c_n and more power but looser theory guarantees.
|
|
76
|
+
"""
|
|
77
|
+
n = len(self.pc)
|
|
78
|
+
b = min(int(3.5*(n / np.log(n))), int(0.8*n))
|
|
79
|
+
try:
|
|
80
|
+
N = min(int(subsample_percentage * math.comb(n, b)), self.max_depth)
|
|
81
|
+
except OverflowError:
|
|
82
|
+
N = self.max_depth
|
|
83
|
+
all_idx = np.arange(n)
|
|
84
|
+
|
|
85
|
+
if not self.is_distance_matrix:
|
|
86
|
+
D = cdist(self.pc, self.pc)
|
|
87
|
+
else:
|
|
88
|
+
D = self.pc
|
|
89
|
+
|
|
90
|
+
T_j_array = np.zeros(N)
|
|
91
|
+
for i in range(N):
|
|
92
|
+
idx = np.random.choice(n, size=b, replace=False)
|
|
93
|
+
# h(S_n, S_b*) = max_{i in S_n} min_{j in S_b*} D[i,j]
|
|
94
|
+
T_j_array[i] = float(D[:, idx].min(axis=1).max())
|
|
95
|
+
|
|
96
|
+
# bias_order = (b/n)**(0.25)
|
|
97
|
+
|
|
98
|
+
return T_j_array
|
|
99
|
+
|
|
100
|
+
def subsample(self):
|
|
101
|
+
"""
|
|
102
|
+
Calls subsampling method to calculate c_n and p_values
|
|
103
|
+
"""
|
|
104
|
+
births = self.dgm[:, 0]
|
|
105
|
+
deaths = self.dgm[:, 1]
|
|
106
|
+
pers = deaths - births
|
|
107
|
+
T_j_array = self._subsampling_method()
|
|
108
|
+
c_n = float(np.quantile(T_j_array, 1.0 - self.alpha))
|
|
109
|
+
p_values = np.array([
|
|
110
|
+
float(np.mean(T_j_array >= p / 2)) for p in pers
|
|
111
|
+
])
|
|
112
|
+
return c_n, p_values
|
|
113
|
+
|
|
114
|
+
def concentration_of_measure_method(self):
|
|
115
|
+
"""
|
|
116
|
+
Fasy et al. 4.2 concentration of measure
|
|
117
|
+
|
|
118
|
+
From the paper:
|
|
119
|
+
P(H(S_n, M) > \hat(t_n)) <= alpha + O((log(n)/n)^(1/(2+d)))
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
...
|
|
123
|
+
|
|
124
|
+
def concentration(self):
|
|
125
|
+
"""
|
|
126
|
+
Calls concentration method to calculate c_n
|
|
127
|
+
"""
|
|
128
|
+
c_n = self.concentration_of_measure_method()
|
|
129
|
+
return c_n, None
|
|
130
|
+
|
|
131
|
+
def shells_method(self):
|
|
132
|
+
"""
|
|
133
|
+
Fasy et al. 4.3 method of shells
|
|
134
|
+
|
|
135
|
+
From the paper:
|
|
136
|
+
P(H(S_{2,n}, M) > \hat(t_{1,n}) <= alpha + O(r_n)
|
|
137
|
+
"""
|
|
138
|
+
...
|
|
139
|
+
|
|
140
|
+
def shells(self):
|
|
141
|
+
"""
|
|
142
|
+
Calls shells method to calculate c_n
|
|
143
|
+
"""
|
|
144
|
+
c_n = self.shells_method()
|
|
145
|
+
return c_n, None
|
|
146
|
+
|
|
147
|
+
def denisty_method(self):
|
|
148
|
+
"""
|
|
149
|
+
Fasy et al. 4.4 Density estimation
|
|
150
|
+
|
|
151
|
+
From the paper:
|
|
152
|
+
P(||\hat{p}_h - p_h||_infinity > Z_alpha / sqrt(nh^D) ) <= alpha + O(log(n)/nh^D)^((4+D)/(4+2D))
|
|
153
|
+
"""
|
|
154
|
+
...
|
|
155
|
+
|
|
156
|
+
def denisty(self):
|
|
157
|
+
"""
|
|
158
|
+
Calls shells method to calculate c_n
|
|
159
|
+
"""
|
|
160
|
+
c_n = self.density_method()
|
|
161
|
+
return c_n, None
|
|
162
|
+
|
|
163
|
+
def results(self) -> dict:
|
|
164
|
+
"""
|
|
165
|
+
Returns a structured array with one row per bar.
|
|
166
|
+
Cols: birth, death, pers, p_value, significant
|
|
167
|
+
|
|
168
|
+
and the threshold c_n
|
|
169
|
+
"""
|
|
170
|
+
c_n = -np.inf
|
|
171
|
+
p_values = None
|
|
172
|
+
if self.method in self.method_calls.keys():
|
|
173
|
+
c_n, p_values = self.method_calls[self.method]()
|
|
174
|
+
|
|
175
|
+
births = self.dgm[:, 0]
|
|
176
|
+
deaths = self.dgm[:, 1]
|
|
177
|
+
pers = deaths - births
|
|
178
|
+
|
|
179
|
+
rejected = pers > 2*c_n
|
|
180
|
+
|
|
181
|
+
return {
|
|
182
|
+
"results_array": np.column_stack([
|
|
183
|
+
births,
|
|
184
|
+
deaths,
|
|
185
|
+
pers,
|
|
186
|
+
p_values,
|
|
187
|
+
rejected.astype(float),
|
|
188
|
+
]),
|
|
189
|
+
"threshold": 2*c_n # used for diagram
|
|
190
|
+
}
|
|
File without changes
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from typing import List
|
|
3
|
+
import warnings
|
|
4
|
+
|
|
5
|
+
EULER_MASCHERONI = 0.57721566490153
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class UNTest:
|
|
9
|
+
def __init__(
|
|
10
|
+
self,
|
|
11
|
+
dgm: np.ndarray,
|
|
12
|
+
k: int,
|
|
13
|
+
alpha: float = 0.05,
|
|
14
|
+
complex: str = "VR",
|
|
15
|
+
correction_strategy: str = "BH",
|
|
16
|
+
max_depth: int = 1000,
|
|
17
|
+
max_threshold=None,
|
|
18
|
+
method: str = "universal_null:median"
|
|
19
|
+
):
|
|
20
|
+
"""
|
|
21
|
+
Implimentation of the universal null hypothesis test from
|
|
22
|
+
'A universal null‑distribution for topological data analysis'
|
|
23
|
+
by Omer Bobrowski & Primoz Skraba
|
|
24
|
+
"""
|
|
25
|
+
self.dgm = np.copy(dgm)
|
|
26
|
+
self.k = k
|
|
27
|
+
self.complex = complex # currently only VR is supported
|
|
28
|
+
self.max_depth = max_depth
|
|
29
|
+
self.alpha = alpha
|
|
30
|
+
self.correction_strategy = correction_strategy
|
|
31
|
+
self.method = method
|
|
32
|
+
|
|
33
|
+
default_max = 10.0 # max epsilon for ripser for example
|
|
34
|
+
|
|
35
|
+
if max_threshold is not None:
|
|
36
|
+
self.max_threshold = max_threshold
|
|
37
|
+
else:
|
|
38
|
+
finite_deaths = self.dgm[np.isfinite(self.dgm[:, 1]), 1]
|
|
39
|
+
self.max_threshold = float(
|
|
40
|
+
np.max(finite_deaths))*10 if len(finite_deaths) > 0 else default_max
|
|
41
|
+
warnings.warn(
|
|
42
|
+
"max_threshold not supplied — falling back to max finite death * 10. "
|
|
43
|
+
"Pass max_eps from your Rips filtration for correct results.",
|
|
44
|
+
UserWarning,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
self.A = None
|
|
48
|
+
if self.complex == "VR":
|
|
49
|
+
self.A = 1 # for Vietoris-Rips complex
|
|
50
|
+
elif self.complex == "C":
|
|
51
|
+
self.A = 0.5
|
|
52
|
+
else:
|
|
53
|
+
raise ValueError(
|
|
54
|
+
f"Unknown complex type {self.complex}. Use 'VR' or 'C'.")
|
|
55
|
+
|
|
56
|
+
finite_mask = np.isfinite(self.dgm[:, 1])
|
|
57
|
+
births_f = self.dgm[finite_mask, 0]
|
|
58
|
+
pi_values_f = self.dgm[finite_mask, 1] / births_f
|
|
59
|
+
|
|
60
|
+
valid = (births_f > 0) & (pi_values_f > 1.0)
|
|
61
|
+
log_log_pi = np.log(np.log(pi_values_f[valid]))
|
|
62
|
+
|
|
63
|
+
if self.method == "universal_null:median":
|
|
64
|
+
self.L_hat = float(np.median(log_log_pi))
|
|
65
|
+
elif self.method == "universal_null:mean":
|
|
66
|
+
self.L_hat = float(np.mean(log_log_pi))
|
|
67
|
+
|
|
68
|
+
def _correct_alpha(self) -> float:
|
|
69
|
+
if self.correction_strategy == "Bonferroni":
|
|
70
|
+
return self.alpha / len(self.dgm)
|
|
71
|
+
elif self.correction_strategy == "BH":
|
|
72
|
+
return self.alpha
|
|
73
|
+
elif self.correction_strategy == None:
|
|
74
|
+
return self.alpha
|
|
75
|
+
else:
|
|
76
|
+
raise ValueError(
|
|
77
|
+
f"Unknown multiple testing correction strategy "
|
|
78
|
+
f"{self.correction_strategy}. Use 'Bonferroni' or 'BH'."
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
def _pi_min(self, x: float) -> float:
|
|
82
|
+
"""
|
|
83
|
+
minimum death/birth ratio such that the p-value for this diagram is under x
|
|
84
|
+
"""
|
|
85
|
+
if x <= 0.0:
|
|
86
|
+
return np.inf
|
|
87
|
+
if x >= 1.0:
|
|
88
|
+
return 1.0
|
|
89
|
+
|
|
90
|
+
B = -EULER_MASCHERONI - self.A * self.L_hat
|
|
91
|
+
l_thresh = np.log(-np.log(x))
|
|
92
|
+
return float(np.exp(np.exp((l_thresh - B) / self.A)))
|
|
93
|
+
|
|
94
|
+
def _find_threshold_for_infinite_cycles(self, t_0: float) -> float:
|
|
95
|
+
"""
|
|
96
|
+
This algorithm gives us the threshold we need to use when calculating p_values
|
|
97
|
+
for infinite cycles
|
|
98
|
+
We choose the earliest-born infinite cycle (min(I)),
|
|
99
|
+
while we could have chosen the latest-born (max(I)), or any intermediate value.
|
|
100
|
+
This choice represents trade-off between the number of iterations needed and the overestimation of τ.
|
|
101
|
+
Choosing the earliest born cycle results in the smallest threshold, but with potentially more iterations, while choosing the last cycle will
|
|
102
|
+
have fewer iterations with a possible overestimation of the threshold.
|
|
103
|
+
"""
|
|
104
|
+
tau = t_0
|
|
105
|
+
i = 0
|
|
106
|
+
|
|
107
|
+
while i < self.max_depth:
|
|
108
|
+
i += 1
|
|
109
|
+
D = self.dgm[self.dgm[:, 0] <= tau]
|
|
110
|
+
if len(D) == 0:
|
|
111
|
+
break
|
|
112
|
+
|
|
113
|
+
threshold = self._pi_min(self.alpha / len(D))
|
|
114
|
+
inf_births = D[D[:, 1] >= tau, 0]
|
|
115
|
+
inf_births = inf_births[inf_births > 0]
|
|
116
|
+
|
|
117
|
+
if len(inf_births) == 0:
|
|
118
|
+
break
|
|
119
|
+
|
|
120
|
+
I_births = inf_births[tau / inf_births < threshold]
|
|
121
|
+
|
|
122
|
+
if len(I_births) == 0:
|
|
123
|
+
break
|
|
124
|
+
|
|
125
|
+
tau = float(np.min(I_births) * threshold)
|
|
126
|
+
|
|
127
|
+
return tau
|
|
128
|
+
|
|
129
|
+
def _calculate_l(self) -> np.ndarray:
|
|
130
|
+
"""
|
|
131
|
+
Normalises the persistence diagram values so the noise values follow
|
|
132
|
+
the Lgumbel(0,1) distribution
|
|
133
|
+
|
|
134
|
+
l_value is defined piecewise
|
|
135
|
+
for finite death it is defined using a log(log()) transform and normalised
|
|
136
|
+
for infinite death values we let death = max_threshold
|
|
137
|
+
"""
|
|
138
|
+
|
|
139
|
+
births = self.dgm[:, 0]
|
|
140
|
+
deaths = np.where(np.isfinite(
|
|
141
|
+
self.dgm[:, 1]), self.dgm[:, 1], self.max_threshold)
|
|
142
|
+
pi_values = deaths / births
|
|
143
|
+
|
|
144
|
+
tau = self._find_threshold_for_infinite_cycles(self.max_threshold)
|
|
145
|
+
inf_mask = ~np.isfinite(self.dgm[:, 1])
|
|
146
|
+
pi_values[inf_mask] = tau / births[inf_mask]
|
|
147
|
+
|
|
148
|
+
log_log_pi = np.full(len(self.dgm), np.nan)
|
|
149
|
+
valid = (births > 0) & (pi_values > 1.0)
|
|
150
|
+
log_log_pi[valid] = np.log(np.log(pi_values[valid]))
|
|
151
|
+
|
|
152
|
+
l_values = self.A * log_log_pi - EULER_MASCHERONI - self.A * self.L_hat
|
|
153
|
+
return l_values
|
|
154
|
+
|
|
155
|
+
def _calculate_p_values_for_persistence_diagram(self) -> np.ndarray:
|
|
156
|
+
"""
|
|
157
|
+
Finds the p value for each cycle.
|
|
158
|
+
l_values for infinite cycles are handeled by _calculate_l
|
|
159
|
+
"""
|
|
160
|
+
l_values = self._calculate_l()
|
|
161
|
+
p_values = np.exp(-np.exp(l_values))
|
|
162
|
+
self.p_values = p_values
|
|
163
|
+
return p_values
|
|
164
|
+
|
|
165
|
+
def calculate_significance_for_persistence_diagram(self) -> np.ndarray:
|
|
166
|
+
"""
|
|
167
|
+
This actually applies your corrected alpha to check which p are significant
|
|
168
|
+
"""
|
|
169
|
+
p_values = self._calculate_p_values_for_persistence_diagram()
|
|
170
|
+
|
|
171
|
+
if self.correction_strategy == "Bonferroni":
|
|
172
|
+
rejected = p_values < self._correct_alpha()
|
|
173
|
+
|
|
174
|
+
elif self.correction_strategy == "BH":
|
|
175
|
+
# sort p-values, find largest k where p_(k) <= k/m * alpha
|
|
176
|
+
m = len(p_values)
|
|
177
|
+
order = np.argsort(p_values)
|
|
178
|
+
sorted_p = p_values[order]
|
|
179
|
+
bh_line = (np.arange(1, m + 1) / m) * self.alpha
|
|
180
|
+
below = np.where(sorted_p <= bh_line)[0]
|
|
181
|
+
rejected = np.zeros(m, dtype=bool)
|
|
182
|
+
if len(below) > 0:
|
|
183
|
+
rejected[order[:below[-1] + 1]] = True
|
|
184
|
+
elif self.correction_strategy == None:
|
|
185
|
+
rejected = p_values < self.alpha
|
|
186
|
+
else:
|
|
187
|
+
raise ValueError(
|
|
188
|
+
f"Unknown multiple testing correction strategy "
|
|
189
|
+
f"{self.correction_strategy}. Use 'Bonferroni' or 'BH'."
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
return rejected
|
|
193
|
+
|
|
194
|
+
def results(self) -> dict:
|
|
195
|
+
"""
|
|
196
|
+
Returns a structured array with one row per bar.
|
|
197
|
+
Cols: birth, death, pi, p_value, significant
|
|
198
|
+
"""
|
|
199
|
+
rejected = self.calculate_significance_for_persistence_diagram()
|
|
200
|
+
p_values = self.p_values
|
|
201
|
+
|
|
202
|
+
births = self.dgm[:, 0]
|
|
203
|
+
deaths = self.dgm[:, 1]
|
|
204
|
+
pi_values = np.where(
|
|
205
|
+
np.isfinite(deaths),
|
|
206
|
+
deaths / births,
|
|
207
|
+
self.max_threshold / births,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
if self.correction_strategy == "Bonferroni":
|
|
211
|
+
alpha_thresh = self._correct_alpha()
|
|
212
|
+
elif self.correction_strategy == "BH":
|
|
213
|
+
# alpha_thresh = k/m * alpha where k = number of rejections
|
|
214
|
+
# this is the BH threshold that was actually applied
|
|
215
|
+
m = len(p_values)
|
|
216
|
+
k = int(rejected.sum())
|
|
217
|
+
print(f"m: {m}, k - number of rejections: {k}")
|
|
218
|
+
if k > 0:
|
|
219
|
+
alpha_thresh = (k / m) * self.alpha
|
|
220
|
+
else:
|
|
221
|
+
alpha_thresh = 0.0 # nothing rejected, threshold is below all bars
|
|
222
|
+
elif self.correction_strategy == None:
|
|
223
|
+
alpha_thresh = self.alpha
|
|
224
|
+
|
|
225
|
+
threshold = self._pi_min(alpha_thresh)
|
|
226
|
+
|
|
227
|
+
return {
|
|
228
|
+
"results_array": np.column_stack([
|
|
229
|
+
births,
|
|
230
|
+
deaths,
|
|
231
|
+
pi_values,
|
|
232
|
+
p_values,
|
|
233
|
+
rejected.astype(float),
|
|
234
|
+
]),
|
|
235
|
+
"threshold": threshold,
|
|
236
|
+
}
|
|
File without changes
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import gudhi
|
|
2
|
+
import numpy as np
|
|
3
|
+
import random
|
|
4
|
+
from scipy.spatial.distance import cdist
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def w_infinity(dgm_1: np.ndarray, dgm_2: np.ndarray) -> float:
|
|
8
|
+
w_inf_approx = gudhi.bottleneck_distance(
|
|
9
|
+
dgm_1.tolist(), dgm_2.tolist(), e=0.01)
|
|
10
|
+
return w_inf_approx
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def hausdorff_directed(A: np.ndarray, B: np.ndarray) -> float:
|
|
14
|
+
"""
|
|
15
|
+
Directed Hausdorff distance from Algorithm 2 in "An Efficient Algorithm for Calculating the Exact Hausdorff Distance"
|
|
16
|
+
from Taha & Hanbury
|
|
17
|
+
"""
|
|
18
|
+
rng = np.random.default_rng()
|
|
19
|
+
A = rng.permutation(A)
|
|
20
|
+
B = rng.permutation(B)
|
|
21
|
+
|
|
22
|
+
c_max = 0.0
|
|
23
|
+
for a in A:
|
|
24
|
+
c_min = np.inf
|
|
25
|
+
for b in B:
|
|
26
|
+
d = float(np.linalg.norm(a - b))
|
|
27
|
+
if d < c_max:
|
|
28
|
+
c_min = d # paper omits this but it seems required
|
|
29
|
+
break
|
|
30
|
+
if d < c_min:
|
|
31
|
+
c_min = d
|
|
32
|
+
if c_min > c_max:
|
|
33
|
+
c_max = c_min
|
|
34
|
+
|
|
35
|
+
return c_max
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def hausdorff(A: np.ndarray, B: np.ndarray) -> float:
|
|
39
|
+
"""
|
|
40
|
+
Symmetric Hausdorff distance
|
|
41
|
+
"""
|
|
42
|
+
return max(hausdorff_directed(A, B), hausdorff_directed(B, A))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def hausdorff_directed_dist_matrix(A: np.ndarray, B: np.ndarray) -> float:
|
|
46
|
+
"""
|
|
47
|
+
Taken from "an Efficient Algorithm for Calculating the Exact Hausdorff Distance"
|
|
48
|
+
by Abdel Aziz Taha and Allan Hanbury.
|
|
49
|
+
|
|
50
|
+
This does not need to compute euclidean distance as this is done for us
|
|
51
|
+
"""
|
|
52
|
+
rng = np.random.default_rng()
|
|
53
|
+
A_idx = rng.permutation(A_idx)
|
|
54
|
+
B_idx = rng.permutation(B_idx)
|
|
55
|
+
|
|
56
|
+
c_max = 0.0
|
|
57
|
+
for i in A_idx:
|
|
58
|
+
c_min = np.inf
|
|
59
|
+
for j in B_idx:
|
|
60
|
+
d = float(A[i, j])
|
|
61
|
+
if d < c_max:
|
|
62
|
+
c_min = d
|
|
63
|
+
break
|
|
64
|
+
if d < c_min:
|
|
65
|
+
c_min = d
|
|
66
|
+
if c_min > c_max:
|
|
67
|
+
c_max = c_min
|
|
68
|
+
|
|
69
|
+
return c_max
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def hausdorff_dist_matrix(A_idx: np.ndarray, B_idx: np.ndarray) -> float:
|
|
73
|
+
"""
|
|
74
|
+
Symmetric Hausdorff distance H(A,B) = max(h(A,B), h(B,A))
|
|
75
|
+
"""
|
|
76
|
+
return max(
|
|
77
|
+
hausdorff_directed_dist_matrix(A_idx, B_idx),
|
|
78
|
+
hausdorff_directed_dist_matrix(B_idx, A_idx),
|
|
79
|
+
)
|
tdaphantom/tdaphantom.py
ADDED
|
@@ -0,0 +1,606 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import warnings
|
|
3
|
+
from tdaphantom.hypothesis_tests.universal_null_tests.universal_null_hypothesis_test import UNTest
|
|
4
|
+
from tdaphantom.hypothesis_tests.bottleneck_distance_tests.bottleneck_distance_test import BNTest
|
|
5
|
+
import matplotlib.pyplot as plt
|
|
6
|
+
import gudhi
|
|
7
|
+
from ripser import ripser
|
|
8
|
+
import pickle
|
|
9
|
+
import os
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Phantom:
|
|
13
|
+
"""
|
|
14
|
+
Persistence diagram container.
|
|
15
|
+
|
|
16
|
+
point_cloud : np.ndarray
|
|
17
|
+
Either an (n, d) point cloud or an (n, n) distance matrix.
|
|
18
|
+
|
|
19
|
+
is_distance_matrix : bool
|
|
20
|
+
Set True when point_cloud is a distance matrix.
|
|
21
|
+
|
|
22
|
+
dgms : dict[int, np.ndarray]
|
|
23
|
+
Persistence diagrams keyed by homological dimension.
|
|
24
|
+
Each value has shape (n, 2) — columns are [birth, death].
|
|
25
|
+
Populated by calculate_dgms_from_point_cloud.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
point_cloud: np.ndarray,
|
|
31
|
+
is_distance_matrix: bool = False,
|
|
32
|
+
):
|
|
33
|
+
self.pc: np.ndarray = point_cloud
|
|
34
|
+
self.is_dist: bool = is_distance_matrix
|
|
35
|
+
self.dgms: dict[int, np.ndarray] = {}
|
|
36
|
+
self._cached_results: dict = {}
|
|
37
|
+
self.k: int = None
|
|
38
|
+
|
|
39
|
+
self.allowed_methods: list[str] = [
|
|
40
|
+
"universal_null",
|
|
41
|
+
"universal_null:median",
|
|
42
|
+
"universal_null:mean",
|
|
43
|
+
"bottleneck",
|
|
44
|
+
"bottleneck:subsample",
|
|
45
|
+
# "bottleneck:shells",
|
|
46
|
+
# "bottleneck:density",
|
|
47
|
+
# "bottleneck:concentration",
|
|
48
|
+
]
|
|
49
|
+
self.allowed_methods_descriptions: dict[str, str] = {
|
|
50
|
+
"universal_null": "Alias for universal_null:median.",
|
|
51
|
+
"universal_null:median": "Assumes noise follows a Gumbel distribution (Bobrowski & Skraba); uses median normalisation.",
|
|
52
|
+
"universal_null:mean": "Assumes noise follows a Gumbel distribution (Bobrowski & Skraba); uses mean normalisation.",
|
|
53
|
+
"bottleneck": "Alias for bottleneck:subsample. All bottleneck methods aim to bound the bottleneck distance between your samples diagram and the ideal hypothetical diagram using confidence intervals. This is how the hypothesis test is designed.",
|
|
54
|
+
"bottleneck:subsample": "Bootstrap confidence band via subsampling (Fasy et al.).",
|
|
55
|
+
# "bottleneck:shells": "Bottleneck test using shell decomposition.",
|
|
56
|
+
# "bottleneck:density": "Bottleneck test using density estimation.",
|
|
57
|
+
# "bottleneck:concentration": "Bottleneck test using concentration inequalities.",
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
def __repr__(self) -> str:
|
|
61
|
+
sizes = {k: len(v) for k, v in self.dgms.items()}
|
|
62
|
+
return (
|
|
63
|
+
f"Phantom("
|
|
64
|
+
f"n_points={len(self.pc)}, "
|
|
65
|
+
f"is_distance_matrix={self.is_dist}, "
|
|
66
|
+
f"computed_dims={list(self.dgms.keys())}, "
|
|
67
|
+
f"diagram_sizes={sizes})"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
def calculate_dgms_from_point_cloud(
|
|
71
|
+
self,
|
|
72
|
+
point_cloud: np.ndarray = None,
|
|
73
|
+
is_distance_matrix: bool = None,
|
|
74
|
+
max_dim: int = None,
|
|
75
|
+
max_eps: float = None,
|
|
76
|
+
k: int = None,
|
|
77
|
+
) -> dict[int, np.ndarray]:
|
|
78
|
+
"""
|
|
79
|
+
Build a Vietoris-Rips complex and compute persistence diagrams for
|
|
80
|
+
every homological dimension 0 … max_dim.
|
|
81
|
+
|
|
82
|
+
max_dim defaults to k when provided, otherwise 1. This lets
|
|
83
|
+
you write calculate_dgms_from_point_cloud(k=2) and have exactly
|
|
84
|
+
the diagrams needed for a subsequent hypothesis_test(k=2)
|
|
85
|
+
|
|
86
|
+
point_cloud : np.ndarray, optional
|
|
87
|
+
Overrides self.pc when provided.
|
|
88
|
+
is_distance_matrix : bool, optional
|
|
89
|
+
Overrides self.is_dist when provided.
|
|
90
|
+
max_dim : int, optional
|
|
91
|
+
Highest homological dimension to compute.
|
|
92
|
+
Defaults to k if given, otherwise 1.
|
|
93
|
+
max_eps : float, optional
|
|
94
|
+
Maximum edge length for the Rips filtration (default np.inf).
|
|
95
|
+
k : int, optional
|
|
96
|
+
Convenience alias: sets max_dim when max_dim is not
|
|
97
|
+
explicitly supplied.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
dict[int, np.ndarray]
|
|
101
|
+
Persistence diagrams keyed by homological dimension.
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
if point_cloud is None:
|
|
105
|
+
point_cloud = self.pc
|
|
106
|
+
if is_distance_matrix is None:
|
|
107
|
+
is_distance_matrix = self.is_dist
|
|
108
|
+
if max_dim is None:
|
|
109
|
+
max_dim = k if k is not None else 1
|
|
110
|
+
|
|
111
|
+
# Default max_eps to the diameter of the data (matching ripser's
|
|
112
|
+
# behaviour). np.inf is intentionally avoided: gudhi will enumerate
|
|
113
|
+
# every possible simplex up to max_dim+1, which is O(n^(max_dim+2))
|
|
114
|
+
# and will exhaust memory on any moderately sized point cloud.
|
|
115
|
+
if max_eps is None:
|
|
116
|
+
if is_distance_matrix:
|
|
117
|
+
max_eps = float(np.max(point_cloud))
|
|
118
|
+
else:
|
|
119
|
+
# Diameter via broadcasting; O(n²) memory — warn for large inputs.
|
|
120
|
+
if len(point_cloud) > 5000:
|
|
121
|
+
warnings.warn(
|
|
122
|
+
f"Computing the diameter of {len(point_cloud)} points "
|
|
123
|
+
f"requires an O(n²) distance matrix. Consider passing "
|
|
124
|
+
f"max_eps explicitly to avoid this.",
|
|
125
|
+
UserWarning,
|
|
126
|
+
stacklevel=2,
|
|
127
|
+
)
|
|
128
|
+
D = np.linalg.norm(
|
|
129
|
+
point_cloud[:, None, :] - point_cloud[None, :, :], axis=-1
|
|
130
|
+
)
|
|
131
|
+
max_eps = float(D.max())
|
|
132
|
+
|
|
133
|
+
self.max_eps = max_eps
|
|
134
|
+
|
|
135
|
+
if is_distance_matrix:
|
|
136
|
+
rc = gudhi.RipsComplex(
|
|
137
|
+
distance_matrix=point_cloud.tolist(),
|
|
138
|
+
max_edge_length=max_eps,
|
|
139
|
+
sparse=0.3,
|
|
140
|
+
)
|
|
141
|
+
else:
|
|
142
|
+
rc = gudhi.RipsComplex(
|
|
143
|
+
points=point_cloud.tolist(),
|
|
144
|
+
max_edge_length=max_eps,
|
|
145
|
+
sparse=0.3,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
# create_simplex_tree needs max_dimension = max_dim + 1 to compute
|
|
149
|
+
# homology up to degree max_dim
|
|
150
|
+
st = rc.create_simplex_tree(max_dimension=max_dim + 1)
|
|
151
|
+
st.compute_persistence()
|
|
152
|
+
|
|
153
|
+
self.dgms = {}
|
|
154
|
+
for dim in range(max_dim + 1):
|
|
155
|
+
intervals = st.persistence_intervals_in_dimension(dim)
|
|
156
|
+
if len(intervals) == 0:
|
|
157
|
+
self.dgms[dim] = np.empty((0, 2))
|
|
158
|
+
else:
|
|
159
|
+
dgm = np.array(intervals, dtype=float)
|
|
160
|
+
# Remove degenerate bars (numerical artefacts)
|
|
161
|
+
self.dgms[dim] = dgm[dgm[:, 1] > dgm[:, 0]]
|
|
162
|
+
|
|
163
|
+
return self.dgms
|
|
164
|
+
|
|
165
|
+
def calculate_dgms_from_point_cloud_ripser(
|
|
166
|
+
self,
|
|
167
|
+
point_cloud: np.ndarray = None,
|
|
168
|
+
is_distance_matrix: bool = None,
|
|
169
|
+
max_dim: int = None,
|
|
170
|
+
max_eps: float = None,
|
|
171
|
+
k: int = None,
|
|
172
|
+
) -> dict[int, np.ndarray]:
|
|
173
|
+
"""
|
|
174
|
+
Ripser backend for computing persistence diagrams. Equivalent
|
|
175
|
+
interface to calculate_dgms_from_point_cloud but uses ripser
|
|
176
|
+
instead of gudhi's RipsComplex.
|
|
177
|
+
|
|
178
|
+
Ripser is significantly faster than gudhi for Vietoris-Rips
|
|
179
|
+
persistence, especially at higher dimensions, because it exploits
|
|
180
|
+
the implicit representation of the Rips complex and uses
|
|
181
|
+
cohomology rather than homology internally.
|
|
182
|
+
|
|
183
|
+
point_cloud : np.ndarray, optional
|
|
184
|
+
Overrides self.pc when provided.
|
|
185
|
+
is_distance_matrix : bool, optional
|
|
186
|
+
Overrides self.is_dist when provided.
|
|
187
|
+
max_dim : int, optional
|
|
188
|
+
Highest homological dimension to compute.
|
|
189
|
+
Defaults to k if given, otherwise 1.
|
|
190
|
+
max_eps : float, optional
|
|
191
|
+
Maximum edge length / filtration threshold.
|
|
192
|
+
Defaults to the diameter of the data.
|
|
193
|
+
k : int, optional
|
|
194
|
+
sets max_dim when max_dim is not
|
|
195
|
+
explicitly supplied.
|
|
196
|
+
|
|
197
|
+
Returns
|
|
198
|
+
dict[int, np.ndarray]
|
|
199
|
+
Persistence diagrams keyed by homological dimension,
|
|
200
|
+
stored in ``self.dgms``.
|
|
201
|
+
"""
|
|
202
|
+
if point_cloud is None:
|
|
203
|
+
point_cloud = self.pc
|
|
204
|
+
if is_distance_matrix is None:
|
|
205
|
+
is_distance_matrix = self.is_dist
|
|
206
|
+
if max_dim is None:
|
|
207
|
+
max_dim = k if k is not None else 1
|
|
208
|
+
if max_eps is None:
|
|
209
|
+
max_eps = np.inf # ripser handles inf safely via its internal algorithms
|
|
210
|
+
|
|
211
|
+
result = ripser(
|
|
212
|
+
point_cloud,
|
|
213
|
+
maxdim=max_dim,
|
|
214
|
+
# thresh=max_eps,
|
|
215
|
+
distance_matrix=is_distance_matrix,
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
self.dgms = {}
|
|
219
|
+
for dim, dgm in enumerate(result["dgms"]):
|
|
220
|
+
if len(dgm) == 0:
|
|
221
|
+
self.dgms[dim] = np.empty((0, 2))
|
|
222
|
+
else:
|
|
223
|
+
dgm = np.array(dgm, dtype=float)
|
|
224
|
+
# Remove degenerate bars (numerical artefacts)
|
|
225
|
+
self.dgms[dim] = dgm[dgm[:, 1] > dgm[:, 0]]
|
|
226
|
+
|
|
227
|
+
self.max_eps = max_eps
|
|
228
|
+
return self.dgms
|
|
229
|
+
|
|
230
|
+
def display_dgms(
|
|
231
|
+
self,
|
|
232
|
+
dgms: np.ndarray = None,
|
|
233
|
+
plot: str = "both",
|
|
234
|
+
) -> None:
|
|
235
|
+
"""
|
|
236
|
+
Plot the computed persistence diagrams.
|
|
237
|
+
|
|
238
|
+
Each homological dimension is drawn in a distinct colour. Call
|
|
239
|
+
calculate_dgms_from_point_cloud before this method or pass in a diagram.
|
|
240
|
+
|
|
241
|
+
plot : str
|
|
242
|
+
"diagram" — persistence diagram only (birth vs death).
|
|
243
|
+
"barcode" — barcode only (one horizontal bar per feature).
|
|
244
|
+
"both" — diagram and barcode side by side (default).
|
|
245
|
+
"""
|
|
246
|
+
if not self.dgms:
|
|
247
|
+
if not dgms:
|
|
248
|
+
raise ValueError(
|
|
249
|
+
"No persistence diagrams found. "
|
|
250
|
+
"Call calculate_dgms_from_point_cloud first."
|
|
251
|
+
"Or pass in your own diagram with the form {0: dgm_0, 1: dgm_2, ...}"
|
|
252
|
+
)
|
|
253
|
+
if plot not in ("diagram", "barcode", "both"):
|
|
254
|
+
raise ValueError(
|
|
255
|
+
f"plot must be 'diagram', 'barcode', or 'both', got {plot!r}."
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
dims = sorted(self.dgms.keys())
|
|
259
|
+
colours = plt.cm.tab10.colors
|
|
260
|
+
|
|
261
|
+
n_cols = 2 if plot == "both" else 1
|
|
262
|
+
fig, axes = plt.subplots(
|
|
263
|
+
1, n_cols, figsize=(6 * n_cols, 5), squeeze=False)
|
|
264
|
+
|
|
265
|
+
all_finite = np.concatenate(
|
|
266
|
+
[dgm[np.isfinite(dgm[:, 1])] for dgm in self.dgms.values()
|
|
267
|
+
if len(dgm) > 0],
|
|
268
|
+
axis=0,
|
|
269
|
+
) if any(len(d) > 0 for d in self.dgms.values()) else np.empty((0, 2))
|
|
270
|
+
|
|
271
|
+
lim = all_finite[:, 1].max() * 1.05 if len(all_finite) else 1.0
|
|
272
|
+
|
|
273
|
+
if plot in ("diagram", "both"):
|
|
274
|
+
ax = axes[0, 0]
|
|
275
|
+
ax.plot([0, lim], [0, lim], "k--", lw=0.8,
|
|
276
|
+
alpha=0.4, label="diagonal")
|
|
277
|
+
|
|
278
|
+
for dim in dims:
|
|
279
|
+
dgm = self.dgms[dim]
|
|
280
|
+
if len(dgm) == 0:
|
|
281
|
+
continue
|
|
282
|
+
births = dgm[:, 0]
|
|
283
|
+
deaths = dgm[:, 1].copy()
|
|
284
|
+
|
|
285
|
+
inf_mask = ~np.isfinite(deaths)
|
|
286
|
+
deaths[inf_mask] = lim
|
|
287
|
+
colour = colours[dim % len(colours)]
|
|
288
|
+
ax.scatter(
|
|
289
|
+
births, deaths,
|
|
290
|
+
s=10, alpha=0.8, color=colour,
|
|
291
|
+
label=f"H_{dim} ({len(dgm)})",
|
|
292
|
+
zorder=3,
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
if inf_mask.any():
|
|
296
|
+
ax.scatter(
|
|
297
|
+
births[inf_mask], deaths[inf_mask],
|
|
298
|
+
s=30, marker="^", color=colour, zorder=4,
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
ax.set_xlabel("birth")
|
|
302
|
+
ax.set_ylabel("death")
|
|
303
|
+
ax.set_title("Persistence diagram")
|
|
304
|
+
ax.set_aspect("equal")
|
|
305
|
+
ax.set_xlim(0, lim)
|
|
306
|
+
ax.set_ylim(0, lim)
|
|
307
|
+
ax.legend(fontsize=8)
|
|
308
|
+
|
|
309
|
+
if plot in ("barcode", "both"):
|
|
310
|
+
ax = axes[0, 1 if plot == "both" else 0]
|
|
311
|
+
|
|
312
|
+
rank = 0
|
|
313
|
+
tick_positions = []
|
|
314
|
+
tick_labels = []
|
|
315
|
+
|
|
316
|
+
for dim in dims:
|
|
317
|
+
dgm = self.dgms[dim]
|
|
318
|
+
if len(dgm) == 0:
|
|
319
|
+
continue
|
|
320
|
+
colour = colours[dim % len(colours)]
|
|
321
|
+
|
|
322
|
+
pers = dgm[:, 1] - dgm[:, 0]
|
|
323
|
+
order = np.argsort(pers)[::-1]
|
|
324
|
+
dim_start = rank
|
|
325
|
+
|
|
326
|
+
for idx in order:
|
|
327
|
+
birth = dgm[idx, 0]
|
|
328
|
+
death = dgm[idx, 1] if np.isfinite(dgm[idx, 1]) else lim
|
|
329
|
+
ax.hlines(rank, birth, death, colors=colour,
|
|
330
|
+
linewidth=1.5, alpha=0.8)
|
|
331
|
+
rank += 1
|
|
332
|
+
|
|
333
|
+
mid = (dim_start + rank - 1) / 2
|
|
334
|
+
tick_positions.append(mid)
|
|
335
|
+
tick_labels.append(f"H_{dim}")
|
|
336
|
+
|
|
337
|
+
ax.set_xlabel("filtration value epsilon")
|
|
338
|
+
ax.set_yticks(tick_positions)
|
|
339
|
+
ax.set_yticklabels(tick_labels)
|
|
340
|
+
ax.set_title("Barcode")
|
|
341
|
+
ax.invert_yaxis()
|
|
342
|
+
|
|
343
|
+
plt.tight_layout()
|
|
344
|
+
plt.show()
|
|
345
|
+
|
|
346
|
+
def hypothesis_test(
|
|
347
|
+
self,
|
|
348
|
+
alpha: float = 0.05,
|
|
349
|
+
methods: list[str] = None,
|
|
350
|
+
correction_method: str = "BH",
|
|
351
|
+
k: int = 1,
|
|
352
|
+
) -> dict:
|
|
353
|
+
"""
|
|
354
|
+
Run significance tests on the persistence diagram for dimension k.
|
|
355
|
+
|
|
356
|
+
Requires calculate_dgms_from_point_cloud to have been called with
|
|
357
|
+
max_dim >= k (or equivalently k >= k).
|
|
358
|
+
|
|
359
|
+
alpha : float
|
|
360
|
+
Significance level in (0, 1). Default 0.05.
|
|
361
|
+
methods : list[str], optional
|
|
362
|
+
One or more of self.allowed_methods.
|
|
363
|
+
Defaults to ["universal_null", "bottleneck"].
|
|
364
|
+
correction_method : str
|
|
365
|
+
Multiple-testing correction strategy passed to the test objects.
|
|
366
|
+
k : int
|
|
367
|
+
Homological dimension to test. Default 1.
|
|
368
|
+
|
|
369
|
+
dict
|
|
370
|
+
Keyed by method name; each value is the dict returned by the
|
|
371
|
+
corresponding test's .results() method.
|
|
372
|
+
"""
|
|
373
|
+
if not isinstance(k, int) or k < 0:
|
|
374
|
+
raise TypeError(
|
|
375
|
+
f"k must be a non-negative integer homological dimension, got {k!r}."
|
|
376
|
+
)
|
|
377
|
+
if not isinstance(alpha, float):
|
|
378
|
+
raise TypeError(
|
|
379
|
+
f"alpha must be a float, got {type(alpha).__name__}."
|
|
380
|
+
)
|
|
381
|
+
if alpha <= 0 or alpha >= 1:
|
|
382
|
+
raise ValueError(
|
|
383
|
+
f"alpha must be strictly between 0 and 1, got {alpha}."
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
if methods is None:
|
|
387
|
+
methods = ["universal_null", "bottleneck"]
|
|
388
|
+
|
|
389
|
+
invalid = [m for m in methods if m not in self.allowed_methods]
|
|
390
|
+
if invalid:
|
|
391
|
+
raise ValueError(
|
|
392
|
+
f"Unknown method(s): {invalid}. "
|
|
393
|
+
f"Allowed methods: {self.allowed_methods}."
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
if k not in self.dgms:
|
|
397
|
+
raise ValueError(
|
|
398
|
+
f"No persistence diagram found for dimension k={k}. "
|
|
399
|
+
f"Call calculate_dgms_from_point_cloud(k={k}) first."
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
self.k = k
|
|
403
|
+
dgm_k = self.dgms[k]
|
|
404
|
+
results = {}
|
|
405
|
+
|
|
406
|
+
if "universal_null" in methods or "universal_null:median" in methods:
|
|
407
|
+
test = UNTest(
|
|
408
|
+
dgm=dgm_k,
|
|
409
|
+
k=k,
|
|
410
|
+
alpha=alpha,
|
|
411
|
+
correction_strategy=correction_method,
|
|
412
|
+
method="universal_null:median",
|
|
413
|
+
max_threshold=self.max_eps
|
|
414
|
+
)
|
|
415
|
+
results["universal_null:median"] = test.results()
|
|
416
|
+
|
|
417
|
+
if "universal_null:mean" in methods:
|
|
418
|
+
test = UNTest(
|
|
419
|
+
dgm=dgm_k,
|
|
420
|
+
k=k,
|
|
421
|
+
alpha=alpha,
|
|
422
|
+
correction_strategy=correction_method,
|
|
423
|
+
method="universal_null:mean",
|
|
424
|
+
max_threshold=self.max_eps
|
|
425
|
+
)
|
|
426
|
+
results["universal_null:mean"] = test.results()
|
|
427
|
+
|
|
428
|
+
if "bottleneck" in methods or "bottleneck:subsample" in methods:
|
|
429
|
+
test = BNTest(
|
|
430
|
+
point_cloud=self.pc,
|
|
431
|
+
dgm=dgm_k,
|
|
432
|
+
alpha=alpha,
|
|
433
|
+
method="bottleneck:subsample",
|
|
434
|
+
is_distance_matrix=self.is_dist,
|
|
435
|
+
)
|
|
436
|
+
results["bottleneck:subsample"] = test.results()
|
|
437
|
+
|
|
438
|
+
if "bottleneck:shells" in methods:
|
|
439
|
+
test = BNTest(point_cloud=self.pc, dgm=dgm_k,
|
|
440
|
+
alpha=alpha, method="bottleneck:shells", is_distance_matrix=self.is_dist,)
|
|
441
|
+
results["bottleneck:shells"] = test.results()
|
|
442
|
+
|
|
443
|
+
if "bottleneck:density" in methods:
|
|
444
|
+
test = BNTest(point_cloud=self.pc, dgm=dgm_k,
|
|
445
|
+
alpha=alpha, method="bottleneck:density", is_distance_matrix=self.is_dist,)
|
|
446
|
+
results["bottleneck:density"] = test.results()
|
|
447
|
+
|
|
448
|
+
if "bottleneck:concentration" in methods:
|
|
449
|
+
test = BNTest(point_cloud=self.pc, dgm=dgm_k,
|
|
450
|
+
alpha=alpha, method="bottleneck:concentration", is_distance_matrix=self.is_dist,)
|
|
451
|
+
results["bottleneck:concentration"] = test.results()
|
|
452
|
+
|
|
453
|
+
self._cached_results = results
|
|
454
|
+
return results
|
|
455
|
+
|
|
456
|
+
def display_results(
|
|
457
|
+
self,
|
|
458
|
+
results: dict = None,
|
|
459
|
+
method: str = "all",
|
|
460
|
+
plot: str = "both",
|
|
461
|
+
) -> None:
|
|
462
|
+
"""
|
|
463
|
+
Visualise hypothesis test results.
|
|
464
|
+
|
|
465
|
+
results : dict, optional
|
|
466
|
+
Output of hypothesis_test. Uses the most recent cached run
|
|
467
|
+
when omitted.
|
|
468
|
+
method : str
|
|
469
|
+
Which method to plot, or "all" for every method in results.
|
|
470
|
+
plot : str
|
|
471
|
+
"diagram", "barcode", or "both".
|
|
472
|
+
"""
|
|
473
|
+
if results is None:
|
|
474
|
+
if not self._cached_results:
|
|
475
|
+
raise ValueError(
|
|
476
|
+
"No results to display. Pass a results dict or call "
|
|
477
|
+
"hypothesis_test first."
|
|
478
|
+
)
|
|
479
|
+
results = self._cached_results
|
|
480
|
+
|
|
481
|
+
if not isinstance(results, dict):
|
|
482
|
+
raise ValueError(
|
|
483
|
+
"results must be a dict returned by hypothesis_test.")
|
|
484
|
+
|
|
485
|
+
if plot not in ("diagram", "barcode", "both"):
|
|
486
|
+
raise ValueError(
|
|
487
|
+
f"plot must be 'diagram', 'barcode', or 'both', got {plot!r}."
|
|
488
|
+
)
|
|
489
|
+
|
|
490
|
+
available = list(results.keys())
|
|
491
|
+
if method == "all":
|
|
492
|
+
methods_to_plot = available
|
|
493
|
+
else:
|
|
494
|
+
if method not in available:
|
|
495
|
+
raise ValueError(
|
|
496
|
+
f"method {method!r} not found in results. Available: {available}."
|
|
497
|
+
)
|
|
498
|
+
methods_to_plot = [method]
|
|
499
|
+
|
|
500
|
+
n_cols = 2 if plot == "both" else 1
|
|
501
|
+
n_rows = len(methods_to_plot)
|
|
502
|
+
|
|
503
|
+
fig, axes = plt.subplots(
|
|
504
|
+
n_rows, n_cols,
|
|
505
|
+
figsize=(6 * n_cols, 5 * n_rows),
|
|
506
|
+
squeeze=False,
|
|
507
|
+
)
|
|
508
|
+
fig.suptitle(f"H_{self.k} persistence results", fontsize=14)
|
|
509
|
+
|
|
510
|
+
for row, mname in enumerate(methods_to_plot):
|
|
511
|
+
res = results[mname]
|
|
512
|
+
results_array = res["results_array"]
|
|
513
|
+
thr = res.get("threshold", np.nan)
|
|
514
|
+
|
|
515
|
+
births = results_array[:, 0]
|
|
516
|
+
deaths = results_array[:, 1]
|
|
517
|
+
pers = deaths - births
|
|
518
|
+
sig = results_array[:, 4].astype(bool)
|
|
519
|
+
ax_idx = 0
|
|
520
|
+
|
|
521
|
+
if plot in ("diagram", "both"):
|
|
522
|
+
ax = axes[row, ax_idx]
|
|
523
|
+
finite_deaths = deaths[np.isfinite(deaths)]
|
|
524
|
+
lim = finite_deaths.max() * 1.05 if len(finite_deaths) else 1.0
|
|
525
|
+
xs = np.linspace(0, lim, 300)
|
|
526
|
+
|
|
527
|
+
ax.plot([0, lim], [0, lim], "k--", lw=0.8,
|
|
528
|
+
alpha=0.4, label="diagonal")
|
|
529
|
+
|
|
530
|
+
if not np.isnan(thr):
|
|
531
|
+
if np.isinf(thr):
|
|
532
|
+
# inf threshold = everything is noise: shade the entire upper triangle
|
|
533
|
+
ax.fill_between(xs, xs, lim,
|
|
534
|
+
color="steelblue", alpha=0.07, label="noise band (all)")
|
|
535
|
+
elif "universal_null" in mname:
|
|
536
|
+
# Threshold is a multiplicative ratio: death = (pi*)(birth)
|
|
537
|
+
ax.plot(xs, thr * xs, color="steelblue", lw=1.2,
|
|
538
|
+
linestyle="--", alpha=0.7, label=f"pi_min = {thr:.2f}")
|
|
539
|
+
ax.fill_between(xs, xs, thr * xs,
|
|
540
|
+
color="steelblue", alpha=0.07, label="noise band")
|
|
541
|
+
else:
|
|
542
|
+
# Threshold is an additive offset: death = birth + 2c_n
|
|
543
|
+
ax.plot(xs, xs + thr, color="steelblue", lw=1.2,
|
|
544
|
+
linestyle="--", alpha=0.7, label=f"2c_n = {thr:.3f}")
|
|
545
|
+
ax.fill_between(xs, xs, xs + thr,
|
|
546
|
+
color="steelblue", alpha=0.07, label="noise band")
|
|
547
|
+
|
|
548
|
+
ax.scatter(births[~sig], deaths[~sig], s=8, alpha=0.4,
|
|
549
|
+
color="steelblue", label="noise")
|
|
550
|
+
ax.scatter(births[sig], deaths[sig], s=9, alpha=0.9,
|
|
551
|
+
color="crimson", label=f"significant ({sig.sum()})", zorder=5)
|
|
552
|
+
|
|
553
|
+
ax.set_xlabel("birth")
|
|
554
|
+
ax.set_ylabel("death")
|
|
555
|
+
ax.set_title(
|
|
556
|
+
f"{mname} — significance persistence diagram ({sig.sum()} significant)")
|
|
557
|
+
ax.set_aspect("equal")
|
|
558
|
+
ax.set_xlim(0, lim)
|
|
559
|
+
ax.set_ylim(0, lim)
|
|
560
|
+
ax.legend(fontsize=8)
|
|
561
|
+
ax_idx += 1
|
|
562
|
+
|
|
563
|
+
if plot in ("barcode", "both"):
|
|
564
|
+
ax = axes[row, ax_idx]
|
|
565
|
+
order = np.argsort(pers)[::-1]
|
|
566
|
+
|
|
567
|
+
for rank, idx in enumerate(order):
|
|
568
|
+
color = "crimson" if sig[idx] else "steelblue"
|
|
569
|
+
alpha_val = 0.9 if sig[idx] else 0.25
|
|
570
|
+
lw = 3.5 if sig[idx] else 1.0
|
|
571
|
+
ax.hlines(rank, births[idx], deaths[idx],
|
|
572
|
+
colors=color, linewidth=lw, alpha=alpha_val)
|
|
573
|
+
|
|
574
|
+
ax.set_xlabel("filtration value epsilon")
|
|
575
|
+
ax.set_ylabel("bar rank")
|
|
576
|
+
ax.set_title(
|
|
577
|
+
f"{mname} — significance barcode ({sig.sum()} significant)")
|
|
578
|
+
ax.invert_yaxis()
|
|
579
|
+
|
|
580
|
+
plt.tight_layout()
|
|
581
|
+
plt.show()
|
|
582
|
+
|
|
583
|
+
def save(self, path: str) -> None:
|
|
584
|
+
"""
|
|
585
|
+
Save the Phantom instance to disk using pickle.
|
|
586
|
+
"""
|
|
587
|
+
os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
|
|
588
|
+
with open(path, "wb") as f:
|
|
589
|
+
pickle.dump(self, f)
|
|
590
|
+
print(f"Saved Phantom to {path}")
|
|
591
|
+
|
|
592
|
+
@classmethod
|
|
593
|
+
def load(cls, path: str) -> "Phantom":
|
|
594
|
+
"""
|
|
595
|
+
Load a Phantom instance from disk.
|
|
596
|
+
"""
|
|
597
|
+
if not os.path.exists(path):
|
|
598
|
+
raise FileNotFoundError(f"No Phantom file found at {path!r}.")
|
|
599
|
+
with open(path, "rb") as f:
|
|
600
|
+
obj = pickle.load(f)
|
|
601
|
+
if not isinstance(obj, cls):
|
|
602
|
+
raise TypeError(
|
|
603
|
+
f"Loaded object is {type(obj).__name__}, expected Phantom."
|
|
604
|
+
)
|
|
605
|
+
print(f"Loaded Phantom from {path}")
|
|
606
|
+
return obj
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tdaphantom
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Statistical hypothesis testing for persistence diagrams and barcodes
|
|
5
|
+
Author: W. Moriarty
|
|
6
|
+
License: MIT
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Classifier: Topic :: Scientific/Engineering :: Mathematics
|
|
11
|
+
Requires-Python: >=3.10
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Requires-Dist: numpy>=1.26
|
|
15
|
+
Requires-Dist: matplotlib>=3.7
|
|
16
|
+
Requires-Dist: gudhi>=3.11.0
|
|
17
|
+
Requires-Dist: ripser>=0.6.14
|
|
18
|
+
Dynamic: author
|
|
19
|
+
Dynamic: classifier
|
|
20
|
+
Dynamic: description
|
|
21
|
+
Dynamic: description-content-type
|
|
22
|
+
Dynamic: license
|
|
23
|
+
Dynamic: license-file
|
|
24
|
+
Dynamic: requires-dist
|
|
25
|
+
Dynamic: requires-python
|
|
26
|
+
Dynamic: summary
|
|
27
|
+
|
|
28
|
+
# TDA-PHANTOM
|
|
29
|
+
Topological data analysis -
|
|
30
|
+
Persistent Homology Analysis via Null Testing On Manifolds (TDA-PHANTOM)
|
|
31
|
+
is a tool for statistically analysing significance of persistence diagrams and barcodes.
|
|
32
|
+
|
|
33
|
+
This project implements hypothesis tests from:
|
|
34
|
+
|
|
35
|
+
- *Confidence Sets for Persistence Diagrams*
|
|
36
|
+
Fasy et al. (2014)
|
|
37
|
+
DOI: https://doi.org/10.1214/14-AOS1252
|
|
38
|
+
|
|
39
|
+
- *A Universal Null-Distribution for Topological Data Analysis*
|
|
40
|
+
Bobrowski and Skraba (2023)
|
|
41
|
+
DOI: https://doi.org/10.1038/s41598-023-37842-2
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
## Installation
|
|
45
|
+
|
|
46
|
+
Via [PyPI](https://pypi.org/project/tdaphantom/):
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
pip install tdaphantom
|
|
50
|
+
```
|
|
51
|
+
Or you can clone this repository and install it manually:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
python setup.py install
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Overview
|
|
58
|
+
|
|
59
|
+
This tool can build a Vietoris-Rips complex from either a point cloud or distance matrix.
|
|
60
|
+
|
|
61
|
+
It can then be used to visualise the persistence diagram for that complex, and run various hypothesis tests for it.
|
|
62
|
+
|
|
63
|
+
The results of these hypothesis tests can be analysed via a return results array, or visualised in a signifiance persistence diagram.
|
|
64
|
+
|
|
65
|
+
## Example Usage
|
|
66
|
+
|
|
67
|
+
### Generate data
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
def _make_circle(n=2000, noise=0.03, seed=1):
|
|
71
|
+
rng = np.random.default_rng(seed)
|
|
72
|
+
theta = rng.uniform(0, 2 * np.pi, n)
|
|
73
|
+
pts = np.stack([np.cos(theta), np.sin(theta)], axis=1)
|
|
74
|
+
return pts + rng.normal(0, noise, pts.shape)
|
|
75
|
+
|
|
76
|
+
pc_circle = _make_circle()
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### Init Phantom class
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
phantom_circle = Phantom(pc_circle)
|
|
83
|
+
```
|
|
84
|
+
### Calculate persistence diagram
|
|
85
|
+
Here we go up to homological dimension 1
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
phantom_circle.calculate_dgms_from_point_cloud_ripser(k=1)
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
### Display persistence diagram
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
phantom_circle.display_dgms()
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+

|
|
99
|
+
|
|
100
|
+
### Run hypothesis test
|
|
101
|
+
```python
|
|
102
|
+
alpha = 0.01
|
|
103
|
+
correction = None
|
|
104
|
+
methods = ["universal_null"]
|
|
105
|
+
|
|
106
|
+
phantom_circle.hypothesis_test(alpha, correction_method=correction, methods=methods, k=1)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Display signifiance persistence diagram
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
phantom_circle.display_results()
|
|
113
|
+
```
|
|
114
|
+

|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
## Basic useage
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
## Avaliable methods
|
|
121
|
+
|
|
122
|
+
## Universal null median
|
|
123
|
+
### Useage
|
|
124
|
+
### Theory
|
|
125
|
+
|
|
126
|
+
## Universal null mean
|
|
127
|
+
### Useage
|
|
128
|
+
### Theory
|
|
129
|
+
|
|
130
|
+
## Bottleneck subsampling
|
|
131
|
+
### Useage
|
|
132
|
+
### Theory
|
|
133
|
+
|
|
134
|
+
## TODO
|
|
135
|
+
|
|
136
|
+
* Add bottleneck shells
|
|
137
|
+
* Add bottleneck density
|
|
138
|
+
* Add bottleneck concentration
|
|
139
|
+
* Add more integeration tests
|
|
140
|
+
* Add more unit tests
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
tdaphantom/__init__.py,sha256=iJDuo6aPdl6Qem3GwpjEdn9EX_XKjPwY781CetJLSjI,33
|
|
2
|
+
tdaphantom/tdaphantom.py,sha256=eFKmHGkwMiGN9RCPYglmiO-oksrvGME4HJXVfg4TXos,22860
|
|
3
|
+
tdaphantom/hypothesis_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
tdaphantom/hypothesis_tests/bottleneck_distance_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
+
tdaphantom/hypothesis_tests/bottleneck_distance_tests/bottleneck_distance_test.py,sha256=a8PWjB5GxtqpmOaXI9jNsleBJe0Cib088INF4hAaszI,5789
|
|
6
|
+
tdaphantom/hypothesis_tests/universal_null_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
+
tdaphantom/hypothesis_tests/universal_null_tests/universal_null_hypothesis_test.py,sha256=ii-tTrhOUr93NWhciy825KnFB0DMnQg3l7VMKAgIuCM,8427
|
|
8
|
+
tdaphantom/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
+
tdaphantom/metrics/metrics.py,sha256=YqnB2lEVnw0XR-0Lf0KOtIFydRsvzHGZAyaLq2865Do,2112
|
|
10
|
+
tdaphantom-1.0.0.dist-info/licenses/LICENSE,sha256=Ard-MXyIkSN7P8HKKqWGjY0pe9DVSWds26caXlVraI8,1065
|
|
11
|
+
tdaphantom-1.0.0.dist-info/METADATA,sha256=eRDFEFGSmmDXdZupp_lBlDXMILTk72RDSc5UYLaVlNc,3238
|
|
12
|
+
tdaphantom-1.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
13
|
+
tdaphantom-1.0.0.dist-info/top_level.txt,sha256=e3T5mrvKbsHzwq_0yhGFQUQXt5Gn1adeLHjG0ZcqzxQ,11
|
|
14
|
+
tdaphantom-1.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Moriarty
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
tdaphantom
|