synthetic-graph-benchmarks 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- synthetic_graph_benchmarks/__init__.py +23 -0
- synthetic_graph_benchmarks/benchmarks.py +85 -0
- synthetic_graph_benchmarks/dataset.py +47 -0
- synthetic_graph_benchmarks/dist_helper.py +222 -0
- synthetic_graph_benchmarks/spectre_utils.py +1230 -0
- synthetic_graph_benchmarks/utils.py +56 -0
- synthetic_graph_benchmarks-0.1.0.dist-info/METADATA +227 -0
- synthetic_graph_benchmarks-0.1.0.dist-info/RECORD +11 -0
- synthetic_graph_benchmarks-0.1.0.dist-info/WHEEL +4 -0
- synthetic_graph_benchmarks-0.1.0.dist-info/entry_points.txt +2 -0
- synthetic_graph_benchmarks-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,23 @@
|
|
1
|
+
from synthetic_graph_benchmarks.benchmarks import (
|
2
|
+
benchmark_sbm_results,
|
3
|
+
benchmark_planar_results,
|
4
|
+
benchmark_tree_results,
|
5
|
+
)
|
6
|
+
|
7
|
+
__version__ = "0.1.0"
|
8
|
+
|
9
|
+
def main():
|
10
|
+
"""Main entry point for the CLI."""
|
11
|
+
print("Synthetic Graph Benchmarks v" + __version__)
|
12
|
+
print("For usage examples, see: https://github.com/peteole/synthetic_graph_benchmarks")
|
13
|
+
print("Available benchmark functions:")
|
14
|
+
print(" - benchmark_sbm_results()")
|
15
|
+
print(" - benchmark_planar_results()")
|
16
|
+
print(" - benchmark_tree_results()")
|
17
|
+
|
18
|
+
__all__ = [
|
19
|
+
"benchmark_sbm_results",
|
20
|
+
"benchmark_planar_results",
|
21
|
+
"benchmark_tree_results",
|
22
|
+
"__version__",
|
23
|
+
]
|
@@ -0,0 +1,85 @@
|
|
1
|
+
from typing import TypedDict
|
2
|
+
from synthetic_graph_benchmarks.dataset import Dataset
|
3
|
+
from synthetic_graph_benchmarks.spectre_utils import (
|
4
|
+
SBMSamplingMetrics,
|
5
|
+
PlanarSamplingMetrics,
|
6
|
+
TreeSamplingMetrics,
|
7
|
+
)
|
8
|
+
import networkx as nx
|
9
|
+
|
10
|
+
class PlanarBenchmarkResults(TypedDict):
|
11
|
+
degree: float
|
12
|
+
wavelet: float
|
13
|
+
spectre: float
|
14
|
+
clustering: float
|
15
|
+
orbit: float
|
16
|
+
planar_acc: float
|
17
|
+
degree_ratio: float
|
18
|
+
clustering_ratio: float
|
19
|
+
orbit_ratio: float
|
20
|
+
spectre_ratio: float
|
21
|
+
wavelet_ratio: float
|
22
|
+
average_ratio: float
|
23
|
+
|
24
|
+
class SBMBenchmarkResults(TypedDict):
|
25
|
+
degree: float
|
26
|
+
wavelet: float
|
27
|
+
spectre: float
|
28
|
+
clustering: float
|
29
|
+
orbit: float
|
30
|
+
sbm_acc: float
|
31
|
+
sampling_frac_unique: float
|
32
|
+
sampling_frac_unique_non_iso: float
|
33
|
+
sampling_frac_unic_non_iso_valid: float
|
34
|
+
sampling_frac_non_iso: float
|
35
|
+
degree_ratio: float
|
36
|
+
clustering_ratio: float
|
37
|
+
orbit_ratio: float
|
38
|
+
spectre_ratio: float
|
39
|
+
wavelet_ratio: float
|
40
|
+
average_ratio: float
|
41
|
+
|
42
|
+
class TreeBenchmarkResults(TypedDict):
|
43
|
+
degree: float
|
44
|
+
wavelet: float
|
45
|
+
spectre: float
|
46
|
+
clustering: float
|
47
|
+
orbit: float
|
48
|
+
planar_acc: float
|
49
|
+
sampling_frac_unique: float
|
50
|
+
sampling_frac_unique_non_iso: float
|
51
|
+
sampling_frac_unic_non_iso_valid: float
|
52
|
+
sampling_frac_non_iso: float
|
53
|
+
degree_ratio: float
|
54
|
+
spectre_ratio: float
|
55
|
+
wavelet_ratio: float
|
56
|
+
average_ratio: float
|
57
|
+
|
58
|
+
|
59
|
+
|
60
|
+
def benchmark_sbm_results(generated_graphs: list[nx.Graph]) -> SBMBenchmarkResults:
|
61
|
+
"""Benchmark the results of generated graphs against the SBM dataset."""
|
62
|
+
ds = Dataset.load_sbm()
|
63
|
+
metrics = SBMSamplingMetrics(ds)
|
64
|
+
test_metrics = metrics.forward(ds.train_graphs, test=True)
|
65
|
+
return metrics.forward(
|
66
|
+
generated_graphs, ref_metrics={"test": test_metrics}, test=True
|
67
|
+
) # type: ignore
|
68
|
+
|
69
|
+
def benchmark_planar_results(generated_graphs: list[nx.Graph]) -> PlanarBenchmarkResults:
|
70
|
+
"""Benchmark the results of generated graphs against the Planar dataset."""
|
71
|
+
ds = Dataset.load_planar()
|
72
|
+
metrics = PlanarSamplingMetrics(ds)
|
73
|
+
test_metrics = metrics.forward(ds.train_graphs, test=True)
|
74
|
+
return metrics.forward(
|
75
|
+
generated_graphs, ref_metrics={"test": test_metrics}, test=True
|
76
|
+
) # type: ignore
|
77
|
+
|
78
|
+
def benchmark_tree_results(generated_graphs: list[nx.Graph]) -> TreeBenchmarkResults:
|
79
|
+
"""Benchmark the results of generated graphs against the Tree dataset."""
|
80
|
+
ds = Dataset.load_tree()
|
81
|
+
metrics = TreeSamplingMetrics(ds)
|
82
|
+
test_metrics = metrics.forward(ds.train_graphs, test=True)
|
83
|
+
return metrics.forward(
|
84
|
+
generated_graphs, ref_metrics={"test": test_metrics}, test=True
|
85
|
+
) # type: ignore
|
@@ -0,0 +1,47 @@
|
|
1
|
+
|
2
|
+
from dataclasses import dataclass
|
3
|
+
|
4
|
+
import networkx
|
5
|
+
import pickle
|
6
|
+
|
7
|
+
from synthetic_graph_benchmarks.utils import download_file
|
8
|
+
|
9
|
+
|
10
|
+
@dataclass
|
11
|
+
class Dataset:
|
12
|
+
"""A simple dataset class to hold train, validation, and test graphs."""
|
13
|
+
train_graphs: list[networkx.Graph]
|
14
|
+
val_graphs: list[networkx.Graph]
|
15
|
+
test_graphs: list[networkx.Graph] | None = None
|
16
|
+
|
17
|
+
|
18
|
+
@classmethod
|
19
|
+
def load_from_pickle_url(cls, url: str):
|
20
|
+
"""
|
21
|
+
Load a dataset from a pickle file available at the given URL.
|
22
|
+
|
23
|
+
Args:
|
24
|
+
url (str): The URL of the pickle file containing the dataset.
|
25
|
+
|
26
|
+
Returns:
|
27
|
+
Dataset: An instance of the Dataset class with loaded graphs.
|
28
|
+
"""
|
29
|
+
res = download_file(url, "data")
|
30
|
+
with open(res, "rb") as f:
|
31
|
+
data = pickle.load(f)
|
32
|
+
return cls(
|
33
|
+
train_graphs=data['train'],
|
34
|
+
val_graphs=data['val'],
|
35
|
+
test_graphs=data.get('test', None)
|
36
|
+
)
|
37
|
+
@classmethod
|
38
|
+
def load_sbm(cls):
|
39
|
+
return cls.load_from_pickle_url("https://raw.githubusercontent.com/AndreasBergmeister/graph-generation/main/data/sbm.pkl")
|
40
|
+
|
41
|
+
@classmethod
|
42
|
+
def load_planar(cls):
|
43
|
+
return cls.load_from_pickle_url("https://raw.githubusercontent.com/AndreasBergmeister/graph-generation/main/data/planar.pkl")
|
44
|
+
|
45
|
+
@classmethod
|
46
|
+
def load_tree(cls):
|
47
|
+
return cls.load_from_pickle_url("https://raw.githubusercontent.com/AndreasBergmeister/graph-generation/main/data/tree.pkl")
|
@@ -0,0 +1,222 @@
|
|
1
|
+
###############################################################################
|
2
|
+
#
|
3
|
+
# Adapted from https://github.com/lrjconan/GRAN/ which in turn is adapted from https://github.com/JiaxuanYou/graph-generation
|
4
|
+
#
|
5
|
+
###############################################################################
|
6
|
+
import numpy as np
|
7
|
+
import concurrent.futures
|
8
|
+
from functools import partial
|
9
|
+
from scipy.linalg import toeplitz
|
10
|
+
from scipy.stats import wasserstein_distance
|
11
|
+
from scipy.optimize import linprog
|
12
|
+
|
13
|
+
|
14
|
+
def _compute_emd_with_distance_matrix(x, y, distance_mat):
|
15
|
+
"""
|
16
|
+
Compute EMD using linear programming when a custom distance matrix is provided.
|
17
|
+
This is equivalent to pyemd.emd but implemented using scipy.
|
18
|
+
"""
|
19
|
+
x = x.astype(float)
|
20
|
+
y = y.astype(float)
|
21
|
+
|
22
|
+
# Ensure distributions are normalized
|
23
|
+
if np.sum(x) > 0:
|
24
|
+
x = x / np.sum(x)
|
25
|
+
if np.sum(y) > 0:
|
26
|
+
y = y / np.sum(y)
|
27
|
+
|
28
|
+
n = len(x)
|
29
|
+
m = len(y)
|
30
|
+
|
31
|
+
# Create the cost vector (flattened distance matrix)
|
32
|
+
c = distance_mat[:n, :m].flatten()
|
33
|
+
|
34
|
+
# Create equality constraints for supply (sum over columns = x[i])
|
35
|
+
A_eq_supply = np.zeros((n, n * m))
|
36
|
+
for i in range(n):
|
37
|
+
for j in range(m):
|
38
|
+
A_eq_supply[i, i * m + j] = 1
|
39
|
+
b_eq_supply = x
|
40
|
+
|
41
|
+
# Create equality constraints for demand (sum over rows = y[j])
|
42
|
+
A_eq_demand = np.zeros((m, n * m))
|
43
|
+
for j in range(m):
|
44
|
+
for i in range(n):
|
45
|
+
A_eq_demand[j, i * m + j] = 1
|
46
|
+
b_eq_demand = y
|
47
|
+
|
48
|
+
# Combine constraints
|
49
|
+
A_eq = np.vstack([A_eq_supply, A_eq_demand])
|
50
|
+
b_eq = np.hstack([b_eq_supply, b_eq_demand])
|
51
|
+
|
52
|
+
# Bounds: all variables >= 0
|
53
|
+
bounds = [(0, None) for _ in range(n * m)]
|
54
|
+
|
55
|
+
# Solve the linear program
|
56
|
+
try:
|
57
|
+
result = linprog(c, A_eq=A_eq, b_eq=b_eq, bounds=bounds, method="highs")
|
58
|
+
if result.success:
|
59
|
+
return result.fun
|
60
|
+
else:
|
61
|
+
# Fallback to simpler Wasserstein distance
|
62
|
+
return wasserstein_distance(np.arange(n), np.arange(m), x, y)
|
63
|
+
except Exception:
|
64
|
+
# Fallback to simpler Wasserstein distance
|
65
|
+
return wasserstein_distance(np.arange(n), np.arange(m), x, y)
|
66
|
+
|
67
|
+
|
68
|
+
def emd(x, y, distance_scaling=1.0):
|
69
|
+
support_size = max(len(x), len(y))
|
70
|
+
d_mat = toeplitz(range(support_size)).astype(float)
|
71
|
+
distance_mat = d_mat / distance_scaling
|
72
|
+
|
73
|
+
# convert histogram values x and y to float, and make them equal len
|
74
|
+
x = x.astype(float)
|
75
|
+
y = y.astype(float)
|
76
|
+
if len(x) < len(y):
|
77
|
+
x = np.hstack((x, [0.0] * (support_size - len(x))))
|
78
|
+
elif len(y) < len(x):
|
79
|
+
y = np.hstack((y, [0.0] * (support_size - len(y))))
|
80
|
+
|
81
|
+
emd_result = _compute_emd_with_distance_matrix(x, y, distance_mat)
|
82
|
+
return emd_result
|
83
|
+
|
84
|
+
|
85
|
+
def l2(x, y):
|
86
|
+
dist = np.linalg.norm(x - y, 2)
|
87
|
+
return dist
|
88
|
+
|
89
|
+
|
90
|
+
def emd_with_sigma(x, y, sigma=1.0, distance_scaling=1.0):
|
91
|
+
"""EMD
|
92
|
+
Args:
|
93
|
+
x, y: 1D pmf of two distributions with the same support
|
94
|
+
sigma: standard deviation
|
95
|
+
"""
|
96
|
+
support_size = max(len(x), len(y))
|
97
|
+
d_mat = toeplitz(range(support_size)).astype(float)
|
98
|
+
distance_mat = d_mat / distance_scaling
|
99
|
+
|
100
|
+
# convert histogram values x and y to float, and make them equal len
|
101
|
+
x = x.astype(float)
|
102
|
+
y = y.astype(float)
|
103
|
+
if len(x) < len(y):
|
104
|
+
x = np.hstack((x, [0.0] * (support_size - len(x))))
|
105
|
+
elif len(y) < len(x):
|
106
|
+
y = np.hstack((y, [0.0] * (support_size - len(y))))
|
107
|
+
|
108
|
+
return np.abs(_compute_emd_with_distance_matrix(x, y, distance_mat))
|
109
|
+
|
110
|
+
|
111
|
+
def gaussian_emd(x, y, sigma=1.0, distance_scaling=1.0):
|
112
|
+
"""Gaussian kernel with squared distance in exponential term replaced by EMD
|
113
|
+
Args:
|
114
|
+
x, y: 1D pmf of two distributions with the same support
|
115
|
+
sigma: standard deviation
|
116
|
+
"""
|
117
|
+
support_size = max(len(x), len(y))
|
118
|
+
d_mat = toeplitz(range(support_size)).astype(float)
|
119
|
+
distance_mat = d_mat / distance_scaling
|
120
|
+
|
121
|
+
# convert histogram values x and y to float, and make them equal len
|
122
|
+
x = x.astype(float)
|
123
|
+
y = y.astype(float)
|
124
|
+
if len(x) < len(y):
|
125
|
+
x = np.hstack((x, [0.0] * (support_size - len(x))))
|
126
|
+
elif len(y) < len(x):
|
127
|
+
y = np.hstack((y, [0.0] * (support_size - len(y))))
|
128
|
+
|
129
|
+
emd_result = _compute_emd_with_distance_matrix(x, y, distance_mat)
|
130
|
+
return np.exp(-emd_result * emd_result / (2 * sigma * sigma))
|
131
|
+
|
132
|
+
|
133
|
+
def gaussian(x, y, sigma=1.0):
|
134
|
+
support_size = max(len(x), len(y))
|
135
|
+
# convert histogram values x and y to float, and make them equal len
|
136
|
+
x = x.astype(float)
|
137
|
+
y = y.astype(float)
|
138
|
+
if len(x) < len(y):
|
139
|
+
x = np.hstack((x, [0.0] * (support_size - len(x))))
|
140
|
+
elif len(y) < len(x):
|
141
|
+
y = np.hstack((y, [0.0] * (support_size - len(y))))
|
142
|
+
|
143
|
+
dist = np.linalg.norm(x - y, 2)
|
144
|
+
return np.exp(-dist * dist / (2 * sigma * sigma))
|
145
|
+
|
146
|
+
|
147
|
+
def gaussian_tv(x, y, sigma=1.0):
|
148
|
+
support_size = max(len(x), len(y))
|
149
|
+
# convert histogram values x and y to float, and make them equal len
|
150
|
+
x = x.astype(float)
|
151
|
+
y = y.astype(float)
|
152
|
+
if len(x) < len(y):
|
153
|
+
x = np.hstack((x, [0.0] * (support_size - len(x))))
|
154
|
+
elif len(y) < len(x):
|
155
|
+
y = np.hstack((y, [0.0] * (support_size - len(y))))
|
156
|
+
|
157
|
+
dist = np.abs(x - y).sum() / 2.0
|
158
|
+
return np.exp(-dist * dist / (2 * sigma * sigma))
|
159
|
+
|
160
|
+
|
161
|
+
def kernel_parallel_unpacked(x, samples2, kernel):
|
162
|
+
d = 0
|
163
|
+
for s2 in samples2:
|
164
|
+
d += kernel(x, s2)
|
165
|
+
return d
|
166
|
+
|
167
|
+
|
168
|
+
def kernel_parallel_worker(t):
|
169
|
+
return kernel_parallel_unpacked(*t)
|
170
|
+
|
171
|
+
|
172
|
+
def disc(samples1, samples2, kernel, is_parallel=True, *args, **kwargs):
|
173
|
+
"""Discrepancy between 2 samples"""
|
174
|
+
d = 0
|
175
|
+
|
176
|
+
if not is_parallel:
|
177
|
+
for s1 in samples1:
|
178
|
+
for s2 in samples2:
|
179
|
+
d += kernel(s1, s2, *args, **kwargs)
|
180
|
+
else:
|
181
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
182
|
+
for dist in executor.map(
|
183
|
+
kernel_parallel_worker,
|
184
|
+
[(s1, samples2, partial(kernel, *args, **kwargs)) for s1 in samples1],
|
185
|
+
):
|
186
|
+
d += dist
|
187
|
+
if len(samples1) * len(samples2) > 0:
|
188
|
+
d /= len(samples1) * len(samples2)
|
189
|
+
else:
|
190
|
+
d = 1e6
|
191
|
+
return d
|
192
|
+
|
193
|
+
|
194
|
+
def compute_mmd(samples1, samples2, kernel, is_hist=True, *args, **kwargs):
|
195
|
+
"""MMD between two samples"""
|
196
|
+
# normalize histograms into pmf
|
197
|
+
if is_hist:
|
198
|
+
samples1 = [s1 / (np.sum(s1) + 1e-6) for s1 in samples1]
|
199
|
+
samples2 = [s2 / (np.sum(s2) + 1e-6) for s2 in samples2]
|
200
|
+
mmd = (
|
201
|
+
disc(samples1, samples1, kernel, *args, **kwargs)
|
202
|
+
+ disc(samples2, samples2, kernel, *args, **kwargs)
|
203
|
+
- 2 * disc(samples1, samples2, kernel, *args, **kwargs)
|
204
|
+
)
|
205
|
+
|
206
|
+
mmd = np.abs(mmd)
|
207
|
+
|
208
|
+
if mmd < 0:
|
209
|
+
import pdb
|
210
|
+
|
211
|
+
pdb.set_trace()
|
212
|
+
|
213
|
+
return mmd
|
214
|
+
|
215
|
+
|
216
|
+
def compute_emd(samples1, samples2, kernel, is_hist=True, *args, **kwargs):
|
217
|
+
"""EMD between average of two samples"""
|
218
|
+
# normalize histograms into pmf
|
219
|
+
if is_hist:
|
220
|
+
samples1 = [np.mean(samples1)]
|
221
|
+
samples2 = [np.mean(samples2)]
|
222
|
+
return disc(samples1, samples2, kernel, *args, **kwargs), [samples1[0], samples2[0]]
|