synthetic-graph-benchmarks 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,23 @@
1
+ from synthetic_graph_benchmarks.benchmarks import (
2
+ benchmark_sbm_results,
3
+ benchmark_planar_results,
4
+ benchmark_tree_results,
5
+ )
6
+
7
+ __version__ = "0.1.0"
8
+
9
+ def main():
10
+ """Main entry point for the CLI."""
11
+ print("Synthetic Graph Benchmarks v" + __version__)
12
+ print("For usage examples, see: https://github.com/peteole/synthetic_graph_benchmarks")
13
+ print("Available benchmark functions:")
14
+ print(" - benchmark_sbm_results()")
15
+ print(" - benchmark_planar_results()")
16
+ print(" - benchmark_tree_results()")
17
+
18
+ __all__ = [
19
+ "benchmark_sbm_results",
20
+ "benchmark_planar_results",
21
+ "benchmark_tree_results",
22
+ "__version__",
23
+ ]
@@ -0,0 +1,85 @@
1
+ from typing import TypedDict
2
+ from synthetic_graph_benchmarks.dataset import Dataset
3
+ from synthetic_graph_benchmarks.spectre_utils import (
4
+ SBMSamplingMetrics,
5
+ PlanarSamplingMetrics,
6
+ TreeSamplingMetrics,
7
+ )
8
+ import networkx as nx
9
+
10
+ class PlanarBenchmarkResults(TypedDict):
11
+ degree: float
12
+ wavelet: float
13
+ spectre: float
14
+ clustering: float
15
+ orbit: float
16
+ planar_acc: float
17
+ degree_ratio: float
18
+ clustering_ratio: float
19
+ orbit_ratio: float
20
+ spectre_ratio: float
21
+ wavelet_ratio: float
22
+ average_ratio: float
23
+
24
+ class SBMBenchmarkResults(TypedDict):
25
+ degree: float
26
+ wavelet: float
27
+ spectre: float
28
+ clustering: float
29
+ orbit: float
30
+ sbm_acc: float
31
+ sampling_frac_unique: float
32
+ sampling_frac_unique_non_iso: float
33
+ sampling_frac_unic_non_iso_valid: float
34
+ sampling_frac_non_iso: float
35
+ degree_ratio: float
36
+ clustering_ratio: float
37
+ orbit_ratio: float
38
+ spectre_ratio: float
39
+ wavelet_ratio: float
40
+ average_ratio: float
41
+
42
+ class TreeBenchmarkResults(TypedDict):
43
+ degree: float
44
+ wavelet: float
45
+ spectre: float
46
+ clustering: float
47
+ orbit: float
48
+ planar_acc: float
49
+ sampling_frac_unique: float
50
+ sampling_frac_unique_non_iso: float
51
+ sampling_frac_unic_non_iso_valid: float
52
+ sampling_frac_non_iso: float
53
+ degree_ratio: float
54
+ spectre_ratio: float
55
+ wavelet_ratio: float
56
+ average_ratio: float
57
+
58
+
59
+
60
+ def benchmark_sbm_results(generated_graphs: list[nx.Graph]) -> SBMBenchmarkResults:
61
+ """Benchmark the results of generated graphs against the SBM dataset."""
62
+ ds = Dataset.load_sbm()
63
+ metrics = SBMSamplingMetrics(ds)
64
+ test_metrics = metrics.forward(ds.train_graphs, test=True)
65
+ return metrics.forward(
66
+ generated_graphs, ref_metrics={"test": test_metrics}, test=True
67
+ ) # type: ignore
68
+
69
+ def benchmark_planar_results(generated_graphs: list[nx.Graph]) -> PlanarBenchmarkResults:
70
+ """Benchmark the results of generated graphs against the Planar dataset."""
71
+ ds = Dataset.load_planar()
72
+ metrics = PlanarSamplingMetrics(ds)
73
+ test_metrics = metrics.forward(ds.train_graphs, test=True)
74
+ return metrics.forward(
75
+ generated_graphs, ref_metrics={"test": test_metrics}, test=True
76
+ ) # type: ignore
77
+
78
+ def benchmark_tree_results(generated_graphs: list[nx.Graph]) -> TreeBenchmarkResults:
79
+ """Benchmark the results of generated graphs against the Tree dataset."""
80
+ ds = Dataset.load_tree()
81
+ metrics = TreeSamplingMetrics(ds)
82
+ test_metrics = metrics.forward(ds.train_graphs, test=True)
83
+ return metrics.forward(
84
+ generated_graphs, ref_metrics={"test": test_metrics}, test=True
85
+ ) # type: ignore
@@ -0,0 +1,47 @@
1
+
2
+ from dataclasses import dataclass
3
+
4
+ import networkx
5
+ import pickle
6
+
7
+ from synthetic_graph_benchmarks.utils import download_file
8
+
9
+
10
+ @dataclass
11
+ class Dataset:
12
+ """A simple dataset class to hold train, validation, and test graphs."""
13
+ train_graphs: list[networkx.Graph]
14
+ val_graphs: list[networkx.Graph]
15
+ test_graphs: list[networkx.Graph] | None = None
16
+
17
+
18
+ @classmethod
19
+ def load_from_pickle_url(cls, url: str):
20
+ """
21
+ Load a dataset from a pickle file available at the given URL.
22
+
23
+ Args:
24
+ url (str): The URL of the pickle file containing the dataset.
25
+
26
+ Returns:
27
+ Dataset: An instance of the Dataset class with loaded graphs.
28
+ """
29
+ res = download_file(url, "data")
30
+ with open(res, "rb") as f:
31
+ data = pickle.load(f)
32
+ return cls(
33
+ train_graphs=data['train'],
34
+ val_graphs=data['val'],
35
+ test_graphs=data.get('test', None)
36
+ )
37
+ @classmethod
38
+ def load_sbm(cls):
39
+ return cls.load_from_pickle_url("https://raw.githubusercontent.com/AndreasBergmeister/graph-generation/main/data/sbm.pkl")
40
+
41
+ @classmethod
42
+ def load_planar(cls):
43
+ return cls.load_from_pickle_url("https://raw.githubusercontent.com/AndreasBergmeister/graph-generation/main/data/planar.pkl")
44
+
45
+ @classmethod
46
+ def load_tree(cls):
47
+ return cls.load_from_pickle_url("https://raw.githubusercontent.com/AndreasBergmeister/graph-generation/main/data/tree.pkl")
@@ -0,0 +1,222 @@
1
+ ###############################################################################
2
+ #
3
+ # Adapted from https://github.com/lrjconan/GRAN/ which in turn is adapted from https://github.com/JiaxuanYou/graph-generation
4
+ #
5
+ ###############################################################################
6
+ import numpy as np
7
+ import concurrent.futures
8
+ from functools import partial
9
+ from scipy.linalg import toeplitz
10
+ from scipy.stats import wasserstein_distance
11
+ from scipy.optimize import linprog
12
+
13
+
14
+ def _compute_emd_with_distance_matrix(x, y, distance_mat):
15
+ """
16
+ Compute EMD using linear programming when a custom distance matrix is provided.
17
+ This is equivalent to pyemd.emd but implemented using scipy.
18
+ """
19
+ x = x.astype(float)
20
+ y = y.astype(float)
21
+
22
+ # Ensure distributions are normalized
23
+ if np.sum(x) > 0:
24
+ x = x / np.sum(x)
25
+ if np.sum(y) > 0:
26
+ y = y / np.sum(y)
27
+
28
+ n = len(x)
29
+ m = len(y)
30
+
31
+ # Create the cost vector (flattened distance matrix)
32
+ c = distance_mat[:n, :m].flatten()
33
+
34
+ # Create equality constraints for supply (sum over columns = x[i])
35
+ A_eq_supply = np.zeros((n, n * m))
36
+ for i in range(n):
37
+ for j in range(m):
38
+ A_eq_supply[i, i * m + j] = 1
39
+ b_eq_supply = x
40
+
41
+ # Create equality constraints for demand (sum over rows = y[j])
42
+ A_eq_demand = np.zeros((m, n * m))
43
+ for j in range(m):
44
+ for i in range(n):
45
+ A_eq_demand[j, i * m + j] = 1
46
+ b_eq_demand = y
47
+
48
+ # Combine constraints
49
+ A_eq = np.vstack([A_eq_supply, A_eq_demand])
50
+ b_eq = np.hstack([b_eq_supply, b_eq_demand])
51
+
52
+ # Bounds: all variables >= 0
53
+ bounds = [(0, None) for _ in range(n * m)]
54
+
55
+ # Solve the linear program
56
+ try:
57
+ result = linprog(c, A_eq=A_eq, b_eq=b_eq, bounds=bounds, method="highs")
58
+ if result.success:
59
+ return result.fun
60
+ else:
61
+ # Fallback to simpler Wasserstein distance
62
+ return wasserstein_distance(np.arange(n), np.arange(m), x, y)
63
+ except Exception:
64
+ # Fallback to simpler Wasserstein distance
65
+ return wasserstein_distance(np.arange(n), np.arange(m), x, y)
66
+
67
+
68
+ def emd(x, y, distance_scaling=1.0):
69
+ support_size = max(len(x), len(y))
70
+ d_mat = toeplitz(range(support_size)).astype(float)
71
+ distance_mat = d_mat / distance_scaling
72
+
73
+ # convert histogram values x and y to float, and make them equal len
74
+ x = x.astype(float)
75
+ y = y.astype(float)
76
+ if len(x) < len(y):
77
+ x = np.hstack((x, [0.0] * (support_size - len(x))))
78
+ elif len(y) < len(x):
79
+ y = np.hstack((y, [0.0] * (support_size - len(y))))
80
+
81
+ emd_result = _compute_emd_with_distance_matrix(x, y, distance_mat)
82
+ return emd_result
83
+
84
+
85
+ def l2(x, y):
86
+ dist = np.linalg.norm(x - y, 2)
87
+ return dist
88
+
89
+
90
+ def emd_with_sigma(x, y, sigma=1.0, distance_scaling=1.0):
91
+ """EMD
92
+ Args:
93
+ x, y: 1D pmf of two distributions with the same support
94
+ sigma: standard deviation
95
+ """
96
+ support_size = max(len(x), len(y))
97
+ d_mat = toeplitz(range(support_size)).astype(float)
98
+ distance_mat = d_mat / distance_scaling
99
+
100
+ # convert histogram values x and y to float, and make them equal len
101
+ x = x.astype(float)
102
+ y = y.astype(float)
103
+ if len(x) < len(y):
104
+ x = np.hstack((x, [0.0] * (support_size - len(x))))
105
+ elif len(y) < len(x):
106
+ y = np.hstack((y, [0.0] * (support_size - len(y))))
107
+
108
+ return np.abs(_compute_emd_with_distance_matrix(x, y, distance_mat))
109
+
110
+
111
+ def gaussian_emd(x, y, sigma=1.0, distance_scaling=1.0):
112
+ """Gaussian kernel with squared distance in exponential term replaced by EMD
113
+ Args:
114
+ x, y: 1D pmf of two distributions with the same support
115
+ sigma: standard deviation
116
+ """
117
+ support_size = max(len(x), len(y))
118
+ d_mat = toeplitz(range(support_size)).astype(float)
119
+ distance_mat = d_mat / distance_scaling
120
+
121
+ # convert histogram values x and y to float, and make them equal len
122
+ x = x.astype(float)
123
+ y = y.astype(float)
124
+ if len(x) < len(y):
125
+ x = np.hstack((x, [0.0] * (support_size - len(x))))
126
+ elif len(y) < len(x):
127
+ y = np.hstack((y, [0.0] * (support_size - len(y))))
128
+
129
+ emd_result = _compute_emd_with_distance_matrix(x, y, distance_mat)
130
+ return np.exp(-emd_result * emd_result / (2 * sigma * sigma))
131
+
132
+
133
+ def gaussian(x, y, sigma=1.0):
134
+ support_size = max(len(x), len(y))
135
+ # convert histogram values x and y to float, and make them equal len
136
+ x = x.astype(float)
137
+ y = y.astype(float)
138
+ if len(x) < len(y):
139
+ x = np.hstack((x, [0.0] * (support_size - len(x))))
140
+ elif len(y) < len(x):
141
+ y = np.hstack((y, [0.0] * (support_size - len(y))))
142
+
143
+ dist = np.linalg.norm(x - y, 2)
144
+ return np.exp(-dist * dist / (2 * sigma * sigma))
145
+
146
+
147
+ def gaussian_tv(x, y, sigma=1.0):
148
+ support_size = max(len(x), len(y))
149
+ # convert histogram values x and y to float, and make them equal len
150
+ x = x.astype(float)
151
+ y = y.astype(float)
152
+ if len(x) < len(y):
153
+ x = np.hstack((x, [0.0] * (support_size - len(x))))
154
+ elif len(y) < len(x):
155
+ y = np.hstack((y, [0.0] * (support_size - len(y))))
156
+
157
+ dist = np.abs(x - y).sum() / 2.0
158
+ return np.exp(-dist * dist / (2 * sigma * sigma))
159
+
160
+
161
+ def kernel_parallel_unpacked(x, samples2, kernel):
162
+ d = 0
163
+ for s2 in samples2:
164
+ d += kernel(x, s2)
165
+ return d
166
+
167
+
168
+ def kernel_parallel_worker(t):
169
+ return kernel_parallel_unpacked(*t)
170
+
171
+
172
+ def disc(samples1, samples2, kernel, is_parallel=True, *args, **kwargs):
173
+ """Discrepancy between 2 samples"""
174
+ d = 0
175
+
176
+ if not is_parallel:
177
+ for s1 in samples1:
178
+ for s2 in samples2:
179
+ d += kernel(s1, s2, *args, **kwargs)
180
+ else:
181
+ with concurrent.futures.ThreadPoolExecutor() as executor:
182
+ for dist in executor.map(
183
+ kernel_parallel_worker,
184
+ [(s1, samples2, partial(kernel, *args, **kwargs)) for s1 in samples1],
185
+ ):
186
+ d += dist
187
+ if len(samples1) * len(samples2) > 0:
188
+ d /= len(samples1) * len(samples2)
189
+ else:
190
+ d = 1e6
191
+ return d
192
+
193
+
194
+ def compute_mmd(samples1, samples2, kernel, is_hist=True, *args, **kwargs):
195
+ """MMD between two samples"""
196
+ # normalize histograms into pmf
197
+ if is_hist:
198
+ samples1 = [s1 / (np.sum(s1) + 1e-6) for s1 in samples1]
199
+ samples2 = [s2 / (np.sum(s2) + 1e-6) for s2 in samples2]
200
+ mmd = (
201
+ disc(samples1, samples1, kernel, *args, **kwargs)
202
+ + disc(samples2, samples2, kernel, *args, **kwargs)
203
+ - 2 * disc(samples1, samples2, kernel, *args, **kwargs)
204
+ )
205
+
206
+ mmd = np.abs(mmd)
207
+
208
+ if mmd < 0:
209
+ import pdb
210
+
211
+ pdb.set_trace()
212
+
213
+ return mmd
214
+
215
+
216
+ def compute_emd(samples1, samples2, kernel, is_hist=True, *args, **kwargs):
217
+ """EMD between average of two samples"""
218
+ # normalize histograms into pmf
219
+ if is_hist:
220
+ samples1 = [np.mean(samples1)]
221
+ samples2 = [np.mean(samples2)]
222
+ return disc(samples1, samples2, kernel, *args, **kwargs), [samples1[0], samples2[0]]