topologicpy 0.4.8__py3-none-any.whl → 0.4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- topologicpy/Aperture.py +46 -0
- topologicpy/Cell.py +1780 -0
- topologicpy/CellComplex.py +791 -0
- topologicpy/Cluster.py +591 -0
- topologicpy/Color.py +157 -0
- topologicpy/Context.py +56 -0
- topologicpy/DGL.py +2661 -0
- topologicpy/Dictionary.py +470 -0
- topologicpy/Edge.py +855 -0
- topologicpy/EnergyModel.py +1052 -0
- topologicpy/Face.py +1810 -0
- topologicpy/Graph.py +3526 -0
- topologicpy/Graph_Export.py +858 -0
- topologicpy/Grid.py +338 -0
- topologicpy/Helper.py +182 -0
- topologicpy/Honeybee.py +424 -0
- topologicpy/Matrix.py +255 -0
- topologicpy/Neo4jGraph.py +311 -0
- topologicpy/Plotly.py +1396 -0
- topologicpy/Polyskel.py +524 -0
- topologicpy/Process.py +1368 -0
- topologicpy/SQL.py +48 -0
- topologicpy/Shell.py +1418 -0
- topologicpy/Speckle.py +433 -0
- topologicpy/Topology.py +5854 -0
- topologicpy/UnitTest.py +29 -0
- topologicpy/Vector.py +555 -0
- topologicpy/Vertex.py +714 -0
- topologicpy/Wire.py +2346 -0
- topologicpy/__init__.py +20 -0
- topologicpy/bin/linux/topologic/__init__.py +2 -0
- topologicpy/bin/linux/topologic/topologic.cpython-310-x86_64-linux-gnu.so +0 -0
- topologicpy/bin/linux/topologic/topologic.cpython-311-x86_64-linux-gnu.so +0 -0
- topologicpy/bin/linux/topologic/topologic.cpython-38-x86_64-linux-gnu.so +0 -0
- topologicpy/bin/linux/topologic/topologic.cpython-39-x86_64-linux-gnu.so +0 -0
- topologicpy/bin/linux/topologic.libs/libTKBO-6bdf205d.so.7.7.0 +0 -0
- topologicpy/bin/linux/topologic.libs/libTKBRep-2960a069.so.7.7.0 +0 -0
- topologicpy/bin/linux/topologic.libs/libTKBool-c44b74bd.so.7.7.0 +0 -0
- topologicpy/bin/linux/topologic.libs/libTKFillet-9a670ba0.so.7.7.0 +0 -0
- topologicpy/bin/linux/topologic.libs/libTKG2d-8f31849e.so.7.7.0 +0 -0
- topologicpy/bin/linux/topologic.libs/libTKG3d-4c6bce57.so.7.7.0 +0 -0
- topologicpy/bin/linux/topologic.libs/libTKGeomAlgo-26066fd9.so.7.7.0 +0 -0
- topologicpy/bin/linux/topologic.libs/libTKGeomBase-2116cabe.so.7.7.0 +0 -0
- topologicpy/bin/linux/topologic.libs/libTKMath-72572fa8.so.7.7.0 +0 -0
- topologicpy/bin/linux/topologic.libs/libTKMesh-2a060427.so.7.7.0 +0 -0
- topologicpy/bin/linux/topologic.libs/libTKOffset-6cab68ff.so.7.7.0 +0 -0
- topologicpy/bin/linux/topologic.libs/libTKPrim-eb1262b3.so.7.7.0 +0 -0
- topologicpy/bin/linux/topologic.libs/libTKShHealing-e67e5cc7.so.7.7.0 +0 -0
- topologicpy/bin/linux/topologic.libs/libTKTopAlgo-e4c96c33.so.7.7.0 +0 -0
- topologicpy/bin/linux/topologic.libs/libTKernel-fb7fe3b7.so.7.7.0 +0 -0
- topologicpy/bin/linux/topologic.libs/libgcc_s-32c1665e.so.1 +0 -0
- topologicpy/bin/linux/topologic.libs/libstdc++-672d7b41.so.6.0.30 +0 -0
- topologicpy/bin/windows/topologic/TKBO-f6b191de.dll +0 -0
- topologicpy/bin/windows/topologic/TKBRep-e56a600e.dll +0 -0
- topologicpy/bin/windows/topologic/TKBool-7b8d47ae.dll +0 -0
- topologicpy/bin/windows/topologic/TKFillet-0ddbf0a8.dll +0 -0
- topologicpy/bin/windows/topologic/TKG2d-2e2dee3d.dll +0 -0
- topologicpy/bin/windows/topologic/TKG3d-6674513d.dll +0 -0
- topologicpy/bin/windows/topologic/TKGeomAlgo-d240e370.dll +0 -0
- topologicpy/bin/windows/topologic/TKGeomBase-df87aba5.dll +0 -0
- topologicpy/bin/windows/topologic/TKMath-45bd625a.dll +0 -0
- topologicpy/bin/windows/topologic/TKMesh-d6e826b1.dll +0 -0
- topologicpy/bin/windows/topologic/TKOffset-79b9cc94.dll +0 -0
- topologicpy/bin/windows/topologic/TKPrim-aa430a86.dll +0 -0
- topologicpy/bin/windows/topologic/TKShHealing-bb48be89.dll +0 -0
- topologicpy/bin/windows/topologic/TKTopAlgo-7d0d1e22.dll +0 -0
- topologicpy/bin/windows/topologic/TKernel-08c8cfbb.dll +0 -0
- topologicpy/bin/windows/topologic/__init__.py +2 -0
- topologicpy/bin/windows/topologic/topologic.cp310-win_amd64.pyd +0 -0
- topologicpy/bin/windows/topologic/topologic.cp311-win_amd64.pyd +0 -0
- topologicpy/bin/windows/topologic/topologic.cp38-win_amd64.pyd +0 -0
- topologicpy/bin/windows/topologic/topologic.cp39-win_amd64.pyd +0 -0
- {topologicpy-0.4.8.dist-info → topologicpy-0.4.9.dist-info}/METADATA +1 -1
- topologicpy-0.4.9.dist-info/RECORD +77 -0
- topologicpy-0.4.9.dist-info/top_level.txt +1 -0
- topologicpy-0.4.8.dist-info/RECORD +0 -5
- topologicpy-0.4.8.dist-info/top_level.txt +0 -1
- {topologicpy-0.4.8.dist-info → topologicpy-0.4.9.dist-info}/LICENSE +0 -0
- {topologicpy-0.4.8.dist-info → topologicpy-0.4.9.dist-info}/WHEEL +0 -0
topologicpy/DGL.py
ADDED
|
@@ -0,0 +1,2661 @@
|
|
|
1
|
+
|
|
2
|
+
import topologicpy
|
|
3
|
+
import topologic
|
|
4
|
+
from topologicpy.Dictionary import Dictionary
|
|
5
|
+
import os
|
|
6
|
+
import random
|
|
7
|
+
import time
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
import copy
|
|
10
|
+
import sys
|
|
11
|
+
import subprocess
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
import numpy as np
|
|
15
|
+
except:
|
|
16
|
+
call = [sys.executable, '-m', 'pip', 'install', 'numpy', '-t', sys.path[0]]
|
|
17
|
+
subprocess.run(call)
|
|
18
|
+
try:
|
|
19
|
+
import numpy as np
|
|
20
|
+
except:
|
|
21
|
+
print("DGL - Error: Could not import numpy.")
|
|
22
|
+
try:
|
|
23
|
+
import pandas as pd
|
|
24
|
+
except:
|
|
25
|
+
call = [sys.executable, '-m', 'pip', 'install', 'pandas', '-t', sys.path[0]]
|
|
26
|
+
subprocess.run(call)
|
|
27
|
+
try:
|
|
28
|
+
import pandas as pd
|
|
29
|
+
except:
|
|
30
|
+
print("DGL - Error: Could not import pandas")
|
|
31
|
+
try:
|
|
32
|
+
import torch
|
|
33
|
+
import torch.nn as nn
|
|
34
|
+
import torch.nn.functional as F
|
|
35
|
+
from torch.utils.data.sampler import SubsetRandomSampler
|
|
36
|
+
from torch.utils.data import DataLoader, ConcatDataset
|
|
37
|
+
except:
|
|
38
|
+
call = [sys.executable, '-m', 'pip', 'install', 'torch', '-t', sys.path[0]]
|
|
39
|
+
subprocess.run(call)
|
|
40
|
+
try:
|
|
41
|
+
import torch
|
|
42
|
+
import torch.nn as nn
|
|
43
|
+
import torch.nn.functional as F
|
|
44
|
+
from torch.utils.data.sampler import SubsetRandomSampler
|
|
45
|
+
from torch.utils.data import DataLoader, ConcatDataset
|
|
46
|
+
except:
|
|
47
|
+
print("DGL - Error: Could not import torch")
|
|
48
|
+
try:
|
|
49
|
+
import dgl
|
|
50
|
+
from dgl.data import DGLDataset
|
|
51
|
+
from dgl.dataloading import GraphDataLoader
|
|
52
|
+
from dgl.nn import GINConv, GraphConv, SAGEConv, TAGConv
|
|
53
|
+
from dgl import save_graphs, load_graphs
|
|
54
|
+
except:
|
|
55
|
+
call = [sys.executable, '-m', 'pip', 'install', 'dgl', 'dglgo', '-f', 'https://data.dgl.ai/wheels/repo.html', '--upgrade', '-t', sys.path[0]]
|
|
56
|
+
subprocess.run(call)
|
|
57
|
+
try:
|
|
58
|
+
import dgl
|
|
59
|
+
from dgl.data import DGLDataset
|
|
60
|
+
from dgl.nn import GraphConv
|
|
61
|
+
from dgl import save_graphs, load_graphs
|
|
62
|
+
except:
|
|
63
|
+
print("DGL - Error: Could not import dgl")
|
|
64
|
+
try:
|
|
65
|
+
import sklearn
|
|
66
|
+
from sklearn.model_selection import KFold
|
|
67
|
+
from sklearn.metrics import accuracy_score
|
|
68
|
+
except:
|
|
69
|
+
call = [sys.executable, '-m', 'pip', 'install', 'scikit-learn', '-t', sys.path[0]]
|
|
70
|
+
subprocess.run(call)
|
|
71
|
+
try:
|
|
72
|
+
import sklearn
|
|
73
|
+
from sklearn.model_selection import KFold
|
|
74
|
+
from sklearn.metrics import accuracy_score
|
|
75
|
+
except:
|
|
76
|
+
print("DGL - Error: Could not import sklearn")
|
|
77
|
+
try:
|
|
78
|
+
from tqdm.auto import tqdm
|
|
79
|
+
except:
|
|
80
|
+
call = [sys.executable, '-m', 'pip', 'install', 'tqdm', '-t', sys.path[0]]
|
|
81
|
+
subprocess.run(call)
|
|
82
|
+
try:
|
|
83
|
+
from tqdm.auto import tqdm
|
|
84
|
+
except:
|
|
85
|
+
print("DGL - Error: Could not import tqdm")
|
|
86
|
+
|
|
87
|
+
class _Dataset(DGLDataset):
|
|
88
|
+
def __init__(self, graphs, labels, node_attr_key):
|
|
89
|
+
super().__init__(name='GraphDGL')
|
|
90
|
+
self.graphs = graphs
|
|
91
|
+
self.labels = torch.LongTensor(labels)
|
|
92
|
+
self.node_attr_key = node_attr_key
|
|
93
|
+
# as all graphs have same length of node features then we get dim_nfeats from first graph in the list
|
|
94
|
+
self.dim_nfeats = graphs[0].ndata[node_attr_key].shape[1]
|
|
95
|
+
# to get the number of classes for graphs
|
|
96
|
+
self.gclasses = len(set(labels))
|
|
97
|
+
|
|
98
|
+
def __getitem__(self, i):
|
|
99
|
+
return self.graphs[i], self.labels[i]
|
|
100
|
+
|
|
101
|
+
def __len__(self):
|
|
102
|
+
return len(self.graphs)
|
|
103
|
+
|
|
104
|
+
class _Hparams:
|
|
105
|
+
def __init__(self, model_type="ClassifierHoldout", optimizer_str="Adam", amsgrad=False, betas=(0.9, 0.999), eps=1e-6, lr=0.001, lr_decay= 0, maximize=False, rho=0.9, weight_decay=0, cv_type="Holdout", split=[0.8,0.1, 0.1], k_folds=5, hl_widths=[32], conv_layer_type='SAGEConv', pooling="AvgPooling", batch_size=32, epochs=1,
|
|
106
|
+
use_gpu=False, loss_function="Cross Entropy"):
|
|
107
|
+
"""
|
|
108
|
+
Parameters
|
|
109
|
+
----------
|
|
110
|
+
cv : str
|
|
111
|
+
A string to define the method of cross-validation
|
|
112
|
+
"Holdout": Holdout
|
|
113
|
+
"K-Fold": K-Fold cross validation
|
|
114
|
+
k_folds : int
|
|
115
|
+
An int value in the range of 2 to X to define the number of k-folds for cross-validation. Default is 5.
|
|
116
|
+
split : list
|
|
117
|
+
A list of three item in the range of 0 to 1 to define the split of train,
|
|
118
|
+
validate, and test data. A default value of [0.8,0.1,0.1] means 80% of data will be
|
|
119
|
+
used for training, 10% will be used for validation, and the remaining 10% will be used for training
|
|
120
|
+
hl_widths : list
|
|
121
|
+
List of hidden neurons for each layer such as [32] will mean
|
|
122
|
+
that there is one hidden layers in the network with 32 neurons
|
|
123
|
+
optimizer : torch.optim object
|
|
124
|
+
This will be the selected optimizer from torch.optim package. By
|
|
125
|
+
default, torch.optim.Adam is selected
|
|
126
|
+
learning_rate : float
|
|
127
|
+
a step value to be used to apply the gradients by optimizer
|
|
128
|
+
batch_size : int
|
|
129
|
+
to define a set of samples to be used for training and testing in
|
|
130
|
+
each step of an epoch
|
|
131
|
+
epochs : int
|
|
132
|
+
An epoch means training the neural network with all the training data for one cycle. In an epoch, we use all of the data exactly once. A forward pass and a backward pass together are counted as one pass
|
|
133
|
+
use_GPU : use the GPU. Otherwise, use the CPU
|
|
134
|
+
|
|
135
|
+
Returns
|
|
136
|
+
-------
|
|
137
|
+
None
|
|
138
|
+
|
|
139
|
+
"""
|
|
140
|
+
|
|
141
|
+
self.model_type = model_type
|
|
142
|
+
self.optimizer_str = optimizer_str
|
|
143
|
+
self.amsgrad = amsgrad
|
|
144
|
+
self.betas = betas
|
|
145
|
+
self.eps = eps
|
|
146
|
+
self.lr = lr
|
|
147
|
+
self.lr_decay = lr_decay
|
|
148
|
+
self.maximize = maximize
|
|
149
|
+
self.rho = rho
|
|
150
|
+
self.weight_decay = weight_decay
|
|
151
|
+
self.cv_type = cv_type
|
|
152
|
+
self.split = split
|
|
153
|
+
self.k_folds = k_folds
|
|
154
|
+
self.hl_widths = hl_widths
|
|
155
|
+
self.conv_layer_type = conv_layer_type
|
|
156
|
+
self.pooling = pooling
|
|
157
|
+
self.batch_size = batch_size
|
|
158
|
+
self.epochs = epochs
|
|
159
|
+
self.use_gpu = use_gpu
|
|
160
|
+
self.loss_function = loss_function
|
|
161
|
+
|
|
162
|
+
class _Classic(nn.Module):
|
|
163
|
+
def __init__(self, in_feats, h_feats, num_classes):
|
|
164
|
+
"""
|
|
165
|
+
|
|
166
|
+
Parameters
|
|
167
|
+
----------
|
|
168
|
+
in_feats : int
|
|
169
|
+
Input dimension in the form of integer
|
|
170
|
+
h_feats : list
|
|
171
|
+
List of hidden neurons for each hidden layer
|
|
172
|
+
num_classes : int
|
|
173
|
+
Number of output classes
|
|
174
|
+
|
|
175
|
+
Returns
|
|
176
|
+
-------
|
|
177
|
+
None.
|
|
178
|
+
|
|
179
|
+
"""
|
|
180
|
+
super(_Classic, self).__init__()
|
|
181
|
+
assert isinstance(h_feats, list), "h_feats must be a list"
|
|
182
|
+
h_feats = [x for x in h_feats if x is not None]
|
|
183
|
+
assert len(h_feats) !=0, "h_feats is empty. unable to add hidden layers"
|
|
184
|
+
self.list_of_layers = nn.ModuleList()
|
|
185
|
+
dim = [in_feats] + h_feats
|
|
186
|
+
for i in range(1, len(dim)):
|
|
187
|
+
self.list_of_layers.append(GraphConv(dim[i-1], dim[i]))
|
|
188
|
+
self.final = GraphConv(dim[-1], num_classes)
|
|
189
|
+
|
|
190
|
+
def forward(self, g, in_feat):
|
|
191
|
+
h = in_feat
|
|
192
|
+
for i in range(len(self.list_of_layers)):
|
|
193
|
+
h = self.list_of_layers[i](g, h)
|
|
194
|
+
h = F.relu(h)
|
|
195
|
+
h = self.final(g, h)
|
|
196
|
+
g.ndata['h'] = h
|
|
197
|
+
return dgl.mean_nodes(g, 'h')
|
|
198
|
+
|
|
199
|
+
class _ClassicReg(nn.Module):
|
|
200
|
+
def __init__(self, in_feats, h_feats):
|
|
201
|
+
super(_ClassicReg, self).__init__()
|
|
202
|
+
assert isinstance(h_feats, list), "h_feats must be a list"
|
|
203
|
+
h_feats = [x for x in h_feats if x is not None]
|
|
204
|
+
assert len(h_feats) !=0, "h_feats is empty. unable to add hidden layers"
|
|
205
|
+
self.list_of_layers = nn.ModuleList()
|
|
206
|
+
dim = [in_feats] + h_feats
|
|
207
|
+
for i in range(1, len(dim)):
|
|
208
|
+
self.list_of_layers.append(GraphConv(dim[i-1], dim[i]))
|
|
209
|
+
self.final = nn.Linear(dim[-1], 1)
|
|
210
|
+
|
|
211
|
+
def forward(self, g, in_feat):
|
|
212
|
+
h = in_feat
|
|
213
|
+
for i in range(len(self.list_of_layers)):
|
|
214
|
+
h = self.list_of_layers[i](g, h)
|
|
215
|
+
h = F.relu(h)
|
|
216
|
+
h = self.final(h)
|
|
217
|
+
g.ndata['h'] = h
|
|
218
|
+
return dgl.mean_nodes(g, 'h')
|
|
219
|
+
|
|
220
|
+
class _GINConv(nn.Module):
|
|
221
|
+
def __init__(self, in_feats, h_feats, num_classes, pooling):
|
|
222
|
+
super(_GINConv, self).__init__()
|
|
223
|
+
assert isinstance(h_feats, list), "h_feats must be a list"
|
|
224
|
+
h_feats = [x for x in h_feats if x is not None]
|
|
225
|
+
assert len(h_feats) !=0, "h_feats is empty. unable to add hidden layers"
|
|
226
|
+
self.list_of_layers = nn.ModuleList()
|
|
227
|
+
dim = [in_feats] + h_feats
|
|
228
|
+
|
|
229
|
+
# Convolution (Hidden) Layers
|
|
230
|
+
for i in range(1, len(dim)):
|
|
231
|
+
lin = nn.Linear(dim[i-1], dim[i])
|
|
232
|
+
self.list_of_layers.append(GINConv(lin, 'sum'))
|
|
233
|
+
|
|
234
|
+
# Final Layer
|
|
235
|
+
self.final = nn.Linear(dim[-1], num_classes)
|
|
236
|
+
|
|
237
|
+
# Pooling layer
|
|
238
|
+
if pooling.lower() == "avgpooling":
|
|
239
|
+
self.pooling_layer = dgl.nn.AvgPooling()
|
|
240
|
+
elif pooling.lower() == "maxpooling":
|
|
241
|
+
self.pooling_layer = dgl.nn.MaxPooling()
|
|
242
|
+
elif pooling.lower() == "sumpooling":
|
|
243
|
+
self.pooling_layer = dgl.nn.SumPooling()
|
|
244
|
+
else:
|
|
245
|
+
raise NotImplementedError
|
|
246
|
+
|
|
247
|
+
def forward(self, g, in_feat):
|
|
248
|
+
h = in_feat
|
|
249
|
+
# Generate node features
|
|
250
|
+
for i in range(len(self.list_of_layers)): # Aim for 2 about 3 layers
|
|
251
|
+
h = self.list_of_layers[i](g, h)
|
|
252
|
+
h = F.relu(h)
|
|
253
|
+
# h will now be matrix of dimension num_nodes by h_feats[-1]
|
|
254
|
+
h = self.final(h)
|
|
255
|
+
g.ndata['h'] = h
|
|
256
|
+
# Go from node level features to graph level features by pooling
|
|
257
|
+
h = self.pooling_layer(g, h)
|
|
258
|
+
# h will now be vector of dimension num_classes
|
|
259
|
+
return h
|
|
260
|
+
|
|
261
|
+
class _GraphConv(nn.Module):
|
|
262
|
+
def __init__(self, in_feats, h_feats, num_classes, pooling):
|
|
263
|
+
super(_GraphConv, self).__init__()
|
|
264
|
+
assert isinstance(h_feats, list), "h_feats must be a list"
|
|
265
|
+
h_feats = [x for x in h_feats if x is not None]
|
|
266
|
+
assert len(h_feats) !=0, "h_feats is empty. unable to add hidden layers"
|
|
267
|
+
self.list_of_layers = nn.ModuleList()
|
|
268
|
+
dim = [in_feats] + h_feats
|
|
269
|
+
|
|
270
|
+
# Convolution (Hidden) Layers
|
|
271
|
+
for i in range(1, len(dim)):
|
|
272
|
+
self.list_of_layers.append(GraphConv(dim[i-1], dim[i]))
|
|
273
|
+
|
|
274
|
+
# Final Layer
|
|
275
|
+
# Followed example at: https://docs.dgl.ai/tutorials/blitz/5_graph_classification.html#sphx-glr-tutorials-blitz-5-graph-classification-py
|
|
276
|
+
self.final = GraphConv(dim[-1], num_classes)
|
|
277
|
+
|
|
278
|
+
# Pooling layer
|
|
279
|
+
if pooling.lower() == "avgpooling":
|
|
280
|
+
self.pooling_layer = dgl.nn.AvgPooling()
|
|
281
|
+
elif pooling.lower() == "maxpooling":
|
|
282
|
+
self.pooling_layer = dgl.nn.MaxPooling()
|
|
283
|
+
elif pooling.lower() == "sumpooling":
|
|
284
|
+
self.pooling_layer = dgl.nn.SumPooling()
|
|
285
|
+
else:
|
|
286
|
+
raise NotImplementedError
|
|
287
|
+
|
|
288
|
+
def forward(self, g, in_feat):
|
|
289
|
+
h = in_feat
|
|
290
|
+
# Generate node features
|
|
291
|
+
for i in range(len(self.list_of_layers)): # Aim for 2 about 3 layers
|
|
292
|
+
h = self.list_of_layers[i](g, h)
|
|
293
|
+
h = F.relu(h)
|
|
294
|
+
# h will now be matrix of dimension num_nodes by h_feats[-1]
|
|
295
|
+
h = self.final(g,h)
|
|
296
|
+
g.ndata['h'] = h
|
|
297
|
+
# Go from node level features to graph level features by pooling
|
|
298
|
+
h = self.pooling_layer(g, h)
|
|
299
|
+
# h will now be vector of dimension num_classes
|
|
300
|
+
return h
|
|
301
|
+
|
|
302
|
+
class _SAGEConv(nn.Module):
|
|
303
|
+
def __init__(self, in_feats, h_feats, num_classes, pooling):
|
|
304
|
+
super(_SAGEConv, self).__init__()
|
|
305
|
+
assert isinstance(h_feats, list), "h_feats must be a list"
|
|
306
|
+
h_feats = [x for x in h_feats if x is not None]
|
|
307
|
+
assert len(h_feats) !=0, "h_feats is empty. unable to add hidden layers"
|
|
308
|
+
self.list_of_layers = nn.ModuleList()
|
|
309
|
+
dim = [in_feats] + h_feats
|
|
310
|
+
|
|
311
|
+
# Convolution (Hidden) Layers
|
|
312
|
+
for i in range(1, len(dim)):
|
|
313
|
+
self.list_of_layers.append(SAGEConv(dim[i-1], dim[i], aggregator_type='pool'))
|
|
314
|
+
|
|
315
|
+
# Final Layer
|
|
316
|
+
self.final = nn.Linear(dim[-1], num_classes)
|
|
317
|
+
|
|
318
|
+
# Pooling layer
|
|
319
|
+
if pooling.lower() == "avgpooling":
|
|
320
|
+
self.pooling_layer = dgl.nn.AvgPooling()
|
|
321
|
+
elif pooling.lower() == "maxpooling":
|
|
322
|
+
self.pooling_layer = dgl.nn.MaxPooling()
|
|
323
|
+
elif pooling.lower() == "sumpooling":
|
|
324
|
+
self.pooling_layer = dgl.nn.SumPooling()
|
|
325
|
+
else:
|
|
326
|
+
raise NotImplementedError
|
|
327
|
+
|
|
328
|
+
def forward(self, g, in_feat):
|
|
329
|
+
h = in_feat
|
|
330
|
+
# Generate node features
|
|
331
|
+
for i in range(len(self.list_of_layers)): # Aim for 2 about 3 layers
|
|
332
|
+
h = self.list_of_layers[i](g, h)
|
|
333
|
+
h = F.relu(h)
|
|
334
|
+
# h will now be matrix of dimension num_nodes by h_feats[-1]
|
|
335
|
+
h = self.final(h)
|
|
336
|
+
g.ndata['h'] = h
|
|
337
|
+
# Go from node level features to graph level features by pooling
|
|
338
|
+
h = self.pooling_layer(g, h)
|
|
339
|
+
# h will now be vector of dimension num_classes
|
|
340
|
+
return h
|
|
341
|
+
|
|
342
|
+
class _TAGConv(nn.Module):
|
|
343
|
+
def __init__(self, in_feats, h_feats, num_classes, pooling):
|
|
344
|
+
super(_TAGConv, self).__init__()
|
|
345
|
+
assert isinstance(h_feats, list), "h_feats must be a list"
|
|
346
|
+
h_feats = [x for x in h_feats if x is not None]
|
|
347
|
+
assert len(h_feats) !=0, "h_feats is empty. unable to add hidden layers"
|
|
348
|
+
self.list_of_layers = nn.ModuleList()
|
|
349
|
+
dim = [in_feats] + h_feats
|
|
350
|
+
|
|
351
|
+
# Convolution (Hidden) Layers
|
|
352
|
+
for i in range(1, len(dim)):
|
|
353
|
+
self.list_of_layers.append(TAGConv(dim[i-1], dim[i], k=2))
|
|
354
|
+
|
|
355
|
+
# Final Layer
|
|
356
|
+
self.final = nn.Linear(dim[-1], num_classes)
|
|
357
|
+
|
|
358
|
+
# Pooling layer
|
|
359
|
+
if pooling.lower() == "avgpooling":
|
|
360
|
+
self.pooling_layer = dgl.nn.AvgPooling()
|
|
361
|
+
elif pooling.lower() == "maxpooling":
|
|
362
|
+
self.pooling_layer = dgl.nn.MaxPooling()
|
|
363
|
+
elif pooling.lower() == "sumpooling":
|
|
364
|
+
self.pooling_layer = dgl.nn.SumPooling()
|
|
365
|
+
else:
|
|
366
|
+
raise NotImplementedError
|
|
367
|
+
|
|
368
|
+
def forward(self, g, in_feat):
|
|
369
|
+
h = in_feat
|
|
370
|
+
# Generate node features
|
|
371
|
+
for i in range(len(self.list_of_layers)): # Aim for 2 about 3 layers
|
|
372
|
+
h = self.list_of_layers[i](g, h)
|
|
373
|
+
h = F.relu(h)
|
|
374
|
+
# h will now be matrix of dimension num_nodes by h_feats[-1]
|
|
375
|
+
h = self.final(h)
|
|
376
|
+
g.ndata['h'] = h
|
|
377
|
+
# Go from node level features to graph level features by pooling
|
|
378
|
+
h = self.pooling_layer(g, h)
|
|
379
|
+
# h will now be vector of dimension num_classes
|
|
380
|
+
return h
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
class _GraphConvReg(nn.Module):
|
|
384
|
+
def __init__(self, in_feats, h_feats, pooling):
|
|
385
|
+
super(_GraphConvReg, self).__init__()
|
|
386
|
+
assert isinstance(h_feats, list), "h_feats must be a list"
|
|
387
|
+
h_feats = [x for x in h_feats if x is not None]
|
|
388
|
+
assert len(h_feats) !=0, "h_feats is empty. unable to add hidden layers"
|
|
389
|
+
self.list_of_layers = nn.ModuleList()
|
|
390
|
+
dim = [in_feats] + h_feats
|
|
391
|
+
|
|
392
|
+
# Convolution (Hidden) Layers
|
|
393
|
+
for i in range(1, len(dim)):
|
|
394
|
+
self.list_of_layers.append(GraphConv(dim[i-1], dim[i]))
|
|
395
|
+
|
|
396
|
+
# Final Layer
|
|
397
|
+
self.final = nn.Linear(dim[-1], 1)
|
|
398
|
+
|
|
399
|
+
# Pooling layer
|
|
400
|
+
if pooling.lower() == "avgpooling":
|
|
401
|
+
self.pooling_layer = dgl.nn.AvgPooling()
|
|
402
|
+
elif pooling.lower() == "maxpooling":
|
|
403
|
+
self.pooling_layer = dgl.nn.MaxPooling()
|
|
404
|
+
elif pooling.lower() == "sumpooling":
|
|
405
|
+
self.pooling_layer = dgl.nn.SumPooling()
|
|
406
|
+
else:
|
|
407
|
+
raise NotImplementedError
|
|
408
|
+
|
|
409
|
+
def forward(self, g, in_feat):
|
|
410
|
+
h = in_feat
|
|
411
|
+
# Generate node features
|
|
412
|
+
for i in range(len(self.list_of_layers)): # Aim for 2 about 3 layers
|
|
413
|
+
h = self.list_of_layers[i](g, h)
|
|
414
|
+
h = F.relu(h)
|
|
415
|
+
# h will now be matrix of dimension num_nodes by h_feats[-1]
|
|
416
|
+
h = self.final(h)
|
|
417
|
+
g.ndata['h'] = h
|
|
418
|
+
# Go from node level features to graph level features by pooling
|
|
419
|
+
h = self.pooling_layer(g, h)
|
|
420
|
+
# h will now be vector of dimension num_classes
|
|
421
|
+
return h
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
class _RegressorHoldout:
|
|
425
|
+
def __init__(self, hparams, trainingDataset, validationDataset=None, testingDataset=None):
|
|
426
|
+
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
|
427
|
+
device = torch.device("cpu")
|
|
428
|
+
self.trainingDataset = trainingDataset
|
|
429
|
+
self.validationDataset = validationDataset
|
|
430
|
+
self.testingDataset = testingDataset
|
|
431
|
+
self.hparams = hparams
|
|
432
|
+
if hparams.conv_layer_type.lower() == 'classic':
|
|
433
|
+
self.model = _ClassicReg(trainingDataset.dim_nfeats, hparams.hl_widths).to(device)
|
|
434
|
+
elif hparams.conv_layer_type.lower() == 'ginconv':
|
|
435
|
+
self.model = _GINConv(trainingDataset.dim_nfeats, hparams.hl_widths,
|
|
436
|
+
1, hparams.pooling).to(device)
|
|
437
|
+
elif hparams.conv_layer_type.lower() == 'graphconv':
|
|
438
|
+
self.model = _GraphConvReg(trainingDataset.dim_nfeats, hparams.hl_widths, hparams.pooling).to(device)
|
|
439
|
+
elif hparams.conv_layer_type.lower() == 'sageconv':
|
|
440
|
+
self.model = _SAGEConv(trainingDataset.dim_nfeats, hparams.hl_widths,
|
|
441
|
+
1, hparams.pooling).to(device)
|
|
442
|
+
elif hparams.conv_layer_type.lower() == 'tagconv':
|
|
443
|
+
self.model = _TAGConv(trainingDataset.dim_nfeats, hparams.hl_widths,
|
|
444
|
+
1, hparams.pooling).to(device)
|
|
445
|
+
elif hparams.conv_layer_type.lower() == 'gcn':
|
|
446
|
+
self.model = _ClassicReg(trainingDataset.dim_nfeats, hparams.hl_widths).to(device)
|
|
447
|
+
else:
|
|
448
|
+
raise NotImplementedError
|
|
449
|
+
|
|
450
|
+
if hparams.optimizer_str.lower() == "adadelta":
|
|
451
|
+
self.optimizer = torch.optim.Adadelta(self.model.parameters(), eps=hparams.eps,
|
|
452
|
+
lr=hparams.lr, rho=hparams.rho, weight_decay=hparams.weight_decay)
|
|
453
|
+
elif hparams.optimizer_str.lower() == "adagrad":
|
|
454
|
+
self.optimizer = torch.optim.Adagrad(self.model.parameters(), eps=hparams.eps,
|
|
455
|
+
lr=hparams.lr, lr_decay=hparams.lr_decay, weight_decay=hparams.weight_decay)
|
|
456
|
+
elif hparams.optimizer_str.lower() == "adam":
|
|
457
|
+
self.optimizer = torch.optim.Adam(self.model.parameters(), amsgrad=hparams.amsgrad, betas=hparams.betas, eps=hparams.eps,
|
|
458
|
+
lr=hparams.lr, maximize=hparams.maximize, weight_decay=hparams.weight_decay)
|
|
459
|
+
|
|
460
|
+
self.use_gpu = hparams.use_gpu
|
|
461
|
+
self.training_loss_list = []
|
|
462
|
+
self.validation_loss_list = []
|
|
463
|
+
self.node_attr_key = trainingDataset.node_attr_key
|
|
464
|
+
|
|
465
|
+
# train, validate, test split
|
|
466
|
+
num_train = int(len(trainingDataset) * (hparams.split[0]))
|
|
467
|
+
num_validate = int(len(trainingDataset) * (hparams.split[1]))
|
|
468
|
+
num_test = len(trainingDataset) - num_train - num_validate
|
|
469
|
+
idx = torch.randperm(len(trainingDataset))
|
|
470
|
+
train_sampler = SubsetRandomSampler(idx[:num_train])
|
|
471
|
+
validate_sampler = SubsetRandomSampler(idx[num_train:num_train+num_validate])
|
|
472
|
+
test_sampler = SubsetRandomSampler(idx[num_train+num_validate:num_train+num_validate+num_test])
|
|
473
|
+
|
|
474
|
+
if validationDataset:
|
|
475
|
+
self.train_dataloader = GraphDataLoader(trainingDataset,
|
|
476
|
+
batch_size=hparams.batch_size,
|
|
477
|
+
drop_last=False)
|
|
478
|
+
self.validate_dataloader = GraphDataLoader(validationDataset,
|
|
479
|
+
batch_size=hparams.batch_size,
|
|
480
|
+
drop_last=False)
|
|
481
|
+
else:
|
|
482
|
+
self.train_dataloader = GraphDataLoader(trainingDataset, sampler=train_sampler,
|
|
483
|
+
batch_size=hparams.batch_size,
|
|
484
|
+
drop_last=False)
|
|
485
|
+
self.validate_dataloader = GraphDataLoader(trainingDataset, sampler=validate_sampler,
|
|
486
|
+
batch_size=hparams.batch_size,
|
|
487
|
+
drop_last=False)
|
|
488
|
+
|
|
489
|
+
if testingDataset:
|
|
490
|
+
self.test_dataloader = GraphDataLoader(testingDataset,
|
|
491
|
+
batch_size=len(testingDataset),
|
|
492
|
+
drop_last=False)
|
|
493
|
+
else:
|
|
494
|
+
self.test_dataloader = GraphDataLoader(trainingDataset, sampler=test_sampler,
|
|
495
|
+
batch_size=hparams.batch_size,
|
|
496
|
+
drop_last=False)
|
|
497
|
+
|
|
498
|
+
def train(self):
|
|
499
|
+
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
|
500
|
+
device = torch.device("cpu")
|
|
501
|
+
# Init the loss and accuracy reporting lists
|
|
502
|
+
self.training_loss_list = []
|
|
503
|
+
self.validation_loss_list = []
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
# Run the training loop for defined number of epochs
|
|
507
|
+
for _ in tqdm(range(self.hparams.epochs), desc='Epochs', total=self.hparams.epochs, leave=False):
|
|
508
|
+
# Iterate over the DataLoader for training data
|
|
509
|
+
for batched_graph, labels in tqdm(self.train_dataloader, desc='Training', leave=False):
|
|
510
|
+
# Make sure the model is in training mode
|
|
511
|
+
self.model.train()
|
|
512
|
+
# Zero the gradients
|
|
513
|
+
self.optimizer.zero_grad()
|
|
514
|
+
|
|
515
|
+
# Perform forward pass
|
|
516
|
+
pred = self.model(batched_graph, batched_graph.ndata[self.node_attr_key].float()).to(device)
|
|
517
|
+
# Compute loss
|
|
518
|
+
loss = F.mse_loss(torch.flatten(pred), labels.float())
|
|
519
|
+
|
|
520
|
+
# Perform backward pass
|
|
521
|
+
loss.backward()
|
|
522
|
+
|
|
523
|
+
# Perform optimization
|
|
524
|
+
self.optimizer.step()
|
|
525
|
+
|
|
526
|
+
self.training_loss_list.append(torch.sqrt(loss).item())
|
|
527
|
+
self.validate()
|
|
528
|
+
self.validation_loss_list.append(torch.sqrt(self.validation_loss).item())
|
|
529
|
+
|
|
530
|
+
|
|
531
|
+
def validate(self):
|
|
532
|
+
device = torch.device("cpu")
|
|
533
|
+
self.model.eval()
|
|
534
|
+
for batched_graph, labels in tqdm(self.validate_dataloader, desc='Validating', leave=False):
|
|
535
|
+
pred = self.model(batched_graph, batched_graph.ndata[self.node_attr_key].float()).to(device)
|
|
536
|
+
loss = F.mse_loss(torch.flatten(pred), labels.float())
|
|
537
|
+
self.validation_loss = loss
|
|
538
|
+
|
|
539
|
+
def test(self):
|
|
540
|
+
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
|
541
|
+
device = torch.device("cpu")
|
|
542
|
+
self.model.eval()
|
|
543
|
+
for batched_graph, labels in tqdm(self.test_dataloader, desc='Testing', leave=False):
|
|
544
|
+
pred = self.model(batched_graph, batched_graph.ndata[self.node_attr_key].float()).to(device)
|
|
545
|
+
loss = F.mse_loss(torch.flatten(pred), labels.float())
|
|
546
|
+
self.testing_loss = torch.sqrt(loss).item()
|
|
547
|
+
|
|
548
|
+
def save(self, path):
|
|
549
|
+
if path:
|
|
550
|
+
# Make sure the file extension is .pt
|
|
551
|
+
ext = path[len(path)-3:len(path)]
|
|
552
|
+
if ext.lower() != ".pt":
|
|
553
|
+
path = path+".pt"
|
|
554
|
+
torch.save(self.model, path)
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
class _RegressorKFold:
|
|
558
|
+
def __init__(self, hparams, trainingDataset, testingDataset=None):
|
|
559
|
+
self.trainingDataset = trainingDataset
|
|
560
|
+
self.testingDataset = testingDataset
|
|
561
|
+
self.hparams = hparams
|
|
562
|
+
self.losses = []
|
|
563
|
+
self.min_loss = 0
|
|
564
|
+
# at beginning of the script
|
|
565
|
+
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
|
566
|
+
device = torch.device("cpu")
|
|
567
|
+
if hparams.conv_layer_type.lower() == 'classic':
|
|
568
|
+
self.model = _ClassicReg(trainingDataset.dim_nfeats, hparams.hl_widths).to(device)
|
|
569
|
+
elif hparams.conv_layer_type.lower() == 'ginconv':
|
|
570
|
+
self.model = _GINConv(trainingDataset.dim_nfeats, hparams.hl_widths,
|
|
571
|
+
1, hparams.pooling).to(device)
|
|
572
|
+
elif hparams.conv_layer_type.lower() == 'graphconv':
|
|
573
|
+
self.model = _GraphConvReg(trainingDataset.dim_nfeats, hparams.hl_widths, hparams.pooling).to(device)
|
|
574
|
+
elif hparams.conv_layer_type.lower() == 'sageconv':
|
|
575
|
+
self.model = _SAGEConv(trainingDataset.dim_nfeats, hparams.hl_widths,
|
|
576
|
+
1, hparams.pooling).to(device)
|
|
577
|
+
elif hparams.conv_layer_type.lower() == 'tagconv':
|
|
578
|
+
self.model = _TAGConv(trainingDataset.dim_nfeats, hparams.hl_widths,
|
|
579
|
+
1, hparams.pooling).to(device)
|
|
580
|
+
elif hparams.conv_layer_type.lower() == 'gcn':
|
|
581
|
+
self.model = _ClassicReg(trainingDataset.dim_nfeats, hparams.hl_widths).to(device)
|
|
582
|
+
else:
|
|
583
|
+
raise NotImplementedError
|
|
584
|
+
|
|
585
|
+
if hparams.optimizer_str.lower() == "adadelta":
|
|
586
|
+
self.optimizer = torch.optim.Adadelta(self.model.parameters(), eps=hparams.eps,
|
|
587
|
+
lr=hparams.lr, rho=hparams.rho, weight_decay=hparams.weight_decay)
|
|
588
|
+
elif hparams.optimizer_str.lower() == "adagrad":
|
|
589
|
+
self.optimizer = torch.optim.Adagrad(self.model.parameters(), eps=hparams.eps,
|
|
590
|
+
lr=hparams.lr, lr_decay=hparams.lr_decay, weight_decay=hparams.weight_decay)
|
|
591
|
+
elif hparams.optimizer_str.lower() == "adam":
|
|
592
|
+
self.optimizer = torch.optim.Adam(self.model.parameters(), amsgrad=hparams.amsgrad, betas=hparams.betas, eps=hparams.eps,
|
|
593
|
+
lr=hparams.lr, maximize=hparams.maximize, weight_decay=hparams.weight_decay)
|
|
594
|
+
|
|
595
|
+
self.use_gpu = hparams.use_gpu
|
|
596
|
+
self.training_loss_list = []
|
|
597
|
+
self.validation_loss_list = []
|
|
598
|
+
self.node_attr_key = trainingDataset.node_attr_key
|
|
599
|
+
|
|
600
|
+
# train, validate, test split
|
|
601
|
+
num_train = int(len(trainingDataset) * (hparams.split[0]))
|
|
602
|
+
num_validate = int(len(trainingDataset) * (hparams.split[1]))
|
|
603
|
+
num_test = len(trainingDataset) - num_train - num_validate
|
|
604
|
+
idx = torch.randperm(len(trainingDataset))
|
|
605
|
+
test_sampler = SubsetRandomSampler(idx[num_train+num_validate:num_train+num_validate+num_test])
|
|
606
|
+
|
|
607
|
+
if testingDataset:
|
|
608
|
+
self.test_dataloader = GraphDataLoader(testingDataset,
|
|
609
|
+
batch_size=len(testingDataset),
|
|
610
|
+
drop_last=False)
|
|
611
|
+
else:
|
|
612
|
+
self.test_dataloader = GraphDataLoader(trainingDataset, sampler=test_sampler,
|
|
613
|
+
batch_size=hparams.batch_size,
|
|
614
|
+
drop_last=False)
|
|
615
|
+
|
|
616
|
+
def reset_weights(self):
|
|
617
|
+
'''
|
|
618
|
+
Try resetting model weights to avoid
|
|
619
|
+
weight leakage.
|
|
620
|
+
'''
|
|
621
|
+
device = torch.device("cpu")
|
|
622
|
+
if self.hparams.conv_layer_type.lower() == 'classic':
|
|
623
|
+
self.model = _ClassicReg(self.trainingDataset.dim_nfeats, self.hparams.hl_widths).to(device)
|
|
624
|
+
elif self.hparams.conv_layer_type.lower() == 'ginconv':
|
|
625
|
+
self.model = _GINConv(self.trainingDataset.dim_nfeats, self.hparams.hl_widths,
|
|
626
|
+
1, self.hparams.pooling).to(device)
|
|
627
|
+
elif self.hparams.conv_layer_type.lower() == 'graphconv':
|
|
628
|
+
self.model = _GraphConvReg(self.trainingDataset.dim_nfeats, self.hparams.hl_widths, self.hparams.pooling).to(device)
|
|
629
|
+
elif self.hparams.conv_layer_type.lower() == 'sageconv':
|
|
630
|
+
self.model = _SAGEConv(self.trainingDataset.dim_nfeats, self.hparams.hl_widths,
|
|
631
|
+
1, self.hparams.pooling).to(device)
|
|
632
|
+
elif self.hparams.conv_layer_type.lower() == 'tagconv':
|
|
633
|
+
self.model = _TAGConv(self.trainingDataset.dim_nfeats, self.hparams.hl_widths,
|
|
634
|
+
1, self.hparams.pooling).to(device)
|
|
635
|
+
elif self.hparams.conv_layer_type.lower() == 'gcn':
|
|
636
|
+
self.model = _ClassicReg(self.trainingDataset.dim_nfeats, self.hparams.hl_widths).to(device)
|
|
637
|
+
else:
|
|
638
|
+
raise NotImplementedError
|
|
639
|
+
|
|
640
|
+
if self.hparams.optimizer_str.lower() == "adadelta":
|
|
641
|
+
self.optimizer = torch.optim.Adadelta(self.model.parameters(), eps=self.hparams.eps,
|
|
642
|
+
lr=self.hparams.lr, rho=self.hparams.rho, weight_decay=self.hparams.weight_decay)
|
|
643
|
+
elif self.hparams.optimizer_str.lower() == "adagrad":
|
|
644
|
+
self.optimizer = torch.optim.Adagrad(self.model.parameters(), eps=self.hparams.eps,
|
|
645
|
+
lr=self.hparams.lr, lr_decay=self.hparams.lr_decay, weight_decay=self.hparams.weight_decay)
|
|
646
|
+
elif self.hparams.optimizer_str.lower() == "adam":
|
|
647
|
+
self.optimizer = torch.optim.Adam(self.model.parameters(), amsgrad=self.hparams.amsgrad, betas=self.hparams.betas, eps=self.hparams.eps,
|
|
648
|
+
lr=self.hparams.lr, maximize=self.hparams.maximize, weight_decay=self.hparams.weight_decay)
|
|
649
|
+
|
|
650
|
+
|
|
651
|
+
|
|
652
|
+
|
|
653
|
+
def train(self):
|
|
654
|
+
device = torch.device("cpu")
|
|
655
|
+
|
|
656
|
+
# The number of folds (This should come from the hparams)
|
|
657
|
+
k_folds = self.hparams.k_folds
|
|
658
|
+
|
|
659
|
+
# Init the loss and accuracy reporting lists
|
|
660
|
+
self.training_loss_list = []
|
|
661
|
+
self.validation_loss_list = []
|
|
662
|
+
|
|
663
|
+
# Set fixed random number seed
|
|
664
|
+
torch.manual_seed(42)
|
|
665
|
+
|
|
666
|
+
# Define the K-fold Cross Validator
|
|
667
|
+
kfold = KFold(n_splits=k_folds, shuffle=True)
|
|
668
|
+
|
|
669
|
+
models = []
|
|
670
|
+
weights = []
|
|
671
|
+
losses = []
|
|
672
|
+
train_dataloaders = []
|
|
673
|
+
validate_dataloaders = []
|
|
674
|
+
|
|
675
|
+
# K-fold Cross-validation model evaluation
|
|
676
|
+
for fold, (train_ids, validate_ids) in tqdm(enumerate(kfold.split(self.trainingDataset)), desc="Fold", initial=1, total=k_folds, leave=False):
|
|
677
|
+
epoch_training_loss_list = []
|
|
678
|
+
epoch_validation_loss_list = []
|
|
679
|
+
# Sample elements randomly from a given list of ids, no replacement.
|
|
680
|
+
train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
|
|
681
|
+
validate_subsampler = torch.utils.data.SubsetRandomSampler(validate_ids)
|
|
682
|
+
|
|
683
|
+
# Define data loaders for training and testing data in this fold
|
|
684
|
+
self.train_dataloader = GraphDataLoader(self.trainingDataset, sampler=train_subsampler,
|
|
685
|
+
batch_size=self.hparams.batch_size,
|
|
686
|
+
drop_last=False)
|
|
687
|
+
self.validate_dataloader = GraphDataLoader(self.trainingDataset, sampler=validate_subsampler,
|
|
688
|
+
batch_size=self.hparams.batch_size,
|
|
689
|
+
drop_last=False)
|
|
690
|
+
# Init the neural network
|
|
691
|
+
self.reset_weights()
|
|
692
|
+
|
|
693
|
+
# Run the training loop for defined number of epochs
|
|
694
|
+
best_rmse = np.inf
|
|
695
|
+
# Run the training loop for defined number of epochs
|
|
696
|
+
for _ in tqdm(range(self.hparams.epochs), desc='Epochs', total=self.hparams.epochs, initial=1, leave=False):
|
|
697
|
+
# Iterate over the DataLoader for training data
|
|
698
|
+
for batched_graph, labels in tqdm(self.train_dataloader, desc='Training', leave=False):
|
|
699
|
+
# Make sure the model is in training mode
|
|
700
|
+
self.model.train()
|
|
701
|
+
# Zero the gradients
|
|
702
|
+
self.optimizer.zero_grad()
|
|
703
|
+
|
|
704
|
+
# Perform forward pass
|
|
705
|
+
pred = self.model(batched_graph, batched_graph.ndata[self.node_attr_key].float()).to(device)
|
|
706
|
+
# Compute loss
|
|
707
|
+
loss = F.mse_loss(torch.flatten(pred), labels.float())
|
|
708
|
+
|
|
709
|
+
# Perform backward pass
|
|
710
|
+
loss.backward()
|
|
711
|
+
|
|
712
|
+
# Perform optimization
|
|
713
|
+
self.optimizer.step()
|
|
714
|
+
|
|
715
|
+
|
|
716
|
+
epoch_training_loss_list.append(torch.sqrt(loss).item())
|
|
717
|
+
self.validate()
|
|
718
|
+
epoch_validation_loss_list.append(torch.sqrt(self.validation_loss).item())
|
|
719
|
+
|
|
720
|
+
models.append(self.model)
|
|
721
|
+
weights.append(copy.deepcopy(self.model.state_dict()))
|
|
722
|
+
losses.append(torch.sqrt(self.validation_loss).item())
|
|
723
|
+
train_dataloaders.append(self.train_dataloader)
|
|
724
|
+
validate_dataloaders.append(self.validate_dataloader)
|
|
725
|
+
self.training_loss_list.append(epoch_training_loss_list)
|
|
726
|
+
self.validation_loss_list.append(epoch_validation_loss_list)
|
|
727
|
+
self.losses = losses
|
|
728
|
+
min_loss = min(losses)
|
|
729
|
+
self.min_loss = min_loss
|
|
730
|
+
ind = losses.index(min_loss)
|
|
731
|
+
self.model = models[ind]
|
|
732
|
+
self.model.load_state_dict(weights[ind])
|
|
733
|
+
self.model.eval()
|
|
734
|
+
self.training_loss_list = self.training_loss_list[ind]
|
|
735
|
+
self.validation_loss_list = self.validation_loss_list[ind]
|
|
736
|
+
|
|
737
|
+
def validate(self):
|
|
738
|
+
device = torch.device("cpu")
|
|
739
|
+
self.model.eval()
|
|
740
|
+
for batched_graph, labels in tqdm(self.validate_dataloader, desc='Validating', leave=False):
|
|
741
|
+
pred = self.model(batched_graph, batched_graph.ndata[self.node_attr_key].float()).to(device)
|
|
742
|
+
loss = F.mse_loss(torch.flatten(pred), labels.float())
|
|
743
|
+
self.validation_loss = loss
|
|
744
|
+
|
|
745
|
+
def test(self):
|
|
746
|
+
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
|
747
|
+
device = torch.device("cpu")
|
|
748
|
+
#self.model.eval()
|
|
749
|
+
for batched_graph, labels in tqdm(self.test_dataloader, desc='Testing', leave=False):
|
|
750
|
+
pred = self.model(batched_graph, batched_graph.ndata[self.node_attr_key].float()).to(device)
|
|
751
|
+
loss = F.mse_loss(torch.flatten(pred), labels.float())
|
|
752
|
+
self.testing_loss = torch.sqrt(loss).item()
|
|
753
|
+
|
|
754
|
+
def save(self, path):
|
|
755
|
+
if path:
|
|
756
|
+
# Make sure the file extension is .pt
|
|
757
|
+
ext = path[len(path)-3:len(path)]
|
|
758
|
+
if ext.lower() != ".pt":
|
|
759
|
+
path = path+".pt"
|
|
760
|
+
torch.save(self.model, path)
|
|
761
|
+
|
|
762
|
+
class _ClassifierHoldout:
|
|
763
|
+
def __init__(self, hparams, trainingDataset, validationDataset=None, testingDataset=None):
|
|
764
|
+
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
|
765
|
+
device = torch.device("cpu")
|
|
766
|
+
self.trainingDataset = trainingDataset
|
|
767
|
+
self.validationDataset = validationDataset
|
|
768
|
+
self.testingDataset = testingDataset
|
|
769
|
+
self.hparams = hparams
|
|
770
|
+
if hparams.conv_layer_type.lower() == 'classic':
|
|
771
|
+
self.model = _Classic(trainingDataset.dim_nfeats, hparams.hl_widths,
|
|
772
|
+
trainingDataset.gclasses).to(device)
|
|
773
|
+
elif hparams.conv_layer_type.lower() == 'ginconv':
|
|
774
|
+
self.model = _GINConv(trainingDataset.dim_nfeats, hparams.hl_widths,
|
|
775
|
+
trainingDataset.gclasses, hparams.pooling).to(device)
|
|
776
|
+
elif hparams.conv_layer_type.lower() == 'graphconv':
|
|
777
|
+
self.model = _GraphConv(trainingDataset.dim_nfeats, hparams.hl_widths,
|
|
778
|
+
trainingDataset.gclasses, hparams.pooling).to(device)
|
|
779
|
+
elif hparams.conv_layer_type.lower() == 'sageconv':
|
|
780
|
+
self.model = _SAGEConv(trainingDataset.dim_nfeats, hparams.hl_widths,
|
|
781
|
+
trainingDataset.gclasses, hparams.pooling).to(device)
|
|
782
|
+
elif hparams.conv_layer_type.lower() == 'tagconv':
|
|
783
|
+
self.model = _TAGConv(trainingDataset.dim_nfeats, hparams.hl_widths,
|
|
784
|
+
trainingDataset.gclasses, hparams.pooling).to(device)
|
|
785
|
+
elif hparams.conv_layer_type.lower() == 'gcn':
|
|
786
|
+
self.model = _Classic(trainingDataset.dim_nfeats, hparams.hl_widths,
|
|
787
|
+
trainingDataset.gclasses).to(device)
|
|
788
|
+
else:
|
|
789
|
+
raise NotImplementedError
|
|
790
|
+
|
|
791
|
+
if hparams.optimizer_str.lower() == "adadelta":
|
|
792
|
+
self.optimizer = torch.optim.Adadelta(self.model.parameters(), eps=hparams.eps,
|
|
793
|
+
lr=hparams.lr, rho=hparams.rho, weight_decay=hparams.weight_decay)
|
|
794
|
+
elif hparams.optimizer_str.lower() == "adagrad":
|
|
795
|
+
self.optimizer = torch.optim.Adagrad(self.model.parameters(), eps=hparams.eps,
|
|
796
|
+
lr=hparams.lr, lr_decay=hparams.lr_decay, weight_decay=hparams.weight_decay)
|
|
797
|
+
elif hparams.optimizer_str.lower() == "adam":
|
|
798
|
+
self.optimizer = torch.optim.Adam(self.model.parameters(), amsgrad=hparams.amsgrad, betas=hparams.betas, eps=hparams.eps,
|
|
799
|
+
lr=hparams.lr, maximize=hparams.maximize, weight_decay=hparams.weight_decay)
|
|
800
|
+
self.use_gpu = hparams.use_gpu
|
|
801
|
+
self.training_loss_list = []
|
|
802
|
+
self.validation_loss_list = []
|
|
803
|
+
self.training_accuracy_list = []
|
|
804
|
+
self.validation_accuracy_list = []
|
|
805
|
+
self.node_attr_key = trainingDataset.node_attr_key
|
|
806
|
+
|
|
807
|
+
# train, validate, test split
|
|
808
|
+
num_train = int(len(trainingDataset) * (hparams.split[0]))
|
|
809
|
+
num_validate = int(len(trainingDataset) * (hparams.split[1]))
|
|
810
|
+
num_test = len(trainingDataset) - num_train - num_validate
|
|
811
|
+
idx = torch.randperm(len(trainingDataset))
|
|
812
|
+
train_sampler = SubsetRandomSampler(idx[:num_train])
|
|
813
|
+
validate_sampler = SubsetRandomSampler(idx[num_train:num_train+num_validate])
|
|
814
|
+
test_sampler = SubsetRandomSampler(idx[num_train+num_validate:num_train+num_validate+num_test])
|
|
815
|
+
|
|
816
|
+
if validationDataset:
|
|
817
|
+
self.train_dataloader = GraphDataLoader(trainingDataset,
|
|
818
|
+
batch_size=hparams.batch_size,
|
|
819
|
+
drop_last=False)
|
|
820
|
+
self.validate_dataloader = GraphDataLoader(validationDataset,
|
|
821
|
+
batch_size=hparams.batch_size,
|
|
822
|
+
drop_last=False)
|
|
823
|
+
else:
|
|
824
|
+
self.train_dataloader = GraphDataLoader(trainingDataset, sampler=train_sampler,
|
|
825
|
+
batch_size=hparams.batch_size,
|
|
826
|
+
drop_last=False)
|
|
827
|
+
self.validate_dataloader = GraphDataLoader(trainingDataset, sampler=validate_sampler,
|
|
828
|
+
batch_size=hparams.batch_size,
|
|
829
|
+
drop_last=False)
|
|
830
|
+
|
|
831
|
+
if testingDataset:
|
|
832
|
+
self.test_dataloader = GraphDataLoader(testingDataset,
|
|
833
|
+
batch_size=len(testingDataset),
|
|
834
|
+
drop_last=False)
|
|
835
|
+
else:
|
|
836
|
+
self.test_dataloader = GraphDataLoader(trainingDataset, sampler=test_sampler,
|
|
837
|
+
batch_size=hparams.batch_size,
|
|
838
|
+
drop_last=False)
|
|
839
|
+
def train(self):
|
|
840
|
+
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
|
841
|
+
device = torch.device("cpu")
|
|
842
|
+
# Init the loss and accuracy reporting lists
|
|
843
|
+
self.training_accuracy_list = []
|
|
844
|
+
self.training_loss_list = []
|
|
845
|
+
self.validation_accuracy_list = []
|
|
846
|
+
self.validation_loss_list = []
|
|
847
|
+
|
|
848
|
+
# Run the training loop for defined number of epochs
|
|
849
|
+
for _ in tqdm(range(self.hparams.epochs), desc='Epochs', initial=1, leave=False):
|
|
850
|
+
temp_loss_list = []
|
|
851
|
+
temp_acc_list = []
|
|
852
|
+
# Iterate over the DataLoader for training data
|
|
853
|
+
for batched_graph, labels in tqdm(self.train_dataloader, desc='Training', leave=False):
|
|
854
|
+
# Make sure the model is in training mode
|
|
855
|
+
self.model.train()
|
|
856
|
+
|
|
857
|
+
# Zero the gradients
|
|
858
|
+
self.optimizer.zero_grad()
|
|
859
|
+
|
|
860
|
+
# Perform forward pass
|
|
861
|
+
pred = self.model(batched_graph, batched_graph.ndata[self.node_attr_key].float()).to(device)
|
|
862
|
+
# Compute loss
|
|
863
|
+
if self.hparams.loss_function.lower() == "negative log likelihood":
|
|
864
|
+
logp = F.log_softmax(pred, 1)
|
|
865
|
+
loss = F.nll_loss(logp, labels)
|
|
866
|
+
elif self.hparams.loss_function.lower() == "cross entropy":
|
|
867
|
+
loss = F.cross_entropy(pred, labels)
|
|
868
|
+
|
|
869
|
+
# Save loss information for reporting
|
|
870
|
+
temp_loss_list.append(loss.item())
|
|
871
|
+
temp_acc_list.append(accuracy_score(labels, pred.argmax(1)))
|
|
872
|
+
|
|
873
|
+
# Perform backward pass
|
|
874
|
+
loss.backward()
|
|
875
|
+
|
|
876
|
+
# Perform optimization
|
|
877
|
+
self.optimizer.step()
|
|
878
|
+
|
|
879
|
+
self.training_accuracy_list.append(np.mean(temp_acc_list).item())
|
|
880
|
+
self.training_loss_list.append(np.mean(temp_loss_list).item())
|
|
881
|
+
self.validate()
|
|
882
|
+
self.validation_accuracy_list.append(self.validation_accuracy)
|
|
883
|
+
self.validation_loss_list.append(self.validation_loss)
|
|
884
|
+
|
|
885
|
+
def validate(self):
|
|
886
|
+
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
|
887
|
+
device = torch.device("cpu")
|
|
888
|
+
temp_loss_list = []
|
|
889
|
+
temp_acc_list = []
|
|
890
|
+
self.model.eval()
|
|
891
|
+
for batched_graph, labels in tqdm(self.validate_dataloader, desc='Validating', leave=False):
|
|
892
|
+
pred = self.model(batched_graph, batched_graph.ndata[self.node_attr_key].float()).to(device)
|
|
893
|
+
if self.hparams.loss_function.lower() == "negative log likelihood":
|
|
894
|
+
logp = F.log_softmax(pred, 1)
|
|
895
|
+
loss = F.nll_loss(logp, labels)
|
|
896
|
+
elif self.hparams.loss_function.lower() == "cross entropy":
|
|
897
|
+
loss = F.cross_entropy(pred, labels)
|
|
898
|
+
temp_loss_list.append(loss.item())
|
|
899
|
+
temp_acc_list.append(accuracy_score(labels, pred.argmax(1)))
|
|
900
|
+
self.validation_accuracy = np.mean(temp_acc_list).item()
|
|
901
|
+
self.validation_loss = np.mean(temp_loss_list).item()
|
|
902
|
+
|
|
903
|
+
def test(self):
|
|
904
|
+
if self.test_dataloader:
|
|
905
|
+
temp_loss_list = []
|
|
906
|
+
temp_acc_list = []
|
|
907
|
+
self.model.eval()
|
|
908
|
+
for batched_graph, labels in tqdm(self.test_dataloader, desc='Testing', leave=False):
|
|
909
|
+
pred = self.model(batched_graph, batched_graph.ndata[self.node_attr_key].float())
|
|
910
|
+
if self.hparams.loss_function.lower() == "negative log likelihood":
|
|
911
|
+
logp = F.log_softmax(pred, 1)
|
|
912
|
+
loss = F.nll_loss(logp, labels)
|
|
913
|
+
elif self.hparams.loss_function.lower() == "cross entropy":
|
|
914
|
+
loss = F.cross_entropy(pred, labels)
|
|
915
|
+
temp_loss_list.append(loss.item())
|
|
916
|
+
temp_acc_list.append(accuracy_score(labels, pred.argmax(1)))
|
|
917
|
+
self.testing_accuracy = np.mean(temp_acc_list).item()
|
|
918
|
+
self.testing_loss = np.mean(temp_loss_list).item()
|
|
919
|
+
|
|
920
|
+
def save(self, path):
|
|
921
|
+
if path:
|
|
922
|
+
# Make sure the file extension is .pt
|
|
923
|
+
ext = path[len(path)-3:len(path)]
|
|
924
|
+
if ext.lower() != ".pt":
|
|
925
|
+
path = path+".pt"
|
|
926
|
+
torch.save(self.model, path)
|
|
927
|
+
|
|
928
|
+
class _ClassifierKFold:
|
|
929
|
+
def __init__(self, hparams, trainingDataset, testingDataset=None):
|
|
930
|
+
self.trainingDataset = trainingDataset
|
|
931
|
+
self.testingDataset = testingDataset
|
|
932
|
+
self.hparams = hparams
|
|
933
|
+
self.testing_accuracy = 0
|
|
934
|
+
self.accuracies = []
|
|
935
|
+
self.max_accuracy = 0
|
|
936
|
+
# at beginning of the script
|
|
937
|
+
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
|
938
|
+
device = torch.device("cpu")
|
|
939
|
+
if hparams.conv_layer_type.lower() == 'classic':
|
|
940
|
+
self.model = _Classic(trainingDataset.dim_nfeats, hparams.hl_widths,
|
|
941
|
+
trainingDataset.gclasses).to(device)
|
|
942
|
+
elif hparams.conv_layer_type.lower() == 'ginconv':
|
|
943
|
+
self.model = _GINConv(trainingDataset.dim_nfeats, hparams.hl_widths,
|
|
944
|
+
trainingDataset.gclasses, hparams.pooling).to(device)
|
|
945
|
+
elif hparams.conv_layer_type.lower() == 'graphconv':
|
|
946
|
+
self.model = _GraphConv(trainingDataset.dim_nfeats, hparams.hl_widths,
|
|
947
|
+
trainingDataset.gclasses, hparams.pooling).to(device)
|
|
948
|
+
elif hparams.conv_layer_type.lower() == 'sageconv':
|
|
949
|
+
self.model = _SAGEConv(trainingDataset.dim_nfeats, hparams.hl_widths,
|
|
950
|
+
trainingDataset.gclasses, hparams.pooling).to(device)
|
|
951
|
+
elif hparams.conv_layer_type.lower() == 'tagconv':
|
|
952
|
+
self.model = _TAGConv(trainingDataset.dim_nfeats, hparams.hl_widths,
|
|
953
|
+
trainingDataset.gclasses, hparams.pooling).to(device)
|
|
954
|
+
else:
|
|
955
|
+
raise NotImplementedError
|
|
956
|
+
|
|
957
|
+
if hparams.optimizer_str.lower() == "adadelta":
|
|
958
|
+
self.optimizer = torch.optim.Adadelta(self.model.parameters(), eps=hparams.eps,
|
|
959
|
+
lr=hparams.lr, rho=hparams.rho, weight_decay=hparams.weight_decay)
|
|
960
|
+
elif hparams.optimizer_str.lower() == "adagrad":
|
|
961
|
+
self.optimizer = torch.optim.Adagrad(self.model.parameters(), eps=hparams.eps,
|
|
962
|
+
lr=hparams.lr, lr_decay=hparams.lr_decay, weight_decay=hparams.weight_decay)
|
|
963
|
+
elif hparams.optimizer_str.lower() == "adam":
|
|
964
|
+
self.optimizer = torch.optim.Adam(self.model.parameters(), amsgrad=hparams.amsgrad, betas=hparams.betas, eps=hparams.eps,
|
|
965
|
+
lr=hparams.lr, maximize=hparams.maximize, weight_decay=hparams.weight_decay)
|
|
966
|
+
self.use_gpu = hparams.use_gpu
|
|
967
|
+
self.training_loss_list = []
|
|
968
|
+
self.validation_loss_list = []
|
|
969
|
+
self.training_accuracy_list = []
|
|
970
|
+
self.validation_accuracy_list = []
|
|
971
|
+
self.node_attr_key = trainingDataset.node_attr_key
|
|
972
|
+
|
|
973
|
+
|
|
974
|
+
def reset_weights(self):
|
|
975
|
+
'''
|
|
976
|
+
Try resetting model weights to avoid
|
|
977
|
+
weight leakage.
|
|
978
|
+
'''
|
|
979
|
+
device = torch.device("cpu")
|
|
980
|
+
if self.hparams.conv_layer_type.lower() == 'classic':
|
|
981
|
+
self.model = _Classic(self.trainingDataset.dim_nfeats, self.hparams.hl_widths,
|
|
982
|
+
self.trainingDataset.gclasses).to(device)
|
|
983
|
+
elif self.hparams.conv_layer_type.lower() == 'ginconv':
|
|
984
|
+
self.model = _GINConv(self.trainingDataset.dim_nfeats, self.hparams.hl_widths,
|
|
985
|
+
self.trainingDataset.gclasses, self.hparams.pooling).to(device)
|
|
986
|
+
elif self.hparams.conv_layer_type.lower() == 'graphconv':
|
|
987
|
+
self.model = _GraphConv(self.trainingDataset.dim_nfeats, self.hparams.hl_widths,
|
|
988
|
+
self.trainingDataset.gclasses, self.hparams.pooling).to(device)
|
|
989
|
+
elif self.hparams.conv_layer_type.lower() == 'sageconv':
|
|
990
|
+
self.model = _SAGEConv(self.trainingDataset.dim_nfeats, self.hparams.hl_widths,
|
|
991
|
+
self.trainingDataset.gclasses, self.hparams.pooling).to(device)
|
|
992
|
+
elif self.hparams.conv_layer_type.lower() == 'tagconv':
|
|
993
|
+
self.model = _TAGConv(self.trainingDataset.dim_nfeats, self.hparams.hl_widths,
|
|
994
|
+
self.trainingDataset.gclasses, self.hparams.pooling).to(device)
|
|
995
|
+
else:
|
|
996
|
+
raise NotImplementedError
|
|
997
|
+
if self.hparams.optimizer_str.lower() == "adadelta":
|
|
998
|
+
self.optimizer = torch.optim.Adadelta(self.model.parameters(), eps=self.hparams.eps,
|
|
999
|
+
lr=self.hparams.lr, rho=self.hparams.rho, weight_decay=self.hparams.weight_decay)
|
|
1000
|
+
elif self.hparams.optimizer_str.lower() == "adagrad":
|
|
1001
|
+
self.optimizer = torch.optim.Adagrad(self.model.parameters(), eps=self.hparams.eps,
|
|
1002
|
+
lr=self.hparams.lr, lr_decay=self.hparams.lr_decay, weight_decay=self.hparams.weight_decay)
|
|
1003
|
+
elif self.hparams.optimizer_str.lower() == "adam":
|
|
1004
|
+
self.optimizer = torch.optim.Adam(self.model.parameters(), amsgrad=self.hparams.amsgrad, betas=self.hparams.betas, eps=self.hparams.eps,
|
|
1005
|
+
lr=self.hparams.lr, maximize=self.hparams.maximize, weight_decay=self.hparams.weight_decay)
|
|
1006
|
+
|
|
1007
|
+
def train(self):
|
|
1008
|
+
# The number of folds (This should come from the hparams)
|
|
1009
|
+
k_folds = self.hparams.k_folds
|
|
1010
|
+
|
|
1011
|
+
# Init the loss and accuracy reporting lists
|
|
1012
|
+
self.training_accuracy_list = []
|
|
1013
|
+
self.training_loss_list = []
|
|
1014
|
+
self.validation_accuracy_list = []
|
|
1015
|
+
self.validation_loss_list = []
|
|
1016
|
+
|
|
1017
|
+
# Set fixed random number seed
|
|
1018
|
+
torch.manual_seed(42)
|
|
1019
|
+
|
|
1020
|
+
# Define the K-fold Cross Validator
|
|
1021
|
+
kfold = KFold(n_splits=k_folds, shuffle=True)
|
|
1022
|
+
|
|
1023
|
+
models = []
|
|
1024
|
+
weights = []
|
|
1025
|
+
accuracies = []
|
|
1026
|
+
train_dataloaders = []
|
|
1027
|
+
validate_dataloaders = []
|
|
1028
|
+
|
|
1029
|
+
# K-fold Cross-validation model evaluation
|
|
1030
|
+
for fold, (train_ids, validate_ids) in tqdm(enumerate(kfold.split(self.trainingDataset)), desc="Fold", initial=1, total=k_folds, leave=False):
|
|
1031
|
+
epoch_training_loss_list = []
|
|
1032
|
+
epoch_training_accuracy_list = []
|
|
1033
|
+
epoch_validation_loss_list = []
|
|
1034
|
+
epoch_validation_accuracy_list = []
|
|
1035
|
+
# Sample elements randomly from a given list of ids, no replacement.
|
|
1036
|
+
train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
|
|
1037
|
+
validate_subsampler = torch.utils.data.SubsetRandomSampler(validate_ids)
|
|
1038
|
+
|
|
1039
|
+
# Define data loaders for training and testing data in this fold
|
|
1040
|
+
self.train_dataloader = GraphDataLoader(self.trainingDataset, sampler=train_subsampler,
|
|
1041
|
+
batch_size=self.hparams.batch_size,
|
|
1042
|
+
drop_last=False)
|
|
1043
|
+
self.validate_dataloader = GraphDataLoader(self.trainingDataset, sampler=validate_subsampler,
|
|
1044
|
+
batch_size=self.hparams.batch_size,
|
|
1045
|
+
drop_last=False)
|
|
1046
|
+
# Init the neural network
|
|
1047
|
+
self.reset_weights()
|
|
1048
|
+
|
|
1049
|
+
# Run the training loop for defined number of epochs
|
|
1050
|
+
for _ in tqdm(range(0,self.hparams.epochs), desc='Epochs', initial=1, total=self.hparams.epochs, leave=False):
|
|
1051
|
+
temp_loss_list = []
|
|
1052
|
+
temp_acc_list = []
|
|
1053
|
+
|
|
1054
|
+
# Iterate over the DataLoader for training data
|
|
1055
|
+
for batched_graph, labels in tqdm(self.train_dataloader, desc='Training', leave=False):
|
|
1056
|
+
|
|
1057
|
+
# Make sure the model is in training mode
|
|
1058
|
+
self.model.train()
|
|
1059
|
+
|
|
1060
|
+
# Zero the gradients
|
|
1061
|
+
self.optimizer.zero_grad()
|
|
1062
|
+
|
|
1063
|
+
# Perform forward pass
|
|
1064
|
+
pred = self.model(batched_graph, batched_graph.ndata[self.node_attr_key].float())
|
|
1065
|
+
|
|
1066
|
+
# Compute loss
|
|
1067
|
+
if self.hparams.loss_function.lower() == "negative log likelihood":
|
|
1068
|
+
logp = F.log_softmax(pred, 1)
|
|
1069
|
+
loss = F.nll_loss(logp, labels)
|
|
1070
|
+
elif self.hparams.loss_function.lower() == "cross entropy":
|
|
1071
|
+
loss = F.cross_entropy(pred, labels)
|
|
1072
|
+
|
|
1073
|
+
# Save loss information for reporting
|
|
1074
|
+
temp_loss_list.append(loss.item())
|
|
1075
|
+
temp_acc_list.append(accuracy_score(labels, pred.argmax(1)))
|
|
1076
|
+
|
|
1077
|
+
# Perform backward pass
|
|
1078
|
+
loss.backward()
|
|
1079
|
+
|
|
1080
|
+
# Perform optimization
|
|
1081
|
+
self.optimizer.step()
|
|
1082
|
+
|
|
1083
|
+
epoch_training_accuracy_list.append(np.mean(temp_acc_list).item())
|
|
1084
|
+
epoch_training_loss_list.append(np.mean(temp_loss_list).item())
|
|
1085
|
+
self.validate()
|
|
1086
|
+
epoch_validation_accuracy_list.append(self.validation_accuracy)
|
|
1087
|
+
epoch_validation_loss_list.append(self.validation_loss)
|
|
1088
|
+
models.append(self.model)
|
|
1089
|
+
weights.append(copy.deepcopy(self.model.state_dict()))
|
|
1090
|
+
accuracies.append(self.validation_accuracy)
|
|
1091
|
+
train_dataloaders.append(self.train_dataloader)
|
|
1092
|
+
validate_dataloaders.append(self.validate_dataloader)
|
|
1093
|
+
self.training_accuracy_list.append(epoch_training_accuracy_list)
|
|
1094
|
+
self.training_loss_list.append(epoch_training_loss_list)
|
|
1095
|
+
self.validation_accuracy_list.append(epoch_validation_accuracy_list)
|
|
1096
|
+
self.validation_loss_list.append(epoch_validation_loss_list)
|
|
1097
|
+
self.accuracies = accuracies
|
|
1098
|
+
max_accuracy = max(accuracies)
|
|
1099
|
+
self.max_accuracy = max_accuracy
|
|
1100
|
+
ind = accuracies.index(max_accuracy)
|
|
1101
|
+
self.model = models[ind]
|
|
1102
|
+
self.model.load_state_dict(weights[ind])
|
|
1103
|
+
self.model.eval()
|
|
1104
|
+
self.training_accuracy_list = self.training_accuracy_list[ind]
|
|
1105
|
+
self.training_loss_list = self.training_loss_list[ind]
|
|
1106
|
+
self.validation_accuracy_list = self.validation_accuracy_list[ind]
|
|
1107
|
+
self.validation_loss_list = self.validation_loss_list[ind]
|
|
1108
|
+
|
|
1109
|
+
def validate(self):
|
|
1110
|
+
temp_loss_list = []
|
|
1111
|
+
temp_acc_list = []
|
|
1112
|
+
self.model.eval()
|
|
1113
|
+
for batched_graph, labels in tqdm(self.validate_dataloader, desc='Validating', leave=False):
|
|
1114
|
+
pred = self.model(batched_graph, batched_graph.ndata[self.node_attr_key].float())
|
|
1115
|
+
if self.hparams.loss_function.lower() == "negative log likelihood":
|
|
1116
|
+
logp = F.log_softmax(pred, 1)
|
|
1117
|
+
loss = F.nll_loss(logp, labels)
|
|
1118
|
+
elif self.hparams.loss_function.lower() == "cross entropy":
|
|
1119
|
+
loss = F.cross_entropy(pred, labels)
|
|
1120
|
+
temp_loss_list.append(loss.item())
|
|
1121
|
+
temp_acc_list.append(accuracy_score(labels, pred.argmax(1)))
|
|
1122
|
+
self.validation_accuracy = np.mean(temp_acc_list).item()
|
|
1123
|
+
self.validation_loss = np.mean(temp_loss_list).item()
|
|
1124
|
+
|
|
1125
|
+
def test(self):
|
|
1126
|
+
if self.testingDataset:
|
|
1127
|
+
self.test_dataloader = GraphDataLoader(self.testingDataset,
|
|
1128
|
+
batch_size=len(self.testingDataset),
|
|
1129
|
+
drop_last=False)
|
|
1130
|
+
temp_loss_list = []
|
|
1131
|
+
temp_acc_list = []
|
|
1132
|
+
self.model.eval()
|
|
1133
|
+
for batched_graph, labels in tqdm(self.test_dataloader, desc='Testing', leave=False):
|
|
1134
|
+
pred = self.model(batched_graph, batched_graph.ndata[self.node_attr_key].float())
|
|
1135
|
+
if self.hparams.loss_function.lower() == "negative log likelihood":
|
|
1136
|
+
logp = F.log_softmax(pred, 1)
|
|
1137
|
+
loss = F.nll_loss(logp, labels)
|
|
1138
|
+
elif self.hparams.loss_function.lower() == "cross entropy":
|
|
1139
|
+
loss = F.cross_entropy(pred, labels)
|
|
1140
|
+
temp_loss_list.append(loss.item())
|
|
1141
|
+
temp_acc_list.append(accuracy_score(labels, pred.argmax(1)))
|
|
1142
|
+
self.testing_accuracy = np.mean(temp_acc_list).item()
|
|
1143
|
+
self.testing_loss = np.mean(temp_loss_list).item()
|
|
1144
|
+
|
|
1145
|
+
def save(self, path):
|
|
1146
|
+
if path:
|
|
1147
|
+
# Make sure the file extension is .pt
|
|
1148
|
+
ext = path[len(path)-3:len(path)]
|
|
1149
|
+
if ext.lower() != ".pt":
|
|
1150
|
+
path = path+".pt"
|
|
1151
|
+
torch.save(self.model, path)
|
|
1152
|
+
|
|
1153
|
+
class DGL:
|
|
1154
|
+
@staticmethod
|
|
1155
|
+
def Accuracy(actual, predicted, mantissa=4):
|
|
1156
|
+
"""
|
|
1157
|
+
Computes the accuracy of the input predictions based on the input labels. This is to be used only with classification not with regression.
|
|
1158
|
+
|
|
1159
|
+
Parameters
|
|
1160
|
+
----------
|
|
1161
|
+
actual : list
|
|
1162
|
+
The input list of actual values.
|
|
1163
|
+
predicted : list
|
|
1164
|
+
The input list of predicted values.
|
|
1165
|
+
mantissa : int , optional
|
|
1166
|
+
The desired length of the mantissa. The default is 4.
|
|
1167
|
+
|
|
1168
|
+
Returns
|
|
1169
|
+
-------
|
|
1170
|
+
dict
|
|
1171
|
+
A dictionary returning the accuracy information. This contains the following keys and values:
|
|
1172
|
+
- "accuracy" (float): The number of correct predictions divided by the length of the list.
|
|
1173
|
+
- "correct" (int): The number of correct predictions
|
|
1174
|
+
- "mask" (list): A boolean mask for correct vs. wrong predictions which can be used to filter the list of predictions
|
|
1175
|
+
- "size" (int): The size of the predictions list
|
|
1176
|
+
- "wrong" (int): The number of wrong predictions
|
|
1177
|
+
|
|
1178
|
+
"""
|
|
1179
|
+
if len(predicted) < 1 or len(actual) < 1 or not len(predicted) == len(actual):
|
|
1180
|
+
return None
|
|
1181
|
+
correct = 0
|
|
1182
|
+
mask = []
|
|
1183
|
+
for i in range(len(predicted)):
|
|
1184
|
+
if predicted[i] == actual[i]:
|
|
1185
|
+
correct = correct + 1
|
|
1186
|
+
mask.append(True)
|
|
1187
|
+
else:
|
|
1188
|
+
mask.append(False)
|
|
1189
|
+
size = len(predicted)
|
|
1190
|
+
wrong = len(predicted)- correct
|
|
1191
|
+
accuracy = round(float(correct) / float(len(predicted)), mantissa)
|
|
1192
|
+
return {"accuracy":accuracy, "correct":correct, "mask":mask, "size":size, "wrong":wrong}
|
|
1193
|
+
|
|
1194
|
+
@staticmethod
|
|
1195
|
+
def RMSE(actual, predicted, mantissa=4):
|
|
1196
|
+
"""
|
|
1197
|
+
Computes the accuracy based on the mean squared error of the input predictions based on the input actual values. This is to be used only with regression not with classification.
|
|
1198
|
+
|
|
1199
|
+
Parameters
|
|
1200
|
+
----------
|
|
1201
|
+
actual : list
|
|
1202
|
+
The input list of actual values.
|
|
1203
|
+
predicted : list
|
|
1204
|
+
The input list of predicted values.
|
|
1205
|
+
mantissa : int , optional
|
|
1206
|
+
The desired length of the mantissa. The default is 4.
|
|
1207
|
+
|
|
1208
|
+
Returns
|
|
1209
|
+
-------
|
|
1210
|
+
dict
|
|
1211
|
+
A dictionary returning the accuracy information. This contains the following keys and values:
|
|
1212
|
+
- "rmse" (float): Root Mean Square Error.
|
|
1213
|
+
- "size" (int): The size of the predictions list
|
|
1214
|
+
"""
|
|
1215
|
+
if len(predicted) < 1 or len(actual) < 1 or not len(predicted) == len(actual):
|
|
1216
|
+
return None
|
|
1217
|
+
size = len(predicted)
|
|
1218
|
+
mse = F.mse_loss(torch.tensor(predicted), torch.tensor(actual))
|
|
1219
|
+
rmse = round(torch.sqrt(mse).item(), mantissa)
|
|
1220
|
+
return {"rmse":rmse, "size":size}
|
|
1221
|
+
|
|
1222
|
+
@staticmethod
|
|
1223
|
+
def BalanceDataset(dataset, labels, method="undersampling", key="node_attr"):
|
|
1224
|
+
"""
|
|
1225
|
+
Balances the input dataset using the specified method.
|
|
1226
|
+
|
|
1227
|
+
Parameters
|
|
1228
|
+
----------
|
|
1229
|
+
dataset : DGLDataset
|
|
1230
|
+
The input dataset.
|
|
1231
|
+
labels : list
|
|
1232
|
+
The input list of labels.
|
|
1233
|
+
method : str, optional
|
|
1234
|
+
The method of sampling. This can be "undersampling" or "oversampling". It is case insensitive. The defaul is "undersampling".
|
|
1235
|
+
key : str
|
|
1236
|
+
The key used for the node attributes.
|
|
1237
|
+
|
|
1238
|
+
Returns
|
|
1239
|
+
-------
|
|
1240
|
+
DGLDataset
|
|
1241
|
+
The balanced dataset.
|
|
1242
|
+
|
|
1243
|
+
"""
|
|
1244
|
+
df = pd.DataFrame({'graph_index': range(len(labels)), 'label': labels})
|
|
1245
|
+
|
|
1246
|
+
if method.lower() == 'undersampling':
|
|
1247
|
+
min_distribution = df['label'].value_counts().min()
|
|
1248
|
+
df = df.groupby('label').sample(n=min_distribution)
|
|
1249
|
+
elif method.lower() == 'oversampling':
|
|
1250
|
+
max_distribution = df['label'].value_counts().max()
|
|
1251
|
+
df = df.groupby('label').sample(n=max_distribution, replace=True)
|
|
1252
|
+
else:
|
|
1253
|
+
raise NotImplementedError
|
|
1254
|
+
|
|
1255
|
+
list_idx = df['graph_index'].tolist()
|
|
1256
|
+
graphs = []
|
|
1257
|
+
labels = []
|
|
1258
|
+
for index in list_idx:
|
|
1259
|
+
graph, label = dataset[index]
|
|
1260
|
+
graphs.append(graph)
|
|
1261
|
+
labels.append(label)
|
|
1262
|
+
return DGL.DatasetByGraphs(graphs=graphs, labels=labels, key=key)
|
|
1263
|
+
|
|
1264
|
+
@staticmethod
|
|
1265
|
+
def GraphByTopologicGraph(topologicGraph, bidirectional=True, key=None, categories=[], node_attr_key="node_attr", tolerance=0.0001):
|
|
1266
|
+
"""
|
|
1267
|
+
Returns a DGL graph by the input topologic graph.
|
|
1268
|
+
|
|
1269
|
+
Parameters
|
|
1270
|
+
----------
|
|
1271
|
+
topologicGraph : topologic.Graph
|
|
1272
|
+
The input topologic graph.
|
|
1273
|
+
bidirectional : bool , optional
|
|
1274
|
+
If set to True, the output DGL graph is forced to be bidirectional. The defaul is True.
|
|
1275
|
+
key : str
|
|
1276
|
+
The dictionary key where the node label is stored.
|
|
1277
|
+
categories : list
|
|
1278
|
+
The list of categories of node features.
|
|
1279
|
+
node_attr_key : str
|
|
1280
|
+
The dictionary key of the node attributes.
|
|
1281
|
+
tolerance : float , optional
|
|
1282
|
+
The desired tolerance. The default is 0.0001.
|
|
1283
|
+
|
|
1284
|
+
Returns
|
|
1285
|
+
-------
|
|
1286
|
+
DGL Graph
|
|
1287
|
+
The created DGL graph.
|
|
1288
|
+
|
|
1289
|
+
"""
|
|
1290
|
+
from topologicpy.Vertex import Vertex
|
|
1291
|
+
from topologicpy.Graph import Graph
|
|
1292
|
+
from topologicpy.Dictionary import Dictionary
|
|
1293
|
+
from topologicpy.Topology import Topology
|
|
1294
|
+
|
|
1295
|
+
graph_dict = {}
|
|
1296
|
+
vertices = Graph.Vertices(topologicGraph)
|
|
1297
|
+
edges = Graph.Edges(topologicGraph)
|
|
1298
|
+
graph_dict["num_nodes"] = len(vertices)
|
|
1299
|
+
graph_dict["src"] = []
|
|
1300
|
+
graph_dict["dst"] = []
|
|
1301
|
+
graph_dict["node_labels"] = {}
|
|
1302
|
+
graph_dict["node_features"] = []
|
|
1303
|
+
nodes = []
|
|
1304
|
+
graph_edges = []
|
|
1305
|
+
|
|
1306
|
+
for i in range(len(vertices)):
|
|
1307
|
+
vDict = Topology.Dictionary(vertices[i])
|
|
1308
|
+
if key:
|
|
1309
|
+
vLabel = Dictionary.ValueAtKey(vDict, key)
|
|
1310
|
+
else:
|
|
1311
|
+
vLabel = ""
|
|
1312
|
+
graph_dict["node_labels"][i] = vLabel
|
|
1313
|
+
# appending tensor of onehotencoded feature for each node following index i
|
|
1314
|
+
graph_dict["node_features"].append(torch.tensor(DGL.OneHotEncode(vLabel, categories)))
|
|
1315
|
+
nodes.append(i)
|
|
1316
|
+
|
|
1317
|
+
for i in range(len(edges)):
|
|
1318
|
+
e = edges[i]
|
|
1319
|
+
sv = e.StartVertex()
|
|
1320
|
+
ev = e.EndVertex()
|
|
1321
|
+
sn = nodes[Vertex.Index(vertex=sv, vertices=vertices, strict=False, tolerance=tolerance)]
|
|
1322
|
+
en = nodes[Vertex.Index(vertex=ev, vertices=vertices, strict=False, tolerance=tolerance)]
|
|
1323
|
+
if (([sn,en] in graph_edges) == False) and (([en,sn] in graph_edges) == False):
|
|
1324
|
+
graph_edges.append([sn,en])
|
|
1325
|
+
|
|
1326
|
+
for anEdge in graph_edges:
|
|
1327
|
+
graph_dict["src"].append(anEdge[0])
|
|
1328
|
+
graph_dict["dst"].append(anEdge[1])
|
|
1329
|
+
|
|
1330
|
+
# Create DDGL graph
|
|
1331
|
+
src = np.array(graph_dict["src"])
|
|
1332
|
+
dst = np.array(graph_dict["dst"])
|
|
1333
|
+
num_nodes = graph_dict["num_nodes"]
|
|
1334
|
+
# Create a graph
|
|
1335
|
+
dgl_graph = dgl.graph((src, dst), num_nodes=num_nodes)
|
|
1336
|
+
|
|
1337
|
+
# Setting the node features as node_attr_key using onehotencoding of vlabel
|
|
1338
|
+
dgl_graph.ndata[node_attr_key] = torch.stack(graph_dict["node_features"])
|
|
1339
|
+
|
|
1340
|
+
if bidirectional:
|
|
1341
|
+
dgl_graph = dgl.add_reverse_edges(dgl_graph)
|
|
1342
|
+
return dgl_graph
|
|
1343
|
+
|
|
1344
|
+
@staticmethod
|
|
1345
|
+
def GraphsByImportedCSV(graphs_file_path, edges_file_path,
|
|
1346
|
+
nodes_file_path, graph_id_header="graph_id",
|
|
1347
|
+
graph_label_header="label", num_nodes_header="num_nodes", src_header="src",
|
|
1348
|
+
dst_header="dst", node_label_header="label", node_attr_key="node_attr",
|
|
1349
|
+
categories=[], bidirectional=True):
|
|
1350
|
+
"""
|
|
1351
|
+
Returns DGL graphs according to the input CSV file paths.
|
|
1352
|
+
|
|
1353
|
+
Parameters
|
|
1354
|
+
----------
|
|
1355
|
+
graphs_file_path : str
|
|
1356
|
+
The file path to the grpahs CSV file.
|
|
1357
|
+
edges_file_path : str
|
|
1358
|
+
The file path to the edges CSV file.
|
|
1359
|
+
nodes_file_path : str
|
|
1360
|
+
The file path to the nodes CSV file.
|
|
1361
|
+
graph_id_header : str , optional
|
|
1362
|
+
The header string used to specify the graph id. The default is "graph_id".
|
|
1363
|
+
graph_label_header : str , optional
|
|
1364
|
+
The header string used to specify the graph label. The default is "label".
|
|
1365
|
+
num_nodes_header : str , optional
|
|
1366
|
+
The header string used to specify the number of nodes. The default is "num_nodes".
|
|
1367
|
+
src_header : str , optional
|
|
1368
|
+
The header string used to specify the source of edges. The default is "src".
|
|
1369
|
+
dst_header : str , optional
|
|
1370
|
+
The header string used to specify the destination of edges. The default is "dst".
|
|
1371
|
+
node_label_header : str , optional
|
|
1372
|
+
The header string used to specify the node label. The default is "label".
|
|
1373
|
+
node_attr_key : str , optional
|
|
1374
|
+
The key string used to specify the node attributes. The default is "node_attr".
|
|
1375
|
+
categories : list
|
|
1376
|
+
The list of categories.
|
|
1377
|
+
bidirectional : bool , optional
|
|
1378
|
+
If set to True, the output DGL graph is forced to be bi-directional. The default is True.
|
|
1379
|
+
|
|
1380
|
+
Returns
|
|
1381
|
+
-------
|
|
1382
|
+
list
|
|
1383
|
+
The list of DGL graphs found in the input CSV files.
|
|
1384
|
+
|
|
1385
|
+
"""
|
|
1386
|
+
|
|
1387
|
+
graphs = pd.read_csv(graphs_file_path)
|
|
1388
|
+
edges = pd.read_csv(edges_file_path)
|
|
1389
|
+
nodes = pd.read_csv(nodes_file_path)
|
|
1390
|
+
dgl_graphs = []
|
|
1391
|
+
labels = []
|
|
1392
|
+
|
|
1393
|
+
# Create a graph for each graph ID from the edges table.
|
|
1394
|
+
# First process the graphs table into two dictionaries with graph IDs as keys.
|
|
1395
|
+
# The label and number of nodes are values.
|
|
1396
|
+
label_dict = {}
|
|
1397
|
+
num_nodes_dict = {}
|
|
1398
|
+
for _, row in graphs.iterrows():
|
|
1399
|
+
label_dict[row[graph_id_header]] = row[graph_label_header]
|
|
1400
|
+
num_nodes_dict[row[graph_id_header]] = row[num_nodes_header]
|
|
1401
|
+
# For the edges, first group the table by graph IDs.
|
|
1402
|
+
edges_group = edges.groupby(graph_id_header)
|
|
1403
|
+
# For the nodes, first group the table by graph IDs.
|
|
1404
|
+
nodes_group = nodes.groupby(graph_id_header)
|
|
1405
|
+
# For each graph ID...
|
|
1406
|
+
for graph_id in edges_group.groups:
|
|
1407
|
+
graph_dict = {}
|
|
1408
|
+
graph_dict[src_header] = []
|
|
1409
|
+
graph_dict[dst_header] = []
|
|
1410
|
+
graph_dict[node_label_header] = {}
|
|
1411
|
+
graph_dict["node_features"] = []
|
|
1412
|
+
num_nodes = num_nodes_dict[graph_id]
|
|
1413
|
+
graph_label = label_dict[graph_id]
|
|
1414
|
+
labels.append(graph_label)
|
|
1415
|
+
|
|
1416
|
+
# Find the edges as well as the number of nodes and its label.
|
|
1417
|
+
edges_of_id = edges_group.get_group(graph_id)
|
|
1418
|
+
src = edges_of_id[src_header].to_numpy()
|
|
1419
|
+
dst = edges_of_id[dst_header].to_numpy()
|
|
1420
|
+
|
|
1421
|
+
# Find the nodes and their labels and features
|
|
1422
|
+
nodes_of_id = nodes_group.get_group(graph_id)
|
|
1423
|
+
node_labels = nodes_of_id[node_label_header]
|
|
1424
|
+
#graph_dict["node_labels"][graph_id] = node_labels
|
|
1425
|
+
|
|
1426
|
+
for node_label in node_labels:
|
|
1427
|
+
graph_dict["node_features"].append(torch.tensor(DGL.OneHotEncode(node_label, categories)))
|
|
1428
|
+
# Create a graph and add it to the list of graphs and labels.
|
|
1429
|
+
dgl_graph = dgl.graph((src, dst), num_nodes=num_nodes)
|
|
1430
|
+
# Setting the node features as node_attr_key using onehotencoding of node_label
|
|
1431
|
+
dgl_graph.ndata[node_attr_key] = torch.stack(graph_dict["node_features"])
|
|
1432
|
+
if bidirectional:
|
|
1433
|
+
dgl_graph = dgl.add_reverse_edges(dgl_graph)
|
|
1434
|
+
dgl_graphs.append(dgl_graph)
|
|
1435
|
+
return {"graphs":dgl_graphs, "labels":labels}
|
|
1436
|
+
|
|
1437
|
+
@staticmethod
|
|
1438
|
+
def GraphsByImportedDGCNN(file_path, categories=[], bidirectional=True):
|
|
1439
|
+
"""
|
|
1440
|
+
Returns the Graphs from the imported DGCNN file.
|
|
1441
|
+
|
|
1442
|
+
Parameters
|
|
1443
|
+
----------
|
|
1444
|
+
file_path : str
|
|
1445
|
+
The file path to the DGCNN text file.
|
|
1446
|
+
categories : list
|
|
1447
|
+
The list of node categories expected in the imported DGCNN file. This is used to one-hot-encode the node features.
|
|
1448
|
+
bidirectional : bool , optional
|
|
1449
|
+
If set to True, the output DGL graph is forced to be bi-directional. The defaults is True.
|
|
1450
|
+
|
|
1451
|
+
Returns
|
|
1452
|
+
-------
|
|
1453
|
+
dict
|
|
1454
|
+
A dictionary object that contains the imported graphs and their corresponding labels. The dictionary has the following keys and values:
|
|
1455
|
+
- "graphs" (list): The list of DGL graphs
|
|
1456
|
+
- "labels" (list): The list of graph labels
|
|
1457
|
+
|
|
1458
|
+
"""
|
|
1459
|
+
graphs = []
|
|
1460
|
+
labels = []
|
|
1461
|
+
file = open(file_path)
|
|
1462
|
+
if file:
|
|
1463
|
+
lines = file.readlines()
|
|
1464
|
+
n_graphs = int(lines[0])
|
|
1465
|
+
index = 1
|
|
1466
|
+
for i in range(n_graphs):
|
|
1467
|
+
graph_dict = {}
|
|
1468
|
+
graph_dict["src"] = []
|
|
1469
|
+
graph_dict["dst"] = []
|
|
1470
|
+
graph_dict["node_labels"] = {}
|
|
1471
|
+
graph_dict["node_features"] = []
|
|
1472
|
+
line = lines[index].split()
|
|
1473
|
+
n_nodes = int(line[0])
|
|
1474
|
+
graph_dict["num_nodes"] = n_nodes
|
|
1475
|
+
graph_label = int(line[1])
|
|
1476
|
+
labels.append(graph_label)
|
|
1477
|
+
index+=1
|
|
1478
|
+
for j in range(n_nodes):
|
|
1479
|
+
line = lines[index+j].split()
|
|
1480
|
+
node_label = int(line[0])
|
|
1481
|
+
graph_dict["node_labels"][j] = node_label
|
|
1482
|
+
graph_dict["node_features"].append(torch.tensor(DGL.OneHotEncode(node_label, categories)))
|
|
1483
|
+
adj_vertices = line[2:]
|
|
1484
|
+
for adj_vertex in adj_vertices:
|
|
1485
|
+
graph_dict["src"].append(j)
|
|
1486
|
+
graph_dict["dst"].append(int(adj_vertex))
|
|
1487
|
+
|
|
1488
|
+
# Create DDGL graph
|
|
1489
|
+
src = np.array(graph_dict["src"])
|
|
1490
|
+
dst = np.array(graph_dict["dst"])
|
|
1491
|
+
# Create a graph
|
|
1492
|
+
dgl_graph = dgl.graph((src, dst), num_nodes=graph_dict["num_nodes"])
|
|
1493
|
+
# Setting the node features as 'node_attr' using onehotencoding of vlabel
|
|
1494
|
+
dgl_graph.ndata['node_attr'] = torch.stack(graph_dict["node_features"])
|
|
1495
|
+
if bidirectional:
|
|
1496
|
+
dgl_graph = dgl.add_reverse_edges(dgl_graph)
|
|
1497
|
+
graphs.append(dgl_graph)
|
|
1498
|
+
index+=n_nodes
|
|
1499
|
+
file.close()
|
|
1500
|
+
return {"graphs":graphs, "labels":labels}
|
|
1501
|
+
|
|
1502
|
+
@staticmethod
|
|
1503
|
+
def CategoryDistribution(labels, categories=None, mantissa=4):
|
|
1504
|
+
"""
|
|
1505
|
+
Returns the category distribution in the input list of labels. This is useful to determine if the dataset is balanced or not.
|
|
1506
|
+
|
|
1507
|
+
Parameters
|
|
1508
|
+
----------
|
|
1509
|
+
labels : list
|
|
1510
|
+
The input list of labels.
|
|
1511
|
+
categories : list , optional
|
|
1512
|
+
The list of node categories expected in the imported DGCNN file. If not specified, the categories are computed directly from the labels. The default is None.
|
|
1513
|
+
mantissa : int , optional
|
|
1514
|
+
The desired length of the mantissa. The default is 4.
|
|
1515
|
+
|
|
1516
|
+
Returns
|
|
1517
|
+
-------
|
|
1518
|
+
dict
|
|
1519
|
+
A dictionary object that contains the categories and their corresponding ratios. The dictionary has the following keys and values:
|
|
1520
|
+
- "categories" (list): The list of categories.
|
|
1521
|
+
- "ratios" (list): The list of ratios of each category as found in the input list of labels.
|
|
1522
|
+
|
|
1523
|
+
"""
|
|
1524
|
+
if not categories:
|
|
1525
|
+
categories = list(set(labels))
|
|
1526
|
+
ratios = []
|
|
1527
|
+
for category in categories:
|
|
1528
|
+
ratios.append(round(float(labels.count(category))/float(len(labels)), mantissa))
|
|
1529
|
+
return {"categories":[categories], "ratios":[ratios]}
|
|
1530
|
+
|
|
1531
|
+
@staticmethod
|
|
1532
|
+
def ModelByFilePath(path):
|
|
1533
|
+
"""
|
|
1534
|
+
Returns the model found at the input file path.
|
|
1535
|
+
Parameters
|
|
1536
|
+
----------
|
|
1537
|
+
path : str
|
|
1538
|
+
File path for the saved classifier.
|
|
1539
|
+
|
|
1540
|
+
Returns
|
|
1541
|
+
-------
|
|
1542
|
+
DGL Classifier
|
|
1543
|
+
The classifier.
|
|
1544
|
+
|
|
1545
|
+
"""
|
|
1546
|
+
if not path:
|
|
1547
|
+
return None
|
|
1548
|
+
return torch.load(path)
|
|
1549
|
+
|
|
1550
|
+
def ConfusionMatrix(actual, predicted, normalize=False):
|
|
1551
|
+
"""
|
|
1552
|
+
Returns the confusion matrix for the input actual and predicted labels. This is to be used with classification tasks only not regression.
|
|
1553
|
+
|
|
1554
|
+
Parameters
|
|
1555
|
+
----------
|
|
1556
|
+
actual : list
|
|
1557
|
+
The input list of actual labels.
|
|
1558
|
+
predicted : list
|
|
1559
|
+
The input list of predicts labels.
|
|
1560
|
+
normalized : bool , optional
|
|
1561
|
+
If set to True, the returned data will be normalized (proportion of 1). Otherwise, actual numbers are returned. The default is False.
|
|
1562
|
+
|
|
1563
|
+
Returns
|
|
1564
|
+
-------
|
|
1565
|
+
list
|
|
1566
|
+
The created confusion matrix.
|
|
1567
|
+
|
|
1568
|
+
"""
|
|
1569
|
+
from sklearn import metrics
|
|
1570
|
+
import numpy
|
|
1571
|
+
if normalize:
|
|
1572
|
+
cm = numpy.transpose(metrics.confusion_matrix(y_true=actual, y_pred=predicted, normalize="true"))
|
|
1573
|
+
else:
|
|
1574
|
+
cm = numpy.transpose(metrics.confusion_matrix(y_true=actual, y_pred=predicted))
|
|
1575
|
+
return cm
|
|
1576
|
+
|
|
1577
|
+
@staticmethod
|
|
1578
|
+
def DatasetByGraphs(dictionary, key="node_attr"):
|
|
1579
|
+
"""
|
|
1580
|
+
Returns a DGL Dataset from the input DGL graphs.
|
|
1581
|
+
|
|
1582
|
+
Parameters
|
|
1583
|
+
----------
|
|
1584
|
+
dictionary : dict
|
|
1585
|
+
The input dictionary of graphs and labels. This dictionary must have the keys "graphs" and "labels"
|
|
1586
|
+
key : str
|
|
1587
|
+
The key used for the node attributes.
|
|
1588
|
+
|
|
1589
|
+
Returns
|
|
1590
|
+
-------
|
|
1591
|
+
DGL.Dataset
|
|
1592
|
+
The creatred DGL dataset.
|
|
1593
|
+
|
|
1594
|
+
"""
|
|
1595
|
+
graphs = dictionary['graphs']
|
|
1596
|
+
labels = dictionary['labels']
|
|
1597
|
+
return _Dataset(graphs, labels, key)
|
|
1598
|
+
|
|
1599
|
+
@staticmethod
|
|
1600
|
+
def DatasetByImportedCSV_NC(folderPath):
|
|
1601
|
+
"""
|
|
1602
|
+
UNDER CONSTRUCTION. DO NOT USE.
|
|
1603
|
+
|
|
1604
|
+
Parameters
|
|
1605
|
+
----------
|
|
1606
|
+
folderPath : str
|
|
1607
|
+
The path to folder containing the input CSV files. In that folder there should be graphs.csv, edges.csv, and vertices.csv
|
|
1608
|
+
|
|
1609
|
+
Returns
|
|
1610
|
+
-------
|
|
1611
|
+
DGLDataset
|
|
1612
|
+
The returns DGL dataset.
|
|
1613
|
+
|
|
1614
|
+
"""
|
|
1615
|
+
return dgl.data.CSVDataset(folderPath, force_reload=True)
|
|
1616
|
+
|
|
1617
|
+
@staticmethod
|
|
1618
|
+
def DatasetBySample(name="ENZYMES"):
|
|
1619
|
+
"""
|
|
1620
|
+
Returns a dataset from the samples database.
|
|
1621
|
+
|
|
1622
|
+
Parameters
|
|
1623
|
+
----------
|
|
1624
|
+
name : str
|
|
1625
|
+
The name of the sample dataset. This can be "ENZYMES", "DD", "COLLAB", or "MUTAG". It is case insensitive. The default is "ENZYMES".
|
|
1626
|
+
|
|
1627
|
+
Returns
|
|
1628
|
+
-------
|
|
1629
|
+
GraphDGL
|
|
1630
|
+
The created DGL dataset.
|
|
1631
|
+
|
|
1632
|
+
"""
|
|
1633
|
+
name = name.upper()
|
|
1634
|
+
dataset = dgl.data.TUDataset(name)
|
|
1635
|
+
dgl_graphs, dgl_labels = zip(*[dataset[i] for i in range(len(dataset.graph_lists))])
|
|
1636
|
+
if name == 'ENZYMES':
|
|
1637
|
+
node_attr_key = 'node_attr'
|
|
1638
|
+
elif name == 'DD':
|
|
1639
|
+
node_attr_key = 'node_labels'
|
|
1640
|
+
elif name == 'COLLAB':
|
|
1641
|
+
node_attr_key = '_ID'
|
|
1642
|
+
elif name == 'MUTAG':
|
|
1643
|
+
node_attr_key = 'node_labels'
|
|
1644
|
+
else:
|
|
1645
|
+
raise NotImplementedError
|
|
1646
|
+
return _Dataset(dgl_graphs, dgl_labels, node_attr_key)
|
|
1647
|
+
|
|
1648
|
+
@staticmethod
|
|
1649
|
+
def DatasetBySample_NC(name="Cora"):
|
|
1650
|
+
"""
|
|
1651
|
+
Returns the sample dataset as specified by the input sample name
|
|
1652
|
+
|
|
1653
|
+
Parameters
|
|
1654
|
+
----------
|
|
1655
|
+
name : str
|
|
1656
|
+
The name of the sample dataset to load. This can be "Cora", "Citeseer", or "Pubmed". It is case insensitive. The default is "Cora".
|
|
1657
|
+
|
|
1658
|
+
Raises
|
|
1659
|
+
------
|
|
1660
|
+
NotImplementedError
|
|
1661
|
+
DESCRIPTION.
|
|
1662
|
+
|
|
1663
|
+
Returns
|
|
1664
|
+
-------
|
|
1665
|
+
list
|
|
1666
|
+
DESCRIPTION.
|
|
1667
|
+
|
|
1668
|
+
"""
|
|
1669
|
+
if name.lower() == 'cora':
|
|
1670
|
+
return [dgl.data.CoraGraphDataset(), 7]
|
|
1671
|
+
elif name.lower() == 'citeseer':
|
|
1672
|
+
return [dgl.data.CiteseerGraphDataset(), 6]
|
|
1673
|
+
elif name.lower() == 'pubmed':
|
|
1674
|
+
return [dgl.data.PubmedGraphDataset(), 3]
|
|
1675
|
+
else:
|
|
1676
|
+
raise NotImplementedError
|
|
1677
|
+
|
|
1678
|
+
@staticmethod
|
|
1679
|
+
def DatasetGraphs(dataset):
|
|
1680
|
+
"""
|
|
1681
|
+
Returns the DGL graphs found the in the input dataset.
|
|
1682
|
+
|
|
1683
|
+
Parameters
|
|
1684
|
+
----------
|
|
1685
|
+
dataset : DGLDataset
|
|
1686
|
+
The input dataset.
|
|
1687
|
+
|
|
1688
|
+
Returns
|
|
1689
|
+
-------
|
|
1690
|
+
list
|
|
1691
|
+
The list of DGL graphs found in the input dataset.
|
|
1692
|
+
|
|
1693
|
+
"""
|
|
1694
|
+
try:
|
|
1695
|
+
_ = dataset[1]
|
|
1696
|
+
except:
|
|
1697
|
+
dataset = [dataset[0]]
|
|
1698
|
+
graphs = []
|
|
1699
|
+
for aGraph in dataset:
|
|
1700
|
+
if isinstance(aGraph, tuple):
|
|
1701
|
+
aGraph = aGraph[0]
|
|
1702
|
+
graphs.append(aGraph)
|
|
1703
|
+
return graphs
|
|
1704
|
+
|
|
1705
|
+
@staticmethod
|
|
1706
|
+
def GraphEdgeData(graph):
|
|
1707
|
+
"""
|
|
1708
|
+
Returns the edge data found in the input DGL graph
|
|
1709
|
+
Parameters
|
|
1710
|
+
----------
|
|
1711
|
+
dgl_graph : DGL Graph
|
|
1712
|
+
The input DGL graph.
|
|
1713
|
+
|
|
1714
|
+
Returns
|
|
1715
|
+
-------
|
|
1716
|
+
edge data
|
|
1717
|
+
The edge data.
|
|
1718
|
+
|
|
1719
|
+
"""
|
|
1720
|
+
return graph.edata
|
|
1721
|
+
|
|
1722
|
+
@staticmethod
|
|
1723
|
+
def Hyperparameters(optimizer, model_type="classifier", cv_type="Holdout", split=[0.8,0.1,0.1], k_folds=5,
|
|
1724
|
+
hl_widths=[32], conv_layer_type="SAGEConv", pooling="AvgPooling",
|
|
1725
|
+
batch_size=1, epochs=1, use_gpu=False, loss_function="Cross Entropy"):
|
|
1726
|
+
"""
|
|
1727
|
+
Creates a hyperparameters object based on the input settings.
|
|
1728
|
+
|
|
1729
|
+
Parameters
|
|
1730
|
+
----------
|
|
1731
|
+
model_type : str , optional
|
|
1732
|
+
The desired type of model. The options are:
|
|
1733
|
+
- "Classifier"
|
|
1734
|
+
- "Regressor"
|
|
1735
|
+
The option is case insensitive. The default is "classifierholdout"
|
|
1736
|
+
optimizer : Optimizer
|
|
1737
|
+
The desired optimizer.
|
|
1738
|
+
cv_type : str , optional
|
|
1739
|
+
The desired cross-validation method. This can be "Holdout" or "K-Fold". It is case-insensitive. The default is "Holdout".
|
|
1740
|
+
split : list , optional
|
|
1741
|
+
The desired split between training validation, and testing. [0.8, 0.1, 0.1] means that 80% of the data is used for training 10% of the data is used for validation, and 10% is used for testing. The default is [0.8, 0.1, 0.1].
|
|
1742
|
+
k_folds : int , optional
|
|
1743
|
+
The desired number of k-folds. The default is 5.
|
|
1744
|
+
hl_widths : list , optional
|
|
1745
|
+
The list of hidden layer widths. A list of [16, 32, 16] means that the model will have 3 hidden layers with number of neurons in each being 16, 32, 16 respectively from input to output. The default is [32].
|
|
1746
|
+
conv_layer_type : str , optional
|
|
1747
|
+
THe desired type of the convultion layer. The options are "Classic", "GraphConv", "GINConv", "SAGEConv", "TAGConv", "DGN". It is case insensitive. The default is "SAGEConv".
|
|
1748
|
+
pooling : str , optional
|
|
1749
|
+
The desired type of pooling. The options are "AvgPooling", "MaxPooling", or "SumPooling". It is case insensitive. The default is "AvgPooling".
|
|
1750
|
+
batch_size : int , optional
|
|
1751
|
+
The desired batch size. The default is 1.
|
|
1752
|
+
epochs : int , optional
|
|
1753
|
+
The desired number of epochs. The default is 1.
|
|
1754
|
+
use_gpu : bool , optional
|
|
1755
|
+
If set to True, the model will attempt to use the GPU. The default is False.
|
|
1756
|
+
loss_function : str , optional
|
|
1757
|
+
The desired loss function. The optionals are "Cross-Entropy" or "Negative Log Likelihood". It is case insensitive. The default is "Cross-Entropy".
|
|
1758
|
+
|
|
1759
|
+
Returns
|
|
1760
|
+
-------
|
|
1761
|
+
Hyperparameters
|
|
1762
|
+
The created hyperparameters object.
|
|
1763
|
+
|
|
1764
|
+
"""
|
|
1765
|
+
|
|
1766
|
+
if optimizer['name'].lower() == "adadelta":
|
|
1767
|
+
optimizer_str = "Adadelta"
|
|
1768
|
+
elif optimizer['name'].lower() == "adagrad":
|
|
1769
|
+
optimizer_str = "Adagrad"
|
|
1770
|
+
elif optimizer['name'].lower() == "adam":
|
|
1771
|
+
optimizer_str = "Adam"
|
|
1772
|
+
return _Hparams(model_type,
|
|
1773
|
+
optimizer_str,
|
|
1774
|
+
optimizer['amsgrad'],
|
|
1775
|
+
optimizer['betas'],
|
|
1776
|
+
optimizer['eps'],
|
|
1777
|
+
optimizer['lr'],
|
|
1778
|
+
optimizer['lr_decay'],
|
|
1779
|
+
optimizer['maximize'],
|
|
1780
|
+
optimizer['rho'],
|
|
1781
|
+
optimizer['weight_decay'],
|
|
1782
|
+
cv_type,
|
|
1783
|
+
split,
|
|
1784
|
+
k_folds,
|
|
1785
|
+
hl_widths,
|
|
1786
|
+
conv_layer_type,
|
|
1787
|
+
pooling,
|
|
1788
|
+
batch_size,
|
|
1789
|
+
epochs,
|
|
1790
|
+
use_gpu,
|
|
1791
|
+
loss_function)
|
|
1792
|
+
|
|
1793
|
+
@staticmethod
|
|
1794
|
+
def OneHotEncode(item, categories):
|
|
1795
|
+
"""
|
|
1796
|
+
One-hot encodes the input item according to the input categories. One-Hot Encoding is a method to encode categorical variables to numerical data that Machine Learning algorithms can deal with. One-Hot encoding is most used during feature engineering for a ML Model. It converts categorical values into a new categorical column and assign a binary value of 1 or 0 to those columns.
|
|
1797
|
+
|
|
1798
|
+
Parameters
|
|
1799
|
+
----------
|
|
1800
|
+
item : any
|
|
1801
|
+
The input item.
|
|
1802
|
+
categories : list
|
|
1803
|
+
The input list of categories.
|
|
1804
|
+
|
|
1805
|
+
Returns
|
|
1806
|
+
-------
|
|
1807
|
+
list
|
|
1808
|
+
A one-hot encoded list of the input item according to the input categories.
|
|
1809
|
+
|
|
1810
|
+
"""
|
|
1811
|
+
returnList = []
|
|
1812
|
+
for i in range(len(categories)):
|
|
1813
|
+
if item == categories[i]:
|
|
1814
|
+
returnList.append(1)
|
|
1815
|
+
else:
|
|
1816
|
+
returnList.append(0)
|
|
1817
|
+
return returnList
|
|
1818
|
+
|
|
1819
|
+
@staticmethod
|
|
1820
|
+
def DatasetLabels(dataset):
|
|
1821
|
+
"""
|
|
1822
|
+
Returns the labels of the graphs in the input dataset
|
|
1823
|
+
|
|
1824
|
+
Parameters
|
|
1825
|
+
----------
|
|
1826
|
+
dataset : DGLDataset
|
|
1827
|
+
The input dataset
|
|
1828
|
+
|
|
1829
|
+
Returns
|
|
1830
|
+
-------
|
|
1831
|
+
list
|
|
1832
|
+
The list of labels.
|
|
1833
|
+
"""
|
|
1834
|
+
return [int(g[1]) for g in dataset]
|
|
1835
|
+
|
|
1836
|
+
@staticmethod
|
|
1837
|
+
def DatasetMerge(datasets, key="node_attr"):
|
|
1838
|
+
"""
|
|
1839
|
+
Merges the input list of datasets into one dataset
|
|
1840
|
+
|
|
1841
|
+
Parameters
|
|
1842
|
+
----------
|
|
1843
|
+
datasets : list
|
|
1844
|
+
The input list of DGLdatasets
|
|
1845
|
+
|
|
1846
|
+
Returns
|
|
1847
|
+
-------
|
|
1848
|
+
DGLDataset
|
|
1849
|
+
The merged dataset
|
|
1850
|
+
"""
|
|
1851
|
+
|
|
1852
|
+
graphs = []
|
|
1853
|
+
labels = []
|
|
1854
|
+
for ds in datasets:
|
|
1855
|
+
graphs += DGL.DatasetGraphs(ds)
|
|
1856
|
+
labels += DGL.DatasetLabels(ds)
|
|
1857
|
+
return DGL.DatasetByGraphs(graphs, labels, key=key)
|
|
1858
|
+
|
|
1859
|
+
@staticmethod
|
|
1860
|
+
def GraphNodeData(graph):
|
|
1861
|
+
"""
|
|
1862
|
+
Returns the node data found in the input dgl_graph
|
|
1863
|
+
|
|
1864
|
+
Parameters
|
|
1865
|
+
----------
|
|
1866
|
+
dgl_graph : DGL graph
|
|
1867
|
+
The input DGL graph.
|
|
1868
|
+
|
|
1869
|
+
Returns
|
|
1870
|
+
-------
|
|
1871
|
+
node data
|
|
1872
|
+
The node data.
|
|
1873
|
+
|
|
1874
|
+
"""
|
|
1875
|
+
return graph.ndata
|
|
1876
|
+
|
|
1877
|
+
@staticmethod
|
|
1878
|
+
def DatasetRemoveCategory(dataset, label, key="node_attr"):
|
|
1879
|
+
"""
|
|
1880
|
+
Removes graphs from the input dataset that have the input label
|
|
1881
|
+
|
|
1882
|
+
Parameters
|
|
1883
|
+
----------
|
|
1884
|
+
dataset : DGLDataset
|
|
1885
|
+
The input dataset
|
|
1886
|
+
label : int
|
|
1887
|
+
The input label
|
|
1888
|
+
key : str , optional
|
|
1889
|
+
The input node attribute key
|
|
1890
|
+
|
|
1891
|
+
Returns
|
|
1892
|
+
-------
|
|
1893
|
+
DGLDataset
|
|
1894
|
+
The resulting dataset
|
|
1895
|
+
|
|
1896
|
+
"""
|
|
1897
|
+
|
|
1898
|
+
graphs = DGL.DatasetGraphs(dataset)
|
|
1899
|
+
labels = DGL.DatasetLabels(dataset)
|
|
1900
|
+
new_graphs = []
|
|
1901
|
+
new_labels = []
|
|
1902
|
+
for i in range(len(labels)):
|
|
1903
|
+
if not labels[i] == label:
|
|
1904
|
+
new_graphs.append(graphs[i])
|
|
1905
|
+
new_labels.append(labels[i])
|
|
1906
|
+
return DGL.DatasetByGraphs(new_graphs, new_labels, key)
|
|
1907
|
+
|
|
1908
|
+
@staticmethod
|
|
1909
|
+
def DatasetSplit(dataset, fracList=[0.8, 0.1, 0.1], shuffle=False, randomState=None, key="node_attr"):
|
|
1910
|
+
"""
|
|
1911
|
+
Splits the dataset into training, validation, and testing datasets.
|
|
1912
|
+
|
|
1913
|
+
Parameters
|
|
1914
|
+
----------
|
|
1915
|
+
dataset : DGLDataset
|
|
1916
|
+
The input dataset
|
|
1917
|
+
fracList : list , optional
|
|
1918
|
+
A list of length 3 containing the fraction to use for training, validation and test. If None, we will use [0.8, 0.1, 0.1]. The default is [0.8, 0.1, 0.1]
|
|
1919
|
+
randomState : int or array_like , optional
|
|
1920
|
+
Random seed used to initialize the pseudo-random number generator. Can be any integer between 0 and 2**32 - 1 inclusive, an array (or other sequence) of such integers, or None (the default). If seed is None, then RandomState will try to read data from /dev/urandom (or the Windows analogue) if available or seed from the clock otherwise.
|
|
1921
|
+
Returns
|
|
1922
|
+
-------
|
|
1923
|
+
dict
|
|
1924
|
+
The dictionary of the optimizer parameters. The dictionary contains the following keys and values:
|
|
1925
|
+
- "train_ds" (DGLDataset)
|
|
1926
|
+
- "validate_ds" (DGLDataset)
|
|
1927
|
+
- "test_ds" (DGLDataset)
|
|
1928
|
+
|
|
1929
|
+
"""
|
|
1930
|
+
|
|
1931
|
+
if not 0 <= fracList[0] <= 1:
|
|
1932
|
+
return None
|
|
1933
|
+
if not 0 <= fracList[1] <= 1:
|
|
1934
|
+
return None
|
|
1935
|
+
if not 0 <= fracList[2] <= 1:
|
|
1936
|
+
return None
|
|
1937
|
+
if sum(fracList) > 1:
|
|
1938
|
+
return None
|
|
1939
|
+
datasets = dgl.data.utils.split_dataset(dataset, frac_list=fracList, shuffle=shuffle, random_state=randomState)
|
|
1940
|
+
if fracList[0] > 0:
|
|
1941
|
+
train_ds = DGL.DatasetByGraphs({'graphs': DGL.DatasetGraphs(datasets[0]), 'labels' :DGL.DatasetLabels(datasets[0])}, key=key)
|
|
1942
|
+
else:
|
|
1943
|
+
train_ds = None
|
|
1944
|
+
if fracList[1] > 0:
|
|
1945
|
+
validate_ds = DGL.DatasetByGraphs({'graphs': DGL.DatasetGraphs(datasets[1]), 'labels' :DGL.DatasetLabels(datasets[1])}, key=key)
|
|
1946
|
+
else:
|
|
1947
|
+
validate_ds = None
|
|
1948
|
+
if fracList[2] > 0:
|
|
1949
|
+
test_ds = DGL.DatasetByGraphs({'graphs': DGL.DatasetGraphs(datasets[2]), 'labels' :DGL.DatasetLabels(datasets[2])}, key=key)
|
|
1950
|
+
else:
|
|
1951
|
+
test_ds = None
|
|
1952
|
+
|
|
1953
|
+
return {
|
|
1954
|
+
"train_ds" : train_ds,
|
|
1955
|
+
"validate_ds" : validate_ds,
|
|
1956
|
+
"test_ds" : test_ds
|
|
1957
|
+
}
|
|
1958
|
+
@staticmethod
|
|
1959
|
+
def Optimizer(name="Adam", amsgrad=True, betas=(0.9,0.999), eps=0.000001, lr=0.001, maximize=False, weightDecay=0.0, rho=0.9, lr_decay=0.0):
|
|
1960
|
+
"""
|
|
1961
|
+
Returns the parameters of the optimizer
|
|
1962
|
+
|
|
1963
|
+
Parameters
|
|
1964
|
+
----------
|
|
1965
|
+
amsgrad : bool , optional.
|
|
1966
|
+
amsgrad is an extension to the Adam version of gradient descent that attempts to improve the convergence properties of the algorithm, avoiding large abrupt changes in the learning rate for each input variable. The default is True.
|
|
1967
|
+
betas : tuple , optional
|
|
1968
|
+
Betas are used as for smoothing the path to the convergence also providing some momentum to cross a local minima or saddle point. The default is (0.9, 0.999).
|
|
1969
|
+
eps : float . optional.
|
|
1970
|
+
eps is a term added to the denominator to improve numerical stability. The default is 0.000001.
|
|
1971
|
+
lr : float
|
|
1972
|
+
The learning rate (lr) defines the adjustment in the weights of our network with respect to the loss gradient descent. The default is 0.001.
|
|
1973
|
+
maximize : float , optional
|
|
1974
|
+
maximize the params based on the objective, instead of minimizing. The default is False.
|
|
1975
|
+
weightDecay : float , optional
|
|
1976
|
+
weightDecay (L2 penalty) is a regularization technique applied to the weights of a neural network. The default is 0.0.
|
|
1977
|
+
|
|
1978
|
+
Returns
|
|
1979
|
+
-------
|
|
1980
|
+
dict
|
|
1981
|
+
The dictionary of the optimizer parameters. The dictionary contains the following keys and values:
|
|
1982
|
+
- "name" (str): The name of the optimizer
|
|
1983
|
+
- "amsgrad" (bool):
|
|
1984
|
+
- "betas" (tuple):
|
|
1985
|
+
- "eps" (float):
|
|
1986
|
+
- "lr" (float):
|
|
1987
|
+
- "maximize" (bool):
|
|
1988
|
+
- weightDecay (float):
|
|
1989
|
+
|
|
1990
|
+
"""
|
|
1991
|
+
return {"name":name, "amsgrad":amsgrad, "betas":betas, "eps":eps, "lr": lr, "maximize":maximize, "weight_decay":weightDecay, "rho":rho, "lr_decay":lr_decay}
|
|
1992
|
+
|
|
1993
|
+
@staticmethod
|
|
1994
|
+
def ModelClassify(model, dataset, node_attr_key="node_attr"):
|
|
1995
|
+
"""
|
|
1996
|
+
Predicts the classification the labels of the input dataset.
|
|
1997
|
+
|
|
1998
|
+
Parameters
|
|
1999
|
+
----------
|
|
2000
|
+
dataset : DGLDataset
|
|
2001
|
+
The input DGL dataset.
|
|
2002
|
+
model : Model
|
|
2003
|
+
The input trained model.
|
|
2004
|
+
node_attr_key : str , optional
|
|
2005
|
+
The key used for node attributes. The default is "node_attr".
|
|
2006
|
+
|
|
2007
|
+
Returns
|
|
2008
|
+
-------
|
|
2009
|
+
dict
|
|
2010
|
+
Dictionary containing labels and probabilities. The included keys and values are:
|
|
2011
|
+
- "predictions" (list): the list of predicted labels
|
|
2012
|
+
- "probabilities" (list): the list of probabilities that the label is one of the categories.
|
|
2013
|
+
|
|
2014
|
+
"""
|
|
2015
|
+
labels = []
|
|
2016
|
+
probabilities = []
|
|
2017
|
+
for item in tqdm(dataset, desc='Classifying', leave=False):
|
|
2018
|
+
graph = item[0]
|
|
2019
|
+
pred = model(graph, graph.ndata[node_attr_key].float())
|
|
2020
|
+
labels.append(pred.argmax(1).item())
|
|
2021
|
+
probability = (torch.nn.functional.softmax(pred, dim=1).tolist())
|
|
2022
|
+
probability = probability[0]
|
|
2023
|
+
temp_probability = []
|
|
2024
|
+
for p in probability:
|
|
2025
|
+
temp_probability.append(round(p, 3))
|
|
2026
|
+
probabilities.append(temp_probability)
|
|
2027
|
+
return {"predictions":labels, "probabilities":probabilities}
|
|
2028
|
+
|
|
2029
|
+
@staticmethod
|
|
2030
|
+
def ModelPredict(model, dataset, node_attr_key="node_attr"):
|
|
2031
|
+
"""
|
|
2032
|
+
Predicts the value of the input dataset.
|
|
2033
|
+
|
|
2034
|
+
Parameters
|
|
2035
|
+
----------
|
|
2036
|
+
dataset : DGLDataset
|
|
2037
|
+
The input DGL dataset.
|
|
2038
|
+
model : Model
|
|
2039
|
+
The input trained model.
|
|
2040
|
+
node_attr_key : str , optional
|
|
2041
|
+
The key used for node attributes. The default is "node_attr".
|
|
2042
|
+
|
|
2043
|
+
Returns
|
|
2044
|
+
-------
|
|
2045
|
+
list
|
|
2046
|
+
The list of predictions
|
|
2047
|
+
"""
|
|
2048
|
+
values = []
|
|
2049
|
+
for item in tqdm(dataset, desc='Predicting', leave=False):
|
|
2050
|
+
graph = item[0]
|
|
2051
|
+
pred = model(graph, graph.ndata[node_attr_key].float())
|
|
2052
|
+
print(pred)
|
|
2053
|
+
values.append(round(pred.item(), 3))
|
|
2054
|
+
return values
|
|
2055
|
+
|
|
2056
|
+
@staticmethod
|
|
2057
|
+
def ModelClassifyNodes(model, dataset):
|
|
2058
|
+
"""
|
|
2059
|
+
Predicts the calssification of the node labels found in the input dataset using the input classifier.
|
|
2060
|
+
|
|
2061
|
+
Parameters
|
|
2062
|
+
----------
|
|
2063
|
+
model : Model
|
|
2064
|
+
The input model.
|
|
2065
|
+
dataset : DGLDataset
|
|
2066
|
+
The input DGL Dataset.
|
|
2067
|
+
|
|
2068
|
+
Returns
|
|
2069
|
+
-------
|
|
2070
|
+
dict
|
|
2071
|
+
A dictionary containing all the results. The keys in this dictionary are:
|
|
2072
|
+
- "alllabels"
|
|
2073
|
+
- "allpredictions"
|
|
2074
|
+
- "trainlabels"
|
|
2075
|
+
- "trainpredictions"
|
|
2076
|
+
- "validationlabels"
|
|
2077
|
+
- "validationpredictions"
|
|
2078
|
+
- "testlabels"
|
|
2079
|
+
- "testpredictions"
|
|
2080
|
+
|
|
2081
|
+
"""
|
|
2082
|
+
from topologicpy.Helper import Helper
|
|
2083
|
+
|
|
2084
|
+
# classifier, dataset = item
|
|
2085
|
+
allLabels = []
|
|
2086
|
+
allPredictions = []
|
|
2087
|
+
trainLabels = []
|
|
2088
|
+
trainPredictions = []
|
|
2089
|
+
valLabels = []
|
|
2090
|
+
valPredictions = []
|
|
2091
|
+
testLabels = []
|
|
2092
|
+
testPredictions = []
|
|
2093
|
+
|
|
2094
|
+
graphs = DGL.DatasetGraphs(dataset)
|
|
2095
|
+
for g in graphs:
|
|
2096
|
+
if not g.ndata:
|
|
2097
|
+
continue
|
|
2098
|
+
train_mask = g.ndata['train_mask']
|
|
2099
|
+
val_mask = g.ndata['val_mask']
|
|
2100
|
+
test_mask = g.ndata['test_mask']
|
|
2101
|
+
features = g.ndata['feat']
|
|
2102
|
+
labels = g.ndata['label']
|
|
2103
|
+
train_labels = labels[train_mask]
|
|
2104
|
+
val_labels = labels[val_mask]
|
|
2105
|
+
test_labels = labels[test_mask]
|
|
2106
|
+
allLabels.append(labels.tolist())
|
|
2107
|
+
trainLabels.append(train_labels.tolist())
|
|
2108
|
+
valLabels.append(val_labels.tolist())
|
|
2109
|
+
testLabels.append(test_labels.tolist())
|
|
2110
|
+
|
|
2111
|
+
# Forward
|
|
2112
|
+
logits = model(g, features)
|
|
2113
|
+
train_logits = logits[train_mask]
|
|
2114
|
+
val_logits = logits[val_mask]
|
|
2115
|
+
test_logits = logits[test_mask]
|
|
2116
|
+
|
|
2117
|
+
# Compute prediction
|
|
2118
|
+
predictions = logits.argmax(1)
|
|
2119
|
+
train_predictions = train_logits.argmax(1)
|
|
2120
|
+
val_predictions = val_logits.argmax(1)
|
|
2121
|
+
test_predictions = test_logits.argmax(1)
|
|
2122
|
+
allPredictions.append(predictions.tolist())
|
|
2123
|
+
trainPredictions.append(train_predictions.tolist())
|
|
2124
|
+
valPredictions.append(val_predictions.tolist())
|
|
2125
|
+
testPredictions.append(test_predictions.tolist())
|
|
2126
|
+
|
|
2127
|
+
return {
|
|
2128
|
+
"alllabels": Helper.Flatten(allLabels),
|
|
2129
|
+
"allpredictions" : Helper.Flatten(allPredictions),
|
|
2130
|
+
"trainlabels" : Helper.Flatten(trainLabels),
|
|
2131
|
+
"trainpredictions" : Helper.Flatten(trainPredictions),
|
|
2132
|
+
"validationlabels" : Helper.Flatten(valLabels),
|
|
2133
|
+
"validationpredictions" : Helper.Flatten(valPredictions),
|
|
2134
|
+
"testlabels" : Helper.Flatten(testLabels),
|
|
2135
|
+
"testpredictions" : Helper.Flatten(testPredictions)
|
|
2136
|
+
|
|
2137
|
+
}
|
|
2138
|
+
|
|
2139
|
+
@staticmethod
|
|
2140
|
+
def Show(data,
|
|
2141
|
+
labels,
|
|
2142
|
+
title="Training/Validation",
|
|
2143
|
+
xTitle="Epochs",
|
|
2144
|
+
xSpacing=1,
|
|
2145
|
+
yTitle="Accuracy and Loss",
|
|
2146
|
+
ySpacing=0.1,
|
|
2147
|
+
useMarkers=False,
|
|
2148
|
+
chartType="Line",
|
|
2149
|
+
width=950,
|
|
2150
|
+
height=500,
|
|
2151
|
+
backgroundColor='rgba(0,0,0,0)',
|
|
2152
|
+
gridColor='lightgray',
|
|
2153
|
+
marginLeft=0,
|
|
2154
|
+
marginRight=0,
|
|
2155
|
+
marginTop=40,
|
|
2156
|
+
marginBottom=0,
|
|
2157
|
+
renderer = "notebook"):
|
|
2158
|
+
"""
|
|
2159
|
+
Shows the data in a plolty graph.
|
|
2160
|
+
|
|
2161
|
+
Parameters
|
|
2162
|
+
----------
|
|
2163
|
+
data : list
|
|
2164
|
+
The data to display.
|
|
2165
|
+
labels : list
|
|
2166
|
+
The labels to use for the data.
|
|
2167
|
+
width : int , optional
|
|
2168
|
+
The desired width of the figure. The default is 950.
|
|
2169
|
+
height : int , optional
|
|
2170
|
+
The desired height of the figure. The default is 500.
|
|
2171
|
+
title : str , optional
|
|
2172
|
+
The chart title. The default is "Training and Testing Results".
|
|
2173
|
+
xTitle : str , optional
|
|
2174
|
+
The X-axis title. The default is "Epochs".
|
|
2175
|
+
xSpacing : float , optional
|
|
2176
|
+
The X-axis spacing. The default is 1.0.
|
|
2177
|
+
yTitle : str , optional
|
|
2178
|
+
The Y-axis title. The default is "Accuracy and Loss".
|
|
2179
|
+
ySpacing : float , optional
|
|
2180
|
+
THe Y-axis spacing. The default is 0.1.
|
|
2181
|
+
useMarkers : bool , optional
|
|
2182
|
+
If set to True, markers will be displayed. The default is False.
|
|
2183
|
+
chartType : str , optional
|
|
2184
|
+
The desired type of chart. The options are "Line", "Bar", or "Scatter". It is case insensitive. The default is "Line".
|
|
2185
|
+
backgroundColor : str , optional
|
|
2186
|
+
The desired background color. This can be any plotly color string and may be specified as:
|
|
2187
|
+
- A hex string (e.g. '#ff0000')
|
|
2188
|
+
- An rgb/rgba string (e.g. 'rgb(255,0,0)')
|
|
2189
|
+
- An hsl/hsla string (e.g. 'hsl(0,100%,50%)')
|
|
2190
|
+
- An hsv/hsva string (e.g. 'hsv(0,100%,100%)')
|
|
2191
|
+
- A named CSS color.
|
|
2192
|
+
The default is 'rgba(0,0,0,0)' (transparent).
|
|
2193
|
+
gridColor : str , optional
|
|
2194
|
+
The desired grid color. This can be any plotly color string and may be specified as:
|
|
2195
|
+
- A hex string (e.g. '#ff0000')
|
|
2196
|
+
- An rgb/rgba string (e.g. 'rgb(255,0,0)')
|
|
2197
|
+
- An hsl/hsla string (e.g. 'hsl(0,100%,50%)')
|
|
2198
|
+
- An hsv/hsva string (e.g. 'hsv(0,100%,100%)')
|
|
2199
|
+
- A named CSS color.
|
|
2200
|
+
The default is 'lightgray'.
|
|
2201
|
+
marginLeft : int , optional
|
|
2202
|
+
The desired left margin in pixels. The default is 0.
|
|
2203
|
+
marginRight : int , optional
|
|
2204
|
+
The desired right margin in pixels. The default is 0.
|
|
2205
|
+
marginTop : int , optional
|
|
2206
|
+
The desired top margin in pixels. The default is 40.
|
|
2207
|
+
marginBottom : int , optional
|
|
2208
|
+
The desired bottom margin in pixels. The default is 0.
|
|
2209
|
+
renderer : str , optional
|
|
2210
|
+
The desired plotly renderer. The default is "notebook".
|
|
2211
|
+
|
|
2212
|
+
Returns
|
|
2213
|
+
-------
|
|
2214
|
+
None.
|
|
2215
|
+
|
|
2216
|
+
"""
|
|
2217
|
+
from topologicpy.Plotly import Plotly
|
|
2218
|
+
|
|
2219
|
+
dataFrame = Plotly.DataByDGL(data, labels)
|
|
2220
|
+
fig = Plotly.FigureByDataFrame(dataFrame,
|
|
2221
|
+
labels=labels,
|
|
2222
|
+
title=title,
|
|
2223
|
+
xTitle=xTitle,
|
|
2224
|
+
xSpacing=xSpacing,
|
|
2225
|
+
yTitle=yTitle,
|
|
2226
|
+
ySpacing=ySpacing,
|
|
2227
|
+
useMarkers=useMarkers,
|
|
2228
|
+
chartType=chartType,
|
|
2229
|
+
width=width,
|
|
2230
|
+
height=height,
|
|
2231
|
+
backgroundColor=backgroundColor,
|
|
2232
|
+
gridColor=gridColor,
|
|
2233
|
+
marginRight=marginRight,
|
|
2234
|
+
marginLeft=marginLeft,
|
|
2235
|
+
marginTop=marginTop,
|
|
2236
|
+
marginBottom=marginBottom
|
|
2237
|
+
)
|
|
2238
|
+
Plotly.Show(fig, renderer=renderer)
|
|
2239
|
+
|
|
2240
|
+
@staticmethod
|
|
2241
|
+
def Model(hparams, trainingDataset, validationDataset=None, testingDataset=None):
|
|
2242
|
+
"""
|
|
2243
|
+
Creates a neural network classifier.
|
|
2244
|
+
|
|
2245
|
+
Parameters
|
|
2246
|
+
----------
|
|
2247
|
+
hparams : HParams
|
|
2248
|
+
The input hyperparameters
|
|
2249
|
+
trainingDataset : DGLDataset
|
|
2250
|
+
The input training dataset.
|
|
2251
|
+
validationDataset : DGLDataset
|
|
2252
|
+
The input validation dataset. If not specified, a portion of the trainingDataset will be used for validation according the to the split list as specified in the hyper-parameters.
|
|
2253
|
+
testingDataset : DGLDataset
|
|
2254
|
+
The input testing dataset. If not specified, a portion of the trainingDataset will be used for testing according the to the split list as specified in the hyper-parameters.
|
|
2255
|
+
|
|
2256
|
+
Returns
|
|
2257
|
+
-------
|
|
2258
|
+
Classifier
|
|
2259
|
+
The created classifier
|
|
2260
|
+
|
|
2261
|
+
"""
|
|
2262
|
+
|
|
2263
|
+
model = None
|
|
2264
|
+
if hparams.model_type.lower() == "classifier":
|
|
2265
|
+
if hparams.cv_type.lower() == "holdout":
|
|
2266
|
+
model = _ClassifierHoldout(hparams=hparams, trainingDataset=trainingDataset, validationDataset=validationDataset, testingDataset=testingDataset)
|
|
2267
|
+
elif hparams.cv_type.lower() == "k-fold" or hparams.cv_type.lower() == "kfold":
|
|
2268
|
+
model = _ClassifierKFold(hparams=hparams, trainingDataset=trainingDataset, testingDataset=testingDataset)
|
|
2269
|
+
elif hparams.model_type.lower() == "regressor":
|
|
2270
|
+
if hparams.cv_type.lower() == "holdout":
|
|
2271
|
+
model = _RegressorHoldout(hparams=hparams, trainingDataset=trainingDataset, validationDataset=validationDataset, testingDataset=testingDataset)
|
|
2272
|
+
elif hparams.cv_type.lower() == "k-fold" or hparams.cv_type.lower() == "kfold":
|
|
2273
|
+
model = _RegressorKFold(hparams=hparams, trainingDataset=trainingDataset, testingDataset=testingDataset)
|
|
2274
|
+
else:
|
|
2275
|
+
raise NotImplementedError
|
|
2276
|
+
return model
|
|
2277
|
+
|
|
2278
|
+
@staticmethod
|
|
2279
|
+
def ModelTrain(model):
|
|
2280
|
+
"""
|
|
2281
|
+
Trains the neural network model.
|
|
2282
|
+
|
|
2283
|
+
Parameters
|
|
2284
|
+
----------
|
|
2285
|
+
model : Model
|
|
2286
|
+
The input model.
|
|
2287
|
+
|
|
2288
|
+
Returns
|
|
2289
|
+
-------
|
|
2290
|
+
Model
|
|
2291
|
+
The trained model
|
|
2292
|
+
|
|
2293
|
+
"""
|
|
2294
|
+
if not model:
|
|
2295
|
+
return None
|
|
2296
|
+
model.train()
|
|
2297
|
+
return model
|
|
2298
|
+
|
|
2299
|
+
@staticmethod
|
|
2300
|
+
def ModelTest(model):
|
|
2301
|
+
"""
|
|
2302
|
+
Tests the neural network model.
|
|
2303
|
+
|
|
2304
|
+
Parameters
|
|
2305
|
+
----------
|
|
2306
|
+
model : Model
|
|
2307
|
+
The input model.
|
|
2308
|
+
|
|
2309
|
+
Returns
|
|
2310
|
+
-------
|
|
2311
|
+
Model
|
|
2312
|
+
The tested model
|
|
2313
|
+
|
|
2314
|
+
"""
|
|
2315
|
+
if not model:
|
|
2316
|
+
return None
|
|
2317
|
+
model.test()
|
|
2318
|
+
return model
|
|
2319
|
+
|
|
2320
|
+
@staticmethod
|
|
2321
|
+
def ModelSave(model, path=None):
|
|
2322
|
+
"""
|
|
2323
|
+
Saves the model.
|
|
2324
|
+
|
|
2325
|
+
Parameters
|
|
2326
|
+
----------
|
|
2327
|
+
model : Model
|
|
2328
|
+
The input model.
|
|
2329
|
+
|
|
2330
|
+
Returns
|
|
2331
|
+
-------
|
|
2332
|
+
bool
|
|
2333
|
+
True if the model is saved correctly. False otherwise.
|
|
2334
|
+
|
|
2335
|
+
"""
|
|
2336
|
+
if not model:
|
|
2337
|
+
return None
|
|
2338
|
+
if path:
|
|
2339
|
+
# Make sure the file extension is .pt
|
|
2340
|
+
ext = path[len(path)-3:len(path)]
|
|
2341
|
+
if ext.lower() != ".pt":
|
|
2342
|
+
path = path+".pt"
|
|
2343
|
+
return model.save(path)
|
|
2344
|
+
|
|
2345
|
+
@staticmethod
|
|
2346
|
+
def ModelData(model):
|
|
2347
|
+
"""
|
|
2348
|
+
Returns the data of the model
|
|
2349
|
+
|
|
2350
|
+
Parameters
|
|
2351
|
+
----------
|
|
2352
|
+
model : Model
|
|
2353
|
+
The input model.
|
|
2354
|
+
|
|
2355
|
+
Returns
|
|
2356
|
+
-------
|
|
2357
|
+
dict
|
|
2358
|
+
A dictionary containing the model data.
|
|
2359
|
+
|
|
2360
|
+
"""
|
|
2361
|
+
from topologicpy.Helper import Helper
|
|
2362
|
+
|
|
2363
|
+
data = {'Model Type': [model.hparams.model_type],
|
|
2364
|
+
'Optimizer': [model.hparams.optimizer_str],
|
|
2365
|
+
'CV Type': [model.hparams.cv_type],
|
|
2366
|
+
'Split': model.hparams.split,
|
|
2367
|
+
'K-Folds': [model.hparams.k_folds],
|
|
2368
|
+
'HL Widths': model.hparams.hl_widths,
|
|
2369
|
+
'Conv Layer Type': [model.hparams.conv_layer_type],
|
|
2370
|
+
'Pooling': [model.hparams.pooling],
|
|
2371
|
+
'Learning Rate': [model.hparams.lr],
|
|
2372
|
+
'Batch Size': [model.hparams.batch_size],
|
|
2373
|
+
'Epochs': [model.hparams.epochs]
|
|
2374
|
+
}
|
|
2375
|
+
|
|
2376
|
+
if model.hparams.model_type.lower() == "classifier":
|
|
2377
|
+
testing_accuracy_list = [model.testing_accuracy] * model.hparams.epochs
|
|
2378
|
+
testing_loss_list = [model.testing_loss] * model.hparams.epochs
|
|
2379
|
+
metrics_data = {
|
|
2380
|
+
'Training Accuracy': [model.training_accuracy_list],
|
|
2381
|
+
'Validation Accuracy': [model.validation_accuracy_list],
|
|
2382
|
+
'Testing Accuracy' : [testing_accuracy_list],
|
|
2383
|
+
'Training Loss': [model.training_loss_list],
|
|
2384
|
+
'Validation Loss': [model.validation_loss_list],
|
|
2385
|
+
'Testing Loss' : [testing_loss_list]
|
|
2386
|
+
}
|
|
2387
|
+
if model.hparams.cv_type.lower() == "k-fold":
|
|
2388
|
+
accuracy_data = {
|
|
2389
|
+
'Accuracies' : [model.accuracies],
|
|
2390
|
+
'Max Accuracy' : [model.max_accuracy]
|
|
2391
|
+
}
|
|
2392
|
+
metrics_data.update(accuracy_data)
|
|
2393
|
+
data.update(metrics_data)
|
|
2394
|
+
|
|
2395
|
+
elif model.hparams.model_type.lower() == "regressor":
|
|
2396
|
+
testing_loss_list = [model.testing_loss] * model.hparams.epochs
|
|
2397
|
+
metrics_data = {
|
|
2398
|
+
'Training Loss': [model.training_loss_list],
|
|
2399
|
+
'Validation Loss': [model.validation_loss_list],
|
|
2400
|
+
'Testing Loss' : [testing_loss_list]
|
|
2401
|
+
}
|
|
2402
|
+
if model.hparams.cv_type.lower() == "k-fold":
|
|
2403
|
+
loss_data = {
|
|
2404
|
+
'Losses' : [model.losses],
|
|
2405
|
+
'Min Loss' : [model.min_loss]
|
|
2406
|
+
}
|
|
2407
|
+
metrics_data.update(loss_data)
|
|
2408
|
+
data.update(metrics_data)
|
|
2409
|
+
|
|
2410
|
+
return data
|
|
2411
|
+
|
|
2412
|
+
@staticmethod
|
|
2413
|
+
def GraphsByFilePath(path, labelKey="value", key='node_attr'):
|
|
2414
|
+
graphs, label_dict = load_graphs(path)
|
|
2415
|
+
labels = label_dict[labelKey].tolist()
|
|
2416
|
+
return {"graphs" : graphs, "labels": labels}
|
|
2417
|
+
|
|
2418
|
+
@staticmethod
|
|
2419
|
+
def DataExportToCSV(data, path, overwrite=True):
|
|
2420
|
+
"""
|
|
2421
|
+
Exports the input data to a CSV file
|
|
2422
|
+
|
|
2423
|
+
Parameters
|
|
2424
|
+
----------
|
|
2425
|
+
data : dict
|
|
2426
|
+
The input data. See Data(model)
|
|
2427
|
+
overwrite : bool , optional
|
|
2428
|
+
If set to True, previous saved results files are overwritten. Otherwise, the new results are appended to the previously saved files. The default is True.
|
|
2429
|
+
|
|
2430
|
+
Returns
|
|
2431
|
+
-------
|
|
2432
|
+
bool
|
|
2433
|
+
True if the data is saved correctly to a CSV file. False otherwise.
|
|
2434
|
+
|
|
2435
|
+
"""
|
|
2436
|
+
from topologicpy.Helper import Helper
|
|
2437
|
+
|
|
2438
|
+
# Make sure the file extension is .csv
|
|
2439
|
+
ext = path[len(path)-4:len(path)]
|
|
2440
|
+
if ext.lower() != ".csv":
|
|
2441
|
+
path = path+".csv"
|
|
2442
|
+
|
|
2443
|
+
epoch_list = list(range(1, data['Epochs'][0]+1))
|
|
2444
|
+
|
|
2445
|
+
d = [data['Model Type'], data['Optimizer'], data['CV Type'], [data['Split']], data['K-Folds'], data['HL Widths'], data['Conv Layer Type'], data['Pooling'], data['Learning Rate'], data['Batch Size'], epoch_list]
|
|
2446
|
+
columns = ['Model Type', 'Optimizer', 'CV Type', 'Split', 'K-Folds', 'HL Widths', 'Conv Layer Type', 'Pooling', 'Learning Rate', 'Batch Size', 'Epochs']
|
|
2447
|
+
|
|
2448
|
+
if data['Model Type'][0].lower() == "classifier":
|
|
2449
|
+
d.extend([data['Training Accuracy'][0], data['Validation Accuracy'][0], data['Testing Accuracy'][0], data['Training Loss'][0], data['Validation Loss'][0], data['Testing Loss'][0]])
|
|
2450
|
+
columns.extend(['Training Accuracy', 'Validation Accuracy', 'Testing Accuracy', 'Training Loss', 'Validation Loss', 'Testing Loss'])
|
|
2451
|
+
if data['CV Type'][0].lower() == "k-fold":
|
|
2452
|
+
d.extend([data['Accuracies'], data['Max Accuracy']])
|
|
2453
|
+
columns.extend(['Accuracies', 'Max Accuracy'])
|
|
2454
|
+
|
|
2455
|
+
elif data['Model Type'][0].lower() == "regressor":
|
|
2456
|
+
d.extend([data['Training Loss'][0], data['Validation Loss'][0], data['Testing Loss'][0]])
|
|
2457
|
+
columns.extend(['Training Loss', 'Validation Loss', 'Testing Loss'])
|
|
2458
|
+
if data['CV Type'][0].lower() == "k-fold":
|
|
2459
|
+
d.extend([data['Losses'], data['Min Loss']])
|
|
2460
|
+
columns.extend(['Losses', 'Min Loss'])
|
|
2461
|
+
|
|
2462
|
+
d = Helper.Iterate(d)
|
|
2463
|
+
d = Helper.Transpose(d)
|
|
2464
|
+
df = pd.DataFrame(d, columns=columns)
|
|
2465
|
+
|
|
2466
|
+
status = False
|
|
2467
|
+
if path:
|
|
2468
|
+
if overwrite:
|
|
2469
|
+
mode = 'w+'
|
|
2470
|
+
else:
|
|
2471
|
+
mode = 'a'
|
|
2472
|
+
try:
|
|
2473
|
+
df.to_csv(path, mode=mode, index = False, header=True)
|
|
2474
|
+
status = True
|
|
2475
|
+
except:
|
|
2476
|
+
status = False
|
|
2477
|
+
return status
|
|
2478
|
+
|
|
2479
|
+
'''
|
|
2480
|
+
@staticmethod
|
|
2481
|
+
def TrainRegressor(hparams, trainingDataset, validationDataset=None, testingDataset=None, overwrite=True):
|
|
2482
|
+
"""
|
|
2483
|
+
Trains a neural network regressor.
|
|
2484
|
+
|
|
2485
|
+
Parameters
|
|
2486
|
+
----------
|
|
2487
|
+
hparams : HParams
|
|
2488
|
+
The input hyperparameters
|
|
2489
|
+
trainingDataset : DGLDataset
|
|
2490
|
+
The input training dataset.
|
|
2491
|
+
validationDataset : DGLDataset
|
|
2492
|
+
The input validation dataset. If not specified, a portion of the trainingDataset will be used for validation according the to the split list as specified in the hyper-parameters.
|
|
2493
|
+
testingDataset : DGLDataset
|
|
2494
|
+
The input testing dataset. If not specified, a portion of the trainingDataset will be used for testing according the to the split list as specified in the hyper-parameters.
|
|
2495
|
+
overwrite : bool , optional
|
|
2496
|
+
If set to True, previous saved results files are overwritten. Otherwise, the new results are appended to the previously saved files. The default is True.
|
|
2497
|
+
|
|
2498
|
+
Returns
|
|
2499
|
+
-------
|
|
2500
|
+
dict
|
|
2501
|
+
A dictionary containing all the results.
|
|
2502
|
+
|
|
2503
|
+
"""
|
|
2504
|
+
|
|
2505
|
+
from topologicpy.Helper import Helper
|
|
2506
|
+
import time
|
|
2507
|
+
import datetime
|
|
2508
|
+
start = time.time()
|
|
2509
|
+
regressor = _RegressorHoldout(hparams, trainingDataset, validationDataset, testingDataset)
|
|
2510
|
+
regressor.train()
|
|
2511
|
+
accuracy = regressor.validate()
|
|
2512
|
+
|
|
2513
|
+
end = time.time()
|
|
2514
|
+
duration = round(end - start,3)
|
|
2515
|
+
utcnow = datetime.datetime.utcnow()
|
|
2516
|
+
timestamp_str = "UTC-"+str(utcnow.year)+"-"+str(utcnow.month)+"-"+str(utcnow.day)+"-"+str(utcnow.hour)+"-"+str(utcnow.minute)+"-"+str(utcnow.second)
|
|
2517
|
+
epoch_list = list(range(1,regressor.hparams.epochs+1))
|
|
2518
|
+
d2 = [[timestamp_str], [duration], [regressor.hparams.optimizer_str], [regressor.hparams.cv_type], [regressor.hparams.split], [regressor.hparams.k_folds], regressor.hparams.hl_widths, [regressor.hparams.conv_layer_type], [regressor.hparams.pooling], [regressor.hparams.lr], [regressor.hparams.batch_size], epoch_list, regressor.training_accuracy_list, regressor.validation_accuracy_list]
|
|
2519
|
+
d2 = Helper.Iterate(d2)
|
|
2520
|
+
d2 = Helper.Transpose(d2)
|
|
2521
|
+
|
|
2522
|
+
data = {'TimeStamp': "UTC-"+str(utcnow.year)+"-"+str(utcnow.month)+"-"+str(utcnow.day)+"-"+str(utcnow.hour)+"-"+str(utcnow.minute)+"-"+str(utcnow.second),
|
|
2523
|
+
'Duration': [duration],
|
|
2524
|
+
'Optimizer': [regressor.hparams.optimizer_str],
|
|
2525
|
+
'CV Type': [regressor.hparams.cv_type],
|
|
2526
|
+
'Split': [regressor.hparams.split],
|
|
2527
|
+
'K-Folds': [regressor.hparams.k_folds],
|
|
2528
|
+
'HL Widths': [regressor.hparams.hl_widths],
|
|
2529
|
+
'Conv Layer Type': [regressor.hparams.conv_layer_type],
|
|
2530
|
+
'Pooling': [regressor.hparams.pooling],
|
|
2531
|
+
'Learning Rate': [regressor.hparams.lr],
|
|
2532
|
+
'Batch Size': [regressor.hparams.batch_size],
|
|
2533
|
+
'Epochs': [regressor.hparams.epochs],
|
|
2534
|
+
'Training Accuracy': [regressor.training_accuracy_list],
|
|
2535
|
+
'Validation Accuracy': [regressor.validation_accuracy_list]
|
|
2536
|
+
}
|
|
2537
|
+
|
|
2538
|
+
df = pd.DataFrame(d2, columns= ['TimeStamp', 'Duration', 'Optimizer', 'CV Type', 'Split', 'K-Folds', 'HL Widths', 'Conv Layer Type', 'Pooling', 'Learning Rate', 'Batch Size', 'Epochs', 'Training Accuracy', 'Testing Accuracy'])
|
|
2539
|
+
if regressor.hparams.results_path:
|
|
2540
|
+
if overwrite:
|
|
2541
|
+
df.to_csv(regressor.hparams.results_path, mode='w+', index = False, header=True)
|
|
2542
|
+
else:
|
|
2543
|
+
df.to_csv(regressor.hparams.results_path, mode='a', index = False, header=False)
|
|
2544
|
+
return data
|
|
2545
|
+
'''
|
|
2546
|
+
|
|
2547
|
+
@staticmethod
|
|
2548
|
+
def _TrainClassifier_NC(graphs, model, hparams):
|
|
2549
|
+
"""
|
|
2550
|
+
Parameters
|
|
2551
|
+
----------
|
|
2552
|
+
graphs : list
|
|
2553
|
+
The input list of graphs.
|
|
2554
|
+
model : GCN Model
|
|
2555
|
+
The input classifier model.
|
|
2556
|
+
hparams : HParams
|
|
2557
|
+
The input hyper-parameters.
|
|
2558
|
+
|
|
2559
|
+
Returns
|
|
2560
|
+
-------
|
|
2561
|
+
list
|
|
2562
|
+
The list of trained model and predictions.
|
|
2563
|
+
|
|
2564
|
+
"""
|
|
2565
|
+
# Default optimizer
|
|
2566
|
+
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
|
|
2567
|
+
if hparams.optimizer_str.lower() == "adadelta":
|
|
2568
|
+
optimizer = torch.optim.Adadelta(model.parameters(), eps=hparams.eps,
|
|
2569
|
+
lr=hparams.lr, rho=hparams.rho, weight_decay=hparams.weight_decay)
|
|
2570
|
+
elif hparams.optimizer_str.lower() == "adagrad":
|
|
2571
|
+
optimizer = torch.optim.Adagrad(model.parameters(), eps=hparams.eps,
|
|
2572
|
+
lr=hparams.lr, lr_decay=hparams.lr_decay, weight_decay=hparams.weight_decay)
|
|
2573
|
+
elif hparams.optimizer_str.lower() == "adam":
|
|
2574
|
+
optimizer = torch.optim.Adam(model.parameters(), amsgrad=hparams.amsgrad, betas=hparams.betas, eps=hparams.eps,
|
|
2575
|
+
lr=hparams.lr, maximize=hparams.maximize, weight_decay=hparams.weight_decay)
|
|
2576
|
+
|
|
2577
|
+
for e in range(hparams.epochs):
|
|
2578
|
+
best_val_acc = 0
|
|
2579
|
+
best_test_acc = 0
|
|
2580
|
+
for i in range(len(graphs)):
|
|
2581
|
+
g = graphs[i]
|
|
2582
|
+
if not g.ndata:
|
|
2583
|
+
continue
|
|
2584
|
+
features = g.ndata['feat']
|
|
2585
|
+
labels = g.ndata['label']
|
|
2586
|
+
train_mask = g.ndata['train_mask']
|
|
2587
|
+
val_mask = g.ndata['val_mask']
|
|
2588
|
+
test_mask = g.ndata['test_mask']
|
|
2589
|
+
# Forward
|
|
2590
|
+
logits = model(g, features)
|
|
2591
|
+
|
|
2592
|
+
# Compute prediction
|
|
2593
|
+
pred = logits.argmax(1)
|
|
2594
|
+
|
|
2595
|
+
# Compute loss
|
|
2596
|
+
# Note that you should only compute the losses of the nodes in the training set.
|
|
2597
|
+
# Compute loss
|
|
2598
|
+
if hparams.loss_function.lower() == "negative log likelihood":
|
|
2599
|
+
logp = F.log_softmax(logits[train_mask], 1)
|
|
2600
|
+
loss = F.nll_loss(logp, labels[train_mask])
|
|
2601
|
+
elif hparams.loss_function.lower() == "cross entropy":
|
|
2602
|
+
loss = F.cross_entropy(logits[train_mask], labels[train_mask])
|
|
2603
|
+
# Compute accuracy on training/validation/test
|
|
2604
|
+
train_acc = (pred[train_mask] == labels[train_mask]).float().mean()
|
|
2605
|
+
val_acc = (pred[val_mask] == labels[val_mask]).float().mean()
|
|
2606
|
+
test_acc = (pred[test_mask] == labels[test_mask]).float().mean()
|
|
2607
|
+
|
|
2608
|
+
# Save the best validation accuracy and the corresponding test accuracy.
|
|
2609
|
+
if val_acc > best_val_acc:
|
|
2610
|
+
best_val_acc = val_acc
|
|
2611
|
+
if test_acc > best_test_acc:
|
|
2612
|
+
best_test_acc = test_acc
|
|
2613
|
+
|
|
2614
|
+
# Backward
|
|
2615
|
+
optimizer.zero_grad()
|
|
2616
|
+
loss.backward()
|
|
2617
|
+
optimizer.step()
|
|
2618
|
+
if e % 1 == 0:
|
|
2619
|
+
print('In epoch {}, loss: {:.3f}, val acc: {:.3f} (best {:.3f}), test acc: {:.3f} (best {:.3f})'.format(
|
|
2620
|
+
e, loss, val_acc, best_val_acc, test_acc, best_test_acc))
|
|
2621
|
+
return [model, pred]
|
|
2622
|
+
|
|
2623
|
+
@staticmethod
|
|
2624
|
+
def TrainNodeClassifier(hparams, dataset, numLabels, sample):
|
|
2625
|
+
"""
|
|
2626
|
+
Parameters
|
|
2627
|
+
----------
|
|
2628
|
+
hparams : TYPE
|
|
2629
|
+
DESCRIPTION.
|
|
2630
|
+
dataset : TYPE
|
|
2631
|
+
DESCRIPTION.
|
|
2632
|
+
numLabels : TYPE
|
|
2633
|
+
DESCRIPTION.
|
|
2634
|
+
sample : TYPE
|
|
2635
|
+
DESCRIPTION.
|
|
2636
|
+
|
|
2637
|
+
Returns
|
|
2638
|
+
-------
|
|
2639
|
+
final_model : TYPE
|
|
2640
|
+
DESCRIPTION.
|
|
2641
|
+
|
|
2642
|
+
"""
|
|
2643
|
+
|
|
2644
|
+
# hparams, dataset, numLabels, sample = item
|
|
2645
|
+
# We will consider only the first graph in the dataset.
|
|
2646
|
+
graphs = DGL.DatasetGraphs(dataset)
|
|
2647
|
+
# Sample a random list from the graphs
|
|
2648
|
+
if sample < len(graphs) and sample > 0:
|
|
2649
|
+
graphs = random.sample(graphs, sample)
|
|
2650
|
+
if len(graphs) == 1:
|
|
2651
|
+
i = 0
|
|
2652
|
+
elif len(graphs) > 1:
|
|
2653
|
+
i = random.randrange(0, len(graphs)-1)
|
|
2654
|
+
else: # There are no gaphs in the dataset, return None
|
|
2655
|
+
return None
|
|
2656
|
+
model = _Classic(graphs[i].ndata['feat'].shape[1], hparams.hl_widths, numLabels)
|
|
2657
|
+
final_model, predictions = DGL._TrainNodeClassifier(graphs, model, hparams)
|
|
2658
|
+
# Save the entire model
|
|
2659
|
+
if hparams.checkpoint_path is not None:
|
|
2660
|
+
torch.save(final_model, hparams.checkpoint_path)
|
|
2661
|
+
return final_model
|