pyxla 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyxla/__about__.py +1 -0
- pyxla/__init__.py +1380 -0
- pyxla/sampling.py +508 -0
- pyxla/util.py +573 -0
- pyxla-0.0.1.dist-info/METADATA +63 -0
- pyxla-0.0.1.dist-info/RECORD +7 -0
- pyxla-0.0.1.dist-info/WHEEL +4 -0
pyxla/__init__.py
ADDED
|
@@ -0,0 +1,1380 @@
|
|
|
1
|
+
|
|
2
|
+
"""The core functions of the library are defined here."""
|
|
3
|
+
|
|
4
|
+
import matplotlib.axes
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import seaborn as sns
|
|
8
|
+
import matplotlib.pyplot as plt
|
|
9
|
+
import statistics
|
|
10
|
+
import scipy
|
|
11
|
+
from scipy.stats import spearmanr
|
|
12
|
+
from scipy.spatial.distance import pdist
|
|
13
|
+
from typing import Tuple
|
|
14
|
+
import matplotlib
|
|
15
|
+
import math
|
|
16
|
+
from typing import Callable, Union, List, Iterable
|
|
17
|
+
from tqdm.auto import tqdm
|
|
18
|
+
import logging
|
|
19
|
+
import itertools
|
|
20
|
+
from sklearn.linear_model import Ridge, LassoLars
|
|
21
|
+
from sklearn.model_selection import train_test_split
|
|
22
|
+
from sklearn.inspection import permutation_importance
|
|
23
|
+
|
|
24
|
+
from . import util
|
|
25
|
+
|
|
26
|
+
logging.basicConfig(level=logging.INFO)
|
|
27
|
+
|
|
28
|
+
# allow importing `load_data` immediately from `pyxla`
|
|
29
|
+
from .util import load_data
|
|
30
|
+
|
|
31
|
+
# definition of the sample structure
|
|
32
|
+
# (the data files should include the global optima if known)
|
|
33
|
+
sample = {'name': None,
|
|
34
|
+
'size': 0,
|
|
35
|
+
# variable space
|
|
36
|
+
'X': None, # dataframe
|
|
37
|
+
#'Xd': 0, # int
|
|
38
|
+
'Xcsv': None, # str
|
|
39
|
+
# objective space
|
|
40
|
+
'F': None,
|
|
41
|
+
#'Fd': 0,
|
|
42
|
+
'Fcsv': None,
|
|
43
|
+
'numF': 0,
|
|
44
|
+
'max': False, # we minimize by default
|
|
45
|
+
# violation space
|
|
46
|
+
'V': None,
|
|
47
|
+
#'Vd': 0,
|
|
48
|
+
'Vcsv': None,
|
|
49
|
+
'numV': 0,
|
|
50
|
+
# neighborhood
|
|
51
|
+
'N': None,
|
|
52
|
+
'Ncsv': None,
|
|
53
|
+
# distance
|
|
54
|
+
'D': None,
|
|
55
|
+
'Dcsv': None,
|
|
56
|
+
'representation': 'continuous', # by default
|
|
57
|
+
'd_metric_func': None, # distance metric function of the form dist(X1, X2) -> d
|
|
58
|
+
'neighbourhood_func': None, # of the form f(X1, X2) -> bool
|
|
59
|
+
'p': 2 # Euclidean distance by default
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
def descriptive_stats(data, name):
|
|
63
|
+
result = dict()
|
|
64
|
+
result[str(name + '_min')] = min(data)
|
|
65
|
+
result[str(name + '_max')] = max(data)
|
|
66
|
+
result[str(name + '_mean')] = statistics.mean(data)
|
|
67
|
+
result[str(name + '_med')] = statistics.median(data)
|
|
68
|
+
result[str(name + '_q1')] = statistics.quantiles(data, n = 4)[0]
|
|
69
|
+
result[str(name + '_q3')] = statistics.quantiles(data, n = 4)[2]
|
|
70
|
+
result[str(name + '_sd')] = statistics.stdev(data)
|
|
71
|
+
result[str(name + '_skew')] = scipy.stats.skew(data).item()
|
|
72
|
+
result[str(name + '_kurt')] = scipy.stats.kurtosis(data).item()
|
|
73
|
+
return result
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def distr_f(sample: dict, bins: int = 'auto') -> Tuple[dict, matplotlib.figure.Figure]:
|
|
77
|
+
"""
|
|
78
|
+
**Distribution of objectives (fitness values)**
|
|
79
|
+
|
|
80
|
+
The ``distr_f`` feature visualises the spread of objective values and computes
|
|
81
|
+
some descriptive statistics of the objective values. Alongside a histogram of
|
|
82
|
+
objective values and a histogram of their dense ranking, various descriptive statistics
|
|
83
|
+
are computed, including: minimum, maximum, mean, median, quartiles (Q1, Q3),
|
|
84
|
+
standard deviation, skewness, and kurtosis. Dense ranking is a ranking method
|
|
85
|
+
provided by the `Link pandas https://pandas.pydata.org` package, which entails
|
|
86
|
+
assigning to a group of equally valued solutions the least rank in the group
|
|
87
|
+
and ensuring that rank increases by 1 from group to group.
|
|
88
|
+
|
|
89
|
+
Parameters
|
|
90
|
+
----------
|
|
91
|
+
sample : dict
|
|
92
|
+
A sample containing the various input files i.e `F`, `V`.
|
|
93
|
+
|
|
94
|
+
Returns
|
|
95
|
+
-------
|
|
96
|
+
dict
|
|
97
|
+
Descriptive statistics of objective values.
|
|
98
|
+
matplotlib.figure.Figure
|
|
99
|
+
Histograms of objective values and or ranks of objective values.
|
|
100
|
+
|
|
101
|
+
Examples
|
|
102
|
+
--------
|
|
103
|
+
>>> from pyxla import util, distr_f
|
|
104
|
+
>>> import matplotlib
|
|
105
|
+
>>> sample = util.load_sample('cec2010_c01_2d_F1_V2', test=True)
|
|
106
|
+
>>> feat, plot = distr_f(sample)
|
|
107
|
+
>>> type(feat)
|
|
108
|
+
<class 'dict'>
|
|
109
|
+
>>> isinstance(plot, matplotlib.figure.Figure)
|
|
110
|
+
True
|
|
111
|
+
|
|
112
|
+
"""
|
|
113
|
+
F = sample['F']
|
|
114
|
+
numF = sample['numF']
|
|
115
|
+
R = pd.DataFrame()
|
|
116
|
+
|
|
117
|
+
for col in F:
|
|
118
|
+
R[f"{col}_rank"] = F[col].rank(ascending = not sample['max'], method = 'min').astype(int)
|
|
119
|
+
|
|
120
|
+
feat = dict()
|
|
121
|
+
for i in range(0, len(F.columns)):
|
|
122
|
+
feat.update(descriptive_stats(F.iloc[:,i], F.columns[i]))
|
|
123
|
+
feat.update(descriptive_stats(R.iloc[:,i], R.columns[i]))
|
|
124
|
+
|
|
125
|
+
# plots
|
|
126
|
+
ncols = numF + (numF > 1)
|
|
127
|
+
nrows = 2
|
|
128
|
+
fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=(3 * ncols, 3 * nrows))
|
|
129
|
+
palette = sns.color_palette()[-numF:] # pick last n colors in palette
|
|
130
|
+
|
|
131
|
+
ax = lambda i, j: axs[i, j] if numF > 1 else axs[i]
|
|
132
|
+
for i, col in enumerate(F):
|
|
133
|
+
sns.histplot(F[col], ax=ax(0, i), color=palette[i], bins=bins)
|
|
134
|
+
sns.histplot(R[f"{col}_rank"], ax=ax(1, i), color=palette[i])
|
|
135
|
+
|
|
136
|
+
# blend plots
|
|
137
|
+
if numF > 1:
|
|
138
|
+
sns.histplot(F, ax=ax(0, numF), palette=palette)
|
|
139
|
+
sns.histplot(R, ax=ax(1, numF), palette=palette)
|
|
140
|
+
|
|
141
|
+
fig.suptitle('Distribution of objective values/ranks')
|
|
142
|
+
plt.tight_layout()
|
|
143
|
+
|
|
144
|
+
return feat, fig
|
|
145
|
+
|
|
146
|
+
def distr_v(sample: dict) -> Tuple[dict, matplotlib.figure.Figure]:
|
|
147
|
+
"""
|
|
148
|
+
**Distribution of violation values**
|
|
149
|
+
|
|
150
|
+
The ``distr_v`` feature visualises the spread of violation values and
|
|
151
|
+
computes some descriptive statistics of the violation values.
|
|
152
|
+
Alongside a histogram of violation values and a histogram of their
|
|
153
|
+
dense ranking, descriptive statistics are computed, including:
|
|
154
|
+
minimum, maximum, mean, median, quartiles (Q1, Q3), standard deviation,
|
|
155
|
+
skewness, and kurtosis. Additionally, the feasibility rate is computed
|
|
156
|
+
per violation, and the overall feasibility, taking all constraints into
|
|
157
|
+
account, is computed. Feasibility rate refers to the proportion of solutions
|
|
158
|
+
that are feasible with respect to a constraint.
|
|
159
|
+
|
|
160
|
+
Parameters
|
|
161
|
+
----------
|
|
162
|
+
sample : dict
|
|
163
|
+
A sample containing the various input files i.e `F`, `V`.
|
|
164
|
+
|
|
165
|
+
Returns
|
|
166
|
+
-------
|
|
167
|
+
dict
|
|
168
|
+
Descriptive statistics of violation values.
|
|
169
|
+
matplotlib.figure.Figure
|
|
170
|
+
Histograms of violation values and ranks of violation values.
|
|
171
|
+
"""
|
|
172
|
+
V = sample['V']
|
|
173
|
+
if V is None: raise ValueError('V is absent in the sample. Please provide input V first.')
|
|
174
|
+
|
|
175
|
+
V_ = V.drop('feasible', axis=1)
|
|
176
|
+
numV = sample['numV']
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
R = pd.DataFrame()
|
|
180
|
+
|
|
181
|
+
for col in V_:
|
|
182
|
+
R[f"{col}_rank"] = V[col].rank(ascending = True, method = 'min').astype(int)
|
|
183
|
+
|
|
184
|
+
# features
|
|
185
|
+
feat = dict()
|
|
186
|
+
# exclude column 'feasible'
|
|
187
|
+
for i, col in enumerate(V_):
|
|
188
|
+
feat.update(descriptive_stats(V.iloc[:,i], col))
|
|
189
|
+
# feasibility rate per constraint
|
|
190
|
+
feat[f'{col}_feas_rate'] = (V.iloc[:,i] == 0).mean().item()
|
|
191
|
+
feat[f'overall_feas_rate'] = V['feasible'].mean().item()
|
|
192
|
+
|
|
193
|
+
# plots
|
|
194
|
+
ncols = numV + (numV > 1)
|
|
195
|
+
nrows = 2
|
|
196
|
+
fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=(3 * ncols, 3 * nrows))
|
|
197
|
+
palette = sns.color_palette()[-numV:] # pick last n colors in palette
|
|
198
|
+
|
|
199
|
+
ax = lambda i, j: axs[i, j] if numV > 1 else axs[i]
|
|
200
|
+
|
|
201
|
+
for i, col in enumerate(V_):
|
|
202
|
+
sns.histplot(V[col], ax=ax(0, i), color=palette[i])
|
|
203
|
+
sns.histplot(R[f"{col}_rank"], ax=ax(1, i), color=palette[i])
|
|
204
|
+
|
|
205
|
+
# blend plots
|
|
206
|
+
if numV > 1:
|
|
207
|
+
sns.histplot(V_, ax=ax(0, numV), palette=palette)
|
|
208
|
+
sns.histplot(R, ax=ax(1, numV), palette=palette)
|
|
209
|
+
|
|
210
|
+
fig.suptitle('Distribution of violation values/ranks')
|
|
211
|
+
plt.tight_layout()
|
|
212
|
+
|
|
213
|
+
return feat, fig
|
|
214
|
+
|
|
215
|
+
def distr_Par(sample: dict) -> Tuple[dict, sns.axisgrid.FacetGrid]:
|
|
216
|
+
"""_summary_
|
|
217
|
+
|
|
218
|
+
Parameters
|
|
219
|
+
----------
|
|
220
|
+
sample : dict
|
|
221
|
+
_description_
|
|
222
|
+
|
|
223
|
+
Returns
|
|
224
|
+
-------
|
|
225
|
+
sns.axisgrid.FacetGrid
|
|
226
|
+
_description_
|
|
227
|
+
"""
|
|
228
|
+
Par = sample['R'].filter(like='pareto')
|
|
229
|
+
numPar = len(Par.columns)
|
|
230
|
+
if not numPar: raise Exception('Pareto rank is not possible for this sample.')
|
|
231
|
+
|
|
232
|
+
feat = dict()
|
|
233
|
+
|
|
234
|
+
for i in range(0, numPar):
|
|
235
|
+
feat.update(descriptive_stats(Par.iloc[:,i], Par.columns[i]))
|
|
236
|
+
|
|
237
|
+
# plots
|
|
238
|
+
ncols = numPar
|
|
239
|
+
fig, axs = plt.subplots(ncols=ncols, figsize=(3 * ncols, 3))
|
|
240
|
+
palette = sns.color_palette()[-numPar:] # pick last n colors in palette
|
|
241
|
+
|
|
242
|
+
ax = lambda i : axs[i] if numPar > 1 else axs
|
|
243
|
+
|
|
244
|
+
for i, col in enumerate(Par):
|
|
245
|
+
sns.histplot(Par[col], ax=ax(i), color=palette[i])
|
|
246
|
+
|
|
247
|
+
fig.suptitle('Distribution of Pareto ranks')
|
|
248
|
+
plt.tight_layout()
|
|
249
|
+
|
|
250
|
+
return feat, fig
|
|
251
|
+
|
|
252
|
+
def distr_Deb(sample: dict) -> Tuple[dict, sns.axisgrid.FacetGrid]:
|
|
253
|
+
"""_summary_
|
|
254
|
+
|
|
255
|
+
Parameters
|
|
256
|
+
----------
|
|
257
|
+
sample : dict
|
|
258
|
+
_description_
|
|
259
|
+
|
|
260
|
+
Returns
|
|
261
|
+
-------
|
|
262
|
+
sns.axisgrid.FacetGrid
|
|
263
|
+
_description_
|
|
264
|
+
"""
|
|
265
|
+
if 'Deb' not in sample['R']: raise Exception("Deb's feasibility rule ranking is not possible for this sample.")
|
|
266
|
+
deb_ranks = sample['R']['Deb']
|
|
267
|
+
plot = sns.displot(deb_ranks)
|
|
268
|
+
plot.set(title=f"Deb\'s feasibility rule ranking distribution {sample['name']}")
|
|
269
|
+
feat = dict()
|
|
270
|
+
feat.update(descriptive_stats(deb_ranks, 'Deb'))
|
|
271
|
+
|
|
272
|
+
return feat, plot
|
|
273
|
+
|
|
274
|
+
def annotate_with_corr_coefs(x, y, label=None, color=None, **kwargs) -> None:
|
|
275
|
+
ax = plt.gca()
|
|
276
|
+
cor, _ = spearmanr(x, y)
|
|
277
|
+
cor = 'undefined' if math.isnan(cor) else f'{cor:.2f}'
|
|
278
|
+
# if sample has V file
|
|
279
|
+
if label != None:
|
|
280
|
+
feasibility = 'feasible' if label else 'infeasible'
|
|
281
|
+
kwargs['feat'].update({f"{x.name}_{y.name} ({feasibility})": cor})
|
|
282
|
+
pos = (0.5, 0.5) if label else (0.5, 0.25)
|
|
283
|
+
ax.annotate(f'corr = {cor} ({feasibility})', xy = pos, xycoords='axes fraction', ha = 'center', color = color)
|
|
284
|
+
# else if no V file
|
|
285
|
+
else:
|
|
286
|
+
kwargs['feat'].update({f"{x.name}_{y.name}": cor})
|
|
287
|
+
pos = (0.5, 0.5)
|
|
288
|
+
ax.annotate(f'corr = {cor}', xy = pos, xycoords='axes fraction', ha = 'center', color = color)
|
|
289
|
+
ax.set_axis_off()
|
|
290
|
+
|
|
291
|
+
def corr(sample: dict) -> Tuple[dict, sns.PairGrid]:
|
|
292
|
+
"""Correlation of values
|
|
293
|
+
|
|
294
|
+
@todo summary
|
|
295
|
+
|
|
296
|
+
Parameters
|
|
297
|
+
----------
|
|
298
|
+
sample : dict
|
|
299
|
+
A sample containing the various input files i.e `F`, `V`.
|
|
300
|
+
|
|
301
|
+
Returns
|
|
302
|
+
-------
|
|
303
|
+
sns.PairGrid
|
|
304
|
+
Grid of scatter plots of all objectives and violations against
|
|
305
|
+
each other, with feasibility indicated per solution.
|
|
306
|
+
feat: dict
|
|
307
|
+
Spearman's correlation coefficients for all pairs of sets of
|
|
308
|
+
values split by feasibility
|
|
309
|
+
"""
|
|
310
|
+
if util.present(sample, 'V'):
|
|
311
|
+
FV = pd.merge(sample['F'], sample['V'], left_index=True, right_index=True)
|
|
312
|
+
g = sns.PairGrid(FV, hue = 'feasible', hue_order=[True, False])
|
|
313
|
+
else:
|
|
314
|
+
FV = sample['F']
|
|
315
|
+
if sample['numF'] == 1: logging.warning('The sample has a single objective with no constraint, therefore this feature is meaningless.')
|
|
316
|
+
g = sns.PairGrid(FV)
|
|
317
|
+
# diagonal
|
|
318
|
+
g.map_diag(sns.histplot)
|
|
319
|
+
# lower triangle
|
|
320
|
+
g.map_lower(sns.scatterplot, alpha = 0.5)
|
|
321
|
+
g.map_lower(sns.regplot, scatter = False)
|
|
322
|
+
# upper triangle
|
|
323
|
+
feat = {}
|
|
324
|
+
g.map_upper(annotate_with_corr_coefs, feat=feat)
|
|
325
|
+
# legend
|
|
326
|
+
g.add_legend()
|
|
327
|
+
violation_txt = ' and violations' if util.present(sample, 'V') else ''
|
|
328
|
+
g.figure.suptitle(f"Correlation of objectives{violation_txt}")
|
|
329
|
+
return feat, g
|
|
330
|
+
|
|
331
|
+
def corr_ranks(sample: dict) -> Tuple[dict, sns.PairGrid]:
|
|
332
|
+
"""Correlation of ranks
|
|
333
|
+
|
|
334
|
+
@todo summary
|
|
335
|
+
|
|
336
|
+
Parameters
|
|
337
|
+
----------
|
|
338
|
+
sample : dict
|
|
339
|
+
A sample containing the various input files i.e `F`, `V`
|
|
340
|
+
|
|
341
|
+
Returns
|
|
342
|
+
-------
|
|
343
|
+
sns.PairGrid
|
|
344
|
+
Grid of scatter plots of all objective, violation, Pareto and Deb's
|
|
345
|
+
ranks against each other, with feasibility indicated per solution
|
|
346
|
+
feat: dict
|
|
347
|
+
Spearman's correlation coefficients for all pairs of sets of ranks
|
|
348
|
+
split by feasibility
|
|
349
|
+
"""
|
|
350
|
+
if util.present(sample, 'V'):
|
|
351
|
+
g = sns.PairGrid(sample['R'], hue='feasible', hue_order=[True, False])
|
|
352
|
+
else:
|
|
353
|
+
g = sns.PairGrid(sample['R'])
|
|
354
|
+
if sample['numF'] == 1: logging.warning('The sample has a single objective with no constraint, therefore this feature is meaningless.')
|
|
355
|
+
|
|
356
|
+
g.map_diag(sns.histplot)
|
|
357
|
+
# diagonal
|
|
358
|
+
g.map_diag(sns.histplot)
|
|
359
|
+
# lower triangle
|
|
360
|
+
g.map_lower(sns.scatterplot, alpha=0.5)
|
|
361
|
+
g.map_lower(sns.regplot, scatter=False)
|
|
362
|
+
# upper triangle
|
|
363
|
+
feat = {}
|
|
364
|
+
g.map_upper(annotate_with_corr_coefs, feat=feat)
|
|
365
|
+
# legend
|
|
366
|
+
g.add_legend()
|
|
367
|
+
g.figure.suptitle('Correlation of ranks')
|
|
368
|
+
g.figure.tight_layout()
|
|
369
|
+
return feat, g
|
|
370
|
+
|
|
371
|
+
def pw_dist(sample: dict, id_a: int, id_b: int, metric=None) -> float:
|
|
372
|
+
"""Calculates the distance between 2 solutions given their indices.
|
|
373
|
+
|
|
374
|
+
Parameters
|
|
375
|
+
----------
|
|
376
|
+
sample : dict
|
|
377
|
+
A sample containing the various input files i.e `F`, `V`.
|
|
378
|
+
id_a : int
|
|
379
|
+
Index of solution `a`
|
|
380
|
+
id_b : int
|
|
381
|
+
Index of solution `b`
|
|
382
|
+
|
|
383
|
+
Returns
|
|
384
|
+
-------
|
|
385
|
+
float
|
|
386
|
+
Distance between solution `a` and `b` as a float
|
|
387
|
+
"""
|
|
388
|
+
if id_a == id_b:
|
|
389
|
+
return 0
|
|
390
|
+
else:
|
|
391
|
+
d = None
|
|
392
|
+
D = sample['D']
|
|
393
|
+
# check if D file is provided
|
|
394
|
+
if isinstance(D, pd.DataFrame):
|
|
395
|
+
if id_a < id_b: d = D.loc[(id_a, id_b), 'd']
|
|
396
|
+
else: d = D.loc[(id_b, id_a), 'd']
|
|
397
|
+
# if D not provided calculate d's
|
|
398
|
+
else:
|
|
399
|
+
# if no metric specific use default metrics
|
|
400
|
+
if not metric:
|
|
401
|
+
metric = 'euclidean' if sample['representation'] == 'continuous' else 'hamming'
|
|
402
|
+
|
|
403
|
+
if 'X' not in sample: raise ValueError('Please provide either an X input file.')
|
|
404
|
+
X = sample['X']
|
|
405
|
+
a = X.iloc[id_a].to_numpy()
|
|
406
|
+
b = X.iloc[id_b].to_numpy()
|
|
407
|
+
# d = np.linalg.norm(a - b, ord = sample['p'])
|
|
408
|
+
d = pdist([a, b], metric=metric)
|
|
409
|
+
return d
|
|
410
|
+
|
|
411
|
+
def fdc(sample: dict, compute_D_file: bool = True) -> Tuple[dict, matplotlib.axes.Axes]:
|
|
412
|
+
"""Computes objective-distance correlation
|
|
413
|
+
|
|
414
|
+
@todo summary
|
|
415
|
+
|
|
416
|
+
Parameters
|
|
417
|
+
----------
|
|
418
|
+
sample : dict
|
|
419
|
+
A sample containing the various input files i.e `F`, `V`.
|
|
420
|
+
compute_D_file : bool, optional
|
|
421
|
+
By default `True`; when there is no D file in the sample, if
|
|
422
|
+
`compute_D_file` is set to `True`, the whole D file is
|
|
423
|
+
calculated. Calculating the whole D file will eliminate redundant
|
|
424
|
+
distance calculations in the future, but it can be time consuming.
|
|
425
|
+
To speed up calculation of `fdc`, set `compute_D_file` to
|
|
426
|
+
`False` so that only the required distances are calculated.
|
|
427
|
+
|
|
428
|
+
Returns
|
|
429
|
+
-------
|
|
430
|
+
corr : dict
|
|
431
|
+
Dictionary containing Spearman's correlation coefficients
|
|
432
|
+
objective-distance correlation per objective.
|
|
433
|
+
fig : matplotlib.axes.Axes
|
|
434
|
+
`matplotlib` axes containing scatter plots of objective
|
|
435
|
+
values against distance to the nearest best solution in
|
|
436
|
+
sample per objective, for all solutions or for feasible
|
|
437
|
+
solutions only.
|
|
438
|
+
|
|
439
|
+
Raises
|
|
440
|
+
------
|
|
441
|
+
Exception
|
|
442
|
+
Raises an exception if both D and X inputs are absent. One of D
|
|
443
|
+
or X is needed to compute distances between solution.
|
|
444
|
+
"""
|
|
445
|
+
util.handle_missing_D_file(sample, compute_D_file)
|
|
446
|
+
|
|
447
|
+
sample['FDC'] = pd.DataFrame()
|
|
448
|
+
FDC = sample['FDC']
|
|
449
|
+
F = sample['F']
|
|
450
|
+
R = sample['R']
|
|
451
|
+
corr = {}
|
|
452
|
+
|
|
453
|
+
fig, axs = plt.subplots(ncols=len(F.columns), figsize=(5 * len(F.columns), 5))
|
|
454
|
+
for i, col in enumerate(F.columns):
|
|
455
|
+
FDC[col] = F[col]
|
|
456
|
+
FDC['distance'] = 0.0
|
|
457
|
+
# pick F with rank 1,
|
|
458
|
+
bestF = R.query(f'{col} == 1')
|
|
459
|
+
# for rank 1 solution set distance to 0
|
|
460
|
+
FDC.loc[bestF.index, 'distance'] = 0.0
|
|
461
|
+
# compute distance only for non-rank-1 solutions
|
|
462
|
+
for t_idx in FDC[~FDC.index.isin(bestF.index)].index:
|
|
463
|
+
d_nearest_best_f = pw_dist(sample, t_idx, bestF.index[0])
|
|
464
|
+
# check for the nearest among all bestFs
|
|
465
|
+
for b_idx in bestF[1:].index:
|
|
466
|
+
d = pw_dist(sample, t_idx, b_idx)
|
|
467
|
+
if d < d_nearest_best_f: d_nearest_best_f = d
|
|
468
|
+
FDC.loc[t_idx, 'distance'] = d_nearest_best_f
|
|
469
|
+
|
|
470
|
+
r, _ = scipy.stats.spearmanr(FDC[col], FDC['distance'])
|
|
471
|
+
corr.update({f"fdc_{col}": r})
|
|
472
|
+
|
|
473
|
+
ax = axs[i] if len(F.columns) > 1 else axs
|
|
474
|
+
sns.scatterplot(data = FDC, x = 'distance', y = col, ax = ax)
|
|
475
|
+
sns.regplot(data = FDC, x = 'distance', y = col, ax = ax, scatter=False)
|
|
476
|
+
ax.set_title(f'FDC (corr = {r:.2f})')
|
|
477
|
+
|
|
478
|
+
fig.tight_layout()
|
|
479
|
+
|
|
480
|
+
return corr, fig
|
|
481
|
+
|
|
482
|
+
def vdc(sample: dict, compute_D_file: bool = True) -> Tuple[dict, matplotlib.axes.Axes]:
|
|
483
|
+
"""Calculates violation-distance correlation.
|
|
484
|
+
|
|
485
|
+
This function calculates the violation distance
|
|
486
|
+
correlation: the correlation between violation values and distance
|
|
487
|
+
to the nearest feasible solution.
|
|
488
|
+
|
|
489
|
+
Parameters
|
|
490
|
+
----------
|
|
491
|
+
sample : dict
|
|
492
|
+
A sample containing the at least input files V and D.
|
|
493
|
+
compute_D_file : bool, optional
|
|
494
|
+
By default `True`; when there is no D file in the sample, if
|
|
495
|
+
`compute_D_file` is set to `True`, the whole D file is
|
|
496
|
+
calculated. Calculating the whole D file will eliminate redundant
|
|
497
|
+
distance calculations in the future, but it can be time consuming.
|
|
498
|
+
To speed up calculation of `fdc`, set `compute_D_file` to
|
|
499
|
+
`False` so that only the required distances are calculated.
|
|
500
|
+
|
|
501
|
+
Returns
|
|
502
|
+
-------
|
|
503
|
+
corr : dict
|
|
504
|
+
Dictionary containing correlation between constraints and
|
|
505
|
+
distance to the nearest feasible solution.
|
|
506
|
+
fig : matplotlib.axes.Axes
|
|
507
|
+
`matplotlib` axes containing a scatter plot of violation
|
|
508
|
+
values against distance to the nearest feasible solution.
|
|
509
|
+
|
|
510
|
+
Examples
|
|
511
|
+
--------
|
|
512
|
+
>>> from pyxla import util, vdc
|
|
513
|
+
>>> import matplotlib
|
|
514
|
+
>>> sample = util.load_sample('cec2010_c01_2d_F1_V2', test=True)
|
|
515
|
+
>>> corr, plot = vdc(sample)
|
|
516
|
+
>>> type(corr)
|
|
517
|
+
<class 'dict'>
|
|
518
|
+
>>> isinstance(plot, matplotlib.figure.Figure)
|
|
519
|
+
True
|
|
520
|
+
"""
|
|
521
|
+
if not util.present(sample, 'V'): raise Exception('V is required. Please provide V.')
|
|
522
|
+
|
|
523
|
+
util. handle_missing_D_file(sample, compute_D_file)
|
|
524
|
+
|
|
525
|
+
V = sample['V']
|
|
526
|
+
|
|
527
|
+
if V[V['feasible'] == True].empty: raise Exception('VDC is undefined as there is no feasible solution.')
|
|
528
|
+
|
|
529
|
+
corr = {}
|
|
530
|
+
fig, axs = plt.subplots(ncols=len(V.columns[:-1]), figsize=(5 * len(V.columns[:-1]), 5))
|
|
531
|
+
|
|
532
|
+
for i, col in tqdm(enumerate(V.columns[:-1]), total=sample['numV']): # exclude col 'feasible'
|
|
533
|
+
sample[f'VDC_{col}'] = pd.DataFrame()
|
|
534
|
+
VDC = sample[f'VDC_{col}']
|
|
535
|
+
# separate feasible and infeasible solutions.
|
|
536
|
+
feas_v, infeas_v = V.query(f'{col} == 0'), V.query(f'{col} != 0')
|
|
537
|
+
|
|
538
|
+
VDC[col] = infeas_v[col]
|
|
539
|
+
# for each infeasible solution:
|
|
540
|
+
for infeas_idx in infeas_v.index:
|
|
541
|
+
d_nearest_feas = pw_dist(sample, infeas_idx, feas_v.index[0])
|
|
542
|
+
# compute dist to all feas. solutions
|
|
543
|
+
for feas_idx in feas_v.iloc[1:].index:
|
|
544
|
+
d = pw_dist(sample, infeas_idx, feas_idx)
|
|
545
|
+
if d < d_nearest_feas:
|
|
546
|
+
# look for the nearest feas. solution
|
|
547
|
+
d_nearest_feas = d
|
|
548
|
+
VDC.loc[infeas_idx, 'distance'] = d_nearest_feas
|
|
549
|
+
# no existing infeasible solutions
|
|
550
|
+
if infeas_v.empty: VDC['distance'] = 0
|
|
551
|
+
|
|
552
|
+
r, _ = scipy.stats.spearmanr(VDC[col], VDC['distance'])
|
|
553
|
+
corr.update({f"vdc_{col}": r})
|
|
554
|
+
|
|
555
|
+
ax = axs[i] if len(V.columns[:-1]) > 1 else axs
|
|
556
|
+
# use uniform color for infeasible solutions
|
|
557
|
+
color = sns.color_palette()[1]
|
|
558
|
+
sns.scatterplot(data = sample[f'VDC_{col}'], x = 'distance', y = col, ax = ax, color=color)
|
|
559
|
+
ax.set_title(f'VDC (corr = {r:.2f})')
|
|
560
|
+
|
|
561
|
+
return corr, fig
|
|
562
|
+
|
|
563
|
+
def rdc(sample: dict, compute_D_file: bool = True) -> Tuple[dict, matplotlib.axes.Axes]:
|
|
564
|
+
"""Rank distance correlation
|
|
565
|
+
|
|
566
|
+
Scatter plots of ranks against distance to the nearest best solution
|
|
567
|
+
(with rank 1) in sample. Ranks based on Pareto ranks for objectives,
|
|
568
|
+
violations or the combination, or Deb feasibility rank.
|
|
569
|
+
|
|
570
|
+
Parameters
|
|
571
|
+
----------
|
|
572
|
+
sample : dict
|
|
573
|
+
A sample containing the at least input files V and D.
|
|
574
|
+
compute_D_file : bool, optional
|
|
575
|
+
By default `True`; when there is no D file in the sample, if
|
|
576
|
+
`compute_D_file` is set to `True`, the whole D file is
|
|
577
|
+
calculated. Calculating the whole D file will eliminate redundant
|
|
578
|
+
distance calculations in the future, but it can be time consuming.
|
|
579
|
+
To speed up calculation of `fdc`, set `compute_D_file` to
|
|
580
|
+
`False` so that only the required distances are calculated.
|
|
581
|
+
|
|
582
|
+
Returns
|
|
583
|
+
-------
|
|
584
|
+
corr : dict
|
|
585
|
+
Dictionary containing correlation between ranks and
|
|
586
|
+
distance to the nearest best solution.
|
|
587
|
+
fig : matplotlib.axes.Axes
|
|
588
|
+
`matplotlib` axes containing a scatter plot of ranks
|
|
589
|
+
against distance to the nearest best solution.
|
|
590
|
+
|
|
591
|
+
"""
|
|
592
|
+
R: pd.DataFrame = sample['R']
|
|
593
|
+
corrs = {}
|
|
594
|
+
numR = sample['numR']
|
|
595
|
+
ncols = numR
|
|
596
|
+
# nrows = math.ceil(numR / ncols)
|
|
597
|
+
fig, axs = plt.subplots(ncols=ncols, nrows=1, figsize=(5 * ncols, 5 * 1))
|
|
598
|
+
# for each rank...
|
|
599
|
+
axs = axs.ravel()
|
|
600
|
+
for i, col in tqdm(enumerate(R.columns[:-1]), total=numR): # exclude col 'feasible'
|
|
601
|
+
sample[f'RDC_{col}'] = pd.DataFrame()
|
|
602
|
+
RDC: pd.DataFrame = sample[f'RDC_{col}']
|
|
603
|
+
RDC[col] = R[col]
|
|
604
|
+
# check if col is a V
|
|
605
|
+
if col in sample['V'].columns:
|
|
606
|
+
# determine feasibility per constraint
|
|
607
|
+
RDC['feasible'] = sample['V'][col] = 0
|
|
608
|
+
else:
|
|
609
|
+
RDC['feasible'] = R['feasible']
|
|
610
|
+
RDC['distance'] = None
|
|
611
|
+
# get solutions with rank == 1
|
|
612
|
+
best = R.query(f'{col} == 1')
|
|
613
|
+
# for rank 1 solution set distance to 0
|
|
614
|
+
RDC.loc[best.index, 'distance'] = 0
|
|
615
|
+
# compute distance only for non-rank-1 solutions
|
|
616
|
+
for t_idx in RDC.query('distance.isna()').index:
|
|
617
|
+
d_nearest_best = pw_dist(sample, t_idx, best.index[0])
|
|
618
|
+
for b_idx in best[1:].index:
|
|
619
|
+
d = pw_dist(sample, t_idx, b_idx)
|
|
620
|
+
if d < d_nearest_best: d_nearest_best = d
|
|
621
|
+
RDC.loc[t_idx, 'distance'] = d_nearest_best
|
|
622
|
+
|
|
623
|
+
sp_corr, _ = scipy.stats.spearmanr(RDC[col], RDC['distance'])
|
|
624
|
+
corrs.update({f"rdc_{col}": sp_corr})
|
|
625
|
+
|
|
626
|
+
ax = axs[i] if numR > 1 else axs
|
|
627
|
+
sns.scatterplot(data = sample[f'RDC_{col}'], x = 'distance', y = col, hue='feasible', hue_order=[True, False], ax = ax)
|
|
628
|
+
ax.set_title(f'RDC (corr = {sp_corr:.2f})')
|
|
629
|
+
|
|
630
|
+
# if i + 1 < ncols * nrows:
|
|
631
|
+
# for unused_ax in axs[i + 1:]: unused_ax.set_axis_off()
|
|
632
|
+
return corrs, fig
|
|
633
|
+
|
|
634
|
+
def pdc(sample: dict, metric: Union[Callable, str] = 'euclidean') -> Tuple[dict, matplotlib.figure.Figure]:
|
|
635
|
+
"""Computes pairwise distance correlation and produces scatter plots.
|
|
636
|
+
|
|
637
|
+
This feature produces a visual output of scatter plots of pairwise
|
|
638
|
+
distance on the solution space against distance on the objective
|
|
639
|
+
space, violation space, and for each objective, constraint and rank
|
|
640
|
+
individually. The numerical output is the pairwise distance
|
|
641
|
+
Spearman's correlation coefficient.
|
|
642
|
+
|
|
643
|
+
Parameters
|
|
644
|
+
----------
|
|
645
|
+
sample : dict
|
|
646
|
+
A sample containing the at least input files V and D.
|
|
647
|
+
metric : Callable or str, optional
|
|
648
|
+
A metric function or the name of a distance metric as listed
|
|
649
|
+
``scipy``'s ``pdist`` function
|
|
650
|
+
<https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html>`_.
|
|
651
|
+
If a metric function is defined it must take two solutions and computes distance
|
|
652
|
+
between them of the form ``dist(Xa, Xb) -> d`` where ``Xa`` and ``Xb``
|
|
653
|
+
are ``pandas`` Series representing solutions, by default ``None``.
|
|
654
|
+
For example: ``lambda Xa, Xb: abs(Xa.sum() - Xb.sum())``.
|
|
655
|
+
|
|
656
|
+
Returns
|
|
657
|
+
-------
|
|
658
|
+
corr : dict
|
|
659
|
+
Dictionary containing correlation between pairwise distance on the
|
|
660
|
+
solution space and pairwise distance on the objective space, on
|
|
661
|
+
the violation space, and on each objective, violation and rank
|
|
662
|
+
individually.
|
|
663
|
+
fig : matplotlib.figure.Figure
|
|
664
|
+
`matplotlib` axes containing a scatter plot of pairwise distance
|
|
665
|
+
on the solution space against pairwise distance on the objective
|
|
666
|
+
space, on the violation space, and on each objective, violation
|
|
667
|
+
and rank individually.
|
|
668
|
+
|
|
669
|
+
Examples
|
|
670
|
+
--------
|
|
671
|
+
>>> from pyxla import util, pdc
|
|
672
|
+
>>> import matplotlib
|
|
673
|
+
>>> sample = util.load_sample('cec2010_c01_2d_F1_V2', test=True)
|
|
674
|
+
>>> corr, plot = pdc(sample)
|
|
675
|
+
>>> type(corr)
|
|
676
|
+
<class 'dict'>
|
|
677
|
+
>>> isinstance(plot, matplotlib.figure.Figure)
|
|
678
|
+
True
|
|
679
|
+
"""
|
|
680
|
+
util.handle_missing_D_file(sample, True, warn=False)
|
|
681
|
+
|
|
682
|
+
F = sample['F']
|
|
683
|
+
V = sample['V']
|
|
684
|
+
D = sample['D']
|
|
685
|
+
# using errors='ignore' as the `feasible` column is only present when we have V
|
|
686
|
+
R = sample['R'].drop('feasible', axis=1, errors='ignore')
|
|
687
|
+
PD = pd.DataFrame()
|
|
688
|
+
|
|
689
|
+
PD['Xd'] = D['d']
|
|
690
|
+
|
|
691
|
+
corrs = {}
|
|
692
|
+
rows = []
|
|
693
|
+
|
|
694
|
+
# objective space
|
|
695
|
+
row1 = [{'x': F, 'name':'F'}]
|
|
696
|
+
|
|
697
|
+
# violation space
|
|
698
|
+
if util.present(sample, 'V'):
|
|
699
|
+
V = V.drop('feasible', axis=1)
|
|
700
|
+
row1.append({'x': V, 'name':'V'})
|
|
701
|
+
|
|
702
|
+
rows.extend(row1)
|
|
703
|
+
|
|
704
|
+
# objectives
|
|
705
|
+
if sample['numF'] > 1:
|
|
706
|
+
rows.extend([{'x': F[col], 'name': col} for col in F.columns])
|
|
707
|
+
|
|
708
|
+
# violations
|
|
709
|
+
if sample['numV'] > 1:
|
|
710
|
+
rows.extend([{'x': V[col], 'name': col} for col in V])
|
|
711
|
+
|
|
712
|
+
# ranks
|
|
713
|
+
rows.extend([{'x': R[col], 'name': f'{col}-rank'} for col in R])
|
|
714
|
+
|
|
715
|
+
# prepare figure
|
|
716
|
+
ncols = min(3, len(rows)) #max(2, sample['numF'], sample['numV'], sample['numR'])
|
|
717
|
+
nrows = math.ceil(len(rows) / ncols)
|
|
718
|
+
fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=(5 * ncols, 5 * nrows))
|
|
719
|
+
color = sns.color_palette()[2]
|
|
720
|
+
axs = axs.ravel()
|
|
721
|
+
unused_axs = []
|
|
722
|
+
|
|
723
|
+
for i, plot in enumerate(rows):
|
|
724
|
+
# for col, plot in enumerate(plots):
|
|
725
|
+
ax_lbl = f"{plot['name']}_d"
|
|
726
|
+
PD[ax_lbl] = util.calc_pairwise_dist(plot['x'], metric=metric, representation=sample['representation'])
|
|
727
|
+
corr, _ = scipy.stats.spearmanr(PD['Xd'], PD[ax_lbl])
|
|
728
|
+
corrs.update({f"pdc_{plot['name']}": corr}) # fix .item()
|
|
729
|
+
|
|
730
|
+
ax = sns.scatterplot(x=PD['Xd'], y=PD[ax_lbl], ax=axs[i], color=color)
|
|
731
|
+
ax.set(title=f"X_d vs {plot['name']}_d (corr = {corr:.2f})")
|
|
732
|
+
|
|
733
|
+
# unused_axs.extend([(i, c) for c in range(col + 1, ncols)])
|
|
734
|
+
|
|
735
|
+
# turn off unused axes
|
|
736
|
+
if i + 1 < ncols * nrows:
|
|
737
|
+
for unused_ax in axs[i + 1:]: unused_ax.set_axis_off()
|
|
738
|
+
# for ax in unused_axs: axs[ax].set_axis_off()
|
|
739
|
+
|
|
740
|
+
fig.suptitle(f"PDC for {sample['name']}", y=1.04)
|
|
741
|
+
plt.tight_layout()
|
|
742
|
+
|
|
743
|
+
return corrs, fig
|
|
744
|
+
|
|
745
|
+
def nfc(sample: dict) -> Tuple[dict, matplotlib.axes.Axes]:
|
|
746
|
+
"""Computes neighbouring solutions' objective values correlation.
|
|
747
|
+
|
|
748
|
+
This feature produces a scatter plot of the objective values between
|
|
749
|
+
neighbours for each objective (fitness cloud), for all solutions or
|
|
750
|
+
for feasible solutions only. The plot is divided a broken line through
|
|
751
|
+
origin such that lines above line are improving neighbours, those
|
|
752
|
+
below are deteriorating neighbours while those on the line are neutral
|
|
753
|
+
neighbours. The numerical output produced is a list of Spearman's
|
|
754
|
+
correlation coefficients for each scatter plot.
|
|
755
|
+
|
|
756
|
+
Parameters
|
|
757
|
+
----------
|
|
758
|
+
sample : dict
|
|
759
|
+
A sample containing input files i.e `F`, `N`.
|
|
760
|
+
|
|
761
|
+
Returns
|
|
762
|
+
-------
|
|
763
|
+
corr : dict
|
|
764
|
+
Dictionary containing correlation between objective values of
|
|
765
|
+
solutions and the objective values of their neighbours, for all
|
|
766
|
+
solutions and for feasible solutions only.
|
|
767
|
+
fig : matplotlib.axes.Axes
|
|
768
|
+
`matplotlib` axes each containing a scatter plot of objective
|
|
769
|
+
values of solutions against the objective values of their
|
|
770
|
+
neighbours, for all solutions and for feasible solutions only.
|
|
771
|
+
|
|
772
|
+
Raises
|
|
773
|
+
------
|
|
774
|
+
Exception
|
|
775
|
+
Raised if both N and X inputs are absent, as neighbourhood
|
|
776
|
+
cannot be determined without having either.
|
|
777
|
+
|
|
778
|
+
Examples
|
|
779
|
+
--------
|
|
780
|
+
>>> from pyxla import util, nfc
|
|
781
|
+
>>> import matplotlib
|
|
782
|
+
>>> sample = util.load_sample('nk_n14_k2_id5_F1_V1', test=True)
|
|
783
|
+
>>> corrs, plot = nfc(sample)
|
|
784
|
+
>>> type(corrs)
|
|
785
|
+
<class 'dict'>
|
|
786
|
+
>>> isinstance(plot, matplotlib.figure.Figure)
|
|
787
|
+
True
|
|
788
|
+
"""
|
|
789
|
+
|
|
790
|
+
util.handle_missing_N_file(sample, True, warn=False)
|
|
791
|
+
|
|
792
|
+
F = sample['F']
|
|
793
|
+
numF = sample['numF']
|
|
794
|
+
V = sample['V']
|
|
795
|
+
N = sample['N']
|
|
796
|
+
|
|
797
|
+
corrs = {}
|
|
798
|
+
rows = []
|
|
799
|
+
|
|
800
|
+
rows.append({'name': 'all', 'nfc': N})
|
|
801
|
+
|
|
802
|
+
# for feasible solutions only
|
|
803
|
+
if util.present(sample, 'V'):
|
|
804
|
+
N_indexed = N.set_index('id1', drop=False)
|
|
805
|
+
V = V.loc[N_indexed.index]
|
|
806
|
+
feas_idxs = V[V['feasible'] == True].index
|
|
807
|
+
# add plot data to rows only if there are feasible solutions
|
|
808
|
+
if len(feas_idxs) > 0:
|
|
809
|
+
feas_N = N_indexed.loc[feas_idxs]
|
|
810
|
+
feas_N.reset_index(drop=True, inplace=True)
|
|
811
|
+
rows.append({'name': 'feas.', 'nfc': feas_N})
|
|
812
|
+
|
|
813
|
+
ncols = sample['numF']
|
|
814
|
+
nrows = len(rows)
|
|
815
|
+
fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=(5 * ncols, 5 * nrows))
|
|
816
|
+
|
|
817
|
+
# change color palette
|
|
818
|
+
sns.set_palette('deep')
|
|
819
|
+
sns.set_palette(reversed(sns.color_palette()), 10)
|
|
820
|
+
palette = sns.color_palette()
|
|
821
|
+
|
|
822
|
+
for row, data in enumerate(rows):
|
|
823
|
+
NFC = data['nfc']
|
|
824
|
+
for col, f in enumerate(tqdm(F.columns)):
|
|
825
|
+
x, y = f'{f}', f'neighbour {f}'
|
|
826
|
+
# collate fitness values pairs for neighbour pairs
|
|
827
|
+
NFC[x] = F[f].loc[NFC['id1']].tolist()
|
|
828
|
+
NFC[y] = F[f].loc[NFC['id2']].tolist()
|
|
829
|
+
|
|
830
|
+
corr, _ = scipy.stats.spearmanr(NFC[x], NFC[y])
|
|
831
|
+
corr = corr if isinstance(corr, float) else corr.item()
|
|
832
|
+
corrs.update({f"nfc_{data['name']}_X_for_{f}": corr})
|
|
833
|
+
|
|
834
|
+
# determine axis coords.
|
|
835
|
+
if len(rows) > 1:
|
|
836
|
+
ax = axs[row, col] if numF > 1 else axs[row]
|
|
837
|
+
else:
|
|
838
|
+
ax = axs[col] if numF > 1 else axs
|
|
839
|
+
|
|
840
|
+
ax = sns.scatterplot(x=NFC[x], y=NFC[y], ax=ax, color=palette[0])
|
|
841
|
+
ax.set(title=f'NFC for {f} {data['name']} X (corr = {corr:.2f})')
|
|
842
|
+
sns.regplot(x=NFC[x], y=NFC[y], ax=ax, scatter=False, color=palette[1])
|
|
843
|
+
|
|
844
|
+
limits = [
|
|
845
|
+
np.min([ax.get_xlim(), ax.get_ylim()]),
|
|
846
|
+
np.max([ax.get_xlim(), ax.get_ylim()]),
|
|
847
|
+
]
|
|
848
|
+
|
|
849
|
+
# plot diagonal line
|
|
850
|
+
ax.plot(limits, limits, '--k')
|
|
851
|
+
|
|
852
|
+
util.equalize_axes_(ax)
|
|
853
|
+
|
|
854
|
+
fig.suptitle(f"NFC for {sample['name']}", y=1.04)
|
|
855
|
+
plt.tight_layout()
|
|
856
|
+
return corrs, fig
|
|
857
|
+
|
|
858
|
+
def ncf(sample: dict) -> Tuple[dict, matplotlib.axes.Axes]:
|
|
859
|
+
"""Neighbouring change in feasibility
|
|
860
|
+
|
|
861
|
+
This feature produces as visual output a bar chart of proportion of feasible solutions
|
|
862
|
+
with infeasible neighbours and infeasible solutions with feasible
|
|
863
|
+
neighbours. The corresponding numerical outputs are the respective proportions.
|
|
864
|
+
|
|
865
|
+
Parameters
|
|
866
|
+
----------
|
|
867
|
+
sample : dict
|
|
868
|
+
A sample containing input files i.e `F`, `N`.
|
|
869
|
+
|
|
870
|
+
Returns
|
|
871
|
+
-------
|
|
872
|
+
proportions : dict
|
|
873
|
+
Dictionary containing the numerical proportions as defined above.
|
|
874
|
+
fig : matplotlib.axes.Axes
|
|
875
|
+
`matplotlib` figure with the bar chart visually illustrating the
|
|
876
|
+
proportions defined above.
|
|
877
|
+
|
|
878
|
+
Raises
|
|
879
|
+
------
|
|
880
|
+
Exception
|
|
881
|
+
Raises an exception if no V file is provided as feasibility is
|
|
882
|
+
undefined without the V file.
|
|
883
|
+
Exception
|
|
884
|
+
Raises an exception if the sample only has solutions belong to
|
|
885
|
+
only one class of feasibility i.e if all solutions are infeasible
|
|
886
|
+
or if all solutions are feasible.
|
|
887
|
+
Exception
|
|
888
|
+
Raised if both N and X inputs are absent, as neighbourhood
|
|
889
|
+
cannot be determined without having either.
|
|
890
|
+
|
|
891
|
+
Examples
|
|
892
|
+
--------
|
|
893
|
+
>>> from pyxla import util, ncf
|
|
894
|
+
>>> sample = util.load_sample('nk_n14_k2_id5_F1_V1', test=True)
|
|
895
|
+
>>> proportions, fig = ncf(sample)
|
|
896
|
+
>>> proportions # doctest: +SKIP
|
|
897
|
+
"""
|
|
898
|
+
|
|
899
|
+
if not util.present(sample, 'V'): raise Exception('V is required. Please provide V.')
|
|
900
|
+
|
|
901
|
+
util.handle_missing_N_file(sample, True, warn=False)
|
|
902
|
+
|
|
903
|
+
proportions = {}
|
|
904
|
+
|
|
905
|
+
V = sample['V']
|
|
906
|
+
|
|
907
|
+
N = sample['N']
|
|
908
|
+
N['id1_feasible'] = V.loc[N['id1']]['feasible'].to_numpy()
|
|
909
|
+
N['id2_feasible'] = V.loc[N['id2']]['feasible'].to_numpy()
|
|
910
|
+
|
|
911
|
+
inversions = (N['id1_feasible'] ^ N['id2_feasible']).sum()
|
|
912
|
+
|
|
913
|
+
proportions['discontiguous feasbility'] = inversions / len(N)
|
|
914
|
+
proportions['contiguous feasibility'] = 1 - proportions['discontiguous feasbility']
|
|
915
|
+
|
|
916
|
+
fig = sns.barplot(x=proportions.keys(), y=proportions.values(), hue=proportions.keys())
|
|
917
|
+
fig.set_title(f'NΔFeas for {sample['name']}')
|
|
918
|
+
fig.set_ylim(0, 1)
|
|
919
|
+
fig.set_ylabel('proportion')
|
|
920
|
+
|
|
921
|
+
for container in fig.containers:
|
|
922
|
+
fig.bar_label(container, label_type='center', padding=5)
|
|
923
|
+
|
|
924
|
+
plt.tight_layout()
|
|
925
|
+
|
|
926
|
+
return proportions, fig
|
|
927
|
+
|
|
928
|
+
def n_flat(sample: dict, bounds: Union[List[float], float] = 0) -> Tuple[dict, matplotlib.axes.Axes]:
|
|
929
|
+
"""Neutral degree of neighbours.
|
|
930
|
+
|
|
931
|
+
The visual output of this feature is a scatterplot of the neutral
|
|
932
|
+
degree of each solution against the neutral degree of its neighbours
|
|
933
|
+
with respect to each objective function. The numerical output is the
|
|
934
|
+
corresponding Spearman correlation coefficients for each scatterplot.
|
|
935
|
+
The neutral degree of a solution is the number of its neutral
|
|
936
|
+
neighbors [1]_.
|
|
937
|
+
|
|
938
|
+
Parameters
|
|
939
|
+
----------
|
|
940
|
+
sample : dict
|
|
941
|
+
A sample containing input files i.e ``F``, ``N``.
|
|
942
|
+
bound : Union[List[float], float]
|
|
943
|
+
Bound(s) for infering equality of objective values, by default ``0``.
|
|
944
|
+
|
|
945
|
+
Returns
|
|
946
|
+
-------
|
|
947
|
+
corr : dict
|
|
948
|
+
Dictionary containing correlation between the neutral degree of
|
|
949
|
+
each solutions against neutral degree of neighbours for each
|
|
950
|
+
objective.
|
|
951
|
+
fig : matplotlib.figure.Figure
|
|
952
|
+
``matplotlib`` figure containing axes for each objective with each
|
|
953
|
+
axis containing a scatterplot of the neutral degree of each
|
|
954
|
+
solution against neutral degree of neighbours.
|
|
955
|
+
|
|
956
|
+
References
|
|
957
|
+
----------
|
|
958
|
+
.. [1] S. Verel, G. Ochoa, and M. Tomassini, 'Local Optima Networks of NK Landscapes With Neutrality', Evolutionary Computation, IEEE Transactions on, vol. 15, Jul. 2011.
|
|
959
|
+
|
|
960
|
+
Examples
|
|
961
|
+
--------
|
|
962
|
+
>>> from pyxla import util, n_flat
|
|
963
|
+
>>> import matplotlib
|
|
964
|
+
>>> sample = util.load_sample('nk_n14_k2_id5_F1_V1', test=True)
|
|
965
|
+
>>> corrs, plot = n_flat(sample, 0.01)
|
|
966
|
+
>>> type(corrs)
|
|
967
|
+
<class 'dict'>
|
|
968
|
+
>>> isinstance(plot, matplotlib.figure.Figure)
|
|
969
|
+
True
|
|
970
|
+
"""
|
|
971
|
+
|
|
972
|
+
util.handle_missing_N_file(sample, True, warn=False)
|
|
973
|
+
|
|
974
|
+
N = sample['N']
|
|
975
|
+
F = sample['F']
|
|
976
|
+
NDN = pd.DataFrame() # neutral degree of neighbours
|
|
977
|
+
N_indexed = N.set_index('id1')
|
|
978
|
+
numF = sample['numF']
|
|
979
|
+
|
|
980
|
+
# replicate equality range for each objective
|
|
981
|
+
if not isinstance(bounds, Iterable): bounds = [bounds] * numF
|
|
982
|
+
|
|
983
|
+
corrs = {}
|
|
984
|
+
fig, axs = plt.subplots(ncols=numF, figsize=(5 * numF, 5))
|
|
985
|
+
|
|
986
|
+
for i, f in enumerate(F.columns):
|
|
987
|
+
n_degrees = pd.DataFrame()
|
|
988
|
+
# get all soln. indices in the N file
|
|
989
|
+
n_degrees['id'] = np.unique(np.concat([N['id1'].unique(), N['id2'].unique()]))
|
|
990
|
+
n_degrees['degree'] = 0
|
|
991
|
+
n_degrees.set_index('id', inplace=True)
|
|
992
|
+
|
|
993
|
+
# compute neutral degree for each soln. with a neighbour(s)
|
|
994
|
+
for idx in N['id1'].unique():
|
|
995
|
+
obj_val = F.loc[idx][f]
|
|
996
|
+
neighbours = N_indexed.loc[idx]['id2']
|
|
997
|
+
neighbours_obj_vals = F.loc[neighbours]
|
|
998
|
+
# check obj. value equality within supplied bounds
|
|
999
|
+
neutral_neighbours = neighbours_obj_vals[abs(neighbours_obj_vals - obj_val) <= bounds[i]].dropna()
|
|
1000
|
+
n_degrees.loc[idx, 'degree'] = len(neutral_neighbours)
|
|
1001
|
+
# n_degrees.loc[idx, 'degree'] = math.floor((2/3) * len(neighbours_obj_vals))
|
|
1002
|
+
|
|
1003
|
+
x = 'neutral degree'
|
|
1004
|
+
y = 'neutral degree of neighbour'
|
|
1005
|
+
NDN[x] = n_degrees.loc[N['id1']]['degree'].to_numpy()
|
|
1006
|
+
NDN[y] = n_degrees.loc[N['id2']]['degree'].to_numpy()
|
|
1007
|
+
|
|
1008
|
+
corr, _ = scipy.stats.spearmanr(NDN[x], NDN[y])
|
|
1009
|
+
corrs.update({f"n_flat_{f}": corr})
|
|
1010
|
+
|
|
1011
|
+
ax = axs[i] if len(F.columns) > 1 else axs
|
|
1012
|
+
ax = sns.scatterplot(x=NDN[x], y=NDN[y], ax=ax, alpha=0.5)
|
|
1013
|
+
ax.set(title=f'N_flat for {f} X (corr = {corr:.2f})')
|
|
1014
|
+
sns.regplot(x=NDN[x], y=NDN[y], ax=ax, scatter=False)
|
|
1015
|
+
|
|
1016
|
+
fig.suptitle(f"N_flat for {sample['name']}", y=1.04)
|
|
1017
|
+
|
|
1018
|
+
plt.tight_layout()
|
|
1019
|
+
return corrs, fig
|
|
1020
|
+
|
|
1021
|
+
|
|
1022
|
+
def nvc(sample: dict) -> Tuple[dict, matplotlib.figure.Figure]:
|
|
1023
|
+
"""Computes neighbouring solutions' violation values correlation (NVC)
|
|
1024
|
+
|
|
1025
|
+
This feature produces as visual output, Scatter plot of the violation
|
|
1026
|
+
values between neighbours for each constraint, for infeasible solutions
|
|
1027
|
+
only. A regression line is plotted to indicate correlation. The plot
|
|
1028
|
+
is divided a broken line through origin such that lines above line are
|
|
1029
|
+
improving neighbours, those below are deteriorating neighbours while
|
|
1030
|
+
those on the line are neutral neighbours. The numerical output for
|
|
1031
|
+
this feature is a set of Spearman's correlation coefficients for each
|
|
1032
|
+
scatter plot.
|
|
1033
|
+
|
|
1034
|
+
Parameters
|
|
1035
|
+
----------
|
|
1036
|
+
sample : dict
|
|
1037
|
+
A sample containing input files i.e `V`, `N`.
|
|
1038
|
+
|
|
1039
|
+
Returns
|
|
1040
|
+
-------
|
|
1041
|
+
corr : dict
|
|
1042
|
+
Dictionary containing correlation between violation values of
|
|
1043
|
+
solutions and the violation values of their neighbours, for
|
|
1044
|
+
infeasible solutions only.
|
|
1045
|
+
fig : matplotlib.axes.Axes
|
|
1046
|
+
`matplotlib` axes each containing a scatter plot of violation
|
|
1047
|
+
values of solutions against the violation values of their
|
|
1048
|
+
neighbours for infeasible solutions only.
|
|
1049
|
+
|
|
1050
|
+
Examples
|
|
1051
|
+
--------
|
|
1052
|
+
>>> from pyxla import util, nvc
|
|
1053
|
+
>>> import matplotlib
|
|
1054
|
+
>>> sample = util.load_sample('nk_n14_k2_id5_F3_V2', test=True)
|
|
1055
|
+
>>> corrs, plot = nvc(sample)
|
|
1056
|
+
>>> type(corrs)
|
|
1057
|
+
<class 'dict'>
|
|
1058
|
+
>>> isinstance(plot, matplotlib.figure.Figure)
|
|
1059
|
+
True
|
|
1060
|
+
"""
|
|
1061
|
+
|
|
1062
|
+
if not util.present(sample, 'V'): raise Exception('V is required. Please provide V.')
|
|
1063
|
+
|
|
1064
|
+
util.handle_missing_N_file(sample, True, warn=False)
|
|
1065
|
+
|
|
1066
|
+
V = sample['V'].drop('feasible', axis=1)
|
|
1067
|
+
numV = sample['numV']
|
|
1068
|
+
N = sample['N']
|
|
1069
|
+
N_indexed = N.set_index('id1')
|
|
1070
|
+
|
|
1071
|
+
# get all solutions represented in N file
|
|
1072
|
+
X_in_N_id1 = N['id1'].unique()
|
|
1073
|
+
X_in_N_id2 = N['id2'].unique()
|
|
1074
|
+
X_in_N = np.unique(np.concat([X_in_N_id1, X_in_N_id2]))
|
|
1075
|
+
# filter V to only have Xs in N
|
|
1076
|
+
V = V.loc[X_in_N]
|
|
1077
|
+
assert len(V) == len(X_in_N)
|
|
1078
|
+
|
|
1079
|
+
corrs = {}
|
|
1080
|
+
fig, axs = plt.subplots(ncols=numV, figsize=(5 * numV, 5))
|
|
1081
|
+
|
|
1082
|
+
# change color palette
|
|
1083
|
+
sns.set_palette('deep')
|
|
1084
|
+
sns.set_palette(reversed(sns.color_palette()), 10)
|
|
1085
|
+
palette = sns.color_palette()
|
|
1086
|
+
|
|
1087
|
+
for i, v in enumerate(tqdm(V.columns)):
|
|
1088
|
+
NVC = pd.DataFrame()
|
|
1089
|
+
infeas = V.loc[X_in_N_id1].query(f"{v} != 0").index.to_numpy()
|
|
1090
|
+
# reduce N to have `id1's` that are infeasible
|
|
1091
|
+
N_filtered = N_indexed.loc[infeas]
|
|
1092
|
+
N_filtered = N_filtered.reset_index()
|
|
1093
|
+
x, y = f"{v}", f"neighbour {v}"
|
|
1094
|
+
NVC[x] = V.loc[N_filtered['id1'].to_numpy()][v].to_numpy()
|
|
1095
|
+
NVC[y] = V.loc[N_filtered['id2'].to_numpy()][v].to_numpy()
|
|
1096
|
+
assert len(NVC[x]) == len(NVC[y]) == len(N_filtered)
|
|
1097
|
+
|
|
1098
|
+
corr, _ = scipy.stats.spearmanr(NVC[x], NVC[y])
|
|
1099
|
+
corr = corr if isinstance(corr, float) else corr.item()
|
|
1100
|
+
corrs.update({f"NVC_for_{v}": f'{corr:.4f}'})
|
|
1101
|
+
|
|
1102
|
+
ax = axs[i] if numV > 1 else axs
|
|
1103
|
+
|
|
1104
|
+
ax = sns.scatterplot(x=NVC[x], y=NVC[y], ax=ax, color=palette[0])
|
|
1105
|
+
ax.set(title=f'NVC for {v} (corr = {corr:.2f})')
|
|
1106
|
+
sns.regplot(x=NVC[x], y=NVC[y], ax=ax, scatter=False, color=palette[1])
|
|
1107
|
+
|
|
1108
|
+
# plot diagonal line
|
|
1109
|
+
limits = [
|
|
1110
|
+
np.min([ax.get_xlim(), ax.get_ylim()]),
|
|
1111
|
+
np.max([ax.get_xlim(), ax.get_ylim()]),
|
|
1112
|
+
]
|
|
1113
|
+
ax.plot(limits, limits, '--k')
|
|
1114
|
+
|
|
1115
|
+
util.equalize_axes_(ax)
|
|
1116
|
+
|
|
1117
|
+
|
|
1118
|
+
fig.suptitle(f"NVC for {sample['name']}", y=1.04)
|
|
1119
|
+
plt.tight_layout()
|
|
1120
|
+
|
|
1121
|
+
return corrs, fig
|
|
1122
|
+
|
|
1123
|
+
def nrc(sample: dict) -> Tuple[dict, matplotlib.figure.Figure]:
|
|
1124
|
+
"""Computes neighbouring solutions' ranks correlation (NVC)
|
|
1125
|
+
|
|
1126
|
+
This feature produces as visual output, Scatter plot of the ranks
|
|
1127
|
+
between neighbours for each constraint, for infeasible solutions
|
|
1128
|
+
only. Ranks are based on Pareto ranks for objectives, violations, the
|
|
1129
|
+
combination of objectives and violations, and Deb feasibility rank.
|
|
1130
|
+
A regression line is plotted to indicate correlation. The plot
|
|
1131
|
+
is divided a broken line through origin such that lines above line are
|
|
1132
|
+
improving neighbours, those below are deteriorating neighbours while
|
|
1133
|
+
those on the line are neutral neighbours. The numerical output for
|
|
1134
|
+
this feature is a set of Spearman's correlation coefficients for each
|
|
1135
|
+
scatter plot.
|
|
1136
|
+
|
|
1137
|
+
Parameters
|
|
1138
|
+
----------
|
|
1139
|
+
sample : dict
|
|
1140
|
+
A sample containing input files i.e `F`, `N`.
|
|
1141
|
+
|
|
1142
|
+
Returns
|
|
1143
|
+
-------
|
|
1144
|
+
corr : dict
|
|
1145
|
+
Dictionary containing correlation between ranks of
|
|
1146
|
+
solutions and the ranks of their neighbours, for
|
|
1147
|
+
infeasible solutions only.
|
|
1148
|
+
fig : matplotlib.axes.Axes
|
|
1149
|
+
`matplotlib` axes each containing a scatter plot of violation
|
|
1150
|
+
values of solutions against the ranks of their
|
|
1151
|
+
neighbours for infeasible solutions only.
|
|
1152
|
+
|
|
1153
|
+
Examples
|
|
1154
|
+
--------
|
|
1155
|
+
>>> from pyxla import util, nrc
|
|
1156
|
+
>>> import matplotlib
|
|
1157
|
+
>>> sample = util.load_sample('nk_n14_k2_id5_F3_V2', test=True)
|
|
1158
|
+
>>> corrs, plot = nrc(sample)
|
|
1159
|
+
>>> type(corrs)
|
|
1160
|
+
<class 'dict'>
|
|
1161
|
+
>>> isinstance(plot, matplotlib.figure.Figure)
|
|
1162
|
+
True
|
|
1163
|
+
"""
|
|
1164
|
+
|
|
1165
|
+
util.handle_missing_N_file(sample, True, warn=False)
|
|
1166
|
+
|
|
1167
|
+
R = sample['R']
|
|
1168
|
+
R = R if not util.present(sample, 'V') else R.drop('feasible', axis=1)
|
|
1169
|
+
numR = sample['numR']
|
|
1170
|
+
N = sample['N']
|
|
1171
|
+
|
|
1172
|
+
# get all solutions represented in N file
|
|
1173
|
+
X_in_N_id1 = N['id1'].unique()
|
|
1174
|
+
X_in_N_id2 = N['id2'].unique()
|
|
1175
|
+
X_in_N = np.unique(np.concat([X_in_N_id1, X_in_N_id2]))
|
|
1176
|
+
# filter R to only have Xs in N
|
|
1177
|
+
R = R.loc[X_in_N]
|
|
1178
|
+
assert len(R) == len(X_in_N)
|
|
1179
|
+
|
|
1180
|
+
corrs = {}
|
|
1181
|
+
ncols = min(3, numR)
|
|
1182
|
+
nrows = math.ceil(numR / ncols)
|
|
1183
|
+
fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=(5 * ncols, 5 * nrows))
|
|
1184
|
+
axs = axs.ravel() if numR > 1 else axs
|
|
1185
|
+
# change color palette
|
|
1186
|
+
sns.set_palette('deep')
|
|
1187
|
+
sns.set_palette(reversed(sns.color_palette()), 10)
|
|
1188
|
+
palette = sns.color_palette()
|
|
1189
|
+
|
|
1190
|
+
for i, r in enumerate(tqdm(R.columns)):
|
|
1191
|
+
NRC = pd.DataFrame()
|
|
1192
|
+
x, y = f"{r} rank", f"neighbour {r} rank"
|
|
1193
|
+
NRC[x] = R.loc[N['id1'].to_numpy()][r].to_numpy()
|
|
1194
|
+
NRC[y] = R.loc[N['id2'].to_numpy()][r].to_numpy()
|
|
1195
|
+
|
|
1196
|
+
corr, _ = scipy.stats.spearmanr(NRC[x], NRC[y])
|
|
1197
|
+
corr = corr if isinstance(corr, float) else corr.item()
|
|
1198
|
+
corrs.update({f"NRC_for_{r}_ranks": f'{corr:.4f}'})
|
|
1199
|
+
ax = axs[i] if numR > 1 else axs
|
|
1200
|
+
|
|
1201
|
+
ax = sns.scatterplot(x=NRC[x], y=NRC[y], ax=ax, color=palette[0])
|
|
1202
|
+
ax.set(title=f'NRC for {r} ranks (corr = {corr:.2f})')
|
|
1203
|
+
sns.regplot(x=NRC[x], y=NRC[y], ax=ax, scatter=False, color=palette[1])
|
|
1204
|
+
|
|
1205
|
+
# plot diagonal line
|
|
1206
|
+
limits = [
|
|
1207
|
+
np.min([ax.get_xlim(), ax.get_ylim()]),
|
|
1208
|
+
np.max([ax.get_xlim(), ax.get_ylim()]),
|
|
1209
|
+
]
|
|
1210
|
+
ax.plot(limits, limits, '--k')
|
|
1211
|
+
|
|
1212
|
+
util.equalize_axes_(ax)
|
|
1213
|
+
|
|
1214
|
+
fig.suptitle(f"NRC for {sample['name']}", y=1.04)
|
|
1215
|
+
|
|
1216
|
+
plt.tight_layout()
|
|
1217
|
+
|
|
1218
|
+
return corrs, fig
|
|
1219
|
+
|
|
1220
|
+
def disp_best(sample: dict, init_percentage: int = 10, growth_factor: int = 2):
|
|
1221
|
+
"""Dispersion of best solutions
|
|
1222
|
+
|
|
1223
|
+
This feature analyses the dipersion amongst best solutions. It
|
|
1224
|
+
producesas visual output scatter plots showing the distribution of
|
|
1225
|
+
pairwise distances between solutions for increasing sample sizes of
|
|
1226
|
+
best solutions, where 'best' is with respect to different objectives,
|
|
1227
|
+
constraints and ranks. The corresponding numerical outputs are the
|
|
1228
|
+
dispersion metrics [1]_ with respect to each objective, constrant and
|
|
1229
|
+
rank.
|
|
1230
|
+
Positive dispersion metric values indicates the presence of funnels while
|
|
1231
|
+
negative values indicate the presence of global structure.
|
|
1232
|
+
|
|
1233
|
+
Parameters
|
|
1234
|
+
----------
|
|
1235
|
+
sample : dict
|
|
1236
|
+
A sample containing input files i.e `F`, `V`.
|
|
1237
|
+
|
|
1238
|
+
References
|
|
1239
|
+
----------
|
|
1240
|
+
.. [1] M. Lunacek and D. Whitley, 'The dispersion metric and the CMA evolution strategy', in Proceedings of the 8th annual conference on Genetic and evolutionary computation, 2006, pp. 477-484.
|
|
1241
|
+
|
|
1242
|
+
"""
|
|
1243
|
+
|
|
1244
|
+
if init_percentage >= 100: raise Exception('Initial percentage for sub-sampling must be less than 100%.')
|
|
1245
|
+
|
|
1246
|
+
util.handle_missing_D_file(sample, True, warn=False)
|
|
1247
|
+
|
|
1248
|
+
D = sample['D']
|
|
1249
|
+
R: pd.DataFrame = sample['R']
|
|
1250
|
+
numR = sample['numR']
|
|
1251
|
+
R = R if not util.present(sample, 'V') else R.drop('feasible', axis=1)
|
|
1252
|
+
|
|
1253
|
+
disp_metrics = {}
|
|
1254
|
+
ncols = min(3, numR)
|
|
1255
|
+
nrows = math.ceil(numR / ncols)
|
|
1256
|
+
fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=(ncols * 5, nrows * 5))
|
|
1257
|
+
axs = axs.ravel() if numR > 1 else axs
|
|
1258
|
+
|
|
1259
|
+
sample_sizes = [init_percentage]
|
|
1260
|
+
i = 0
|
|
1261
|
+
while sample_sizes[i] * growth_factor < 100:
|
|
1262
|
+
sample_sizes.append(sample_sizes[i] * growth_factor)
|
|
1263
|
+
i += 1
|
|
1264
|
+
|
|
1265
|
+
for i, r in enumerate(R.columns):
|
|
1266
|
+
sorted = R.sort_values(by=r)
|
|
1267
|
+
pw_distances_avgs = []
|
|
1268
|
+
plot_data = pd.DataFrame(columns=['pairwise distances', 'sample size'])
|
|
1269
|
+
|
|
1270
|
+
for n in sample_sizes:
|
|
1271
|
+
nth = int(n/100 * len(R))
|
|
1272
|
+
best_n = sorted[:nth].index.sort_values()
|
|
1273
|
+
pairs = itertools.combinations(best_n, 2)
|
|
1274
|
+
pw_distances = D.loc[pairs]['d'].to_numpy()
|
|
1275
|
+
|
|
1276
|
+
data = pd.DataFrame()
|
|
1277
|
+
data['pairwise distances'] = pw_distances
|
|
1278
|
+
data['sample size'] = n
|
|
1279
|
+
plot_data = data if plot_data.empty else pd.concat([plot_data, data])
|
|
1280
|
+
|
|
1281
|
+
pw_distances_avgs.append(pw_distances.mean())
|
|
1282
|
+
|
|
1283
|
+
# dispersion metrics
|
|
1284
|
+
disp_metric = pw_distances_avgs[0] - pw_distances_avgs[-1]
|
|
1285
|
+
disp_metrics.update({r: disp_metric})
|
|
1286
|
+
|
|
1287
|
+
# plot
|
|
1288
|
+
ax = axs[i] if numR > 1 else axs
|
|
1289
|
+
sns.scatterplot(x=plot_data['sample size'], y=plot_data['pairwise distances'], ax=ax)
|
|
1290
|
+
ax.set(title=f'disp_best for {r} (metric = {disp_metric:.4f})')
|
|
1291
|
+
|
|
1292
|
+
# custom function scale matching growth factor
|
|
1293
|
+
forward = lambda x: np.log(x / init_percentage) / np.log(growth_factor)
|
|
1294
|
+
inverse = lambda x: init_percentage * (growth_factor ** x)
|
|
1295
|
+
ax.set_xscale('function', functions=(forward, inverse))
|
|
1296
|
+
|
|
1297
|
+
# set ticks manually
|
|
1298
|
+
ax.set_xticks(sample_sizes)
|
|
1299
|
+
ax.set_xticklabels([str(t) for t in sample_sizes])
|
|
1300
|
+
|
|
1301
|
+
# turn off unused axes
|
|
1302
|
+
if i + 1 < ncols * nrows:
|
|
1303
|
+
for unused_ax in axs[i + 1:]: unused_ax.set_axis_off()
|
|
1304
|
+
|
|
1305
|
+
fig.suptitle(f"disp_best for {sample['name']}", y=1.04)
|
|
1306
|
+
|
|
1307
|
+
plt.tight_layout()
|
|
1308
|
+
|
|
1309
|
+
return disp_metrics, fig
|
|
1310
|
+
|
|
1311
|
+
|
|
1312
|
+
def X_imp(sample: dict, n_repeats=10, train_proportion = 0.7, binary: bool = False, seed: float = None):
|
|
1313
|
+
if not util.present(sample, 'X'): raise Exception('X is required. Please provide an X file.')
|
|
1314
|
+
|
|
1315
|
+
X: pd.DataFrame = sample['X']
|
|
1316
|
+
F = sample['F']
|
|
1317
|
+
numF = sample['numF']
|
|
1318
|
+
|
|
1319
|
+
x_imp_ranks = {}
|
|
1320
|
+
|
|
1321
|
+
|
|
1322
|
+
# add the objectives to the X file
|
|
1323
|
+
for f in F.columns:
|
|
1324
|
+
X[f] = F[f]
|
|
1325
|
+
|
|
1326
|
+
pointbiserialr = lambda x, y: scipy.stats.pointbiserialr(y, x)[0]
|
|
1327
|
+
|
|
1328
|
+
corr_matrix = X.corr(method=pointbiserialr)
|
|
1329
|
+
|
|
1330
|
+
# remove F cols from X file
|
|
1331
|
+
X.drop(F.columns, axis=1, inplace=True)
|
|
1332
|
+
|
|
1333
|
+
corr_matrix = corr_matrix.drop(index=X.columns).drop(columns=F.columns)
|
|
1334
|
+
|
|
1335
|
+
fig, axs = plt.subplots(ncols=numF, nrows=2, figsize=(numF * 5, 10))
|
|
1336
|
+
|
|
1337
|
+
for i, f in enumerate(F.columns):
|
|
1338
|
+
ax = axs[0, i] if numF > 1 else axs[i]
|
|
1339
|
+
ax = sns.barplot(corr_matrix.loc[f], ax=ax)
|
|
1340
|
+
ax.set(title=f"Correlation of X and {f}")
|
|
1341
|
+
|
|
1342
|
+
for i, f in enumerate(F.columns):
|
|
1343
|
+
x_imp = pd.DataFrame()
|
|
1344
|
+
|
|
1345
|
+
# normalize
|
|
1346
|
+
X = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
|
|
1347
|
+
|
|
1348
|
+
X_train, X_test, y_train, y_test = train_test_split(X, F[f], train_size=train_proportion, random_state=seed)
|
|
1349
|
+
|
|
1350
|
+
model = LassoLars(alpha=0.001).fit(X_train, y_train)
|
|
1351
|
+
r2 = model.score(X_test, y_test)
|
|
1352
|
+
|
|
1353
|
+
imp = permutation_importance(model, X_test, y_test,
|
|
1354
|
+
n_repeats=n_repeats,
|
|
1355
|
+
random_state=seed,
|
|
1356
|
+
scoring='r2')
|
|
1357
|
+
# sort importance means desc.
|
|
1358
|
+
imp_idxs = imp.importances_mean.argsort()[::-1]
|
|
1359
|
+
|
|
1360
|
+
x_imp['X'] = X.columns[imp_idxs]
|
|
1361
|
+
x_imp['importance'] = imp.importances_mean[imp_idxs]
|
|
1362
|
+
x_imp['std'] = imp.importances_std[imp_idxs]
|
|
1363
|
+
x_imp['rank'] = x_imp.index.to_numpy() + 1
|
|
1364
|
+
x_imp_ranks[f] = x_imp
|
|
1365
|
+
|
|
1366
|
+
ax = sns.barplot(x_imp, x='X', y='importance', ax=axs[1, i] if numF > 1 else axs[1])
|
|
1367
|
+
ax.errorbar(
|
|
1368
|
+
x=range(len(x_imp)),
|
|
1369
|
+
y=x_imp['importance'],
|
|
1370
|
+
yerr=x_imp['std'],
|
|
1371
|
+
fmt='none',
|
|
1372
|
+
c='black',
|
|
1373
|
+
)
|
|
1374
|
+
ax.set(title=f"X_imp for {f} (Validation {r"$R^2$"} = {r2:.4f})")
|
|
1375
|
+
|
|
1376
|
+
fig.suptitle(f"X_imp for {sample['name']}")
|
|
1377
|
+
|
|
1378
|
+
plt.tight_layout()
|
|
1379
|
+
|
|
1380
|
+
return corr_matrix, x_imp_ranks, fig
|