pyxla 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pyxla/__init__.py ADDED
@@ -0,0 +1,1380 @@
1
+
2
+ """The core functions of the library are defined here."""
3
+
4
+ import matplotlib.axes
5
+ import numpy as np
6
+ import pandas as pd
7
+ import seaborn as sns
8
+ import matplotlib.pyplot as plt
9
+ import statistics
10
+ import scipy
11
+ from scipy.stats import spearmanr
12
+ from scipy.spatial.distance import pdist
13
+ from typing import Tuple
14
+ import matplotlib
15
+ import math
16
+ from typing import Callable, Union, List, Iterable
17
+ from tqdm.auto import tqdm
18
+ import logging
19
+ import itertools
20
+ from sklearn.linear_model import Ridge, LassoLars
21
+ from sklearn.model_selection import train_test_split
22
+ from sklearn.inspection import permutation_importance
23
+
24
+ from . import util
25
+
26
+ logging.basicConfig(level=logging.INFO)
27
+
28
+ # allow importing `load_data` immediately from `pyxla`
29
+ from .util import load_data
30
+
31
+ # definition of the sample structure
32
+ # (the data files should include the global optima if known)
33
+ sample = {'name': None,
34
+ 'size': 0,
35
+ # variable space
36
+ 'X': None, # dataframe
37
+ #'Xd': 0, # int
38
+ 'Xcsv': None, # str
39
+ # objective space
40
+ 'F': None,
41
+ #'Fd': 0,
42
+ 'Fcsv': None,
43
+ 'numF': 0,
44
+ 'max': False, # we minimize by default
45
+ # violation space
46
+ 'V': None,
47
+ #'Vd': 0,
48
+ 'Vcsv': None,
49
+ 'numV': 0,
50
+ # neighborhood
51
+ 'N': None,
52
+ 'Ncsv': None,
53
+ # distance
54
+ 'D': None,
55
+ 'Dcsv': None,
56
+ 'representation': 'continuous', # by default
57
+ 'd_metric_func': None, # distance metric function of the form dist(X1, X2) -> d
58
+ 'neighbourhood_func': None, # of the form f(X1, X2) -> bool
59
+ 'p': 2 # Euclidean distance by default
60
+ }
61
+
62
+ def descriptive_stats(data, name):
63
+ result = dict()
64
+ result[str(name + '_min')] = min(data)
65
+ result[str(name + '_max')] = max(data)
66
+ result[str(name + '_mean')] = statistics.mean(data)
67
+ result[str(name + '_med')] = statistics.median(data)
68
+ result[str(name + '_q1')] = statistics.quantiles(data, n = 4)[0]
69
+ result[str(name + '_q3')] = statistics.quantiles(data, n = 4)[2]
70
+ result[str(name + '_sd')] = statistics.stdev(data)
71
+ result[str(name + '_skew')] = scipy.stats.skew(data).item()
72
+ result[str(name + '_kurt')] = scipy.stats.kurtosis(data).item()
73
+ return result
74
+
75
+
76
+ def distr_f(sample: dict, bins: int = 'auto') -> Tuple[dict, matplotlib.figure.Figure]:
77
+ """
78
+ **Distribution of objectives (fitness values)**
79
+
80
+ The ``distr_f`` feature visualises the spread of objective values and computes
81
+ some descriptive statistics of the objective values. Alongside a histogram of
82
+ objective values and a histogram of their dense ranking, various descriptive statistics
83
+ are computed, including: minimum, maximum, mean, median, quartiles (Q1, Q3),
84
+ standard deviation, skewness, and kurtosis. Dense ranking is a ranking method
85
+ provided by the `Link pandas https://pandas.pydata.org` package, which entails
86
+ assigning to a group of equally valued solutions the least rank in the group
87
+ and ensuring that rank increases by 1 from group to group.
88
+
89
+ Parameters
90
+ ----------
91
+ sample : dict
92
+ A sample containing the various input files i.e `F`, `V`.
93
+
94
+ Returns
95
+ -------
96
+ dict
97
+ Descriptive statistics of objective values.
98
+ matplotlib.figure.Figure
99
+ Histograms of objective values and or ranks of objective values.
100
+
101
+ Examples
102
+ --------
103
+ >>> from pyxla import util, distr_f
104
+ >>> import matplotlib
105
+ >>> sample = util.load_sample('cec2010_c01_2d_F1_V2', test=True)
106
+ >>> feat, plot = distr_f(sample)
107
+ >>> type(feat)
108
+ <class 'dict'>
109
+ >>> isinstance(plot, matplotlib.figure.Figure)
110
+ True
111
+
112
+ """
113
+ F = sample['F']
114
+ numF = sample['numF']
115
+ R = pd.DataFrame()
116
+
117
+ for col in F:
118
+ R[f"{col}_rank"] = F[col].rank(ascending = not sample['max'], method = 'min').astype(int)
119
+
120
+ feat = dict()
121
+ for i in range(0, len(F.columns)):
122
+ feat.update(descriptive_stats(F.iloc[:,i], F.columns[i]))
123
+ feat.update(descriptive_stats(R.iloc[:,i], R.columns[i]))
124
+
125
+ # plots
126
+ ncols = numF + (numF > 1)
127
+ nrows = 2
128
+ fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=(3 * ncols, 3 * nrows))
129
+ palette = sns.color_palette()[-numF:] # pick last n colors in palette
130
+
131
+ ax = lambda i, j: axs[i, j] if numF > 1 else axs[i]
132
+ for i, col in enumerate(F):
133
+ sns.histplot(F[col], ax=ax(0, i), color=palette[i], bins=bins)
134
+ sns.histplot(R[f"{col}_rank"], ax=ax(1, i), color=palette[i])
135
+
136
+ # blend plots
137
+ if numF > 1:
138
+ sns.histplot(F, ax=ax(0, numF), palette=palette)
139
+ sns.histplot(R, ax=ax(1, numF), palette=palette)
140
+
141
+ fig.suptitle('Distribution of objective values/ranks')
142
+ plt.tight_layout()
143
+
144
+ return feat, fig
145
+
146
+ def distr_v(sample: dict) -> Tuple[dict, matplotlib.figure.Figure]:
147
+ """
148
+ **Distribution of violation values**
149
+
150
+ The ``distr_v`` feature visualises the spread of violation values and
151
+ computes some descriptive statistics of the violation values.
152
+ Alongside a histogram of violation values and a histogram of their
153
+ dense ranking, descriptive statistics are computed, including:
154
+ minimum, maximum, mean, median, quartiles (Q1, Q3), standard deviation,
155
+ skewness, and kurtosis. Additionally, the feasibility rate is computed
156
+ per violation, and the overall feasibility, taking all constraints into
157
+ account, is computed. Feasibility rate refers to the proportion of solutions
158
+ that are feasible with respect to a constraint.
159
+
160
+ Parameters
161
+ ----------
162
+ sample : dict
163
+ A sample containing the various input files i.e `F`, `V`.
164
+
165
+ Returns
166
+ -------
167
+ dict
168
+ Descriptive statistics of violation values.
169
+ matplotlib.figure.Figure
170
+ Histograms of violation values and ranks of violation values.
171
+ """
172
+ V = sample['V']
173
+ if V is None: raise ValueError('V is absent in the sample. Please provide input V first.')
174
+
175
+ V_ = V.drop('feasible', axis=1)
176
+ numV = sample['numV']
177
+
178
+
179
+ R = pd.DataFrame()
180
+
181
+ for col in V_:
182
+ R[f"{col}_rank"] = V[col].rank(ascending = True, method = 'min').astype(int)
183
+
184
+ # features
185
+ feat = dict()
186
+ # exclude column 'feasible'
187
+ for i, col in enumerate(V_):
188
+ feat.update(descriptive_stats(V.iloc[:,i], col))
189
+ # feasibility rate per constraint
190
+ feat[f'{col}_feas_rate'] = (V.iloc[:,i] == 0).mean().item()
191
+ feat[f'overall_feas_rate'] = V['feasible'].mean().item()
192
+
193
+ # plots
194
+ ncols = numV + (numV > 1)
195
+ nrows = 2
196
+ fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=(3 * ncols, 3 * nrows))
197
+ palette = sns.color_palette()[-numV:] # pick last n colors in palette
198
+
199
+ ax = lambda i, j: axs[i, j] if numV > 1 else axs[i]
200
+
201
+ for i, col in enumerate(V_):
202
+ sns.histplot(V[col], ax=ax(0, i), color=palette[i])
203
+ sns.histplot(R[f"{col}_rank"], ax=ax(1, i), color=palette[i])
204
+
205
+ # blend plots
206
+ if numV > 1:
207
+ sns.histplot(V_, ax=ax(0, numV), palette=palette)
208
+ sns.histplot(R, ax=ax(1, numV), palette=palette)
209
+
210
+ fig.suptitle('Distribution of violation values/ranks')
211
+ plt.tight_layout()
212
+
213
+ return feat, fig
214
+
215
+ def distr_Par(sample: dict) -> Tuple[dict, sns.axisgrid.FacetGrid]:
216
+ """_summary_
217
+
218
+ Parameters
219
+ ----------
220
+ sample : dict
221
+ _description_
222
+
223
+ Returns
224
+ -------
225
+ sns.axisgrid.FacetGrid
226
+ _description_
227
+ """
228
+ Par = sample['R'].filter(like='pareto')
229
+ numPar = len(Par.columns)
230
+ if not numPar: raise Exception('Pareto rank is not possible for this sample.')
231
+
232
+ feat = dict()
233
+
234
+ for i in range(0, numPar):
235
+ feat.update(descriptive_stats(Par.iloc[:,i], Par.columns[i]))
236
+
237
+ # plots
238
+ ncols = numPar
239
+ fig, axs = plt.subplots(ncols=ncols, figsize=(3 * ncols, 3))
240
+ palette = sns.color_palette()[-numPar:] # pick last n colors in palette
241
+
242
+ ax = lambda i : axs[i] if numPar > 1 else axs
243
+
244
+ for i, col in enumerate(Par):
245
+ sns.histplot(Par[col], ax=ax(i), color=palette[i])
246
+
247
+ fig.suptitle('Distribution of Pareto ranks')
248
+ plt.tight_layout()
249
+
250
+ return feat, fig
251
+
252
+ def distr_Deb(sample: dict) -> Tuple[dict, sns.axisgrid.FacetGrid]:
253
+ """_summary_
254
+
255
+ Parameters
256
+ ----------
257
+ sample : dict
258
+ _description_
259
+
260
+ Returns
261
+ -------
262
+ sns.axisgrid.FacetGrid
263
+ _description_
264
+ """
265
+ if 'Deb' not in sample['R']: raise Exception("Deb's feasibility rule ranking is not possible for this sample.")
266
+ deb_ranks = sample['R']['Deb']
267
+ plot = sns.displot(deb_ranks)
268
+ plot.set(title=f"Deb\'s feasibility rule ranking distribution {sample['name']}")
269
+ feat = dict()
270
+ feat.update(descriptive_stats(deb_ranks, 'Deb'))
271
+
272
+ return feat, plot
273
+
274
+ def annotate_with_corr_coefs(x, y, label=None, color=None, **kwargs) -> None:
275
+ ax = plt.gca()
276
+ cor, _ = spearmanr(x, y)
277
+ cor = 'undefined' if math.isnan(cor) else f'{cor:.2f}'
278
+ # if sample has V file
279
+ if label != None:
280
+ feasibility = 'feasible' if label else 'infeasible'
281
+ kwargs['feat'].update({f"{x.name}_{y.name} ({feasibility})": cor})
282
+ pos = (0.5, 0.5) if label else (0.5, 0.25)
283
+ ax.annotate(f'corr = {cor} ({feasibility})', xy = pos, xycoords='axes fraction', ha = 'center', color = color)
284
+ # else if no V file
285
+ else:
286
+ kwargs['feat'].update({f"{x.name}_{y.name}": cor})
287
+ pos = (0.5, 0.5)
288
+ ax.annotate(f'corr = {cor}', xy = pos, xycoords='axes fraction', ha = 'center', color = color)
289
+ ax.set_axis_off()
290
+
291
+ def corr(sample: dict) -> Tuple[dict, sns.PairGrid]:
292
+ """Correlation of values
293
+
294
+ @todo summary
295
+
296
+ Parameters
297
+ ----------
298
+ sample : dict
299
+ A sample containing the various input files i.e `F`, `V`.
300
+
301
+ Returns
302
+ -------
303
+ sns.PairGrid
304
+ Grid of scatter plots of all objectives and violations against
305
+ each other, with feasibility indicated per solution.
306
+ feat: dict
307
+ Spearman's correlation coefficients for all pairs of sets of
308
+ values split by feasibility
309
+ """
310
+ if util.present(sample, 'V'):
311
+ FV = pd.merge(sample['F'], sample['V'], left_index=True, right_index=True)
312
+ g = sns.PairGrid(FV, hue = 'feasible', hue_order=[True, False])
313
+ else:
314
+ FV = sample['F']
315
+ if sample['numF'] == 1: logging.warning('The sample has a single objective with no constraint, therefore this feature is meaningless.')
316
+ g = sns.PairGrid(FV)
317
+ # diagonal
318
+ g.map_diag(sns.histplot)
319
+ # lower triangle
320
+ g.map_lower(sns.scatterplot, alpha = 0.5)
321
+ g.map_lower(sns.regplot, scatter = False)
322
+ # upper triangle
323
+ feat = {}
324
+ g.map_upper(annotate_with_corr_coefs, feat=feat)
325
+ # legend
326
+ g.add_legend()
327
+ violation_txt = ' and violations' if util.present(sample, 'V') else ''
328
+ g.figure.suptitle(f"Correlation of objectives{violation_txt}")
329
+ return feat, g
330
+
331
+ def corr_ranks(sample: dict) -> Tuple[dict, sns.PairGrid]:
332
+ """Correlation of ranks
333
+
334
+ @todo summary
335
+
336
+ Parameters
337
+ ----------
338
+ sample : dict
339
+ A sample containing the various input files i.e `F`, `V`
340
+
341
+ Returns
342
+ -------
343
+ sns.PairGrid
344
+ Grid of scatter plots of all objective, violation, Pareto and Deb's
345
+ ranks against each other, with feasibility indicated per solution
346
+ feat: dict
347
+ Spearman's correlation coefficients for all pairs of sets of ranks
348
+ split by feasibility
349
+ """
350
+ if util.present(sample, 'V'):
351
+ g = sns.PairGrid(sample['R'], hue='feasible', hue_order=[True, False])
352
+ else:
353
+ g = sns.PairGrid(sample['R'])
354
+ if sample['numF'] == 1: logging.warning('The sample has a single objective with no constraint, therefore this feature is meaningless.')
355
+
356
+ g.map_diag(sns.histplot)
357
+ # diagonal
358
+ g.map_diag(sns.histplot)
359
+ # lower triangle
360
+ g.map_lower(sns.scatterplot, alpha=0.5)
361
+ g.map_lower(sns.regplot, scatter=False)
362
+ # upper triangle
363
+ feat = {}
364
+ g.map_upper(annotate_with_corr_coefs, feat=feat)
365
+ # legend
366
+ g.add_legend()
367
+ g.figure.suptitle('Correlation of ranks')
368
+ g.figure.tight_layout()
369
+ return feat, g
370
+
371
+ def pw_dist(sample: dict, id_a: int, id_b: int, metric=None) -> float:
372
+ """Calculates the distance between 2 solutions given their indices.
373
+
374
+ Parameters
375
+ ----------
376
+ sample : dict
377
+ A sample containing the various input files i.e `F`, `V`.
378
+ id_a : int
379
+ Index of solution `a`
380
+ id_b : int
381
+ Index of solution `b`
382
+
383
+ Returns
384
+ -------
385
+ float
386
+ Distance between solution `a` and `b` as a float
387
+ """
388
+ if id_a == id_b:
389
+ return 0
390
+ else:
391
+ d = None
392
+ D = sample['D']
393
+ # check if D file is provided
394
+ if isinstance(D, pd.DataFrame):
395
+ if id_a < id_b: d = D.loc[(id_a, id_b), 'd']
396
+ else: d = D.loc[(id_b, id_a), 'd']
397
+ # if D not provided calculate d's
398
+ else:
399
+ # if no metric specific use default metrics
400
+ if not metric:
401
+ metric = 'euclidean' if sample['representation'] == 'continuous' else 'hamming'
402
+
403
+ if 'X' not in sample: raise ValueError('Please provide either an X input file.')
404
+ X = sample['X']
405
+ a = X.iloc[id_a].to_numpy()
406
+ b = X.iloc[id_b].to_numpy()
407
+ # d = np.linalg.norm(a - b, ord = sample['p'])
408
+ d = pdist([a, b], metric=metric)
409
+ return d
410
+
411
+ def fdc(sample: dict, compute_D_file: bool = True) -> Tuple[dict, matplotlib.axes.Axes]:
412
+ """Computes objective-distance correlation
413
+
414
+ @todo summary
415
+
416
+ Parameters
417
+ ----------
418
+ sample : dict
419
+ A sample containing the various input files i.e `F`, `V`.
420
+ compute_D_file : bool, optional
421
+ By default `True`; when there is no D file in the sample, if
422
+ `compute_D_file` is set to `True`, the whole D file is
423
+ calculated. Calculating the whole D file will eliminate redundant
424
+ distance calculations in the future, but it can be time consuming.
425
+ To speed up calculation of `fdc`, set `compute_D_file` to
426
+ `False` so that only the required distances are calculated.
427
+
428
+ Returns
429
+ -------
430
+ corr : dict
431
+ Dictionary containing Spearman's correlation coefficients
432
+ objective-distance correlation per objective.
433
+ fig : matplotlib.axes.Axes
434
+ `matplotlib` axes containing scatter plots of objective
435
+ values against distance to the nearest best solution in
436
+ sample per objective, for all solutions or for feasible
437
+ solutions only.
438
+
439
+ Raises
440
+ ------
441
+ Exception
442
+ Raises an exception if both D and X inputs are absent. One of D
443
+ or X is needed to compute distances between solution.
444
+ """
445
+ util.handle_missing_D_file(sample, compute_D_file)
446
+
447
+ sample['FDC'] = pd.DataFrame()
448
+ FDC = sample['FDC']
449
+ F = sample['F']
450
+ R = sample['R']
451
+ corr = {}
452
+
453
+ fig, axs = plt.subplots(ncols=len(F.columns), figsize=(5 * len(F.columns), 5))
454
+ for i, col in enumerate(F.columns):
455
+ FDC[col] = F[col]
456
+ FDC['distance'] = 0.0
457
+ # pick F with rank 1,
458
+ bestF = R.query(f'{col} == 1')
459
+ # for rank 1 solution set distance to 0
460
+ FDC.loc[bestF.index, 'distance'] = 0.0
461
+ # compute distance only for non-rank-1 solutions
462
+ for t_idx in FDC[~FDC.index.isin(bestF.index)].index:
463
+ d_nearest_best_f = pw_dist(sample, t_idx, bestF.index[0])
464
+ # check for the nearest among all bestFs
465
+ for b_idx in bestF[1:].index:
466
+ d = pw_dist(sample, t_idx, b_idx)
467
+ if d < d_nearest_best_f: d_nearest_best_f = d
468
+ FDC.loc[t_idx, 'distance'] = d_nearest_best_f
469
+
470
+ r, _ = scipy.stats.spearmanr(FDC[col], FDC['distance'])
471
+ corr.update({f"fdc_{col}": r})
472
+
473
+ ax = axs[i] if len(F.columns) > 1 else axs
474
+ sns.scatterplot(data = FDC, x = 'distance', y = col, ax = ax)
475
+ sns.regplot(data = FDC, x = 'distance', y = col, ax = ax, scatter=False)
476
+ ax.set_title(f'FDC (corr = {r:.2f})')
477
+
478
+ fig.tight_layout()
479
+
480
+ return corr, fig
481
+
482
+ def vdc(sample: dict, compute_D_file: bool = True) -> Tuple[dict, matplotlib.axes.Axes]:
483
+ """Calculates violation-distance correlation.
484
+
485
+ This function calculates the violation distance
486
+ correlation: the correlation between violation values and distance
487
+ to the nearest feasible solution.
488
+
489
+ Parameters
490
+ ----------
491
+ sample : dict
492
+ A sample containing the at least input files V and D.
493
+ compute_D_file : bool, optional
494
+ By default `True`; when there is no D file in the sample, if
495
+ `compute_D_file` is set to `True`, the whole D file is
496
+ calculated. Calculating the whole D file will eliminate redundant
497
+ distance calculations in the future, but it can be time consuming.
498
+ To speed up calculation of `fdc`, set `compute_D_file` to
499
+ `False` so that only the required distances are calculated.
500
+
501
+ Returns
502
+ -------
503
+ corr : dict
504
+ Dictionary containing correlation between constraints and
505
+ distance to the nearest feasible solution.
506
+ fig : matplotlib.axes.Axes
507
+ `matplotlib` axes containing a scatter plot of violation
508
+ values against distance to the nearest feasible solution.
509
+
510
+ Examples
511
+ --------
512
+ >>> from pyxla import util, vdc
513
+ >>> import matplotlib
514
+ >>> sample = util.load_sample('cec2010_c01_2d_F1_V2', test=True)
515
+ >>> corr, plot = vdc(sample)
516
+ >>> type(corr)
517
+ <class 'dict'>
518
+ >>> isinstance(plot, matplotlib.figure.Figure)
519
+ True
520
+ """
521
+ if not util.present(sample, 'V'): raise Exception('V is required. Please provide V.')
522
+
523
+ util. handle_missing_D_file(sample, compute_D_file)
524
+
525
+ V = sample['V']
526
+
527
+ if V[V['feasible'] == True].empty: raise Exception('VDC is undefined as there is no feasible solution.')
528
+
529
+ corr = {}
530
+ fig, axs = plt.subplots(ncols=len(V.columns[:-1]), figsize=(5 * len(V.columns[:-1]), 5))
531
+
532
+ for i, col in tqdm(enumerate(V.columns[:-1]), total=sample['numV']): # exclude col 'feasible'
533
+ sample[f'VDC_{col}'] = pd.DataFrame()
534
+ VDC = sample[f'VDC_{col}']
535
+ # separate feasible and infeasible solutions.
536
+ feas_v, infeas_v = V.query(f'{col} == 0'), V.query(f'{col} != 0')
537
+
538
+ VDC[col] = infeas_v[col]
539
+ # for each infeasible solution:
540
+ for infeas_idx in infeas_v.index:
541
+ d_nearest_feas = pw_dist(sample, infeas_idx, feas_v.index[0])
542
+ # compute dist to all feas. solutions
543
+ for feas_idx in feas_v.iloc[1:].index:
544
+ d = pw_dist(sample, infeas_idx, feas_idx)
545
+ if d < d_nearest_feas:
546
+ # look for the nearest feas. solution
547
+ d_nearest_feas = d
548
+ VDC.loc[infeas_idx, 'distance'] = d_nearest_feas
549
+ # no existing infeasible solutions
550
+ if infeas_v.empty: VDC['distance'] = 0
551
+
552
+ r, _ = scipy.stats.spearmanr(VDC[col], VDC['distance'])
553
+ corr.update({f"vdc_{col}": r})
554
+
555
+ ax = axs[i] if len(V.columns[:-1]) > 1 else axs
556
+ # use uniform color for infeasible solutions
557
+ color = sns.color_palette()[1]
558
+ sns.scatterplot(data = sample[f'VDC_{col}'], x = 'distance', y = col, ax = ax, color=color)
559
+ ax.set_title(f'VDC (corr = {r:.2f})')
560
+
561
+ return corr, fig
562
+
563
+ def rdc(sample: dict, compute_D_file: bool = True) -> Tuple[dict, matplotlib.axes.Axes]:
564
+ """Rank distance correlation
565
+
566
+ Scatter plots of ranks against distance to the nearest best solution
567
+ (with rank 1) in sample. Ranks based on Pareto ranks for objectives,
568
+ violations or the combination, or Deb feasibility rank.
569
+
570
+ Parameters
571
+ ----------
572
+ sample : dict
573
+ A sample containing the at least input files V and D.
574
+ compute_D_file : bool, optional
575
+ By default `True`; when there is no D file in the sample, if
576
+ `compute_D_file` is set to `True`, the whole D file is
577
+ calculated. Calculating the whole D file will eliminate redundant
578
+ distance calculations in the future, but it can be time consuming.
579
+ To speed up calculation of `fdc`, set `compute_D_file` to
580
+ `False` so that only the required distances are calculated.
581
+
582
+ Returns
583
+ -------
584
+ corr : dict
585
+ Dictionary containing correlation between ranks and
586
+ distance to the nearest best solution.
587
+ fig : matplotlib.axes.Axes
588
+ `matplotlib` axes containing a scatter plot of ranks
589
+ against distance to the nearest best solution.
590
+
591
+ """
592
+ R: pd.DataFrame = sample['R']
593
+ corrs = {}
594
+ numR = sample['numR']
595
+ ncols = numR
596
+ # nrows = math.ceil(numR / ncols)
597
+ fig, axs = plt.subplots(ncols=ncols, nrows=1, figsize=(5 * ncols, 5 * 1))
598
+ # for each rank...
599
+ axs = axs.ravel()
600
+ for i, col in tqdm(enumerate(R.columns[:-1]), total=numR): # exclude col 'feasible'
601
+ sample[f'RDC_{col}'] = pd.DataFrame()
602
+ RDC: pd.DataFrame = sample[f'RDC_{col}']
603
+ RDC[col] = R[col]
604
+ # check if col is a V
605
+ if col in sample['V'].columns:
606
+ # determine feasibility per constraint
607
+ RDC['feasible'] = sample['V'][col] = 0
608
+ else:
609
+ RDC['feasible'] = R['feasible']
610
+ RDC['distance'] = None
611
+ # get solutions with rank == 1
612
+ best = R.query(f'{col} == 1')
613
+ # for rank 1 solution set distance to 0
614
+ RDC.loc[best.index, 'distance'] = 0
615
+ # compute distance only for non-rank-1 solutions
616
+ for t_idx in RDC.query('distance.isna()').index:
617
+ d_nearest_best = pw_dist(sample, t_idx, best.index[0])
618
+ for b_idx in best[1:].index:
619
+ d = pw_dist(sample, t_idx, b_idx)
620
+ if d < d_nearest_best: d_nearest_best = d
621
+ RDC.loc[t_idx, 'distance'] = d_nearest_best
622
+
623
+ sp_corr, _ = scipy.stats.spearmanr(RDC[col], RDC['distance'])
624
+ corrs.update({f"rdc_{col}": sp_corr})
625
+
626
+ ax = axs[i] if numR > 1 else axs
627
+ sns.scatterplot(data = sample[f'RDC_{col}'], x = 'distance', y = col, hue='feasible', hue_order=[True, False], ax = ax)
628
+ ax.set_title(f'RDC (corr = {sp_corr:.2f})')
629
+
630
+ # if i + 1 < ncols * nrows:
631
+ # for unused_ax in axs[i + 1:]: unused_ax.set_axis_off()
632
+ return corrs, fig
633
+
634
+ def pdc(sample: dict, metric: Union[Callable, str] = 'euclidean') -> Tuple[dict, matplotlib.figure.Figure]:
635
+ """Computes pairwise distance correlation and produces scatter plots.
636
+
637
+ This feature produces a visual output of scatter plots of pairwise
638
+ distance on the solution space against distance on the objective
639
+ space, violation space, and for each objective, constraint and rank
640
+ individually. The numerical output is the pairwise distance
641
+ Spearman's correlation coefficient.
642
+
643
+ Parameters
644
+ ----------
645
+ sample : dict
646
+ A sample containing the at least input files V and D.
647
+ metric : Callable or str, optional
648
+ A metric function or the name of a distance metric as listed
649
+ ``scipy``'s ``pdist`` function
650
+ <https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html>`_.
651
+ If a metric function is defined it must take two solutions and computes distance
652
+ between them of the form ``dist(Xa, Xb) -> d`` where ``Xa`` and ``Xb``
653
+ are ``pandas`` Series representing solutions, by default ``None``.
654
+ For example: ``lambda Xa, Xb: abs(Xa.sum() - Xb.sum())``.
655
+
656
+ Returns
657
+ -------
658
+ corr : dict
659
+ Dictionary containing correlation between pairwise distance on the
660
+ solution space and pairwise distance on the objective space, on
661
+ the violation space, and on each objective, violation and rank
662
+ individually.
663
+ fig : matplotlib.figure.Figure
664
+ `matplotlib` axes containing a scatter plot of pairwise distance
665
+ on the solution space against pairwise distance on the objective
666
+ space, on the violation space, and on each objective, violation
667
+ and rank individually.
668
+
669
+ Examples
670
+ --------
671
+ >>> from pyxla import util, pdc
672
+ >>> import matplotlib
673
+ >>> sample = util.load_sample('cec2010_c01_2d_F1_V2', test=True)
674
+ >>> corr, plot = pdc(sample)
675
+ >>> type(corr)
676
+ <class 'dict'>
677
+ >>> isinstance(plot, matplotlib.figure.Figure)
678
+ True
679
+ """
680
+ util.handle_missing_D_file(sample, True, warn=False)
681
+
682
+ F = sample['F']
683
+ V = sample['V']
684
+ D = sample['D']
685
+ # using errors='ignore' as the `feasible` column is only present when we have V
686
+ R = sample['R'].drop('feasible', axis=1, errors='ignore')
687
+ PD = pd.DataFrame()
688
+
689
+ PD['Xd'] = D['d']
690
+
691
+ corrs = {}
692
+ rows = []
693
+
694
+ # objective space
695
+ row1 = [{'x': F, 'name':'F'}]
696
+
697
+ # violation space
698
+ if util.present(sample, 'V'):
699
+ V = V.drop('feasible', axis=1)
700
+ row1.append({'x': V, 'name':'V'})
701
+
702
+ rows.extend(row1)
703
+
704
+ # objectives
705
+ if sample['numF'] > 1:
706
+ rows.extend([{'x': F[col], 'name': col} for col in F.columns])
707
+
708
+ # violations
709
+ if sample['numV'] > 1:
710
+ rows.extend([{'x': V[col], 'name': col} for col in V])
711
+
712
+ # ranks
713
+ rows.extend([{'x': R[col], 'name': f'{col}-rank'} for col in R])
714
+
715
+ # prepare figure
716
+ ncols = min(3, len(rows)) #max(2, sample['numF'], sample['numV'], sample['numR'])
717
+ nrows = math.ceil(len(rows) / ncols)
718
+ fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=(5 * ncols, 5 * nrows))
719
+ color = sns.color_palette()[2]
720
+ axs = axs.ravel()
721
+ unused_axs = []
722
+
723
+ for i, plot in enumerate(rows):
724
+ # for col, plot in enumerate(plots):
725
+ ax_lbl = f"{plot['name']}_d"
726
+ PD[ax_lbl] = util.calc_pairwise_dist(plot['x'], metric=metric, representation=sample['representation'])
727
+ corr, _ = scipy.stats.spearmanr(PD['Xd'], PD[ax_lbl])
728
+ corrs.update({f"pdc_{plot['name']}": corr}) # fix .item()
729
+
730
+ ax = sns.scatterplot(x=PD['Xd'], y=PD[ax_lbl], ax=axs[i], color=color)
731
+ ax.set(title=f"X_d vs {plot['name']}_d (corr = {corr:.2f})")
732
+
733
+ # unused_axs.extend([(i, c) for c in range(col + 1, ncols)])
734
+
735
+ # turn off unused axes
736
+ if i + 1 < ncols * nrows:
737
+ for unused_ax in axs[i + 1:]: unused_ax.set_axis_off()
738
+ # for ax in unused_axs: axs[ax].set_axis_off()
739
+
740
+ fig.suptitle(f"PDC for {sample['name']}", y=1.04)
741
+ plt.tight_layout()
742
+
743
+ return corrs, fig
744
+
745
+ def nfc(sample: dict) -> Tuple[dict, matplotlib.axes.Axes]:
746
+ """Computes neighbouring solutions' objective values correlation.
747
+
748
+ This feature produces a scatter plot of the objective values between
749
+ neighbours for each objective (fitness cloud), for all solutions or
750
+ for feasible solutions only. The plot is divided a broken line through
751
+ origin such that lines above line are improving neighbours, those
752
+ below are deteriorating neighbours while those on the line are neutral
753
+ neighbours. The numerical output produced is a list of Spearman's
754
+ correlation coefficients for each scatter plot.
755
+
756
+ Parameters
757
+ ----------
758
+ sample : dict
759
+ A sample containing input files i.e `F`, `N`.
760
+
761
+ Returns
762
+ -------
763
+ corr : dict
764
+ Dictionary containing correlation between objective values of
765
+ solutions and the objective values of their neighbours, for all
766
+ solutions and for feasible solutions only.
767
+ fig : matplotlib.axes.Axes
768
+ `matplotlib` axes each containing a scatter plot of objective
769
+ values of solutions against the objective values of their
770
+ neighbours, for all solutions and for feasible solutions only.
771
+
772
+ Raises
773
+ ------
774
+ Exception
775
+ Raised if both N and X inputs are absent, as neighbourhood
776
+ cannot be determined without having either.
777
+
778
+ Examples
779
+ --------
780
+ >>> from pyxla import util, nfc
781
+ >>> import matplotlib
782
+ >>> sample = util.load_sample('nk_n14_k2_id5_F1_V1', test=True)
783
+ >>> corrs, plot = nfc(sample)
784
+ >>> type(corrs)
785
+ <class 'dict'>
786
+ >>> isinstance(plot, matplotlib.figure.Figure)
787
+ True
788
+ """
789
+
790
+ util.handle_missing_N_file(sample, True, warn=False)
791
+
792
+ F = sample['F']
793
+ numF = sample['numF']
794
+ V = sample['V']
795
+ N = sample['N']
796
+
797
+ corrs = {}
798
+ rows = []
799
+
800
+ rows.append({'name': 'all', 'nfc': N})
801
+
802
+ # for feasible solutions only
803
+ if util.present(sample, 'V'):
804
+ N_indexed = N.set_index('id1', drop=False)
805
+ V = V.loc[N_indexed.index]
806
+ feas_idxs = V[V['feasible'] == True].index
807
+ # add plot data to rows only if there are feasible solutions
808
+ if len(feas_idxs) > 0:
809
+ feas_N = N_indexed.loc[feas_idxs]
810
+ feas_N.reset_index(drop=True, inplace=True)
811
+ rows.append({'name': 'feas.', 'nfc': feas_N})
812
+
813
+ ncols = sample['numF']
814
+ nrows = len(rows)
815
+ fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=(5 * ncols, 5 * nrows))
816
+
817
+ # change color palette
818
+ sns.set_palette('deep')
819
+ sns.set_palette(reversed(sns.color_palette()), 10)
820
+ palette = sns.color_palette()
821
+
822
+ for row, data in enumerate(rows):
823
+ NFC = data['nfc']
824
+ for col, f in enumerate(tqdm(F.columns)):
825
+ x, y = f'{f}', f'neighbour {f}'
826
+ # collate fitness values pairs for neighbour pairs
827
+ NFC[x] = F[f].loc[NFC['id1']].tolist()
828
+ NFC[y] = F[f].loc[NFC['id2']].tolist()
829
+
830
+ corr, _ = scipy.stats.spearmanr(NFC[x], NFC[y])
831
+ corr = corr if isinstance(corr, float) else corr.item()
832
+ corrs.update({f"nfc_{data['name']}_X_for_{f}": corr})
833
+
834
+ # determine axis coords.
835
+ if len(rows) > 1:
836
+ ax = axs[row, col] if numF > 1 else axs[row]
837
+ else:
838
+ ax = axs[col] if numF > 1 else axs
839
+
840
+ ax = sns.scatterplot(x=NFC[x], y=NFC[y], ax=ax, color=palette[0])
841
+ ax.set(title=f'NFC for {f} {data['name']} X (corr = {corr:.2f})')
842
+ sns.regplot(x=NFC[x], y=NFC[y], ax=ax, scatter=False, color=palette[1])
843
+
844
+ limits = [
845
+ np.min([ax.get_xlim(), ax.get_ylim()]),
846
+ np.max([ax.get_xlim(), ax.get_ylim()]),
847
+ ]
848
+
849
+ # plot diagonal line
850
+ ax.plot(limits, limits, '--k')
851
+
852
+ util.equalize_axes_(ax)
853
+
854
+ fig.suptitle(f"NFC for {sample['name']}", y=1.04)
855
+ plt.tight_layout()
856
+ return corrs, fig
857
+
858
+ def ncf(sample: dict) -> Tuple[dict, matplotlib.axes.Axes]:
859
+ """Neighbouring change in feasibility
860
+
861
+ This feature produces as visual output a bar chart of proportion of feasible solutions
862
+ with infeasible neighbours and infeasible solutions with feasible
863
+ neighbours. The corresponding numerical outputs are the respective proportions.
864
+
865
+ Parameters
866
+ ----------
867
+ sample : dict
868
+ A sample containing input files i.e `F`, `N`.
869
+
870
+ Returns
871
+ -------
872
+ proportions : dict
873
+ Dictionary containing the numerical proportions as defined above.
874
+ fig : matplotlib.axes.Axes
875
+ `matplotlib` figure with the bar chart visually illustrating the
876
+ proportions defined above.
877
+
878
+ Raises
879
+ ------
880
+ Exception
881
+ Raises an exception if no V file is provided as feasibility is
882
+ undefined without the V file.
883
+ Exception
884
+ Raises an exception if the sample only has solutions belong to
885
+ only one class of feasibility i.e if all solutions are infeasible
886
+ or if all solutions are feasible.
887
+ Exception
888
+ Raised if both N and X inputs are absent, as neighbourhood
889
+ cannot be determined without having either.
890
+
891
+ Examples
892
+ --------
893
+ >>> from pyxla import util, ncf
894
+ >>> sample = util.load_sample('nk_n14_k2_id5_F1_V1', test=True)
895
+ >>> proportions, fig = ncf(sample)
896
+ >>> proportions # doctest: +SKIP
897
+ """
898
+
899
+ if not util.present(sample, 'V'): raise Exception('V is required. Please provide V.')
900
+
901
+ util.handle_missing_N_file(sample, True, warn=False)
902
+
903
+ proportions = {}
904
+
905
+ V = sample['V']
906
+
907
+ N = sample['N']
908
+ N['id1_feasible'] = V.loc[N['id1']]['feasible'].to_numpy()
909
+ N['id2_feasible'] = V.loc[N['id2']]['feasible'].to_numpy()
910
+
911
+ inversions = (N['id1_feasible'] ^ N['id2_feasible']).sum()
912
+
913
+ proportions['discontiguous feasbility'] = inversions / len(N)
914
+ proportions['contiguous feasibility'] = 1 - proportions['discontiguous feasbility']
915
+
916
+ fig = sns.barplot(x=proportions.keys(), y=proportions.values(), hue=proportions.keys())
917
+ fig.set_title(f'NΔFeas for {sample['name']}')
918
+ fig.set_ylim(0, 1)
919
+ fig.set_ylabel('proportion')
920
+
921
+ for container in fig.containers:
922
+ fig.bar_label(container, label_type='center', padding=5)
923
+
924
+ plt.tight_layout()
925
+
926
+ return proportions, fig
927
+
928
+ def n_flat(sample: dict, bounds: Union[List[float], float] = 0) -> Tuple[dict, matplotlib.axes.Axes]:
929
+ """Neutral degree of neighbours.
930
+
931
+ The visual output of this feature is a scatterplot of the neutral
932
+ degree of each solution against the neutral degree of its neighbours
933
+ with respect to each objective function. The numerical output is the
934
+ corresponding Spearman correlation coefficients for each scatterplot.
935
+ The neutral degree of a solution is the number of its neutral
936
+ neighbors [1]_.
937
+
938
+ Parameters
939
+ ----------
940
+ sample : dict
941
+ A sample containing input files i.e ``F``, ``N``.
942
+ bound : Union[List[float], float]
943
+ Bound(s) for infering equality of objective values, by default ``0``.
944
+
945
+ Returns
946
+ -------
947
+ corr : dict
948
+ Dictionary containing correlation between the neutral degree of
949
+ each solutions against neutral degree of neighbours for each
950
+ objective.
951
+ fig : matplotlib.figure.Figure
952
+ ``matplotlib`` figure containing axes for each objective with each
953
+ axis containing a scatterplot of the neutral degree of each
954
+ solution against neutral degree of neighbours.
955
+
956
+ References
957
+ ----------
958
+ .. [1] S. Verel, G. Ochoa, and M. Tomassini, 'Local Optima Networks of NK Landscapes With Neutrality', Evolutionary Computation, IEEE Transactions on, vol. 15, Jul. 2011.
959
+
960
+ Examples
961
+ --------
962
+ >>> from pyxla import util, n_flat
963
+ >>> import matplotlib
964
+ >>> sample = util.load_sample('nk_n14_k2_id5_F1_V1', test=True)
965
+ >>> corrs, plot = n_flat(sample, 0.01)
966
+ >>> type(corrs)
967
+ <class 'dict'>
968
+ >>> isinstance(plot, matplotlib.figure.Figure)
969
+ True
970
+ """
971
+
972
+ util.handle_missing_N_file(sample, True, warn=False)
973
+
974
+ N = sample['N']
975
+ F = sample['F']
976
+ NDN = pd.DataFrame() # neutral degree of neighbours
977
+ N_indexed = N.set_index('id1')
978
+ numF = sample['numF']
979
+
980
+ # replicate equality range for each objective
981
+ if not isinstance(bounds, Iterable): bounds = [bounds] * numF
982
+
983
+ corrs = {}
984
+ fig, axs = plt.subplots(ncols=numF, figsize=(5 * numF, 5))
985
+
986
+ for i, f in enumerate(F.columns):
987
+ n_degrees = pd.DataFrame()
988
+ # get all soln. indices in the N file
989
+ n_degrees['id'] = np.unique(np.concat([N['id1'].unique(), N['id2'].unique()]))
990
+ n_degrees['degree'] = 0
991
+ n_degrees.set_index('id', inplace=True)
992
+
993
+ # compute neutral degree for each soln. with a neighbour(s)
994
+ for idx in N['id1'].unique():
995
+ obj_val = F.loc[idx][f]
996
+ neighbours = N_indexed.loc[idx]['id2']
997
+ neighbours_obj_vals = F.loc[neighbours]
998
+ # check obj. value equality within supplied bounds
999
+ neutral_neighbours = neighbours_obj_vals[abs(neighbours_obj_vals - obj_val) <= bounds[i]].dropna()
1000
+ n_degrees.loc[idx, 'degree'] = len(neutral_neighbours)
1001
+ # n_degrees.loc[idx, 'degree'] = math.floor((2/3) * len(neighbours_obj_vals))
1002
+
1003
+ x = 'neutral degree'
1004
+ y = 'neutral degree of neighbour'
1005
+ NDN[x] = n_degrees.loc[N['id1']]['degree'].to_numpy()
1006
+ NDN[y] = n_degrees.loc[N['id2']]['degree'].to_numpy()
1007
+
1008
+ corr, _ = scipy.stats.spearmanr(NDN[x], NDN[y])
1009
+ corrs.update({f"n_flat_{f}": corr})
1010
+
1011
+ ax = axs[i] if len(F.columns) > 1 else axs
1012
+ ax = sns.scatterplot(x=NDN[x], y=NDN[y], ax=ax, alpha=0.5)
1013
+ ax.set(title=f'N_flat for {f} X (corr = {corr:.2f})')
1014
+ sns.regplot(x=NDN[x], y=NDN[y], ax=ax, scatter=False)
1015
+
1016
+ fig.suptitle(f"N_flat for {sample['name']}", y=1.04)
1017
+
1018
+ plt.tight_layout()
1019
+ return corrs, fig
1020
+
1021
+
1022
+ def nvc(sample: dict) -> Tuple[dict, matplotlib.figure.Figure]:
1023
+ """Computes neighbouring solutions' violation values correlation (NVC)
1024
+
1025
+ This feature produces as visual output, Scatter plot of the violation
1026
+ values between neighbours for each constraint, for infeasible solutions
1027
+ only. A regression line is plotted to indicate correlation. The plot
1028
+ is divided a broken line through origin such that lines above line are
1029
+ improving neighbours, those below are deteriorating neighbours while
1030
+ those on the line are neutral neighbours. The numerical output for
1031
+ this feature is a set of Spearman's correlation coefficients for each
1032
+ scatter plot.
1033
+
1034
+ Parameters
1035
+ ----------
1036
+ sample : dict
1037
+ A sample containing input files i.e `V`, `N`.
1038
+
1039
+ Returns
1040
+ -------
1041
+ corr : dict
1042
+ Dictionary containing correlation between violation values of
1043
+ solutions and the violation values of their neighbours, for
1044
+ infeasible solutions only.
1045
+ fig : matplotlib.axes.Axes
1046
+ `matplotlib` axes each containing a scatter plot of violation
1047
+ values of solutions against the violation values of their
1048
+ neighbours for infeasible solutions only.
1049
+
1050
+ Examples
1051
+ --------
1052
+ >>> from pyxla import util, nvc
1053
+ >>> import matplotlib
1054
+ >>> sample = util.load_sample('nk_n14_k2_id5_F3_V2', test=True)
1055
+ >>> corrs, plot = nvc(sample)
1056
+ >>> type(corrs)
1057
+ <class 'dict'>
1058
+ >>> isinstance(plot, matplotlib.figure.Figure)
1059
+ True
1060
+ """
1061
+
1062
+ if not util.present(sample, 'V'): raise Exception('V is required. Please provide V.')
1063
+
1064
+ util.handle_missing_N_file(sample, True, warn=False)
1065
+
1066
+ V = sample['V'].drop('feasible', axis=1)
1067
+ numV = sample['numV']
1068
+ N = sample['N']
1069
+ N_indexed = N.set_index('id1')
1070
+
1071
+ # get all solutions represented in N file
1072
+ X_in_N_id1 = N['id1'].unique()
1073
+ X_in_N_id2 = N['id2'].unique()
1074
+ X_in_N = np.unique(np.concat([X_in_N_id1, X_in_N_id2]))
1075
+ # filter V to only have Xs in N
1076
+ V = V.loc[X_in_N]
1077
+ assert len(V) == len(X_in_N)
1078
+
1079
+ corrs = {}
1080
+ fig, axs = plt.subplots(ncols=numV, figsize=(5 * numV, 5))
1081
+
1082
+ # change color palette
1083
+ sns.set_palette('deep')
1084
+ sns.set_palette(reversed(sns.color_palette()), 10)
1085
+ palette = sns.color_palette()
1086
+
1087
+ for i, v in enumerate(tqdm(V.columns)):
1088
+ NVC = pd.DataFrame()
1089
+ infeas = V.loc[X_in_N_id1].query(f"{v} != 0").index.to_numpy()
1090
+ # reduce N to have `id1's` that are infeasible
1091
+ N_filtered = N_indexed.loc[infeas]
1092
+ N_filtered = N_filtered.reset_index()
1093
+ x, y = f"{v}", f"neighbour {v}"
1094
+ NVC[x] = V.loc[N_filtered['id1'].to_numpy()][v].to_numpy()
1095
+ NVC[y] = V.loc[N_filtered['id2'].to_numpy()][v].to_numpy()
1096
+ assert len(NVC[x]) == len(NVC[y]) == len(N_filtered)
1097
+
1098
+ corr, _ = scipy.stats.spearmanr(NVC[x], NVC[y])
1099
+ corr = corr if isinstance(corr, float) else corr.item()
1100
+ corrs.update({f"NVC_for_{v}": f'{corr:.4f}'})
1101
+
1102
+ ax = axs[i] if numV > 1 else axs
1103
+
1104
+ ax = sns.scatterplot(x=NVC[x], y=NVC[y], ax=ax, color=palette[0])
1105
+ ax.set(title=f'NVC for {v} (corr = {corr:.2f})')
1106
+ sns.regplot(x=NVC[x], y=NVC[y], ax=ax, scatter=False, color=palette[1])
1107
+
1108
+ # plot diagonal line
1109
+ limits = [
1110
+ np.min([ax.get_xlim(), ax.get_ylim()]),
1111
+ np.max([ax.get_xlim(), ax.get_ylim()]),
1112
+ ]
1113
+ ax.plot(limits, limits, '--k')
1114
+
1115
+ util.equalize_axes_(ax)
1116
+
1117
+
1118
+ fig.suptitle(f"NVC for {sample['name']}", y=1.04)
1119
+ plt.tight_layout()
1120
+
1121
+ return corrs, fig
1122
+
1123
+ def nrc(sample: dict) -> Tuple[dict, matplotlib.figure.Figure]:
1124
+ """Computes neighbouring solutions' ranks correlation (NVC)
1125
+
1126
+ This feature produces as visual output, Scatter plot of the ranks
1127
+ between neighbours for each constraint, for infeasible solutions
1128
+ only. Ranks are based on Pareto ranks for objectives, violations, the
1129
+ combination of objectives and violations, and Deb feasibility rank.
1130
+ A regression line is plotted to indicate correlation. The plot
1131
+ is divided a broken line through origin such that lines above line are
1132
+ improving neighbours, those below are deteriorating neighbours while
1133
+ those on the line are neutral neighbours. The numerical output for
1134
+ this feature is a set of Spearman's correlation coefficients for each
1135
+ scatter plot.
1136
+
1137
+ Parameters
1138
+ ----------
1139
+ sample : dict
1140
+ A sample containing input files i.e `F`, `N`.
1141
+
1142
+ Returns
1143
+ -------
1144
+ corr : dict
1145
+ Dictionary containing correlation between ranks of
1146
+ solutions and the ranks of their neighbours, for
1147
+ infeasible solutions only.
1148
+ fig : matplotlib.axes.Axes
1149
+ `matplotlib` axes each containing a scatter plot of violation
1150
+ values of solutions against the ranks of their
1151
+ neighbours for infeasible solutions only.
1152
+
1153
+ Examples
1154
+ --------
1155
+ >>> from pyxla import util, nrc
1156
+ >>> import matplotlib
1157
+ >>> sample = util.load_sample('nk_n14_k2_id5_F3_V2', test=True)
1158
+ >>> corrs, plot = nrc(sample)
1159
+ >>> type(corrs)
1160
+ <class 'dict'>
1161
+ >>> isinstance(plot, matplotlib.figure.Figure)
1162
+ True
1163
+ """
1164
+
1165
+ util.handle_missing_N_file(sample, True, warn=False)
1166
+
1167
+ R = sample['R']
1168
+ R = R if not util.present(sample, 'V') else R.drop('feasible', axis=1)
1169
+ numR = sample['numR']
1170
+ N = sample['N']
1171
+
1172
+ # get all solutions represented in N file
1173
+ X_in_N_id1 = N['id1'].unique()
1174
+ X_in_N_id2 = N['id2'].unique()
1175
+ X_in_N = np.unique(np.concat([X_in_N_id1, X_in_N_id2]))
1176
+ # filter R to only have Xs in N
1177
+ R = R.loc[X_in_N]
1178
+ assert len(R) == len(X_in_N)
1179
+
1180
+ corrs = {}
1181
+ ncols = min(3, numR)
1182
+ nrows = math.ceil(numR / ncols)
1183
+ fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=(5 * ncols, 5 * nrows))
1184
+ axs = axs.ravel() if numR > 1 else axs
1185
+ # change color palette
1186
+ sns.set_palette('deep')
1187
+ sns.set_palette(reversed(sns.color_palette()), 10)
1188
+ palette = sns.color_palette()
1189
+
1190
+ for i, r in enumerate(tqdm(R.columns)):
1191
+ NRC = pd.DataFrame()
1192
+ x, y = f"{r} rank", f"neighbour {r} rank"
1193
+ NRC[x] = R.loc[N['id1'].to_numpy()][r].to_numpy()
1194
+ NRC[y] = R.loc[N['id2'].to_numpy()][r].to_numpy()
1195
+
1196
+ corr, _ = scipy.stats.spearmanr(NRC[x], NRC[y])
1197
+ corr = corr if isinstance(corr, float) else corr.item()
1198
+ corrs.update({f"NRC_for_{r}_ranks": f'{corr:.4f}'})
1199
+ ax = axs[i] if numR > 1 else axs
1200
+
1201
+ ax = sns.scatterplot(x=NRC[x], y=NRC[y], ax=ax, color=palette[0])
1202
+ ax.set(title=f'NRC for {r} ranks (corr = {corr:.2f})')
1203
+ sns.regplot(x=NRC[x], y=NRC[y], ax=ax, scatter=False, color=palette[1])
1204
+
1205
+ # plot diagonal line
1206
+ limits = [
1207
+ np.min([ax.get_xlim(), ax.get_ylim()]),
1208
+ np.max([ax.get_xlim(), ax.get_ylim()]),
1209
+ ]
1210
+ ax.plot(limits, limits, '--k')
1211
+
1212
+ util.equalize_axes_(ax)
1213
+
1214
+ fig.suptitle(f"NRC for {sample['name']}", y=1.04)
1215
+
1216
+ plt.tight_layout()
1217
+
1218
+ return corrs, fig
1219
+
1220
+ def disp_best(sample: dict, init_percentage: int = 10, growth_factor: int = 2):
1221
+ """Dispersion of best solutions
1222
+
1223
+ This feature analyses the dipersion amongst best solutions. It
1224
+ producesas visual output scatter plots showing the distribution of
1225
+ pairwise distances between solutions for increasing sample sizes of
1226
+ best solutions, where 'best' is with respect to different objectives,
1227
+ constraints and ranks. The corresponding numerical outputs are the
1228
+ dispersion metrics [1]_ with respect to each objective, constrant and
1229
+ rank.
1230
+ Positive dispersion metric values indicates the presence of funnels while
1231
+ negative values indicate the presence of global structure.
1232
+
1233
+ Parameters
1234
+ ----------
1235
+ sample : dict
1236
+ A sample containing input files i.e `F`, `V`.
1237
+
1238
+ References
1239
+ ----------
1240
+ .. [1] M. Lunacek and D. Whitley, 'The dispersion metric and the CMA evolution strategy', in Proceedings of the 8th annual conference on Genetic and evolutionary computation, 2006, pp. 477-484.
1241
+
1242
+ """
1243
+
1244
+ if init_percentage >= 100: raise Exception('Initial percentage for sub-sampling must be less than 100%.')
1245
+
1246
+ util.handle_missing_D_file(sample, True, warn=False)
1247
+
1248
+ D = sample['D']
1249
+ R: pd.DataFrame = sample['R']
1250
+ numR = sample['numR']
1251
+ R = R if not util.present(sample, 'V') else R.drop('feasible', axis=1)
1252
+
1253
+ disp_metrics = {}
1254
+ ncols = min(3, numR)
1255
+ nrows = math.ceil(numR / ncols)
1256
+ fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=(ncols * 5, nrows * 5))
1257
+ axs = axs.ravel() if numR > 1 else axs
1258
+
1259
+ sample_sizes = [init_percentage]
1260
+ i = 0
1261
+ while sample_sizes[i] * growth_factor < 100:
1262
+ sample_sizes.append(sample_sizes[i] * growth_factor)
1263
+ i += 1
1264
+
1265
+ for i, r in enumerate(R.columns):
1266
+ sorted = R.sort_values(by=r)
1267
+ pw_distances_avgs = []
1268
+ plot_data = pd.DataFrame(columns=['pairwise distances', 'sample size'])
1269
+
1270
+ for n in sample_sizes:
1271
+ nth = int(n/100 * len(R))
1272
+ best_n = sorted[:nth].index.sort_values()
1273
+ pairs = itertools.combinations(best_n, 2)
1274
+ pw_distances = D.loc[pairs]['d'].to_numpy()
1275
+
1276
+ data = pd.DataFrame()
1277
+ data['pairwise distances'] = pw_distances
1278
+ data['sample size'] = n
1279
+ plot_data = data if plot_data.empty else pd.concat([plot_data, data])
1280
+
1281
+ pw_distances_avgs.append(pw_distances.mean())
1282
+
1283
+ # dispersion metrics
1284
+ disp_metric = pw_distances_avgs[0] - pw_distances_avgs[-1]
1285
+ disp_metrics.update({r: disp_metric})
1286
+
1287
+ # plot
1288
+ ax = axs[i] if numR > 1 else axs
1289
+ sns.scatterplot(x=plot_data['sample size'], y=plot_data['pairwise distances'], ax=ax)
1290
+ ax.set(title=f'disp_best for {r} (metric = {disp_metric:.4f})')
1291
+
1292
+ # custom function scale matching growth factor
1293
+ forward = lambda x: np.log(x / init_percentage) / np.log(growth_factor)
1294
+ inverse = lambda x: init_percentage * (growth_factor ** x)
1295
+ ax.set_xscale('function', functions=(forward, inverse))
1296
+
1297
+ # set ticks manually
1298
+ ax.set_xticks(sample_sizes)
1299
+ ax.set_xticklabels([str(t) for t in sample_sizes])
1300
+
1301
+ # turn off unused axes
1302
+ if i + 1 < ncols * nrows:
1303
+ for unused_ax in axs[i + 1:]: unused_ax.set_axis_off()
1304
+
1305
+ fig.suptitle(f"disp_best for {sample['name']}", y=1.04)
1306
+
1307
+ plt.tight_layout()
1308
+
1309
+ return disp_metrics, fig
1310
+
1311
+
1312
+ def X_imp(sample: dict, n_repeats=10, train_proportion = 0.7, binary: bool = False, seed: float = None):
1313
+ if not util.present(sample, 'X'): raise Exception('X is required. Please provide an X file.')
1314
+
1315
+ X: pd.DataFrame = sample['X']
1316
+ F = sample['F']
1317
+ numF = sample['numF']
1318
+
1319
+ x_imp_ranks = {}
1320
+
1321
+
1322
+ # add the objectives to the X file
1323
+ for f in F.columns:
1324
+ X[f] = F[f]
1325
+
1326
+ pointbiserialr = lambda x, y: scipy.stats.pointbiserialr(y, x)[0]
1327
+
1328
+ corr_matrix = X.corr(method=pointbiserialr)
1329
+
1330
+ # remove F cols from X file
1331
+ X.drop(F.columns, axis=1, inplace=True)
1332
+
1333
+ corr_matrix = corr_matrix.drop(index=X.columns).drop(columns=F.columns)
1334
+
1335
+ fig, axs = plt.subplots(ncols=numF, nrows=2, figsize=(numF * 5, 10))
1336
+
1337
+ for i, f in enumerate(F.columns):
1338
+ ax = axs[0, i] if numF > 1 else axs[i]
1339
+ ax = sns.barplot(corr_matrix.loc[f], ax=ax)
1340
+ ax.set(title=f"Correlation of X and {f}")
1341
+
1342
+ for i, f in enumerate(F.columns):
1343
+ x_imp = pd.DataFrame()
1344
+
1345
+ # normalize
1346
+ X = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
1347
+
1348
+ X_train, X_test, y_train, y_test = train_test_split(X, F[f], train_size=train_proportion, random_state=seed)
1349
+
1350
+ model = LassoLars(alpha=0.001).fit(X_train, y_train)
1351
+ r2 = model.score(X_test, y_test)
1352
+
1353
+ imp = permutation_importance(model, X_test, y_test,
1354
+ n_repeats=n_repeats,
1355
+ random_state=seed,
1356
+ scoring='r2')
1357
+ # sort importance means desc.
1358
+ imp_idxs = imp.importances_mean.argsort()[::-1]
1359
+
1360
+ x_imp['X'] = X.columns[imp_idxs]
1361
+ x_imp['importance'] = imp.importances_mean[imp_idxs]
1362
+ x_imp['std'] = imp.importances_std[imp_idxs]
1363
+ x_imp['rank'] = x_imp.index.to_numpy() + 1
1364
+ x_imp_ranks[f] = x_imp
1365
+
1366
+ ax = sns.barplot(x_imp, x='X', y='importance', ax=axs[1, i] if numF > 1 else axs[1])
1367
+ ax.errorbar(
1368
+ x=range(len(x_imp)),
1369
+ y=x_imp['importance'],
1370
+ yerr=x_imp['std'],
1371
+ fmt='none',
1372
+ c='black',
1373
+ )
1374
+ ax.set(title=f"X_imp for {f} (Validation {r"$R^2$"} = {r2:.4f})")
1375
+
1376
+ fig.suptitle(f"X_imp for {sample['name']}")
1377
+
1378
+ plt.tight_layout()
1379
+
1380
+ return corr_matrix, x_imp_ranks, fig