ddsimca 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ddsimca/ddsimca.py ADDED
@@ -0,0 +1,1309 @@
1
+ import math
2
+ import itertools
3
+ import pandas as pd
4
+ import numpy as np
5
+ import numpy.typing as npt
6
+ import matplotlib.patches as mpatches
7
+ import matplotlib.ticker as mticker
8
+ import matplotlib.pyplot as plt
9
+
10
+ from matplotlib.axes import Axes
11
+ from typing import TypeVar
12
+ from scipy.stats.distributions import chi2, norm, binom
13
+
14
+ # maps indices to names of roles and decisions
15
+ ind_to_roles = ['regular', 'extreme', 'outlier', 'alien', 'external']
16
+ ind_to_decisions = ['out', 'in']
17
+
18
+ # annotations for correct typing
19
+ Array2D = npt.NDArray[np.float64]
20
+
21
+ # default plot colors and markers
22
+ MAIN_COLOR = "tab:blue"
23
+
24
+ COLORS_FOMS = {
25
+ "eff": "tab:purple",
26
+ "sens": "tab:blue",
27
+ "spec": "tab:cyan",
28
+ "sel": "tab:cyan",
29
+ "acc": "gray"
30
+ }
31
+
32
+ COLORS_ROLES = {
33
+ "regular": "tab:blue",
34
+ "extreme": "tab:orange",
35
+ "outlier": "tab:red",
36
+ "alien": "tab:blue",
37
+ "external": "tab:red"
38
+ }
39
+
40
+ COLORS_DECISIONS = {
41
+ "in": "tab:blue",
42
+ "out": "tab:red",
43
+ }
44
+
45
+ MARKERS = ['o', 's', '^', 'd', '>', 'h', 'p', 'v']
46
+
47
+ # a small number to add to sd
48
+ EPSILON = 1e-12
49
+
50
+
51
+
52
+ #######################################
53
+ # Auxillary methods for plots #
54
+ #######################################
55
+
56
+ def get_group_colors(groups:list) -> dict:
57
+ """
58
+ Returns dictionary with dedicated color for each group name.
59
+ """
60
+ n = len(groups)
61
+
62
+ if n == 1:
63
+ colors = ['tab:blue']
64
+ elif n == 2:
65
+ colors = ['tab:blue', 'tab:red']
66
+ elif n == 3:
67
+ colors = ['tab:blue', 'tab:orange', 'tab:red']
68
+ elif n <= 10:
69
+ cmap = plt.get_cmap("tab10", n)
70
+ colors = [cmap(i) for i in range(n)]
71
+ elif n <= 20:
72
+ cmap = plt.get_cmap("tab20", n)
73
+ colors = [cmap(i) for i in range(n)]
74
+ else:
75
+ raise ValueError("Number of groups is too large (>20) for distingushing them with colors.")
76
+
77
+ return dict(zip(groups, colors))
78
+
79
+ def plot_axes(ax:Axes, type:str = "p") -> None:
80
+ """
81
+ Add horizontal and vertical (only if `type = "p"`) lines crossing (0,0) origin to a plot.
82
+ """
83
+ ax.axhline(0, color="#a0a0a0", linewidth = 0.75, zorder = 1)
84
+ if type == "p":
85
+ ax.axvline(0, color="#a0a0a0", linewidth = 0.75, zorder = 1)
86
+
87
+ def plot_grid(ax:Axes) -> None:
88
+ """
89
+ Add lightgray grid to a plot and place it behind other plot elements.
90
+ """
91
+ ax.set_axisbelow(True)
92
+ ax.grid(True, linestyle = "--", color = "#e0e0e0")
93
+
94
+ def plot_labels(ax:Axes, x:npt.NDArray[np.float64], y:npt.NDArray[np.float64],
95
+ labels:list, dy:float = 0 ) -> None:
96
+ """
97
+ Show labels on top of data points with coordinates (x, y)
98
+ """
99
+ for i in range(0, len(x)):
100
+ ax.text(x[i], y[i] + dy, labels[i], color="gray", ha = "center")
101
+
102
+ def plot_compstats(ax:Axes, y:npt.NDArray, color:str = MAIN_COLOR,
103
+ marker:str = "o", label:str = "") -> None:
104
+ """
105
+ Show plot with a statistic vs number of components.
106
+ """
107
+ ncomp = len(y)
108
+ comp_seq = list(range(1, ncomp + 1))
109
+ ax.plot(comp_seq, y, color = color, markeredgecolor = color, markerfacecolor = "#ffffff",
110
+ marker = marker, label = label)
111
+ ax.set_xlim((0, ncomp + 1))
112
+ ax.set_xlabel("Number of PCs")
113
+ plot_grid(ax)
114
+
115
+
116
+
117
+ #######################################
118
+ # Auxillary methods for calculations #
119
+ #######################################
120
+
121
+ def get_limits(u0:float, Nu:float, CLe:float = 0.95, CLo:float = 0.9983) -> tuple[float, float]:
122
+ """
123
+ Compute statistical limits for extreme objects and outliers based on chi-square distribution.
124
+ """
125
+ qe = chi2.ppf(CLe, Nu)
126
+ qo = chi2.ppf(CLo, Nu)
127
+ return (float(qe) * u0 / Nu, float(qo) * u0 / Nu)
128
+
129
+ def get_distparams(U: Array2D, type:str = 'classic') -> tuple[npt.NDArray[np.float64],
130
+ npt.NDArray[np.float64]]:
131
+ """
132
+ Computes parameters of a scaled chi-square distribution that approximate the distribution of the distance values using the method of moments.
133
+
134
+ Parameters:
135
+ -----------
136
+ `U` : A matrix (2D NumPy array) of distances to compute the distribution parameters for.
137
+
138
+ Returns
139
+ -------
140
+ tuple
141
+ A tuple containing two estimated parameters:
142
+ - `u0`: The scalar values for each column of U.
143
+ - `Nu`: The estimated number of degrees of freedom for each column of Nu.
144
+
145
+ Raises
146
+ ------
147
+ `ValueError`
148
+ If the input array is empty.
149
+
150
+ Notes
151
+ -----
152
+ The function calculates the mean (u0) and variance (vu) of the values in the input array `u`.
153
+ If the coefficient of variation is very small (less than 1e-6), the function returns (u0, 1) to avoid division by zero in subsequent calculations. Otherwise, it calculates `Nu` as 2 * u0^2 / vu.
154
+ """
155
+
156
+ if len(U.shape) != 2:
157
+ raise ValueError("Argument U must be a matrix (2D array).")
158
+
159
+ if type == 'classic':
160
+ u0 = U.mean(axis = 0)
161
+ vu = U.std(axis = 0, ddof=1.)**2
162
+
163
+ u02 = u0 ** 2
164
+ u02[u02 < EPSILON] = EPSILON
165
+ vu[vu < EPSILON] = EPSILON
166
+
167
+ Nu = np.round(np.divide(2 * u02, vu))
168
+ Nu[Nu < 1] = 1
169
+ Nu[Nu > 250] = 250
170
+
171
+ return (u0, Nu)
172
+
173
+ Mu = np.median(U, axis = 0)
174
+ Su = np.quantile(U, 0.75, axis = 0) - np.quantile(U, 0.25, axis = 0)
175
+ RM = np.divide(Su, Mu)
176
+
177
+ Nu = np.zeros(len(RM))
178
+ Nu[RM > 0.0] = np.round(np.exp((1.380948 * np.log(2.68631 / RM[RM > 0.0])) ** 1.185785))
179
+ Nu[RM > 2.685592117] = 1
180
+ Nu[RM < 0.194565995] = 100
181
+
182
+ tQ2 = chi2.ppf(0.50, Nu)
183
+ tIQR = chi2.ppf(0.75, Nu) - chi2.ppf(0.25, Nu)
184
+ u0 = 0.5 * Nu * (Mu / tQ2 + Su / tIQR)
185
+
186
+ return (u0, Nu)
187
+
188
+ def process_members(f:npt.NDArray[np.float64], eCrit:float, oCrit:float, roles:npt.NDArray[np.int16],
189
+ ind:npt.NDArray[np.int64]|None) -> tuple[int, int]:
190
+ """
191
+ Process objects as target class members by assigning corresponding roles.
192
+
193
+ Parameters:
194
+
195
+ `f`: vector with full distance values for each object.
196
+ `eCrit`: critical f-value for extreme objects.
197
+ `oCrit`: critical f-value for outliers.
198
+ `roles`: list with numeric indices for roles (will be changed in place).
199
+ `ind`: array with logical values which point on class members.
200
+
201
+ Returns:
202
+ --------
203
+ Tuple with number of true positives (TP) and false negatives (FN). In addition to that the function modifies list with roles by assigning `0` to regular objects, `1` to extreme and `2` to outliers.
204
+ """
205
+ if ind is None or ind.sum() < 1:
206
+ return (0, 0)
207
+
208
+
209
+ reg_ind = ind & (f <= eCrit)
210
+ out_ind = ind & (f > oCrit)
211
+ ext_ind = ind & (f > eCrit) & (f < oCrit)
212
+
213
+ roles[reg_ind] = 0
214
+ roles[ext_ind] = 1
215
+ roles[out_ind] = 2
216
+
217
+ TP = reg_ind.sum()
218
+ FN = ind.sum() - TP
219
+ return (TP, FN)
220
+
221
+ def process_strangers(f:npt.NDArray[np.float64], k:float, eCrit:float, roles:npt.NDArray[np.int16],
222
+ ind:npt.NDArray[np.bool_]|None) -> tuple[int, int, float, float, float, float, float, float, float, float]:
223
+ """
224
+ Process objects as non-members by assigning corresponding roles.
225
+
226
+ Parameters:
227
+ -----------
228
+ `f` - vector with full distance values for each object.
229
+ `k` - number of degrees of freedom for full distance (Nf).
230
+ `eCrit` - critical f-value for extreme objects.
231
+ `roles` - list with numeric indices for roles (will be changed in place).
232
+ `ind` - array with logical values which point on non-members.
233
+
234
+ Returns:
235
+ --------
236
+ Tuple with following values:
237
+ * `TN` - number of true negatives.
238
+ * `FP` - number of false positives.
239
+ * `beta` - probability to make Type II error.
240
+ * `s`, `f0`, `hz`, `Mz`, `Sz`, `k`, `m` - parameters used to fit a non-central chi-square distribution.
241
+
242
+ In addition to that, the function modifies list with roles by assigning `3` to alien objects and `4` to external objects.
243
+ """
244
+
245
+ if ind is None or ind.sum() < 1:
246
+ return (0, 0, 0., 0., 0., 0., 0., 0., 0., 0.)
247
+
248
+ ind_in = ind & (f <= eCrit)
249
+ ind_out = ind & (f > eCrit)
250
+ TN = ind_out.sum()
251
+ FP = ind_in.sum()
252
+
253
+ # by default all non-members are aliens
254
+ roles[ind] = 3
255
+
256
+ # Step 1. Sort all distances
257
+ fs = f[ind]
258
+ indv = fs.argsort()
259
+ fp = fs[indv]
260
+ ind_num = np.where(ind)[0]
261
+
262
+ # Step 2. Try to fit the non-central chi-square
263
+ I = len(indv)
264
+ Disc = -1
265
+ n = 0
266
+ m = 0
267
+ d = 0
268
+ M1 = 0
269
+
270
+ while Disc < 0:
271
+ if n > 0:
272
+ # sample with largest f does not fit, so we change its rolw to "external"
273
+ # and amend the number of aliens and externals
274
+ roles[ind_num[indv[I - 1]]] = 4
275
+
276
+ # then we remove this sample from the temporary vector and
277
+ # assess the next biggest
278
+ I = I - 1
279
+ indv = indv[:-1]
280
+ fp = fp[:-1]
281
+
282
+ # compute parameters for moments squared equation and
283
+ # its discriminant
284
+ m = fp.mean()
285
+ d = fp.var(ddof = 1)
286
+ M1 = d / (m * m)
287
+ Disc = 4 - 2 * k * M1
288
+ n = n + 1
289
+
290
+ # Step 3. Calculate x by Eq. (18)
291
+ x = (2 + math.sqrt(Disc)) / M1
292
+
293
+ # Calculate s and f'0 by Eq. (19)
294
+ s = (x - k)
295
+ f0 = m / x
296
+
297
+
298
+ # Calculate: z, hz, r, p, Mz, Sz by Eq. (20)
299
+ z = eCrit / f0
300
+ hz = 1 - 2 * (k + s) * (k + 3 * s) / (3 * (k + 2 * s)**2)
301
+ r = (hz - 1) * (1 - 3 * hz)
302
+ p = (k + 2 * s) / (k + s)**2
303
+ Mz = 1 + hz * p * (hz - 1 - 0.5 * (2 - hz) * r * p)
304
+ Sz = hz * math.sqrt(2 * p) * (1 + 0.5 * r * p)
305
+
306
+ # Step 4. If α is given then calculate β by Eq. (21)
307
+ beta = norm.cdf((math.pow(z / (k + s), hz) - Mz) / Sz)
308
+ return (TN, FP, beta, s, f0, hz, Mz, Sz, k, m)
309
+
310
+
311
+ ####################################
312
+ # DDSIMCARes class #
313
+ ####################################
314
+
315
+ class DDSIMCARes:
316
+ """
317
+ A class to hold and process results from the DDSIMCA model predictions. Do not use it manually, it is used by the 'predict' method from the DDSIMCA class.
318
+
319
+ Methods
320
+ -------
321
+ `select_ncomp()`
322
+ Sets (selects) optimal number of components.
323
+ `summary()`
324
+ Prints a summary with main figures of merits and corresponding statistics.
325
+ `as_df()`
326
+ Returns data frame with role, decision and distances for every object.
327
+ `plotFoM()`
328
+ Show a plot with selected figure of merit vs. number of components.
329
+ `plotDistance()`
330
+ Shows a plot with score, orthogonal or full distance vs object index.
331
+ `plotAcceptance()`
332
+ Shows the acceptance plot.
333
+ `plotScores()`
334
+ Shows a scores plot.
335
+ `plotExtremes()`
336
+ Shows extremes plot.
337
+ `plotAliens()`
338
+ Shows aliens plot.
339
+ """
340
+ def __init__(self, target_class, hParams, qParams, fParams, center, scale, alpha, gamma,
341
+ lim_type, I, H, Q, T, E, classes, labels, ncomp_selected):
342
+
343
+ nrows, ncomp = Q.shape
344
+ h0, Nh = hParams
345
+ q0, Nq = qParams
346
+ f0, Nf = fParams
347
+
348
+ self.nrows = nrows
349
+ self.ncomp = ncomp
350
+ self.ncomp_selected = ncomp_selected
351
+ self.center = center
352
+ self.scale = scale
353
+
354
+ self.T = T
355
+ self.E = E
356
+ self.Q = Q
357
+ self.H = H
358
+ self.F = (H / h0) * Nh + (Q / q0) * Nq
359
+
360
+ self.hParams = hParams
361
+ self.qParams = qParams
362
+ self.fParams = fParams
363
+ self.labels = labels
364
+
365
+ self.has_classes = classes is not None and len(classes) > 0
366
+ self.classes = np.unique(classes) if self.has_classes else []
367
+ self.class_labels = classes
368
+ self.target_class = target_class
369
+
370
+ # confidence levels and critical limits
371
+ self.alpha = alpha
372
+ self.gamma = gamma
373
+ self.lim_type = lim_type
374
+
375
+ self.CLe = 1 - alpha
376
+ self.CLo = (1 - gamma)**(1.0 / I)
377
+
378
+ if self.has_classes:
379
+ ind_members = self.class_labels == target_class
380
+ ind_strangers = ~ind_members
381
+ ind_unknowns = np.full(nrows, False)
382
+ num_members = sum(ind_members)
383
+ num_strangers = sum(ind_strangers)
384
+ num_unknowns = 0
385
+ else:
386
+ ind_members = np.full(nrows, False)
387
+ ind_strangers = np.full(nrows, False)
388
+ ind_unknowns = np.full(nrows, True)
389
+ num_members = 0
390
+ num_strangers = 0
391
+ num_unknowns = nrows
392
+
393
+ # outcomes
394
+ outcomes = [None] * ncomp
395
+ self.R = np.zeros((self.nrows, self.ncomp), dtype=np.int16) # matrix with roles
396
+ self.D = np.full((self.nrows, self.ncomp), False) # matrix with decisions
397
+
398
+
399
+ for a in range(ncomp):
400
+ eCrit, oCrit = get_limits(f0[a], Nf[a], self.CLe, self.CLo)
401
+ f = self.F[:, a]
402
+ roles = np.zeros(nrows, dtype=np.int16)
403
+ decisions = f < eCrit
404
+
405
+ TP, FN = process_members(f, eCrit, oCrit, roles, ind_members)
406
+ TN, FP, beta, s, f0t, hz, Mz, Sz, k, m = process_strangers(f, Nf[a], eCrit, roles, ind_strangers)
407
+
408
+ if num_unknowns > 0:
409
+ _ = process_strangers(f, Nf[a], eCrit, roles, ind_unknowns)
410
+
411
+ if num_members > 0:
412
+ sens = TP / (TP + FN)
413
+ else:
414
+ sens = 0
415
+
416
+ if num_strangers > 0:
417
+ spec = TN / (TN + FP)
418
+ sel = 1 - beta
419
+ else:
420
+ spec = 0
421
+ sel = 0
422
+
423
+ if num_strangers > 0 and num_members > 0:
424
+ eff = math.sqrt(sens * spec)
425
+ acc = (TP + TN) / nrows
426
+ else:
427
+ eff = 0
428
+ acc = 0
429
+
430
+ num_in = np.sum(decisions)
431
+ num_out = nrows - num_in
432
+ outcomes[a] = {"PCs": a + 1, "eCrit": eCrit, "oCrit": oCrit, "TP": TP, "FN": FN, "TN": TN, "FP": FP, "beta": beta, "s": s, "f0t": f0t, "hz": hz, "Mz": Mz, "Sz": Sz, "k": k, "m": m,
433
+ "in": num_in, "out": num_out, "sens": sens, "spec": spec, "sel": sel, "acc": acc,
434
+ "eff": eff}
435
+
436
+ self.R[:, a] = roles
437
+ self.D[:, a] = decisions
438
+
439
+ self.num_members = num_members
440
+ self.num_strangers = num_strangers
441
+ self.num_unknowns = num_unknowns
442
+
443
+ self.ind_members = ind_members
444
+ self.ind_strangers = ind_strangers
445
+ self.ind_unknowns = ind_unknowns
446
+
447
+ self.outcomes = pd.DataFrame(outcomes)
448
+
449
+ def select_ncomp(self, ncomp:int):
450
+ """ Change (select) optimal number of components """
451
+ if ncomp < 1 or ncomp > self.ncomp:
452
+ raise ValueError(f"Wrong value for 'ncomp' parameter (must be between 1 and {self.ncomp}")
453
+ self.ncomp_selected = ncomp
454
+
455
+ def summary(self):
456
+ """
457
+ Prints a summary of the statistics of the model predictions.
458
+
459
+ Displays the number of data points per class and how many are accepted/rejected by the model.
460
+ """
461
+
462
+ print('\033[1m', end = "")
463
+ print("DDSIMCA results:\n")
464
+ print('\033[0m', end = "")
465
+
466
+ print(f"- number of components (total): {self.ncomp}")
467
+ print(f"- number of components (selected): {self.ncomp_selected}")
468
+ print(f"- limit type: {self.lim_type}")
469
+ print(f"- alpha: {self.alpha:.3f}")
470
+ print(f"- gamma: {self.gamma:.3f}\n")
471
+
472
+ if (self.num_unknowns > 0):
473
+ print( "- class labels: not provided")
474
+ print(f"- number of objects: {self.nrows}")
475
+ else:
476
+ print( "- class labels: provided")
477
+ print(f"- number of objects: {self.nrows}")
478
+ print(f"- number of members: {self.num_members}")
479
+ print(f"- number of strangers: {self.num_strangers}")
480
+
481
+ print("")
482
+
483
+ l = ["PCs", "eCrit", "oCrit", "in", "out"]
484
+
485
+ if self.num_members > 0:
486
+ l.extend(["TP", "FN", "sens"])
487
+ if self.num_strangers > 0:
488
+ l.extend(["TN", "FP", "spec", "sel"])
489
+ if self.num_members > 0 and self.num_strangers > 0:
490
+ l.extend(["acc", "eff"])
491
+
492
+ out = self.outcomes[l].round(3)
493
+ print(out.to_string(index = False))
494
+
495
+
496
+ def as_df(self, ncomp:int|None = None):
497
+ """
498
+ Return distance values, decisions and roles for each object as data frame
499
+ """
500
+
501
+ if ncomp is None:
502
+ ncomp = self.ncomp_selected
503
+
504
+ return pd.DataFrame({
505
+ "class": self.class_labels,
506
+ "decision": ["in" if v else "out" for v in self.D[:, ncomp - 1]],
507
+ "role": [ind_to_roles[int(v)] for v in self.R[:, ncomp - 1]],
508
+ "h": self.H[:, ncomp - 1],
509
+ "q": self.Q[:, ncomp - 1],
510
+ "f": self.F[:, ncomp - 1]
511
+ }, index = self.labels)
512
+
513
+
514
+ def plotFoM(self, ax:Axes, fom = 'sens', color:str|None = None, marker:str = 'o', label:str|None = None,
515
+ show_ci:bool = False):
516
+
517
+
518
+ if not self.has_classes:
519
+ raise ValueError("This results object does not have figures of merit as reference class labels were not provided.")
520
+
521
+ if fom not in ["sens", "spec", "sel", "acc", "eff"]:
522
+ raise ValueError("Wrong value for parameter 'fom'.")
523
+
524
+ if color is None:
525
+ color = COLORS_FOMS[fom]
526
+
527
+ if label is None:
528
+ label = fom
529
+
530
+ plot_compstats(ax, self.outcomes[fom], color = color, marker = marker, label = label)
531
+
532
+ # show confidence interval for sensitivity
533
+ if fom == "sens" and show_ci == True:
534
+ n = self.nrows
535
+ p = 1 - self.alpha
536
+ lo = binom.ppf(0.025, n, p) / n
537
+ up = binom.ppf(0.975, n, p) / n
538
+ rect = mpatches.Rectangle((0, lo), self.ncomp + 1, up - lo, facecolor = "#00000020")
539
+ ax.add_patch(rect)
540
+
541
+ ax.set_title("Figures of merit")
542
+ ax.set_ylim((0, 1.1))
543
+ ax.set_ylabel('')
544
+ ax.legend()
545
+
546
+
547
+
548
+ def plotDistance(self, ax:Axes, ncomp:int|None = None, distance:str="q",
549
+ colors:dict|None = None, show_labels:bool = False, show_crit:bool = True):
550
+ """
551
+ Plots the specified type of distance for the data points.
552
+
553
+ Parameters
554
+ ----------
555
+
556
+ """
557
+
558
+ if distance not in ('q', 'h', 'f'):
559
+ raise ValueError("Invalid distance type specified. Choose 'q', 'h', or 'f'.")
560
+ if ncomp is None:
561
+ ncomp = self.ncomp_selected
562
+
563
+ if (ncomp < 1) or (ncomp > self.ncomp):
564
+ raise ValueError(f"Wrong value for parameter 'ncomp' (should be between 1 and {self.ncomp}).")
565
+
566
+ if self.has_classes:
567
+ nclasses = len(self.classes)
568
+ if colors is None:
569
+ colors = get_group_colors(self.classes)
570
+ elif len(colors) < nclasses:
571
+ raise ValueError(f"Colors for each of the {nclasses} must be provided.")
572
+
573
+ distances = {
574
+ 'q': self.Q[:, ncomp - 1],
575
+ 'h': self.H[:, ncomp - 1],
576
+ 'f': self.F[:, ncomp - 1]
577
+ }[distance]
578
+
579
+ title_map = {'q': "Residual", 'h': "Score", 'f': "Full"}
580
+
581
+
582
+
583
+ x = np.arange(len(distances))
584
+ y = distances
585
+ l = self.labels
586
+
587
+ if self.has_classes and colors is not None:
588
+ g = self.class_labels
589
+ l = self.labels
590
+ gu = np.unique(g)
591
+ for i, c in enumerate(gu):
592
+ class_points = [(x[j], y[j], l[j]) for j, gl in enumerate(g) if gl == c]
593
+ cx, cy, cl = zip(*class_points)
594
+ ax.bar(cx, cy, color = colors[c], label = c)
595
+ if show_labels:
596
+ plot_labels(ax, cx, cy, cl, 0.1)
597
+ else:
598
+ ax.bar(x, y, color = MAIN_COLOR)
599
+
600
+ if show_labels:
601
+ dy = np.max(distances) * 0.05
602
+ plot_labels(ax, x, distances, self.labels, dy)
603
+
604
+ if show_crit and distance == "f":
605
+ fCrit = self.outcomes["eCrit"][ncomp - 1]
606
+ ax.axhline(fCrit, color="#a0a0a0", linewidth = 0.75, zorder = 1)
607
+
608
+ ax.legend()
609
+ ax.set_title(f"{title_map[distance]} distance")
610
+ ax.set_ylabel(f"{distance}-distance")
611
+ ax.set_xlabel("Objects")
612
+
613
+
614
+ def plotAcceptance(self, ax:Axes, ncomp:int|None=None, do_log:bool=False,
615
+ show_labels:bool = False, show:str = ""):
616
+ """
617
+ Plots an acceptance graph showing scaled explained and residual distances and the decision boundary.
618
+
619
+ Parameters
620
+ ----------
621
+ plt : matplotlib.pyplot
622
+ Matplotlib plot module.
623
+ do_log : bool, optional
624
+ Whether to plot the original distances or log-transformed (log(1 + u)).
625
+ colors : list, optional
626
+ List of colors for each class.
627
+ markers : list, optional
628
+ List of markers for each class in the plot.
629
+ show_labels : bool, optional
630
+ Logical, show or not object labels on top of each bar.
631
+
632
+ Raises
633
+ ------
634
+ ValueError
635
+ If the number of colors or markers is smaller than the number of classes.
636
+
637
+ """
638
+
639
+ if show == "":
640
+ if self.num_unknowns > 0 or (self.num_members > 0 and self.num_strangers > 0):
641
+ show = "all"
642
+ elif self.num_members > 0:
643
+ show = "members"
644
+ else:
645
+ show = "strangers"
646
+
647
+ if show not in ["all", "members", "strangers"]:
648
+ raise ValueError("Wrong value for parameter 'show', use: 'all', 'members' or 'strangers'")
649
+
650
+ marker = "s" if self.num_unknowns > 0 else "o"
651
+
652
+ h0, Nh = self.hParams
653
+ q0, Nq = self.qParams
654
+ f0, Nf = self.fParams
655
+
656
+ if ncomp is None:
657
+ ncomp = self.ncomp_selected
658
+
659
+ roles = [ind_to_roles[int(i)] for i in self.R[:, ncomp - 1]]
660
+ decisions = [ind_to_decisions[int(i)] for i in self.D[:, ncomp - 1]]
661
+
662
+ h0 = h0[ncomp - 1]
663
+ q0 = q0[ncomp - 1]
664
+ f0 = f0[ncomp - 1]
665
+
666
+ Nh = Nh[ncomp - 1]
667
+ Nq = Nq[ncomp - 1]
668
+ Nf = Nf[ncomp - 1]
669
+
670
+ h = self.H[:, ncomp - 1]
671
+ q = self.Q[:, ncomp - 1]
672
+
673
+ h_scaled = np.log1p(h / h0) if do_log else h / h0
674
+ q_scaled = np.log1p(q / q0) if do_log else q / q0
675
+
676
+ if show == "all" or self.num_unknowns > 0:
677
+ x = h_scaled
678
+ y = q_scaled
679
+ g = self.class_labels if self.num_unknowns == 0 else decisions
680
+ l = self.labels
681
+ gu = np.unique(g)
682
+ col = get_group_colors(self.classes) if self.has_classes else COLORS_DECISIONS
683
+ show_outliers_boundary = False
684
+ elif show == "members" or (self.num_members > 0 and self.num_strangers == 0):
685
+ x = h_scaled[self.ind_members]
686
+ y = q_scaled[self.ind_members]
687
+ g = list(itertools.compress(roles, self.ind_members))
688
+ l = list(itertools.compress(self.labels, self.ind_members))
689
+ gu = np.unique(g)
690
+ col = COLORS_ROLES
691
+ show_outliers_boundary = True
692
+ else:
693
+ x = h_scaled[self.ind_strangers]
694
+ y = q_scaled[self.ind_strangers]
695
+ g = list(itertools.compress(roles, self.ind_strangers))
696
+ l = list(itertools.compress(self.labels, self.ind_strangers))
697
+ gu = np.unique(g)
698
+ col = COLORS_ROLES
699
+ show_outliers_boundary = False
700
+
701
+ dy = (np.max(y) - np.min(y)) * 0.05
702
+
703
+ for i, c in enumerate(gu):
704
+ class_points = [(x[j], y[j], l[j]) for j, gl in enumerate(g) if gl == c]
705
+ cx, cy, cl = zip(*class_points)
706
+ ax.scatter(cx, cy, label=c, marker=marker, edgecolors=col[c], facecolors='none')
707
+ if show_labels:
708
+ plot_labels(ax, cx, cy, cl, dy)
709
+
710
+ # show decision and outliers boundaries
711
+ fCritE = self.outcomes["eCrit"][ncomp - 1]
712
+ xqeMax = fCritE / Nh
713
+ xqe = np.linspace(0, xqeMax, 200)
714
+ yqe = (fCritE - xqe * Nh) / Nq
715
+
716
+ fCritO = self.outcomes["oCrit"][ncomp - 1]
717
+ xqoMax = fCritO / Nh
718
+ xqo = np.linspace(0, xqoMax, 200)
719
+ yqo = (fCritO - xqo * Nh) / Nq
720
+
721
+ if do_log:
722
+ xqe = np.log1p(xqe)
723
+ yqe = np.log1p(yqe)
724
+ xqo = np.log1p(xqo)
725
+ yqo = np.log1p(yqo)
726
+
727
+ ax.plot(xqe, yqe, 'k--', linewidth=0.5)
728
+
729
+ if show_outliers_boundary:
730
+ ax.plot(xqo, yqo, 'k:', linewidth=0.5)
731
+
732
+ plot_grid(ax)
733
+ ax.legend()
734
+ ax.set_title(f"Acceptance plot (A = {ncomp})")
735
+
736
+ if do_log:
737
+ ax.set_xlabel("Explained distance, log(1 + h/h0)")
738
+ ax.set_ylabel("Residual distance, log(1 + q/q0)")
739
+ else:
740
+ ax.set_xlabel("Explained distance, h/h0")
741
+ ax.set_ylabel("Residual distance, q/q0")
742
+
743
+ y_max = np.max([np.max(y), np.max(yqo)])
744
+ ax.set_ylim((0, y_max * 1.15))
745
+
746
+ def plotScores(self, ax:Axes, comp:tuple = (1,), type = "p", color:str = 'tab:blue',
747
+ marker:str = 'o', show_labels:bool = False, label:str|None = None):
748
+ """
749
+ Shows scores plot.
750
+
751
+ Parameters:
752
+ -----------
753
+ `ax`: matplotlib Axis instance (e.g. from `subplot()`)
754
+ `comp`: which components to show loadings for (tuple with two numbers or just one value).
755
+ `type`: plot type (`"p"` for scatter, `"l"` for line, `"h"` for barplot).
756
+ `color`: color of the plot series elements.
757
+ `marker`: marker symbol for scatter plot.
758
+ `show_labels`: logical, show or not variable names as data points labels.
759
+ """
760
+
761
+
762
+ if not isinstance(comp, tuple):
763
+ comp = (comp, )
764
+
765
+ if type == "p":
766
+
767
+ if len(comp) > 1:
768
+ x = self.T[:, comp[0] - 1]
769
+ y = self.T[:, comp[1] - 1]
770
+ xlab = f"PC{comp[0]}"
771
+ ylab = f"PC{comp[1]}"
772
+ else:
773
+ x = list(range(1, self.nrows + 1))
774
+ y = self.T[:, comp[0] - 1]
775
+ xlab = f"Objects"
776
+ ylab = f"PC{comp[0]}"
777
+
778
+ ax.plot(x, y,linestyle='None', marker = marker, markeredgecolor = color,
779
+ markerfacecolor = "#ffffff00", label = label)
780
+
781
+ elif type == "l":
782
+ if label is None:
783
+ label = f"PC{comp[0]}"
784
+ x = list(range(1, self.nrows + 1))
785
+ y = self.T[:, comp[0] - 1]
786
+ xlab = f"Objects"
787
+ ylab = f"PC{comp[0]}"
788
+ ax.plot(x, y, color = color, label = label)
789
+
790
+ elif type == "h":
791
+ if label is None:
792
+ label = f"PC{comp[0]}"
793
+ x = list(range(1, self.nrows + 1))
794
+ y = self.T[:, comp[0] - 1]
795
+ xlab = f"Objects"
796
+ ylab = f"PC{comp[0]}"
797
+ ax.bar(x, y, color = color, label = label)
798
+
799
+ ax.set_xlabel(xlab)
800
+ ax.set_ylabel(ylab)
801
+
802
+ plot_grid(ax)
803
+ plot_axes(ax, type = type)
804
+
805
+ if show_labels:
806
+ dy = (np.max(y) - np.min(y)) * 0.05
807
+ plot_labels(ax, x, y, self.labels, dy)
808
+ ylim = ax.get_ylim()
809
+ ax.set_ylim((ylim[0], ylim[1] + dy * 2))
810
+ ax.set_title("Scores")
811
+
812
+
813
+ def plotExtremes(self, ax:Axes, ncomp:int|None = None, label:str = "", show_ellipse:bool = True,
814
+ color = "#ffffff00", edgecolors:str = MAIN_COLOR, marker:str = "o"):
815
+ """
816
+ Show extremes plot (number of observed extremes vs expected) for class members.
817
+ """
818
+
819
+ if ncomp is None:
820
+ ncomp = self.ncomp_selected
821
+
822
+ if self.num_members < 1:
823
+ raise ValueError("This result object does not have target class members.")
824
+
825
+ f = self.F[self.ind_members, ncomp - 1]
826
+ n = len(f)
827
+
828
+ # remove excluded values if any
829
+ expected = np.arange(1, n + 1)
830
+
831
+ # compute and show the tolerance ellipse
832
+ x = np.arange(1, n + 1)
833
+ alpha = x / n
834
+ D = 2 * np.sqrt(x * (1 - alpha))
835
+ Nm = x - D
836
+ Np = x + D
837
+
838
+ alpha = expected / n
839
+ Nf = self.fParams[1][ncomp - 1]
840
+ q = 1 - chi2.cdf(f, Nf)
841
+ observed = np.sum(q[:, None] < alpha[None, :], axis=0)
842
+
843
+
844
+ if show_ellipse:
845
+ line_color = "#00000010"
846
+ ax.plot(x, Nm, color = line_color)
847
+ ax.plot(x, Np, color = line_color)
848
+ ax.plot(x, x, color = line_color)
849
+ plt.vlines(x, Nm, Np, color = line_color)
850
+
851
+
852
+ ax.scatter(expected, observed, marker = marker, color = color, edgecolors = edgecolors,
853
+ label = label)
854
+
855
+ ax.set_xlabel("Number of extremes (expected)")
856
+ ax.set_ylabel("Number of extremes (observed)")
857
+ ax.set_title(f"Extremes (A = {ncomp})")
858
+ plot_grid(ax)
859
+
860
+
861
+ def plotAliens(self, ax:Axes, ncomp:int|None = None, label:str = "", show_ellipse:bool = True,
862
+ color = "#ffffff00", edgecolors:str = MAIN_COLOR, marker:str = "o"):
863
+ """
864
+ Show aliens plot (number of observed extremes vs expected) for class strangers.
865
+ """
866
+
867
+ if self.num_strangers < 1:
868
+ raise ValueError("This result object does not have objects from non-target classes.")
869
+
870
+ if ncomp is None:
871
+ ncomp = self.ncomp_selected
872
+
873
+
874
+ roles = self.R[:, ncomp - 1]
875
+ f = self.F[(self.ind_strangers) & (roles == 3), ncomp - 1]
876
+ n = len(f)
877
+
878
+ # remove excluded values if any
879
+ expected = np.arange(1, n + 1)
880
+
881
+ # compute and show the tolerance ellipse
882
+ x = np.arange(1, n + 1)
883
+ beta = x / n
884
+ D = 2 * np.sqrt(x * (1 - beta))
885
+ Nm = x - D
886
+ Np = x + D
887
+
888
+ m = self.outcomes["m"][ncomp - 1]
889
+ Mz = self.outcomes["Mz"][ncomp - 1]
890
+ hz = self.outcomes["hz"][ncomp - 1]
891
+ Sz = self.outcomes["Sz"][ncomp - 1]
892
+ beta = expected / n
893
+ zb = norm.ppf(beta)
894
+
895
+ eCrit = m * np.pow(Sz * zb + Mz, 1.0 / hz)
896
+ observed = np.sum(f[:, None] < eCrit[None, :], axis=0)
897
+
898
+
899
+ if show_ellipse:
900
+ line_color = "#00000010"
901
+ ax.plot(x, Nm, color = line_color)
902
+ ax.plot(x, Np, color = line_color)
903
+ ax.plot(x, x, color = line_color)
904
+ plt.vlines(x, Nm, Np, color = line_color)
905
+
906
+
907
+ ax.scatter(expected, observed, marker = marker, color = color, edgecolors = edgecolors,
908
+ label = label)
909
+
910
+ ax.set_xlabel("Number of aliens (expected)")
911
+ ax.set_ylabel("Number of aliens (observed)")
912
+ ax.set_title(f"Aliens (A = {ncomp})")
913
+ plot_grid(ax)
914
+
915
+
916
+ def plotSelectivity(self, ax:Axes, ncomp:int|None = None):
917
+
918
+ if self.num_strangers < 1:
919
+ raise ValueError("This result object does not have objects from non-target classes.")
920
+
921
+ if ncomp is None:
922
+ ncomp = self.ncomp_selected
923
+
924
+ s = self.outcomes["s"][ncomp - 1]
925
+ f0t = self.outcomes["f0t"][ncomp - 1]
926
+ k = self.outcomes["k"][ncomp - 1]
927
+ Mz = self.outcomes["Mz"][ncomp - 1]
928
+ hz = self.outcomes["hz"][ncomp - 1]
929
+ Sz = self.outcomes["Sz"][ncomp - 1]
930
+
931
+ norm1 = 1. / (k + s)
932
+ norm2 = 1. / Sz
933
+
934
+ alpha = np.arange(0, 1, 0.001)
935
+ fcrit = chi2.ppf(1 - alpha, k);
936
+ z = fcrit / f0t
937
+ beta = norm.cdf((np.pow(z * norm1, hz) - Mz) * norm2);
938
+ sel = np.sum(beta * 0.001)
939
+ auc = 1 - sel
940
+
941
+ #const curAlpha = $derived(simca.alpha / 100);
942
+ #const curBeta = $derived(getBeta(curAlpha));
943
+
944
+ plot_grid(ax)
945
+ ax.plot(beta, 1 - alpha)
946
+ ax.set_xlabel("1 - selectivity, β")
947
+ ax.set_ylabel("Expected sensitivity, 1 - ɑ")
948
+ ax.set_title(f"Selectivity (A = {ncomp}, AUC = {auc:.4f})")
949
+
950
+
951
+ ####################################
952
+ # DDSIMCA class #
953
+ ####################################
954
+
955
+ class DDSIMCA:
956
+ """
957
+ A Data Driven SIMCA model class.
958
+
959
+ Parameters:
960
+ -----------
961
+ `target_class`: the name of the target class the model should be trained for.
962
+ """
963
+
964
+ def __init__(self, target_class:str):
965
+ # set main model parameters
966
+ self.target_class = target_class
967
+ self.status = "init"
968
+
969
+
970
+ def get_distances(self, X:Array2D) -> tuple[Array2D, Array2D, Array2D, Array2D]:
971
+ """
972
+ Project values from X to PC space and computes score, orthogonal and full distances
973
+ for each number of components.
974
+
975
+ Parameters:
976
+ -----------
977
+ `X`: matrix (2D array) with data values.
978
+
979
+ Returns:
980
+ --------
981
+ tuple with four matrices (2D arrays, nrows x ncomp):
982
+ `H`: score distances.
983
+ `Q`: orthogonal distances.
984
+ `T`: scores.
985
+ `E`: residuals.
986
+ """
987
+ X = (X - self.center_values) / (self.scale_values + EPSILON)
988
+ T = X @ self.V
989
+
990
+ H = np.zeros((X.shape[0], self.ncomp))
991
+ Q = np.zeros((X.shape[0], self.ncomp))
992
+
993
+ # a = 1
994
+ X_hat = T[:, :1] @ self.V[:, :1].T
995
+ E = X - X_hat
996
+ H[:, :1] = T[:, :1] * T[:, :1] / self.eigenvals[0]
997
+ Q[:, 0] = (E ** 2).sum(axis=1)
998
+
999
+ # a > 1
1000
+ for a in range(2, self.ncomp+1):
1001
+ X_hat = T[:, :a] @ self.V[:, :a].T
1002
+ E = X - X_hat
1003
+ H[:, a - 1] = H[:, a - 2] + T[:, a - 1] * T[:, a - 1] / self.eigenvals[a - 1]
1004
+ Q[:, a - 1] = (E ** 2).sum(axis=1)
1005
+
1006
+ return (H, Q, T, E)
1007
+
1008
+
1009
+ def train(self, data:pd.DataFrame, ncomp:int, center:bool = True, scale:bool = False):
1010
+ """
1011
+ Train DDSIMCA model.
1012
+
1013
+ Parameters:
1014
+ -----------
1015
+ `data`: data frame (Pandas) with training set, first column should contain target class label.
1016
+ `ncomp`: number of components to compute (optimal number can be selected later).
1017
+ `center`: logical, mean center or not data variables.
1018
+ `scale`: logical, standardize or not data variables.
1019
+
1020
+ Raises:
1021
+ -------
1022
+ `ValueError`: if data frame has wrong dimension, no column with class labels or wrong values for this column.
1023
+
1024
+ """
1025
+
1026
+ class_labels = data.iloc[:, 0]
1027
+ classes = sorted(class_labels.unique())
1028
+ if len(classes) != 1 or classes[0] != self.target_class:
1029
+ raise ValueError(f"First column of data frame must content target class name ('{self.target_class}').")
1030
+
1031
+ X = data.iloc[:, 1:].values.astype(np.float64)
1032
+ nrows = X.shape[0]
1033
+ ncols = X.shape[1]
1034
+
1035
+ if ncomp < 1 or ncomp > ncols or ncomp > nrows - 1:
1036
+ raise ValueError(f"Dataset size {nrows}x{ncols} does not match the number of components ({self.ncomp}).")
1037
+
1038
+ self.ncomp = ncomp
1039
+ self.center_values = X.mean(axis=0) if center else np.zeros(X.shape[1])
1040
+ self.scale_values = X.std(axis=0, ddof = 1.) if scale else np.ones(X.shape[1]) - EPSILON
1041
+
1042
+ self.varlabels = data.columns[1:]
1043
+
1044
+ varvalues = pd.to_numeric(self.varlabels, errors="coerce")
1045
+ self.varvalues = varvalues if varvalues.notna().all() else range(1, ncols + 1)
1046
+
1047
+ _, s, V = np.linalg.svd((X - self.center_values) / (self.scale_values + EPSILON), full_matrices=False)
1048
+ self.V = np.transpose(V)[:, :ncomp]
1049
+ self.eigenvals = s[:ncomp]**2 / (nrows - 1)
1050
+
1051
+ H, Q, _, _ = self.get_distances(X)
1052
+
1053
+ h0c, Nhc = get_distparams(H, type = "classic")
1054
+ q0c, Nqc = get_distparams(Q, type = "classic")
1055
+ h0r, Nhr = get_distparams(H, type = "robust")
1056
+ q0r, Nqr = get_distparams(Q, type = "robust")
1057
+
1058
+ Nfc = Nhc + Nqc
1059
+ Nfr = Nhr + Nqr
1060
+ f0c = Nfc
1061
+ f0r = Nfr
1062
+
1063
+ self.center = center
1064
+ self.scale = scale
1065
+ self.ncomp_selected = self.ncomp
1066
+
1067
+ self.nrows = nrows
1068
+ self.ncols = ncols
1069
+ self.hParams = {"classic": (h0c, Nhc), "robust": (h0r, Nhr)}
1070
+ self.qParams = {"classic": (q0c, Nqc), "robust": (q0r, Nqr)}
1071
+ self.fParams = {"classic": (f0c, Nfc), "robust": (f0r, Nfr)}
1072
+ self.status = "trained"
1073
+
1074
+
1075
+ def predict(self, data:pd.DataFrame, lim_type:str = 'classic', alpha:float=0.05,
1076
+ gamma:float=0.01) -> DDSIMCARes:
1077
+ """
1078
+ Apply DDSIMCA model to a new dataset.
1079
+
1080
+ Parameters
1081
+ ----------
1082
+ data_path : str
1083
+ Path to the dataset with images on which to perform predictions.
1084
+ alpha : float, optional
1085
+ Significance level to define expected sensitivity of the model.
1086
+ crit : str, optional
1087
+ Which distance to use for classification (use 'f': full, 'q': residual, 'h': explained).
1088
+
1089
+ Returns
1090
+ -------
1091
+ VAESIMCARes
1092
+ A VAESIMCARes object containing the predictions and statistical analysis.
1093
+ """
1094
+
1095
+ if alpha < 0.00001 or alpha > 0.999999:
1096
+ raise ValueError("Wrong value for parameter 'alpha' (must be between 0.00001 and 0.999999).")
1097
+
1098
+ if data.shape[1] < self.ncols or data.shape[1] > self.ncols + 1:
1099
+ raise ValueError(f"Wrong number of columns in the data frame (expected to be {self.ncols + 1} if first column contains class labels or {self.ncols} if not).")
1100
+
1101
+ has_classes = data.shape[1] == self.ncols + 1
1102
+
1103
+ if has_classes:
1104
+ classes = data.iloc[:, 0]
1105
+ X = data.iloc[:, 1:].values.astype(np.float64)
1106
+ else:
1107
+ classes = None
1108
+ X = data.values.astype(np.float64)
1109
+
1110
+ labels = list(data.index)
1111
+ H, Q, T, E = self.get_distances(X)
1112
+
1113
+
1114
+ return DDSIMCARes(
1115
+ self.target_class,
1116
+ self.hParams[lim_type],
1117
+ self.qParams[lim_type],
1118
+ self.fParams[lim_type],
1119
+ self.center,
1120
+ self.scale,
1121
+ alpha,
1122
+ gamma,
1123
+ lim_type,
1124
+ self.nrows,
1125
+ H, Q, T, E,
1126
+ classes,
1127
+ labels,
1128
+ self.ncomp_selected
1129
+ )
1130
+
1131
+ def select_ncomp(self, ncomp:int):
1132
+ """ Change (select) optimal number of components """
1133
+ if ncomp < 1 or ncomp > self.ncomp:
1134
+ raise ValueError(f"Wrong value for 'ncomp' parameter (must be between 1 and {self.ncomp}")
1135
+ self.ncomp_selected = ncomp
1136
+
1137
+
1138
+ def summary(self, lim_type:str = 'classic'):
1139
+ """ Show summary information for given type of limits estimator ('classic' or 'robust') """
1140
+
1141
+ if (lim_type != 'classic' and lim_type != 'robust'):
1142
+ raise ValueError("Wrong value for 'lim_type' parameter (must be either 'classic' or 'robust'")
1143
+
1144
+ print('\033[1m', end = "")
1145
+ print("DDSIMCA model:\n")
1146
+ print('\033[0m', end = "")
1147
+
1148
+ if self.status == "init":
1149
+ print("- model has not been trained yet.")
1150
+ return
1151
+
1152
+ scaling_ind = int(self.center) * 2 + int(self.scale)
1153
+ scaling_str = ["none", "standardization", "mean centering", "autoscaling (mean centering + standardization)"]
1154
+
1155
+ print(f"- target class: {self.target_class}")
1156
+ print(f"- number of components (total): {self.ncomp}")
1157
+ print(f"- number of components (optimal): {self.ncomp_selected}")
1158
+ print(f"- number of training samples: {self.nrows}")
1159
+ print(f"- number of variables: {self.ncols}")
1160
+ print(f"- preprocessing: {scaling_str[scaling_ind]}")
1161
+
1162
+ print(f"\nParameters for {lim_type} estimators:\n")
1163
+
1164
+ print('\033[4m', end = "")
1165
+ print("PCs Nh Nq eigenvals")
1166
+ print('\033[0m', end = "")
1167
+
1168
+ _, Nq = self.qParams[lim_type]
1169
+ _, Nh = self.hParams[lim_type]
1170
+ for a in range(self.ncomp):
1171
+ if a == self.ncomp_selected - 1:
1172
+ print('\033[1m', end = "")
1173
+ print(f"{(a + 1):3d} {Nh[a]:5.0f} {Nq[a]:5.0f} {self.eigenvals[a]:11.3f}")
1174
+ if a == self.ncomp_selected - 1:
1175
+ print('\033[0m', end = "")
1176
+
1177
+
1178
+ def plotLoadings(self, ax:Axes, comp:tuple = (1,), type = "p", color:str = 'tab:blue',
1179
+ marker:str = 'o', show_labels:bool = False):
1180
+ """
1181
+ Shows loadings plot.
1182
+
1183
+ Parameters:
1184
+ -----------
1185
+ `ax`: matplotlib Axis instance (e.g. from `subplot()`)
1186
+ `comp`: which components to show loadings for (tuple with two numbers or just one value).
1187
+ `type`: plot type (`"p"` for scatter, `"l"` for line, `"h"` for barplot).
1188
+ `color`: color of the plot series elements.
1189
+ `marker`: marker symbol for scatter plot.
1190
+ `show_labels`: logical, show or not variable names as data points labels.
1191
+ """
1192
+
1193
+
1194
+ if not isinstance(comp, tuple):
1195
+ comp = (comp, )
1196
+
1197
+ if type == "p":
1198
+
1199
+ if len(comp) > 1:
1200
+ x = self.V[:, comp[0] - 1]
1201
+ y = self.V[:, comp[1] - 1]
1202
+ xlab = f"PC{comp[0]}"
1203
+ ylab = f"PC{comp[1]}"
1204
+ else:
1205
+ x = self.varvalues
1206
+ y = self.V[:, comp[0] - 1]
1207
+ xlab = f"Variables"
1208
+ ylab = f"PC{comp[0]}"
1209
+
1210
+ ax.grid(True, zorder=-1, linestyle = "--", color = "#e0e0e0")
1211
+ ax.plot(x, y,linestyle='None', marker = marker, markeredgecolor = color, markerfacecolor = "#ffffff00")
1212
+
1213
+ elif type == "l":
1214
+ x = self.varvalues
1215
+ y = self.V[:, comp[0] - 1]
1216
+ xlab = f"Variables"
1217
+ ylab = f"PC{comp[0]}"
1218
+ ax.plot(x, y, color = color, label = f"PC{comp[0]}")
1219
+
1220
+ elif type == "h":
1221
+ x = range(self.ncols)
1222
+ y = self.V[:, comp[0] - 1]
1223
+ xlab = f"Variables"
1224
+ ylab = f"PC{comp[0]}"
1225
+ ax.bar(x, y, color = color, label = f"PC{comp[0]}")
1226
+
1227
+ ax.set_xlabel(xlab)
1228
+ ax.set_ylabel(ylab)
1229
+
1230
+ plot_grid(ax)
1231
+ plot_axes(ax, type = type)
1232
+
1233
+ if show_labels:
1234
+ plot_labels(ax, x, y, self.varlabels, 0.01)
1235
+
1236
+ ax.set_title("Loadings")
1237
+
1238
+
1239
+ def plotDoF(self, ax, dof:str="Nh", lim_type:str="classic", color:str|None = None, marker:str = 'o',
1240
+ label:str|None = None):
1241
+ """
1242
+ Shows plot with degrees of freedom (Nh, Nq or Nf) vs number of components.
1243
+ """
1244
+
1245
+ colors = {'Nh': 'tab:purple', 'Nq': 'tab:blue', 'Nf':'tab:cyan'}
1246
+
1247
+ if dof not in ["Nh", "Nq", "Nf"]:
1248
+ raise ValueError("Wrong value for parameter 'dof'.")
1249
+
1250
+ if color is None:
1251
+ color = colors[dof]
1252
+
1253
+ if label is None:
1254
+ label = dof
1255
+
1256
+ if dof == "Nh":
1257
+ y = self.hParams[lim_type][1]
1258
+ elif dof == "Nq":
1259
+ y = self.qParams[lim_type][1]
1260
+ else:
1261
+ y = self.fParams[lim_type][1]
1262
+
1263
+ plot_compstats(ax, y, color = color, marker = marker, label = label)
1264
+ ax.set_title(f"Degrees of freedom")
1265
+ ax.set_ylabel('')
1266
+ ax.legend()
1267
+
1268
+
1269
+ def plotEigenvals(self, ax, do_log:bool = False, color:str = 'tab:blue', marker:str = 'o'):
1270
+ """
1271
+ Shows plot with eigenvalus vs number of components.
1272
+ """
1273
+
1274
+ if do_log:
1275
+ y = np.log10(self.eigenvals)
1276
+ ylab = "log10(λ)"
1277
+ else:
1278
+ y = self.eigenvals
1279
+ ylab = "λ"
1280
+
1281
+ plot_compstats(ax, y, color = color, marker = marker, label = '')
1282
+ ax.set_title(f"Eigenvalues")
1283
+ ax.set_ylabel(ylab)
1284
+
1285
+
1286
+
1287
+ def ddsimca(data:pd.DataFrame, ncomp:int, center:bool=True, scale:bool=False):
1288
+ """
1289
+ Train Data Driven SIMCA model
1290
+
1291
+ Parameters:
1292
+ -----------
1293
+ `data`: data frame (Pandas) with training set, first column should contain target class label.
1294
+ `ncomp`: number of components to compute (optimal number can be selected later).
1295
+ `center`: logical, mean center or not data variables.
1296
+ `scale`: logical, standardize or not data variables.
1297
+
1298
+ The model will compute distance parameters based on the training set for both classic and robust estimators. You can select which estimator to use later, when apply model to a new dataset.
1299
+
1300
+ """
1301
+ class_labels = data.iloc[:, 0]
1302
+ classes = sorted(class_labels.unique())
1303
+ if len(classes) != 1:
1304
+ raise ValueError(f"First column of data frame must content only target class name")
1305
+
1306
+ target_class = classes[0]
1307
+ m = DDSIMCA(target_class)
1308
+ m.train(data, ncomp, center = center, scale = scale)
1309
+ return m